]> git.proxmox.com Git - ceph.git/blame - ceph/src/client/Client.cc
update sources to v12.1.3
[ceph.git] / ceph / src / client / Client.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15
16// unix-ey fs stuff
17#include <unistd.h>
18#include <sys/types.h>
19#include <time.h>
20#include <utime.h>
21#include <sys/stat.h>
22#include <sys/param.h>
23#include <fcntl.h>
24#include <sys/file.h>
25#include <sys/utsname.h>
26#include <sys/uio.h>
27
28#include <boost/lexical_cast.hpp>
29#include <boost/fusion/include/std_pair.hpp>
30
31#if defined(__FreeBSD__)
32#define XATTR_CREATE 0x1
33#define XATTR_REPLACE 0x2
34#else
35#include <sys/xattr.h>
36#endif
37
38#if defined(__linux__)
39#include <linux/falloc.h>
40#endif
41
42#include <sys/statvfs.h>
43
44#include "common/config.h"
45#include "common/version.h"
46
47// ceph stuff
48#include "messages/MClientSession.h"
49#include "messages/MClientReconnect.h"
50#include "messages/MClientRequest.h"
51#include "messages/MClientRequestForward.h"
52#include "messages/MClientReply.h"
53#include "messages/MClientCaps.h"
54#include "messages/MClientLease.h"
55#include "messages/MClientSnap.h"
56#include "messages/MCommandReply.h"
57#include "messages/MOSDMap.h"
58#include "messages/MClientQuota.h"
59#include "messages/MClientCapRelease.h"
60#include "messages/MMDSMap.h"
61#include "messages/MFSMap.h"
62#include "messages/MFSMapUser.h"
63
64#include "mon/MonClient.h"
65
66#include "mds/flock.h"
67#include "osd/OSDMap.h"
68#include "osdc/Filer.h"
69
70#include "common/Cond.h"
71#include "common/Mutex.h"
72#include "common/perf_counters.h"
73#include "common/admin_socket.h"
74#include "common/errno.h"
75#include "include/str_list.h"
76
77#define dout_subsys ceph_subsys_client
78
79#include "include/lru.h"
80#include "include/compat.h"
81#include "include/stringify.h"
82
83#include "Client.h"
84#include "Inode.h"
85#include "Dentry.h"
86#include "Dir.h"
87#include "ClientSnapRealm.h"
88#include "Fh.h"
89#include "MetaSession.h"
90#include "MetaRequest.h"
91#include "ObjecterWriteback.h"
92#include "posix_acl.h"
93
94#include "include/assert.h"
95#include "include/stat.h"
96
97#include "include/cephfs/ceph_statx.h"
98
99#if HAVE_GETGROUPLIST
100#include <grp.h>
101#include <pwd.h>
102#include <unistd.h>
103#endif
104
105#undef dout_prefix
106#define dout_prefix *_dout << "client." << whoami << " "
107
108#define tout(cct) if (!cct->_conf->client_trace.empty()) traceout
109
110// FreeBSD fails to define this
111#ifndef O_DSYNC
112#define O_DSYNC 0x0
113#endif
114// Darwin fails to define this
115#ifndef O_RSYNC
116#define O_RSYNC 0x0
117#endif
118
119#ifndef O_DIRECT
120#define O_DIRECT 0x0
121#endif
122
123#define DEBUG_GETATTR_CAPS (CEPH_CAP_XATTR_SHARED)
124
125void client_flush_set_callback(void *p, ObjectCacher::ObjectSet *oset)
126{
127 Client *client = static_cast<Client*>(p);
128 client->flush_set_callback(oset);
129}
130
131
132// -------------
133
134Client::CommandHook::CommandHook(Client *client) :
135 m_client(client)
136{
137}
138
139bool Client::CommandHook::call(std::string command, cmdmap_t& cmdmap,
140 std::string format, bufferlist& out)
141{
142 Formatter *f = Formatter::create(format);
143 f->open_object_section("result");
144 m_client->client_lock.Lock();
145 if (command == "mds_requests")
146 m_client->dump_mds_requests(f);
147 else if (command == "mds_sessions")
148 m_client->dump_mds_sessions(f);
149 else if (command == "dump_cache")
150 m_client->dump_cache(f);
151 else if (command == "kick_stale_sessions")
152 m_client->_kick_stale_sessions();
153 else if (command == "status")
154 m_client->dump_status(f);
155 else
156 assert(0 == "bad command registered");
157 m_client->client_lock.Unlock();
158 f->close_section();
159 f->flush(out);
160 delete f;
161 return true;
162}
163
164
165// -------------
166
167dir_result_t::dir_result_t(Inode *in, const UserPerm& perms)
168 : inode(in), offset(0), next_offset(2),
169 release_count(0), ordered_count(0), cache_index(0), start_shared_gen(0),
170 perms(perms)
171 { }
172
173void Client::_reset_faked_inos()
174{
175 ino_t start = 1024;
176 free_faked_inos.clear();
177 free_faked_inos.insert(start, (uint32_t)-1 - start + 1);
178 last_used_faked_ino = 0;
179 _use_faked_inos = sizeof(ino_t) < 8 || cct->_conf->client_use_faked_inos;
180}
181
182void Client::_assign_faked_ino(Inode *in)
183{
184 interval_set<ino_t>::const_iterator it = free_faked_inos.lower_bound(last_used_faked_ino + 1);
185 if (it == free_faked_inos.end() && last_used_faked_ino > 0) {
186 last_used_faked_ino = 0;
187 it = free_faked_inos.lower_bound(last_used_faked_ino + 1);
188 }
189 assert(it != free_faked_inos.end());
190 if (last_used_faked_ino < it.get_start()) {
191 assert(it.get_len() > 0);
192 last_used_faked_ino = it.get_start();
193 } else {
194 ++last_used_faked_ino;
195 assert(it.get_start() + it.get_len() > last_used_faked_ino);
196 }
197 in->faked_ino = last_used_faked_ino;
198 free_faked_inos.erase(in->faked_ino);
199 faked_ino_map[in->faked_ino] = in->vino();
200}
201
202void Client::_release_faked_ino(Inode *in)
203{
204 free_faked_inos.insert(in->faked_ino);
205 faked_ino_map.erase(in->faked_ino);
206}
207
208vinodeno_t Client::_map_faked_ino(ino_t ino)
209{
210 vinodeno_t vino;
211 if (ino == 1)
212 vino = root->vino();
213 else if (faked_ino_map.count(ino))
214 vino = faked_ino_map[ino];
215 else
216 vino = vinodeno_t(0, CEPH_NOSNAP);
217 ldout(cct, 10) << "map_faked_ino " << ino << " -> " << vino << dendl;
218 return vino;
219}
220
221vinodeno_t Client::map_faked_ino(ino_t ino)
222{
223 Mutex::Locker lock(client_lock);
224 return _map_faked_ino(ino);
225}
226
227// cons/des
228
229Client::Client(Messenger *m, MonClient *mc, Objecter *objecter_)
230 : Dispatcher(m->cct),
231 m_command_hook(this),
232 timer(m->cct, client_lock),
233 callback_handle(NULL),
234 switch_interrupt_cb(NULL),
235 remount_cb(NULL),
236 ino_invalidate_cb(NULL),
237 dentry_invalidate_cb(NULL),
238 getgroups_cb(NULL),
239 umask_cb(NULL),
240 can_invalidate_dentries(false),
241 require_remount(false),
242 async_ino_invalidator(m->cct),
243 async_dentry_invalidator(m->cct),
244 interrupt_finisher(m->cct),
245 remount_finisher(m->cct),
246 objecter_finisher(m->cct),
247 tick_event(NULL),
248 messenger(m), monclient(mc),
249 objecter(objecter_),
250 whoami(mc->get_global_id()), cap_epoch_barrier(0),
251 last_tid(0), oldest_tid(0), last_flush_tid(1),
252 initialized(false),
31f18b77 253 mounted(false), unmounting(false), blacklisted(false),
7c673cae
FG
254 local_osd(-1), local_osd_epoch(0),
255 unsafe_sync_write(0),
256 client_lock("Client::client_lock")
257{
258 _reset_faked_inos();
259 //
260 root = 0;
261
262 num_flushing_caps = 0;
263
264 _dir_vxattrs_name_size = _vxattrs_calcu_name_size(_dir_vxattrs);
265 _file_vxattrs_name_size = _vxattrs_calcu_name_size(_file_vxattrs);
266
267 user_id = cct->_conf->client_mount_uid;
268 group_id = cct->_conf->client_mount_gid;
269
270 acl_type = NO_ACL;
271 if (cct->_conf->client_acl_type == "posix_acl")
272 acl_type = POSIX_ACL;
273
274 lru.lru_set_max(cct->_conf->client_cache_size);
275 lru.lru_set_midpoint(cct->_conf->client_cache_mid);
276
277 // file handles
278 free_fd_set.insert(10, 1<<30);
279
280 mdsmap.reset(new MDSMap);
281
282 // osd interfaces
283 writeback_handler.reset(new ObjecterWriteback(objecter, &objecter_finisher,
284 &client_lock));
285 objectcacher.reset(new ObjectCacher(cct, "libcephfs", *writeback_handler, client_lock,
286 client_flush_set_callback, // all commit callback
287 (void*)this,
288 cct->_conf->client_oc_size,
289 cct->_conf->client_oc_max_objects,
290 cct->_conf->client_oc_max_dirty,
291 cct->_conf->client_oc_target_dirty,
292 cct->_conf->client_oc_max_dirty_age,
293 true));
294 objecter_finisher.start();
295 filer.reset(new Filer(objecter, &objecter_finisher));
31f18b77 296 objecter->enable_blacklist_events();
7c673cae
FG
297}
298
299
300Client::~Client()
301{
302 assert(!client_lock.is_locked());
303
31f18b77
FG
304 // It is necessary to hold client_lock, because any inode destruction
305 // may call into ObjectCacher, which asserts that it's lock (which is
306 // client_lock) is held.
307 client_lock.Lock();
7c673cae 308 tear_down_cache();
31f18b77 309 client_lock.Unlock();
7c673cae
FG
310}
311
312void Client::tear_down_cache()
313{
314 // fd's
315 for (ceph::unordered_map<int, Fh*>::iterator it = fd_map.begin();
316 it != fd_map.end();
317 ++it) {
318 Fh *fh = it->second;
319 ldout(cct, 1) << "tear_down_cache forcing close of fh " << it->first << " ino " << fh->inode->ino << dendl;
320 _release_fh(fh);
321 }
322 fd_map.clear();
323
324 while (!opened_dirs.empty()) {
325 dir_result_t *dirp = *opened_dirs.begin();
326 ldout(cct, 1) << "tear_down_cache forcing close of dir " << dirp << " ino " << dirp->inode->ino << dendl;
327 _closedir(dirp);
328 }
329
330 // caps!
331 // *** FIXME ***
332
333 // empty lru
334 lru.lru_set_max(0);
335 trim_cache();
336 assert(lru.lru_get_size() == 0);
337
338 // close root ino
339 assert(inode_map.size() <= 1 + root_parents.size());
340 if (root && inode_map.size() == 1 + root_parents.size()) {
341 delete root;
342 root = 0;
343 root_ancestor = 0;
344 while (!root_parents.empty())
345 root_parents.erase(root_parents.begin());
346 inode_map.clear();
347 _reset_faked_inos();
348 }
349
350 assert(inode_map.empty());
351}
352
353inodeno_t Client::get_root_ino()
354{
355 Mutex::Locker l(client_lock);
356 if (use_faked_inos())
357 return root->faked_ino;
358 else
359 return root->ino;
360}
361
362Inode *Client::get_root()
363{
364 Mutex::Locker l(client_lock);
365 root->ll_get();
366 return root;
367}
368
369
370// debug crapola
371
372void Client::dump_inode(Formatter *f, Inode *in, set<Inode*>& did, bool disconnected)
373{
374 filepath path;
375 in->make_long_path(path);
376 ldout(cct, 1) << "dump_inode: "
377 << (disconnected ? "DISCONNECTED ":"")
378 << "inode " << in->ino
379 << " " << path
380 << " ref " << in->get_num_ref()
381 << *in << dendl;
382
383 if (f) {
384 f->open_object_section("inode");
385 f->dump_stream("path") << path;
386 if (disconnected)
387 f->dump_int("disconnected", 1);
388 in->dump(f);
389 f->close_section();
390 }
391
392 did.insert(in);
393 if (in->dir) {
394 ldout(cct, 1) << " dir " << in->dir << " size " << in->dir->dentries.size() << dendl;
395 for (ceph::unordered_map<string, Dentry*>::iterator it = in->dir->dentries.begin();
396 it != in->dir->dentries.end();
397 ++it) {
398 ldout(cct, 1) << " " << in->ino << " dn " << it->first << " " << it->second << " ref " << it->second->ref << dendl;
399 if (f) {
400 f->open_object_section("dentry");
401 it->second->dump(f);
402 f->close_section();
403 }
404 if (it->second->inode)
405 dump_inode(f, it->second->inode.get(), did, false);
406 }
407 }
408}
409
410void Client::dump_cache(Formatter *f)
411{
412 set<Inode*> did;
413
414 ldout(cct, 1) << "dump_cache" << dendl;
415
416 if (f)
417 f->open_array_section("cache");
418
419 if (root)
420 dump_inode(f, root, did, true);
421
422 // make a second pass to catch anything disconnected
423 for (ceph::unordered_map<vinodeno_t, Inode*>::iterator it = inode_map.begin();
424 it != inode_map.end();
425 ++it) {
426 if (did.count(it->second))
427 continue;
428 dump_inode(f, it->second, did, true);
429 }
430
431 if (f)
432 f->close_section();
433}
434
435void Client::dump_status(Formatter *f)
436{
437 assert(client_lock.is_locked_by_me());
438
439 ldout(cct, 1) << __func__ << dendl;
440
441 const epoch_t osd_epoch
442 = objecter->with_osdmap(std::mem_fn(&OSDMap::get_epoch));
443
444 if (f) {
445 f->open_object_section("metadata");
446 for (const auto& kv : metadata)
447 f->dump_string(kv.first.c_str(), kv.second);
448 f->close_section();
449
450 f->dump_int("dentry_count", lru.lru_get_size());
451 f->dump_int("dentry_pinned_count", lru.lru_get_num_pinned());
452 f->dump_int("id", get_nodeid().v);
453 f->dump_int("inode_count", inode_map.size());
454 f->dump_int("mds_epoch", mdsmap->get_epoch());
455 f->dump_int("osd_epoch", osd_epoch);
456 f->dump_int("osd_epoch_barrier", cap_epoch_barrier);
457 }
458}
459
460int Client::init()
461{
462 timer.init();
463 objectcacher->start();
464
465 client_lock.Lock();
466 assert(!initialized);
467
468 messenger->add_dispatcher_tail(this);
469 client_lock.Unlock();
470
471 _finish_init();
472 return 0;
473}
474
475void Client::_finish_init()
476{
477 client_lock.Lock();
478 // logger
479 PerfCountersBuilder plb(cct, "client", l_c_first, l_c_last);
480 plb.add_time_avg(l_c_reply, "reply", "Latency of receiving a reply on metadata request");
481 plb.add_time_avg(l_c_lat, "lat", "Latency of processing a metadata request");
482 plb.add_time_avg(l_c_wrlat, "wrlat", "Latency of a file data write operation");
483 logger.reset(plb.create_perf_counters());
484 cct->get_perfcounters_collection()->add(logger.get());
485
486 client_lock.Unlock();
487
488 cct->_conf->add_observer(this);
489
490 AdminSocket* admin_socket = cct->get_admin_socket();
491 int ret = admin_socket->register_command("mds_requests",
492 "mds_requests",
493 &m_command_hook,
494 "show in-progress mds requests");
495 if (ret < 0) {
496 lderr(cct) << "error registering admin socket command: "
497 << cpp_strerror(-ret) << dendl;
498 }
499 ret = admin_socket->register_command("mds_sessions",
500 "mds_sessions",
501 &m_command_hook,
502 "show mds session state");
503 if (ret < 0) {
504 lderr(cct) << "error registering admin socket command: "
505 << cpp_strerror(-ret) << dendl;
506 }
507 ret = admin_socket->register_command("dump_cache",
508 "dump_cache",
509 &m_command_hook,
510 "show in-memory metadata cache contents");
511 if (ret < 0) {
512 lderr(cct) << "error registering admin socket command: "
513 << cpp_strerror(-ret) << dendl;
514 }
515 ret = admin_socket->register_command("kick_stale_sessions",
516 "kick_stale_sessions",
517 &m_command_hook,
518 "kick sessions that were remote reset");
519 if (ret < 0) {
520 lderr(cct) << "error registering admin socket command: "
521 << cpp_strerror(-ret) << dendl;
522 }
523 ret = admin_socket->register_command("status",
524 "status",
525 &m_command_hook,
526 "show overall client status");
527 if (ret < 0) {
528 lderr(cct) << "error registering admin socket command: "
529 << cpp_strerror(-ret) << dendl;
530 }
531
532 client_lock.Lock();
533 initialized = true;
534 client_lock.Unlock();
535}
536
537void Client::shutdown()
538{
539 ldout(cct, 1) << "shutdown" << dendl;
540
541 // If we were not mounted, but were being used for sending
542 // MDS commands, we may have sessions that need closing.
543 client_lock.Lock();
544 _close_sessions();
545 client_lock.Unlock();
546
547 cct->_conf->remove_observer(this);
548
549 AdminSocket* admin_socket = cct->get_admin_socket();
550 admin_socket->unregister_command("mds_requests");
551 admin_socket->unregister_command("mds_sessions");
552 admin_socket->unregister_command("dump_cache");
553 admin_socket->unregister_command("kick_stale_sessions");
554 admin_socket->unregister_command("status");
555
556 if (ino_invalidate_cb) {
557 ldout(cct, 10) << "shutdown stopping cache invalidator finisher" << dendl;
558 async_ino_invalidator.wait_for_empty();
559 async_ino_invalidator.stop();
560 }
561
562 if (dentry_invalidate_cb) {
563 ldout(cct, 10) << "shutdown stopping dentry invalidator finisher" << dendl;
564 async_dentry_invalidator.wait_for_empty();
565 async_dentry_invalidator.stop();
566 }
567
568 if (switch_interrupt_cb) {
569 ldout(cct, 10) << "shutdown stopping interrupt finisher" << dendl;
570 interrupt_finisher.wait_for_empty();
571 interrupt_finisher.stop();
572 }
573
574 if (remount_cb) {
575 ldout(cct, 10) << "shutdown stopping remount finisher" << dendl;
576 remount_finisher.wait_for_empty();
577 remount_finisher.stop();
578 }
579
580 objectcacher->stop(); // outside of client_lock! this does a join.
581
582 client_lock.Lock();
583 assert(initialized);
584 initialized = false;
585 timer.shutdown();
586 client_lock.Unlock();
587
588 objecter_finisher.wait_for_empty();
589 objecter_finisher.stop();
590
591 if (logger) {
592 cct->get_perfcounters_collection()->remove(logger.get());
593 logger.reset();
594 }
595}
596
597
598// ===================
599// metadata cache stuff
600
601void Client::trim_cache(bool trim_kernel_dcache)
602{
603 ldout(cct, 20) << "trim_cache size " << lru.lru_get_size() << " max " << lru.lru_get_max() << dendl;
604 unsigned last = 0;
605 while (lru.lru_get_size() != last) {
606 last = lru.lru_get_size();
607
608 if (lru.lru_get_size() <= lru.lru_get_max()) break;
609
610 // trim!
31f18b77 611 Dentry *dn = static_cast<Dentry*>(lru.lru_get_next_expire());
7c673cae
FG
612 if (!dn)
613 break; // done
614
615 trim_dentry(dn);
616 }
617
618 if (trim_kernel_dcache && lru.lru_get_size() > lru.lru_get_max())
619 _invalidate_kernel_dcache();
620
621 // hose root?
622 if (lru.lru_get_size() == 0 && root && root->get_num_ref() == 0 && inode_map.size() == 1 + root_parents.size()) {
623 ldout(cct, 15) << "trim_cache trimmed root " << root << dendl;
624 delete root;
625 root = 0;
626 root_ancestor = 0;
627 while (!root_parents.empty())
628 root_parents.erase(root_parents.begin());
629 inode_map.clear();
630 _reset_faked_inos();
631 }
632}
633
634void Client::trim_cache_for_reconnect(MetaSession *s)
635{
636 mds_rank_t mds = s->mds_num;
637 ldout(cct, 20) << "trim_cache_for_reconnect mds." << mds << dendl;
638
639 int trimmed = 0;
640 list<Dentry*> skipped;
641 while (lru.lru_get_size() > 0) {
642 Dentry *dn = static_cast<Dentry*>(lru.lru_expire());
643 if (!dn)
644 break;
645
646 if ((dn->inode && dn->inode->caps.count(mds)) ||
647 dn->dir->parent_inode->caps.count(mds)) {
648 trim_dentry(dn);
649 trimmed++;
650 } else
651 skipped.push_back(dn);
652 }
653
654 for(list<Dentry*>::iterator p = skipped.begin(); p != skipped.end(); ++p)
655 lru.lru_insert_mid(*p);
656
657 ldout(cct, 20) << "trim_cache_for_reconnect mds." << mds
658 << " trimmed " << trimmed << " dentries" << dendl;
659
660 if (s->caps.size() > 0)
661 _invalidate_kernel_dcache();
662}
663
664void Client::trim_dentry(Dentry *dn)
665{
666 ldout(cct, 15) << "trim_dentry unlinking dn " << dn->name
667 << " in dir " << hex << dn->dir->parent_inode->ino
668 << dendl;
669 if (dn->inode) {
670 Inode *diri = dn->dir->parent_inode;
671 diri->dir_release_count++;
672 clear_dir_complete_and_ordered(diri, true);
673 }
674 unlink(dn, false, false); // drop dir, drop dentry
675}
676
677
678void Client::update_inode_file_bits(Inode *in,
679 uint64_t truncate_seq, uint64_t truncate_size,
680 uint64_t size, uint64_t change_attr,
681 uint64_t time_warp_seq, utime_t ctime,
682 utime_t mtime,
683 utime_t atime,
684 version_t inline_version,
685 bufferlist& inline_data,
686 int issued)
687{
688 bool warn = false;
689 ldout(cct, 10) << "update_inode_file_bits " << *in << " " << ccap_string(issued)
690 << " mtime " << mtime << dendl;
691 ldout(cct, 25) << "truncate_seq: mds " << truncate_seq << " local "
692 << in->truncate_seq << " time_warp_seq: mds " << time_warp_seq
693 << " local " << in->time_warp_seq << dendl;
694 uint64_t prior_size = in->size;
695
696 if (inline_version > in->inline_version) {
697 in->inline_data = inline_data;
698 in->inline_version = inline_version;
699 }
700
701 /* always take a newer change attr */
702 if (change_attr > in->change_attr)
703 in->change_attr = change_attr;
704
705 if (truncate_seq > in->truncate_seq ||
706 (truncate_seq == in->truncate_seq && size > in->size)) {
707 ldout(cct, 10) << "size " << in->size << " -> " << size << dendl;
708 in->size = size;
709 in->reported_size = size;
710 if (truncate_seq != in->truncate_seq) {
711 ldout(cct, 10) << "truncate_seq " << in->truncate_seq << " -> "
712 << truncate_seq << dendl;
713 in->truncate_seq = truncate_seq;
714 in->oset.truncate_seq = truncate_seq;
715
716 // truncate cached file data
717 if (prior_size > size) {
718 _invalidate_inode_cache(in, truncate_size, prior_size - truncate_size);
719 }
720 }
721
722 // truncate inline data
723 if (in->inline_version < CEPH_INLINE_NONE) {
724 uint32_t len = in->inline_data.length();
725 if (size < len)
726 in->inline_data.splice(size, len - size);
727 }
728 }
729 if (truncate_seq >= in->truncate_seq &&
730 in->truncate_size != truncate_size) {
731 if (in->is_file()) {
732 ldout(cct, 10) << "truncate_size " << in->truncate_size << " -> "
733 << truncate_size << dendl;
734 in->truncate_size = truncate_size;
735 in->oset.truncate_size = truncate_size;
736 } else {
737 ldout(cct, 0) << "Hmmm, truncate_seq && truncate_size changed on non-file inode!" << dendl;
738 }
739 }
740
741 // be careful with size, mtime, atime
742 if (issued & (CEPH_CAP_FILE_EXCL|
743 CEPH_CAP_FILE_WR|
744 CEPH_CAP_FILE_BUFFER|
745 CEPH_CAP_AUTH_EXCL|
746 CEPH_CAP_XATTR_EXCL)) {
747 ldout(cct, 30) << "Yay have enough caps to look at our times" << dendl;
748 if (ctime > in->ctime)
749 in->ctime = ctime;
750 if (time_warp_seq > in->time_warp_seq) {
751 ldout(cct, 10) << "mds time_warp_seq " << time_warp_seq << " on inode " << *in
752 << " is higher than local time_warp_seq "
753 << in->time_warp_seq << dendl;
754 //the mds updated times, so take those!
755 in->mtime = mtime;
756 in->atime = atime;
757 in->time_warp_seq = time_warp_seq;
758 } else if (time_warp_seq == in->time_warp_seq) {
759 //take max times
760 if (mtime > in->mtime)
761 in->mtime = mtime;
762 if (atime > in->atime)
763 in->atime = atime;
764 } else if (issued & CEPH_CAP_FILE_EXCL) {
765 //ignore mds values as we have a higher seq
766 } else warn = true;
767 } else {
768 ldout(cct, 30) << "Don't have enough caps, just taking mds' time values" << dendl;
769 if (time_warp_seq >= in->time_warp_seq) {
770 in->ctime = ctime;
771 in->mtime = mtime;
772 in->atime = atime;
773 in->time_warp_seq = time_warp_seq;
774 } else warn = true;
775 }
776 if (warn) {
777 ldout(cct, 0) << "WARNING: " << *in << " mds time_warp_seq "
778 << time_warp_seq << " is lower than local time_warp_seq "
779 << in->time_warp_seq
780 << dendl;
781 }
782}
783
784void Client::_fragmap_remove_non_leaves(Inode *in)
785{
786 for (map<frag_t,int>::iterator p = in->fragmap.begin(); p != in->fragmap.end(); )
787 if (!in->dirfragtree.is_leaf(p->first))
788 in->fragmap.erase(p++);
789 else
790 ++p;
791}
792
793void Client::_fragmap_remove_stopped_mds(Inode *in, mds_rank_t mds)
794{
795 for (auto p = in->fragmap.begin(); p != in->fragmap.end(); )
796 if (p->second == mds)
797 in->fragmap.erase(p++);
798 else
799 ++p;
800}
801
802Inode * Client::add_update_inode(InodeStat *st, utime_t from,
803 MetaSession *session,
804 const UserPerm& request_perms)
805{
806 Inode *in;
807 bool was_new = false;
808 if (inode_map.count(st->vino)) {
809 in = inode_map[st->vino];
810 ldout(cct, 12) << "add_update_inode had " << *in << " caps " << ccap_string(st->cap.caps) << dendl;
811 } else {
812 in = new Inode(this, st->vino, &st->layout);
813 inode_map[st->vino] = in;
814
815 if (use_faked_inos())
816 _assign_faked_ino(in);
817
818 if (!root) {
819 root = in;
820 root_ancestor = in;
821 cwd = root;
822 } else if (!mounted) {
823 root_parents[root_ancestor] = in;
824 root_ancestor = in;
825 }
826
827 // immutable bits
828 in->ino = st->vino.ino;
829 in->snapid = st->vino.snapid;
830 in->mode = st->mode & S_IFMT;
831 was_new = true;
832 }
833
834 in->rdev = st->rdev;
835 if (in->is_symlink())
836 in->symlink = st->symlink;
837
838 if (was_new)
839 ldout(cct, 12) << "add_update_inode adding " << *in << " caps " << ccap_string(st->cap.caps) << dendl;
840
841 if (!st->cap.caps)
842 return in; // as with readdir returning indoes in different snaprealms (no caps!)
843
844 // only update inode if mds info is strictly newer, or it is the same and projected (odd).
845 bool updating_inode = false;
846 int issued = 0;
847 if (st->version == 0 ||
848 (in->version & ~1) < st->version) {
849 updating_inode = true;
850
851 int implemented = 0;
852 issued = in->caps_issued(&implemented) | in->caps_dirty();
853 issued |= implemented;
854
855 in->version = st->version;
856
857 if ((issued & CEPH_CAP_AUTH_EXCL) == 0) {
858 in->mode = st->mode;
859 in->uid = st->uid;
860 in->gid = st->gid;
861 in->btime = st->btime;
862 }
863
864 if ((issued & CEPH_CAP_LINK_EXCL) == 0) {
865 in->nlink = st->nlink;
866 }
867
868 in->dirstat = st->dirstat;
869 in->rstat = st->rstat;
870 in->quota = st->quota;
871 in->layout = st->layout;
872
873 if (in->is_dir()) {
874 in->dir_layout = st->dir_layout;
875 ldout(cct, 20) << " dir hash is " << (int)in->dir_layout.dl_dir_hash << dendl;
876 }
877
878 update_inode_file_bits(in, st->truncate_seq, st->truncate_size, st->size,
879 st->change_attr, st->time_warp_seq, st->ctime,
880 st->mtime, st->atime, st->inline_version,
881 st->inline_data, issued);
882 } else if (st->inline_version > in->inline_version) {
883 in->inline_data = st->inline_data;
884 in->inline_version = st->inline_version;
885 }
886
887 if ((in->xattr_version == 0 || !(issued & CEPH_CAP_XATTR_EXCL)) &&
888 st->xattrbl.length() &&
889 st->xattr_version > in->xattr_version) {
890 bufferlist::iterator p = st->xattrbl.begin();
891 ::decode(in->xattrs, p);
892 in->xattr_version = st->xattr_version;
893 }
894
895 // move me if/when version reflects fragtree changes.
896 if (in->dirfragtree != st->dirfragtree) {
897 in->dirfragtree = st->dirfragtree;
898 _fragmap_remove_non_leaves(in);
899 }
900
901 if (in->snapid == CEPH_NOSNAP) {
902 add_update_cap(in, session, st->cap.cap_id, st->cap.caps, st->cap.seq,
903 st->cap.mseq, inodeno_t(st->cap.realm), st->cap.flags,
904 request_perms);
905 if (in->auth_cap && in->auth_cap->session == session)
906 in->max_size = st->max_size;
907 } else
908 in->snap_caps |= st->cap.caps;
909
910 // setting I_COMPLETE needs to happen after adding the cap
911 if (updating_inode &&
912 in->is_dir() &&
913 (st->cap.caps & CEPH_CAP_FILE_SHARED) &&
914 (issued & CEPH_CAP_FILE_EXCL) == 0 &&
915 in->dirstat.nfiles == 0 &&
916 in->dirstat.nsubdirs == 0) {
917 ldout(cct, 10) << " marking (I_COMPLETE|I_DIR_ORDERED) on empty dir " << *in << dendl;
918 in->flags |= I_COMPLETE | I_DIR_ORDERED;
919 if (in->dir) {
920 ldout(cct, 10) << " dir is open on empty dir " << in->ino << " with "
921 << in->dir->dentries.size() << " entries, marking all dentries null" << dendl;
922 in->dir->readdir_cache.clear();
923 for (auto p = in->dir->dentries.begin();
924 p != in->dir->dentries.end();
925 ++p) {
926 unlink(p->second, true, true); // keep dir, keep dentry
927 }
928 if (in->dir->dentries.empty())
929 close_dir(in->dir);
930 }
931 }
932
933 return in;
934}
935
936
937/*
938 * insert_dentry_inode - insert + link a single dentry + inode into the metadata cache.
939 */
940Dentry *Client::insert_dentry_inode(Dir *dir, const string& dname, LeaseStat *dlease,
941 Inode *in, utime_t from, MetaSession *session,
942 Dentry *old_dentry)
943{
944 Dentry *dn = NULL;
945 if (dir->dentries.count(dname))
946 dn = dir->dentries[dname];
947
948 ldout(cct, 12) << "insert_dentry_inode '" << dname << "' vino " << in->vino()
949 << " in dir " << dir->parent_inode->vino() << " dn " << dn
950 << dendl;
951
952 if (dn && dn->inode) {
953 if (dn->inode->vino() == in->vino()) {
954 touch_dn(dn);
955 ldout(cct, 12) << " had dentry " << dname
956 << " with correct vino " << dn->inode->vino()
957 << dendl;
958 } else {
959 ldout(cct, 12) << " had dentry " << dname
960 << " with WRONG vino " << dn->inode->vino()
961 << dendl;
962 unlink(dn, true, true); // keep dir, keep dentry
963 }
964 }
965
966 if (!dn || !dn->inode) {
967 InodeRef tmp_ref(in);
968 if (old_dentry) {
969 if (old_dentry->dir != dir) {
970 Inode *old_diri = old_dentry->dir->parent_inode;
971 old_diri->dir_ordered_count++;
972 clear_dir_complete_and_ordered(old_diri, false);
973 }
974 unlink(old_dentry, dir == old_dentry->dir, false); // drop dentry, keep dir open if its the same dir
975 }
976 Inode *diri = dir->parent_inode;
977 diri->dir_ordered_count++;
978 clear_dir_complete_and_ordered(diri, false);
979 dn = link(dir, dname, in, dn);
980 }
981
982 update_dentry_lease(dn, dlease, from, session);
983 return dn;
984}
985
986void Client::update_dentry_lease(Dentry *dn, LeaseStat *dlease, utime_t from, MetaSession *session)
987{
988 utime_t dttl = from;
989 dttl += (float)dlease->duration_ms / 1000.0;
990
991 assert(dn);
992
993 if (dlease->mask & CEPH_LOCK_DN) {
994 if (dttl > dn->lease_ttl) {
995 ldout(cct, 10) << "got dentry lease on " << dn->name
996 << " dur " << dlease->duration_ms << "ms ttl " << dttl << dendl;
997 dn->lease_ttl = dttl;
998 dn->lease_mds = session->mds_num;
999 dn->lease_seq = dlease->seq;
1000 dn->lease_gen = session->cap_gen;
1001 }
1002 }
1003 dn->cap_shared_gen = dn->dir->parent_inode->shared_gen;
1004}
1005
1006
1007/*
1008 * update MDS location cache for a single inode
1009 */
1010void Client::update_dir_dist(Inode *in, DirStat *dst)
1011{
1012 // auth
1013 ldout(cct, 20) << "got dirfrag map for " << in->ino << " frag " << dst->frag << " to mds " << dst->auth << dendl;
1014 if (dst->auth >= 0) {
1015 in->fragmap[dst->frag] = dst->auth;
1016 } else {
1017 in->fragmap.erase(dst->frag);
1018 }
1019 if (!in->dirfragtree.is_leaf(dst->frag)) {
1020 in->dirfragtree.force_to_leaf(cct, dst->frag);
1021 _fragmap_remove_non_leaves(in);
1022 }
1023
1024 // replicated
1025 in->dir_replicated = !dst->dist.empty(); // FIXME that's just one frag!
1026
1027 // dist
1028 /*
1029 if (!st->dirfrag_dist.empty()) { // FIXME
1030 set<int> dist = st->dirfrag_dist.begin()->second;
1031 if (dist.empty() && !in->dir_contacts.empty())
1032 ldout(cct, 9) << "lost dist spec for " << in->ino
1033 << " " << dist << dendl;
1034 if (!dist.empty() && in->dir_contacts.empty())
1035 ldout(cct, 9) << "got dist spec for " << in->ino
1036 << " " << dist << dendl;
1037 in->dir_contacts = dist;
1038 }
1039 */
1040}
1041
1042void Client::clear_dir_complete_and_ordered(Inode *diri, bool complete)
1043{
1044 if (diri->flags & I_COMPLETE) {
1045 if (complete) {
1046 ldout(cct, 10) << " clearing (I_COMPLETE|I_DIR_ORDERED) on " << *diri << dendl;
1047 diri->flags &= ~(I_COMPLETE | I_DIR_ORDERED);
1048 } else {
1049 if (diri->flags & I_DIR_ORDERED) {
1050 ldout(cct, 10) << " clearing I_DIR_ORDERED on " << *diri << dendl;
1051 diri->flags &= ~I_DIR_ORDERED;
1052 }
1053 }
1054 if (diri->dir)
1055 diri->dir->readdir_cache.clear();
1056 }
1057}
1058
1059/*
1060 * insert results from readdir or lssnap into the metadata cache.
1061 */
1062void Client::insert_readdir_results(MetaRequest *request, MetaSession *session, Inode *diri) {
1063
1064 MClientReply *reply = request->reply;
1065 ConnectionRef con = request->reply->get_connection();
1066 uint64_t features = con->get_features();
1067
1068 dir_result_t *dirp = request->dirp;
1069 assert(dirp);
1070
1071 // the extra buffer list is only set for readdir and lssnap replies
1072 bufferlist::iterator p = reply->get_extra_bl().begin();
1073 if (!p.end()) {
1074 // snapdir?
1075 if (request->head.op == CEPH_MDS_OP_LSSNAP) {
1076 assert(diri);
1077 diri = open_snapdir(diri);
1078 }
1079
1080 // only open dir if we're actually adding stuff to it!
1081 Dir *dir = diri->open_dir();
1082 assert(dir);
1083
1084 // dirstat
1085 DirStat dst(p);
1086 __u32 numdn;
1087 __u16 flags;
1088 ::decode(numdn, p);
1089 ::decode(flags, p);
1090
1091 bool end = ((unsigned)flags & CEPH_READDIR_FRAG_END);
1092 bool hash_order = ((unsigned)flags & CEPH_READDIR_HASH_ORDER);
1093
1094 frag_t fg = (unsigned)request->head.args.readdir.frag;
1095 unsigned readdir_offset = dirp->next_offset;
1096 string readdir_start = dirp->last_name;
1097 assert(!readdir_start.empty() || readdir_offset == 2);
1098
1099 unsigned last_hash = 0;
1100 if (hash_order) {
1101 if (!readdir_start.empty()) {
1102 last_hash = ceph_frag_value(diri->hash_dentry_name(readdir_start));
1103 } else if (flags & CEPH_READDIR_OFFSET_HASH) {
1104 /* mds understands offset_hash */
1105 last_hash = (unsigned)request->head.args.readdir.offset_hash;
1106 }
1107 }
1108
1109 if (fg != dst.frag) {
1110 ldout(cct, 10) << "insert_trace got new frag " << fg << " -> " << dst.frag << dendl;
1111 fg = dst.frag;
1112 if (!hash_order) {
1113 readdir_offset = 2;
1114 readdir_start.clear();
1115 dirp->offset = dir_result_t::make_fpos(fg, readdir_offset, false);
1116 }
1117 }
1118
1119 ldout(cct, 10) << __func__ << " " << numdn << " readdir items, end=" << end
1120 << ", hash_order=" << hash_order
1121 << ", readdir_start " << readdir_start
1122 << ", last_hash " << last_hash
1123 << ", next_offset " << readdir_offset << dendl;
1124
1125 if (diri->snapid != CEPH_SNAPDIR &&
1126 fg.is_leftmost() && readdir_offset == 2 &&
1127 !(hash_order && last_hash)) {
1128 dirp->release_count = diri->dir_release_count;
1129 dirp->ordered_count = diri->dir_ordered_count;
1130 dirp->start_shared_gen = diri->shared_gen;
1131 dirp->cache_index = 0;
1132 }
1133
1134 dirp->buffer_frag = fg;
1135
1136 _readdir_drop_dirp_buffer(dirp);
1137 dirp->buffer.reserve(numdn);
1138
1139 string dname;
1140 LeaseStat dlease;
1141 for (unsigned i=0; i<numdn; i++) {
1142 ::decode(dname, p);
1143 ::decode(dlease, p);
1144 InodeStat ist(p, features);
1145
1146 ldout(cct, 15) << "" << i << ": '" << dname << "'" << dendl;
1147
1148 Inode *in = add_update_inode(&ist, request->sent_stamp, session,
1149 request->perms);
1150 Dentry *dn;
1151 if (diri->dir->dentries.count(dname)) {
1152 Dentry *olddn = diri->dir->dentries[dname];
1153 if (olddn->inode != in) {
1154 // replace incorrect dentry
1155 unlink(olddn, true, true); // keep dir, dentry
1156 dn = link(dir, dname, in, olddn);
1157 assert(dn == olddn);
1158 } else {
1159 // keep existing dn
1160 dn = olddn;
1161 touch_dn(dn);
1162 }
1163 } else {
1164 // new dn
1165 dn = link(dir, dname, in, NULL);
1166 }
1167
1168 update_dentry_lease(dn, &dlease, request->sent_stamp, session);
1169 if (hash_order) {
1170 unsigned hash = ceph_frag_value(diri->hash_dentry_name(dname));
1171 if (hash != last_hash)
1172 readdir_offset = 2;
1173 last_hash = hash;
1174 dn->offset = dir_result_t::make_fpos(hash, readdir_offset++, true);
1175 } else {
1176 dn->offset = dir_result_t::make_fpos(fg, readdir_offset++, false);
1177 }
1178 // add to readdir cache
1179 if (dirp->release_count == diri->dir_release_count &&
1180 dirp->ordered_count == diri->dir_ordered_count &&
1181 dirp->start_shared_gen == diri->shared_gen) {
1182 if (dirp->cache_index == dir->readdir_cache.size()) {
1183 if (i == 0) {
1184 assert(!dirp->inode->is_complete_and_ordered());
1185 dir->readdir_cache.reserve(dirp->cache_index + numdn);
1186 }
1187 dir->readdir_cache.push_back(dn);
1188 } else if (dirp->cache_index < dir->readdir_cache.size()) {
1189 if (dirp->inode->is_complete_and_ordered())
1190 assert(dir->readdir_cache[dirp->cache_index] == dn);
1191 else
1192 dir->readdir_cache[dirp->cache_index] = dn;
1193 } else {
1194 assert(0 == "unexpected readdir buffer idx");
1195 }
1196 dirp->cache_index++;
1197 }
1198 // add to cached result list
1199 dirp->buffer.push_back(dir_result_t::dentry(dn->offset, dname, in));
1200 ldout(cct, 15) << __func__ << " " << hex << dn->offset << dec << ": '" << dname << "' -> " << in->ino << dendl;
1201 }
1202
1203 if (numdn > 0)
1204 dirp->last_name = dname;
1205 if (end)
1206 dirp->next_offset = 2;
1207 else
1208 dirp->next_offset = readdir_offset;
1209
1210 if (dir->is_empty())
1211 close_dir(dir);
1212 }
1213}
1214
1215/** insert_trace
1216 *
1217 * insert a trace from a MDS reply into the cache.
1218 */
1219Inode* Client::insert_trace(MetaRequest *request, MetaSession *session)
1220{
1221 MClientReply *reply = request->reply;
1222 int op = request->get_op();
1223
1224 ldout(cct, 10) << "insert_trace from " << request->sent_stamp << " mds." << session->mds_num
1225 << " is_target=" << (int)reply->head.is_target
1226 << " is_dentry=" << (int)reply->head.is_dentry
1227 << dendl;
1228
1229 bufferlist::iterator p = reply->get_trace_bl().begin();
1230 if (request->got_unsafe) {
1231 ldout(cct, 10) << "insert_trace -- already got unsafe; ignoring" << dendl;
1232 assert(p.end());
1233 return NULL;
1234 }
1235
1236 if (p.end()) {
1237 ldout(cct, 10) << "insert_trace -- no trace" << dendl;
1238
1239 Dentry *d = request->dentry();
1240 if (d) {
1241 Inode *diri = d->dir->parent_inode;
1242 diri->dir_release_count++;
1243 clear_dir_complete_and_ordered(diri, true);
1244 }
1245
1246 if (d && reply->get_result() == 0) {
1247 if (op == CEPH_MDS_OP_RENAME) {
1248 // rename
1249 Dentry *od = request->old_dentry();
1250 ldout(cct, 10) << " unlinking rename src dn " << od << " for traceless reply" << dendl;
1251 assert(od);
1252 unlink(od, true, true); // keep dir, dentry
1253 } else if (op == CEPH_MDS_OP_RMDIR ||
1254 op == CEPH_MDS_OP_UNLINK) {
1255 // unlink, rmdir
1256 ldout(cct, 10) << " unlinking unlink/rmdir dn " << d << " for traceless reply" << dendl;
1257 unlink(d, true, true); // keep dir, dentry
1258 }
1259 }
1260 return NULL;
1261 }
1262
1263 ConnectionRef con = request->reply->get_connection();
1264 uint64_t features = con->get_features();
1265 ldout(cct, 10) << " features 0x" << hex << features << dec << dendl;
1266
1267 // snap trace
1268 SnapRealm *realm = NULL;
1269 if (reply->snapbl.length())
1270 update_snap_trace(reply->snapbl, &realm);
1271
1272 ldout(cct, 10) << " hrm "
1273 << " is_target=" << (int)reply->head.is_target
1274 << " is_dentry=" << (int)reply->head.is_dentry
1275 << dendl;
1276
1277 InodeStat dirst;
1278 DirStat dst;
1279 string dname;
1280 LeaseStat dlease;
1281 InodeStat ist;
1282
1283 if (reply->head.is_dentry) {
1284 dirst.decode(p, features);
1285 dst.decode(p);
1286 ::decode(dname, p);
1287 ::decode(dlease, p);
1288 }
1289
1290 Inode *in = 0;
1291 if (reply->head.is_target) {
1292 ist.decode(p, features);
1293 if (cct->_conf->client_debug_getattr_caps) {
1294 unsigned wanted = 0;
1295 if (op == CEPH_MDS_OP_GETATTR || op == CEPH_MDS_OP_LOOKUP)
1296 wanted = request->head.args.getattr.mask;
1297 else if (op == CEPH_MDS_OP_OPEN || op == CEPH_MDS_OP_CREATE)
1298 wanted = request->head.args.open.mask;
1299
1300 if ((wanted & CEPH_CAP_XATTR_SHARED) &&
1301 !(ist.xattr_version > 0 && ist.xattrbl.length() > 0))
1302 assert(0 == "MDS reply does not contain xattrs");
1303 }
1304
1305 in = add_update_inode(&ist, request->sent_stamp, session,
1306 request->perms);
1307 }
1308
1309 Inode *diri = NULL;
1310 if (reply->head.is_dentry) {
1311 diri = add_update_inode(&dirst, request->sent_stamp, session,
1312 request->perms);
1313 update_dir_dist(diri, &dst); // dir stat info is attached to ..
1314
1315 if (in) {
1316 Dir *dir = diri->open_dir();
1317 insert_dentry_inode(dir, dname, &dlease, in, request->sent_stamp, session,
1318 (op == CEPH_MDS_OP_RENAME) ? request->old_dentry() : NULL);
1319 } else {
1320 Dentry *dn = NULL;
1321 if (diri->dir && diri->dir->dentries.count(dname)) {
1322 dn = diri->dir->dentries[dname];
1323 if (dn->inode) {
1324 diri->dir_ordered_count++;
1325 clear_dir_complete_and_ordered(diri, false);
1326 unlink(dn, true, true); // keep dir, dentry
1327 }
1328 }
1329 if (dlease.duration_ms > 0) {
1330 if (!dn) {
1331 Dir *dir = diri->open_dir();
1332 dn = link(dir, dname, NULL, NULL);
1333 }
1334 update_dentry_lease(dn, &dlease, request->sent_stamp, session);
1335 }
1336 }
1337 } else if (op == CEPH_MDS_OP_LOOKUPSNAP ||
1338 op == CEPH_MDS_OP_MKSNAP) {
1339 ldout(cct, 10) << " faking snap lookup weirdness" << dendl;
1340 // fake it for snap lookup
1341 vinodeno_t vino = ist.vino;
1342 vino.snapid = CEPH_SNAPDIR;
1343 assert(inode_map.count(vino));
1344 diri = inode_map[vino];
1345
1346 string dname = request->path.last_dentry();
1347
1348 LeaseStat dlease;
1349 dlease.duration_ms = 0;
1350
1351 if (in) {
1352 Dir *dir = diri->open_dir();
1353 insert_dentry_inode(dir, dname, &dlease, in, request->sent_stamp, session);
1354 } else {
1355 if (diri->dir && diri->dir->dentries.count(dname)) {
1356 Dentry *dn = diri->dir->dentries[dname];
1357 if (dn->inode)
1358 unlink(dn, true, true); // keep dir, dentry
1359 }
1360 }
1361 }
1362
1363 if (in) {
1364 if (op == CEPH_MDS_OP_READDIR ||
1365 op == CEPH_MDS_OP_LSSNAP) {
1366 insert_readdir_results(request, session, in);
1367 } else if (op == CEPH_MDS_OP_LOOKUPNAME) {
1368 // hack: return parent inode instead
1369 in = diri;
1370 }
1371
1372 if (request->dentry() == NULL && in != request->inode()) {
1373 // pin the target inode if its parent dentry is not pinned
1374 request->set_other_inode(in);
1375 }
1376 }
1377
1378 if (realm)
1379 put_snap_realm(realm);
1380
1381 request->target = in;
1382 return in;
1383}
1384
1385// -------
1386
1387mds_rank_t Client::choose_target_mds(MetaRequest *req, Inode** phash_diri)
1388{
1389 mds_rank_t mds = MDS_RANK_NONE;
1390 __u32 hash = 0;
1391 bool is_hash = false;
1392
1393 Inode *in = NULL;
1394 Dentry *de = NULL;
1395 Cap *cap = NULL;
1396
1397 if (req->resend_mds >= 0) {
1398 mds = req->resend_mds;
1399 req->resend_mds = -1;
1400 ldout(cct, 10) << "choose_target_mds resend_mds specified as mds." << mds << dendl;
1401 goto out;
1402 }
1403
1404 if (cct->_conf->client_use_random_mds)
1405 goto random_mds;
1406
1407 in = req->inode();
1408 de = req->dentry();
1409 if (in) {
1410 ldout(cct, 20) << "choose_target_mds starting with req->inode " << *in << dendl;
1411 if (req->path.depth()) {
1412 hash = in->hash_dentry_name(req->path[0]);
1413 ldout(cct, 20) << "choose_target_mds inode dir hash is " << (int)in->dir_layout.dl_dir_hash
1414 << " on " << req->path[0]
1415 << " => " << hash << dendl;
1416 is_hash = true;
1417 }
1418 } else if (de) {
1419 if (de->inode) {
1420 in = de->inode.get();
1421 ldout(cct, 20) << "choose_target_mds starting with req->dentry inode " << *in << dendl;
1422 } else {
1423 in = de->dir->parent_inode;
1424 hash = in->hash_dentry_name(de->name);
1425 ldout(cct, 20) << "choose_target_mds dentry dir hash is " << (int)in->dir_layout.dl_dir_hash
1426 << " on " << de->name
1427 << " => " << hash << dendl;
1428 is_hash = true;
1429 }
1430 }
1431 if (in) {
1432 if (in->snapid != CEPH_NOSNAP) {
1433 ldout(cct, 10) << "choose_target_mds " << *in << " is snapped, using nonsnap parent" << dendl;
1434 while (in->snapid != CEPH_NOSNAP) {
1435 if (in->snapid == CEPH_SNAPDIR)
1436 in = in->snapdir_parent.get();
1437 else if (!in->dn_set.empty())
1438 /* In most cases there will only be one dentry, so getting it
1439 * will be the correct action. If there are multiple hard links,
1440 * I think the MDS should be able to redirect as needed*/
1441 in = in->get_first_parent()->dir->parent_inode;
1442 else {
1443 ldout(cct, 10) << "got unlinked inode, can't look at parent" << dendl;
1444 break;
1445 }
1446 }
1447 is_hash = false;
1448 }
1449
1450 ldout(cct, 20) << "choose_target_mds " << *in << " is_hash=" << is_hash
1451 << " hash=" << hash << dendl;
1452
1453 if (is_hash && S_ISDIR(in->mode) && !in->fragmap.empty()) {
1454 frag_t fg = in->dirfragtree[hash];
1455 if (in->fragmap.count(fg)) {
1456 mds = in->fragmap[fg];
1457 if (phash_diri)
1458 *phash_diri = in;
1459 ldout(cct, 10) << "choose_target_mds from dirfragtree hash" << dendl;
1460 goto out;
1461 }
1462 }
1463
1464 if (req->auth_is_best())
1465 cap = in->auth_cap;
1466 if (!cap && !in->caps.empty())
1467 cap = in->caps.begin()->second;
1468 if (!cap)
1469 goto random_mds;
1470 mds = cap->session->mds_num;
1471 ldout(cct, 10) << "choose_target_mds from caps on inode " << *in << dendl;
1472
1473 goto out;
1474 }
1475
1476random_mds:
1477 if (mds < 0) {
1478 mds = _get_random_up_mds();
1479 ldout(cct, 10) << "did not get mds through better means, so chose random mds " << mds << dendl;
1480 }
1481
1482out:
1483 ldout(cct, 20) << "mds is " << mds << dendl;
1484 return mds;
1485}
1486
1487
1488void Client::connect_mds_targets(mds_rank_t mds)
1489{
1490 ldout(cct, 10) << "connect_mds_targets for mds." << mds << dendl;
1491 assert(mds_sessions.count(mds));
1492 const MDSMap::mds_info_t& info = mdsmap->get_mds_info(mds);
1493 for (set<mds_rank_t>::const_iterator q = info.export_targets.begin();
1494 q != info.export_targets.end();
1495 ++q) {
1496 if (mds_sessions.count(*q) == 0 &&
1497 mdsmap->is_clientreplay_or_active_or_stopping(*q)) {
1498 ldout(cct, 10) << "check_mds_sessions opening mds." << mds
1499 << " export target mds." << *q << dendl;
1500 _open_mds_session(*q);
1501 }
1502 }
1503}
1504
1505void Client::dump_mds_sessions(Formatter *f)
1506{
1507 f->dump_int("id", get_nodeid().v);
1508 f->open_array_section("sessions");
1509 for (map<mds_rank_t,MetaSession*>::const_iterator p = mds_sessions.begin(); p != mds_sessions.end(); ++p) {
1510 f->open_object_section("session");
1511 p->second->dump(f);
1512 f->close_section();
1513 }
1514 f->close_section();
1515 f->dump_int("mdsmap_epoch", mdsmap->get_epoch());
1516}
1517void Client::dump_mds_requests(Formatter *f)
1518{
1519 for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
1520 p != mds_requests.end();
1521 ++p) {
1522 f->open_object_section("request");
1523 p->second->dump(f);
1524 f->close_section();
1525 }
1526}
1527
1528int Client::verify_reply_trace(int r,
1529 MetaRequest *request, MClientReply *reply,
1530 InodeRef *ptarget, bool *pcreated,
1531 const UserPerm& perms)
1532{
1533 // check whether this request actually did the create, and set created flag
1534 bufferlist extra_bl;
1535 inodeno_t created_ino;
1536 bool got_created_ino = false;
1537 ceph::unordered_map<vinodeno_t, Inode*>::iterator p;
1538
1539 extra_bl.claim(reply->get_extra_bl());
1540 if (extra_bl.length() >= 8) {
1541 // if the extra bufferlist has a buffer, we assume its the created inode
1542 // and that this request to create succeeded in actually creating
1543 // the inode (won the race with other create requests)
1544 ::decode(created_ino, extra_bl);
1545 got_created_ino = true;
1546 ldout(cct, 10) << "make_request created ino " << created_ino << dendl;
1547 }
1548
1549 if (pcreated)
1550 *pcreated = got_created_ino;
1551
1552 if (request->target) {
1553 *ptarget = request->target;
1554 ldout(cct, 20) << "make_request target is " << *ptarget->get() << dendl;
1555 } else {
1556 if (got_created_ino && (p = inode_map.find(vinodeno_t(created_ino, CEPH_NOSNAP))) != inode_map.end()) {
1557 (*ptarget) = p->second;
1558 ldout(cct, 20) << "make_request created, target is " << *ptarget->get() << dendl;
1559 } else {
1560 // we got a traceless reply, and need to look up what we just
1561 // created. for now, do this by name. someday, do this by the
1562 // ino... which we know! FIXME.
1563 InodeRef target;
1564 Dentry *d = request->dentry();
1565 if (d) {
1566 if (d->dir) {
1567 ldout(cct, 10) << "make_request got traceless reply, looking up #"
1568 << d->dir->parent_inode->ino << "/" << d->name
1569 << " got_ino " << got_created_ino
1570 << " ino " << created_ino
1571 << dendl;
1572 r = _do_lookup(d->dir->parent_inode, d->name, request->regetattr_mask,
1573 &target, perms);
1574 } else {
1575 // if the dentry is not linked, just do our best. see #5021.
1576 assert(0 == "how did this happen? i want logs!");
1577 }
1578 } else {
1579 Inode *in = request->inode();
1580 ldout(cct, 10) << "make_request got traceless reply, forcing getattr on #"
1581 << in->ino << dendl;
1582 r = _getattr(in, request->regetattr_mask, perms, true);
1583 target = in;
1584 }
1585 if (r >= 0) {
1586 // verify ino returned in reply and trace_dist are the same
1587 if (got_created_ino &&
1588 created_ino.val != target->ino.val) {
1589 ldout(cct, 5) << "create got ino " << created_ino << " but then failed on lookup; EINTR?" << dendl;
1590 r = -EINTR;
1591 }
1592 if (ptarget)
1593 ptarget->swap(target);
1594 }
1595 }
1596 }
1597
1598 return r;
1599}
1600
1601
1602/**
1603 * make a request
1604 *
1605 * Blocking helper to make an MDS request.
1606 *
1607 * If the ptarget flag is set, behavior changes slightly: the caller
1608 * expects to get a pointer to the inode we are creating or operating
1609 * on. As a result, we will follow up any traceless mutation reply
1610 * with a getattr or lookup to transparently handle a traceless reply
1611 * from the MDS (as when the MDS restarts and the client has to replay
1612 * a request).
1613 *
1614 * @param request the MetaRequest to execute
1615 * @param perms The user uid/gid to execute as (eventually, full group lists?)
1616 * @param ptarget [optional] address to store a pointer to the target inode we want to create or operate on
1617 * @param pcreated [optional; required if ptarget] where to store a bool of whether our create atomically created a file
1618 * @param use_mds [optional] prefer a specific mds (-1 for default)
1619 * @param pdirbl [optional; disallowed if ptarget] where to pass extra reply payload to the caller
1620 */
1621int Client::make_request(MetaRequest *request,
1622 const UserPerm& perms,
1623 InodeRef *ptarget, bool *pcreated,
1624 mds_rank_t use_mds,
1625 bufferlist *pdirbl)
1626{
1627 int r = 0;
1628
1629 // assign a unique tid
1630 ceph_tid_t tid = ++last_tid;
1631 request->set_tid(tid);
1632
1633 // and timestamp
1634 request->op_stamp = ceph_clock_now();
1635
1636 // make note
1637 mds_requests[tid] = request->get();
1638 if (oldest_tid == 0 && request->get_op() != CEPH_MDS_OP_SETFILELOCK)
1639 oldest_tid = tid;
1640
1641 request->set_caller_perms(perms);
1642
1643 if (cct->_conf->client_inject_fixed_oldest_tid) {
1644 ldout(cct, 20) << __func__ << " injecting fixed oldest_client_tid(1)" << dendl;
1645 request->set_oldest_client_tid(1);
1646 } else {
1647 request->set_oldest_client_tid(oldest_tid);
1648 }
1649
1650 // hack target mds?
1651 if (use_mds >= 0)
1652 request->resend_mds = use_mds;
1653
1654 while (1) {
1655 if (request->aborted())
1656 break;
1657
31f18b77
FG
1658 if (blacklisted) {
1659 request->abort(-EBLACKLISTED);
1660 break;
1661 }
1662
7c673cae
FG
1663 // set up wait cond
1664 Cond caller_cond;
1665 request->caller_cond = &caller_cond;
1666
1667 // choose mds
1668 Inode *hash_diri = NULL;
1669 mds_rank_t mds = choose_target_mds(request, &hash_diri);
1670 int mds_state = (mds == MDS_RANK_NONE) ? MDSMap::STATE_NULL : mdsmap->get_state(mds);
1671 if (mds_state != MDSMap::STATE_ACTIVE && mds_state != MDSMap::STATE_STOPPING) {
1672 if (mds_state == MDSMap::STATE_NULL && mds >= mdsmap->get_max_mds()) {
1673 if (hash_diri) {
1674 ldout(cct, 10) << " target mds." << mds << " has stopped, remove it from fragmap" << dendl;
1675 _fragmap_remove_stopped_mds(hash_diri, mds);
1676 } else {
1677 ldout(cct, 10) << " target mds." << mds << " has stopped, trying a random mds" << dendl;
1678 request->resend_mds = _get_random_up_mds();
1679 }
1680 } else {
1681 ldout(cct, 10) << " target mds." << mds << " not active, waiting for new mdsmap" << dendl;
1682 wait_on_list(waiting_for_mdsmap);
1683 }
1684 continue;
1685 }
1686
1687 // open a session?
1688 MetaSession *session = NULL;
1689 if (!have_open_session(mds)) {
1690 session = _get_or_open_mds_session(mds);
1691
1692 // wait
1693 if (session->state == MetaSession::STATE_OPENING) {
1694 ldout(cct, 10) << "waiting for session to mds." << mds << " to open" << dendl;
1695 wait_on_context_list(session->waiting_for_open);
1696 // Abort requests on REJECT from MDS
1697 if (rejected_by_mds.count(mds)) {
1698 request->abort(-EPERM);
1699 break;
1700 }
1701 continue;
1702 }
1703
1704 if (!have_open_session(mds))
1705 continue;
1706 } else {
1707 session = mds_sessions[mds];
1708 }
1709
1710 // send request.
1711 send_request(request, session);
1712
1713 // wait for signal
1714 ldout(cct, 20) << "awaiting reply|forward|kick on " << &caller_cond << dendl;
1715 request->kick = false;
1716 while (!request->reply && // reply
1717 request->resend_mds < 0 && // forward
1718 !request->kick)
1719 caller_cond.Wait(client_lock);
1720 request->caller_cond = NULL;
1721
1722 // did we get a reply?
1723 if (request->reply)
1724 break;
1725 }
1726
1727 if (!request->reply) {
1728 assert(request->aborted());
1729 assert(!request->got_unsafe);
1730 r = request->get_abort_code();
1731 request->item.remove_myself();
1732 unregister_request(request);
1733 put_request(request); // ours
1734 return r;
1735 }
1736
1737 // got it!
1738 MClientReply *reply = request->reply;
1739 request->reply = NULL;
1740 r = reply->get_result();
1741 if (r >= 0)
1742 request->success = true;
1743
1744 // kick dispatcher (we've got it!)
1745 assert(request->dispatch_cond);
1746 request->dispatch_cond->Signal();
1747 ldout(cct, 20) << "sendrecv kickback on tid " << tid << " " << request->dispatch_cond << dendl;
1748 request->dispatch_cond = 0;
1749
1750 if (r >= 0 && ptarget)
1751 r = verify_reply_trace(r, request, reply, ptarget, pcreated, perms);
1752
1753 if (pdirbl)
1754 pdirbl->claim(reply->get_extra_bl());
1755
1756 // -- log times --
1757 utime_t lat = ceph_clock_now();
1758 lat -= request->sent_stamp;
1759 ldout(cct, 20) << "lat " << lat << dendl;
1760 logger->tinc(l_c_lat, lat);
1761 logger->tinc(l_c_reply, lat);
1762
1763 put_request(request);
1764
1765 reply->put();
1766 return r;
1767}
1768
1769void Client::unregister_request(MetaRequest *req)
1770{
1771 mds_requests.erase(req->tid);
1772 if (req->tid == oldest_tid) {
1773 map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.upper_bound(oldest_tid);
1774 while (true) {
1775 if (p == mds_requests.end()) {
1776 oldest_tid = 0;
1777 break;
1778 }
1779 if (p->second->get_op() != CEPH_MDS_OP_SETFILELOCK) {
1780 oldest_tid = p->first;
1781 break;
1782 }
1783 ++p;
1784 }
1785 }
1786 put_request(req);
1787}
1788
1789void Client::put_request(MetaRequest *request)
1790{
1791 if (request->_put()) {
1792 int op = -1;
1793 if (request->success)
1794 op = request->get_op();
1795 InodeRef other_in;
1796 request->take_other_inode(&other_in);
1797 delete request;
1798
1799 if (other_in &&
1800 (op == CEPH_MDS_OP_RMDIR ||
1801 op == CEPH_MDS_OP_RENAME ||
1802 op == CEPH_MDS_OP_RMSNAP)) {
1803 _try_to_trim_inode(other_in.get(), false);
1804 }
1805 }
1806}
1807
1808int Client::encode_inode_release(Inode *in, MetaRequest *req,
1809 mds_rank_t mds, int drop,
1810 int unless, int force)
1811{
1812 ldout(cct, 20) << "encode_inode_release enter(in:" << *in << ", req:" << req
1813 << " mds:" << mds << ", drop:" << drop << ", unless:" << unless
1814 << ", have:" << ", force:" << force << ")" << dendl;
1815 int released = 0;
1816 if (in->caps.count(mds)) {
1817 Cap *caps = in->caps[mds];
1818 drop &= ~(in->dirty_caps | get_caps_used(in));
1819 if ((drop & caps->issued) &&
1820 !(unless & caps->issued)) {
1821 ldout(cct, 25) << "Dropping caps. Initial " << ccap_string(caps->issued) << dendl;
1822 caps->issued &= ~drop;
1823 caps->implemented &= ~drop;
1824 released = 1;
1825 ldout(cct, 25) << "Now have: " << ccap_string(caps->issued) << dendl;
1826 } else {
1827 released = force;
1828 }
1829 if (released) {
1830 ceph_mds_request_release rel;
1831 rel.ino = in->ino;
1832 rel.cap_id = caps->cap_id;
1833 rel.seq = caps->seq;
1834 rel.issue_seq = caps->issue_seq;
1835 rel.mseq = caps->mseq;
1836 rel.caps = caps->implemented;
1837 rel.wanted = caps->wanted;
1838 rel.dname_len = 0;
1839 rel.dname_seq = 0;
1840 req->cap_releases.push_back(MClientRequest::Release(rel,""));
1841 }
1842 }
1843 ldout(cct, 25) << "encode_inode_release exit(in:" << *in << ") released:"
1844 << released << dendl;
1845 return released;
1846}
1847
1848void Client::encode_dentry_release(Dentry *dn, MetaRequest *req,
1849 mds_rank_t mds, int drop, int unless)
1850{
1851 ldout(cct, 20) << "encode_dentry_release enter(dn:"
1852 << dn << ")" << dendl;
1853 int released = 0;
1854 if (dn->dir)
1855 released = encode_inode_release(dn->dir->parent_inode, req,
1856 mds, drop, unless, 1);
1857 if (released && dn->lease_mds == mds) {
1858 ldout(cct, 25) << "preemptively releasing dn to mds" << dendl;
1859 MClientRequest::Release& rel = req->cap_releases.back();
1860 rel.item.dname_len = dn->name.length();
1861 rel.item.dname_seq = dn->lease_seq;
1862 rel.dname = dn->name;
1863 }
1864 ldout(cct, 25) << "encode_dentry_release exit(dn:"
1865 << dn << ")" << dendl;
1866}
1867
1868
1869/*
1870 * This requires the MClientRequest *request member to be set.
1871 * It will error out horribly without one.
1872 * Additionally, if you set any *drop member, you'd better have
1873 * set the corresponding dentry!
1874 */
1875void Client::encode_cap_releases(MetaRequest *req, mds_rank_t mds)
1876{
1877 ldout(cct, 20) << "encode_cap_releases enter (req: "
1878 << req << ", mds: " << mds << ")" << dendl;
1879 if (req->inode_drop && req->inode())
1880 encode_inode_release(req->inode(), req,
1881 mds, req->inode_drop,
1882 req->inode_unless);
1883
1884 if (req->old_inode_drop && req->old_inode())
1885 encode_inode_release(req->old_inode(), req,
1886 mds, req->old_inode_drop,
1887 req->old_inode_unless);
1888 if (req->other_inode_drop && req->other_inode())
1889 encode_inode_release(req->other_inode(), req,
1890 mds, req->other_inode_drop,
1891 req->other_inode_unless);
1892
1893 if (req->dentry_drop && req->dentry())
1894 encode_dentry_release(req->dentry(), req,
1895 mds, req->dentry_drop,
1896 req->dentry_unless);
1897
1898 if (req->old_dentry_drop && req->old_dentry())
1899 encode_dentry_release(req->old_dentry(), req,
1900 mds, req->old_dentry_drop,
1901 req->old_dentry_unless);
1902 ldout(cct, 25) << "encode_cap_releases exit (req: "
1903 << req << ", mds " << mds <<dendl;
1904}
1905
1906bool Client::have_open_session(mds_rank_t mds)
1907{
1908 return
1909 mds_sessions.count(mds) &&
1910 (mds_sessions[mds]->state == MetaSession::STATE_OPEN ||
1911 mds_sessions[mds]->state == MetaSession::STATE_STALE);
1912}
1913
1914MetaSession *Client::_get_mds_session(mds_rank_t mds, Connection *con)
1915{
1916 if (mds_sessions.count(mds) == 0)
1917 return NULL;
1918 MetaSession *s = mds_sessions[mds];
1919 if (s->con != con)
1920 return NULL;
1921 return s;
1922}
1923
1924MetaSession *Client::_get_or_open_mds_session(mds_rank_t mds)
1925{
1926 if (mds_sessions.count(mds))
1927 return mds_sessions[mds];
1928 return _open_mds_session(mds);
1929}
1930
1931/**
1932 * Populate a map of strings with client-identifying metadata,
1933 * such as the hostname. Call this once at initialization.
1934 */
1935void Client::populate_metadata(const std::string &mount_root)
1936{
1937 // Hostname
1938 struct utsname u;
1939 int r = uname(&u);
1940 if (r >= 0) {
1941 metadata["hostname"] = u.nodename;
1942 ldout(cct, 20) << __func__ << " read hostname '" << u.nodename << "'" << dendl;
1943 } else {
1944 ldout(cct, 1) << __func__ << " failed to read hostname (" << cpp_strerror(r) << ")" << dendl;
1945 }
1946
1947 metadata["pid"] = stringify(getpid());
1948
1949 // Ceph entity id (the '0' in "client.0")
1950 metadata["entity_id"] = cct->_conf->name.get_id();
1951
1952 // Our mount position
1953 if (!mount_root.empty()) {
1954 metadata["root"] = mount_root;
1955 }
1956
1957 // Ceph version
1958 metadata["ceph_version"] = pretty_version_to_str();
1959 metadata["ceph_sha1"] = git_version_to_str();
1960
1961 // Apply any metadata from the user's configured overrides
1962 std::vector<std::string> tokens;
1963 get_str_vec(cct->_conf->client_metadata, ",", tokens);
1964 for (const auto &i : tokens) {
1965 auto eqpos = i.find("=");
1966 // Throw out anything that isn't of the form "<str>=<str>"
1967 if (eqpos == 0 || eqpos == std::string::npos || eqpos == i.size()) {
1968 lderr(cct) << "Invalid metadata keyval pair: '" << i << "'" << dendl;
1969 continue;
1970 }
1971 metadata[i.substr(0, eqpos)] = i.substr(eqpos + 1);
1972 }
1973}
1974
1975/**
1976 * Optionally add or override client metadata fields.
1977 */
1978void Client::update_metadata(std::string const &k, std::string const &v)
1979{
1980 Mutex::Locker l(client_lock);
1981 assert(initialized);
1982
1983 if (metadata.count(k)) {
1984 ldout(cct, 1) << __func__ << " warning, overriding metadata field '" << k
1985 << "' from '" << metadata[k] << "' to '" << v << "'" << dendl;
1986 }
1987
1988 metadata[k] = v;
1989}
1990
1991MetaSession *Client::_open_mds_session(mds_rank_t mds)
1992{
1993 ldout(cct, 10) << "_open_mds_session mds." << mds << dendl;
1994 assert(mds_sessions.count(mds) == 0);
1995 MetaSession *session = new MetaSession;
1996 session->mds_num = mds;
1997 session->seq = 0;
1998 session->inst = mdsmap->get_inst(mds);
1999 session->con = messenger->get_connection(session->inst);
2000 session->state = MetaSession::STATE_OPENING;
2001 session->mds_state = MDSMap::STATE_NULL;
2002 mds_sessions[mds] = session;
2003
2004 // Maybe skip sending a request to open if this MDS daemon
2005 // has previously sent us a REJECT.
2006 if (rejected_by_mds.count(mds)) {
2007 if (rejected_by_mds[mds] == session->inst) {
2008 ldout(cct, 4) << "_open_mds_session mds." << mds << " skipping "
2009 "because we were rejected" << dendl;
2010 return session;
2011 } else {
2012 ldout(cct, 4) << "_open_mds_session mds." << mds << " old inst "
2013 "rejected us, trying with new inst" << dendl;
2014 rejected_by_mds.erase(mds);
2015 }
2016 }
2017
2018 MClientSession *m = new MClientSession(CEPH_SESSION_REQUEST_OPEN);
2019 m->client_meta = metadata;
2020 session->con->send_message(m);
2021 return session;
2022}
2023
2024void Client::_close_mds_session(MetaSession *s)
2025{
2026 ldout(cct, 2) << "_close_mds_session mds." << s->mds_num << " seq " << s->seq << dendl;
2027 s->state = MetaSession::STATE_CLOSING;
2028 s->con->send_message(new MClientSession(CEPH_SESSION_REQUEST_CLOSE, s->seq));
2029}
2030
2031void Client::_closed_mds_session(MetaSession *s)
2032{
2033 s->state = MetaSession::STATE_CLOSED;
2034 s->con->mark_down();
2035 signal_context_list(s->waiting_for_open);
2036 mount_cond.Signal();
2037 remove_session_caps(s);
2038 kick_requests_closed(s);
2039 mds_sessions.erase(s->mds_num);
2040 delete s;
2041}
2042
2043void Client::handle_client_session(MClientSession *m)
2044{
2045 mds_rank_t from = mds_rank_t(m->get_source().num());
2046 ldout(cct, 10) << "handle_client_session " << *m << " from mds." << from << dendl;
2047
2048 MetaSession *session = _get_mds_session(from, m->get_connection().get());
2049 if (!session) {
2050 ldout(cct, 10) << " discarding session message from sessionless mds " << m->get_source_inst() << dendl;
2051 m->put();
2052 return;
2053 }
2054
2055 switch (m->get_op()) {
2056 case CEPH_SESSION_OPEN:
2057 renew_caps(session);
2058 session->state = MetaSession::STATE_OPEN;
2059 if (unmounting)
2060 mount_cond.Signal();
2061 else
2062 connect_mds_targets(from);
2063 signal_context_list(session->waiting_for_open);
2064 break;
2065
2066 case CEPH_SESSION_CLOSE:
2067 _closed_mds_session(session);
2068 break;
2069
2070 case CEPH_SESSION_RENEWCAPS:
2071 if (session->cap_renew_seq == m->get_seq()) {
2072 session->cap_ttl =
2073 session->last_cap_renew_request + mdsmap->get_session_timeout();
2074 wake_inode_waiters(session);
2075 }
2076 break;
2077
2078 case CEPH_SESSION_STALE:
2079 renew_caps(session);
2080 break;
2081
2082 case CEPH_SESSION_RECALL_STATE:
2083 trim_caps(session, m->get_max_caps());
2084 break;
2085
2086 case CEPH_SESSION_FLUSHMSG:
2087 session->con->send_message(new MClientSession(CEPH_SESSION_FLUSHMSG_ACK, m->get_seq()));
2088 break;
2089
2090 case CEPH_SESSION_FORCE_RO:
2091 force_session_readonly(session);
2092 break;
2093
2094 case CEPH_SESSION_REJECT:
2095 rejected_by_mds[session->mds_num] = session->inst;
2096 _closed_mds_session(session);
2097
2098 break;
2099
2100 default:
2101 ceph_abort();
2102 }
2103
2104 m->put();
2105}
2106
2107bool Client::_any_stale_sessions() const
2108{
2109 assert(client_lock.is_locked_by_me());
2110
2111 for (const auto &i : mds_sessions) {
2112 if (i.second->state == MetaSession::STATE_STALE) {
2113 return true;
2114 }
2115 }
2116
2117 return false;
2118}
2119
2120void Client::_kick_stale_sessions()
2121{
2122 ldout(cct, 1) << "kick_stale_sessions" << dendl;
2123
2124 for (map<mds_rank_t,MetaSession*>::iterator p = mds_sessions.begin();
2125 p != mds_sessions.end(); ) {
2126 MetaSession *s = p->second;
2127 ++p;
2128 if (s->state == MetaSession::STATE_STALE)
2129 _closed_mds_session(s);
2130 }
2131}
2132
2133void Client::send_request(MetaRequest *request, MetaSession *session,
2134 bool drop_cap_releases)
2135{
2136 // make the request
2137 mds_rank_t mds = session->mds_num;
2138 ldout(cct, 10) << "send_request rebuilding request " << request->get_tid()
2139 << " for mds." << mds << dendl;
2140 MClientRequest *r = build_client_request(request);
2141 if (request->dentry()) {
2142 r->set_dentry_wanted();
2143 }
2144 if (request->got_unsafe) {
2145 r->set_replayed_op();
2146 if (request->target)
2147 r->head.ino = request->target->ino;
2148 } else {
2149 encode_cap_releases(request, mds);
2150 if (drop_cap_releases) // we haven't send cap reconnect yet, drop cap releases
2151 request->cap_releases.clear();
2152 else
2153 r->releases.swap(request->cap_releases);
2154 }
2155 r->set_mdsmap_epoch(mdsmap->get_epoch());
2156 if (r->head.op == CEPH_MDS_OP_SETXATTR) {
2157 objecter->with_osdmap([r](const OSDMap& o) {
2158 r->set_osdmap_epoch(o.get_epoch());
2159 });
2160 }
2161
2162 if (request->mds == -1) {
2163 request->sent_stamp = ceph_clock_now();
2164 ldout(cct, 20) << "send_request set sent_stamp to " << request->sent_stamp << dendl;
2165 }
2166 request->mds = mds;
2167
2168 Inode *in = request->inode();
2169 if (in && in->caps.count(mds))
2170 request->sent_on_mseq = in->caps[mds]->mseq;
2171
2172 session->requests.push_back(&request->item);
2173
2174 ldout(cct, 10) << "send_request " << *r << " to mds." << mds << dendl;
2175 session->con->send_message(r);
2176}
2177
2178MClientRequest* Client::build_client_request(MetaRequest *request)
2179{
2180 MClientRequest *req = new MClientRequest(request->get_op());
2181 req->set_tid(request->tid);
2182 req->set_stamp(request->op_stamp);
2183 memcpy(&req->head, &request->head, sizeof(ceph_mds_request_head));
2184
2185 // if the filepath's haven't been set, set them!
2186 if (request->path.empty()) {
2187 Inode *in = request->inode();
2188 Dentry *de = request->dentry();
2189 if (in)
2190 in->make_nosnap_relative_path(request->path);
2191 else if (de) {
2192 if (de->inode)
2193 de->inode->make_nosnap_relative_path(request->path);
2194 else if (de->dir) {
2195 de->dir->parent_inode->make_nosnap_relative_path(request->path);
2196 request->path.push_dentry(de->name);
2197 }
2198 else ldout(cct, 1) << "Warning -- unable to construct a filepath!"
2199 << " No path, inode, or appropriately-endowed dentry given!"
2200 << dendl;
2201 } else ldout(cct, 1) << "Warning -- unable to construct a filepath!"
2202 << " No path, inode, or dentry given!"
2203 << dendl;
2204 }
2205 req->set_filepath(request->get_filepath());
2206 req->set_filepath2(request->get_filepath2());
2207 req->set_data(request->data);
2208 req->set_retry_attempt(request->retry_attempt++);
2209 req->head.num_fwd = request->num_fwd;
2210 const gid_t *_gids;
2211 int gid_count = request->perms.get_gids(&_gids);
2212 req->set_gid_list(gid_count, _gids);
2213 return req;
2214}
2215
2216
2217
2218void Client::handle_client_request_forward(MClientRequestForward *fwd)
2219{
2220 mds_rank_t mds = mds_rank_t(fwd->get_source().num());
2221 MetaSession *session = _get_mds_session(mds, fwd->get_connection().get());
2222 if (!session) {
2223 fwd->put();
2224 return;
2225 }
2226 ceph_tid_t tid = fwd->get_tid();
2227
2228 if (mds_requests.count(tid) == 0) {
2229 ldout(cct, 10) << "handle_client_request_forward no pending request on tid " << tid << dendl;
2230 fwd->put();
2231 return;
2232 }
2233
2234 MetaRequest *request = mds_requests[tid];
2235 assert(request);
2236
2237 // reset retry counter
2238 request->retry_attempt = 0;
2239
2240 // request not forwarded, or dest mds has no session.
2241 // resend.
2242 ldout(cct, 10) << "handle_client_request tid " << tid
2243 << " fwd " << fwd->get_num_fwd()
2244 << " to mds." << fwd->get_dest_mds()
2245 << ", resending to " << fwd->get_dest_mds()
2246 << dendl;
2247
2248 request->mds = -1;
2249 request->item.remove_myself();
2250 request->num_fwd = fwd->get_num_fwd();
2251 request->resend_mds = fwd->get_dest_mds();
2252 request->caller_cond->Signal();
2253
2254 fwd->put();
2255}
2256
2257bool Client::is_dir_operation(MetaRequest *req)
2258{
2259 int op = req->get_op();
2260 if (op == CEPH_MDS_OP_MKNOD || op == CEPH_MDS_OP_LINK ||
2261 op == CEPH_MDS_OP_UNLINK || op == CEPH_MDS_OP_RENAME ||
2262 op == CEPH_MDS_OP_MKDIR || op == CEPH_MDS_OP_RMDIR ||
2263 op == CEPH_MDS_OP_SYMLINK || op == CEPH_MDS_OP_CREATE)
2264 return true;
2265 return false;
2266}
2267
2268void Client::handle_client_reply(MClientReply *reply)
2269{
2270 mds_rank_t mds_num = mds_rank_t(reply->get_source().num());
2271 MetaSession *session = _get_mds_session(mds_num, reply->get_connection().get());
2272 if (!session) {
2273 reply->put();
2274 return;
2275 }
2276
2277 ceph_tid_t tid = reply->get_tid();
2278 bool is_safe = reply->is_safe();
2279
2280 if (mds_requests.count(tid) == 0) {
2281 lderr(cct) << "handle_client_reply no pending request on tid " << tid
2282 << " safe is:" << is_safe << dendl;
2283 reply->put();
2284 return;
2285 }
2286 MetaRequest *request = mds_requests.at(tid);
2287
2288 ldout(cct, 20) << "handle_client_reply got a reply. Safe:" << is_safe
2289 << " tid " << tid << dendl;
2290
2291 if (request->got_unsafe && !is_safe) {
2292 //duplicate response
2293 ldout(cct, 0) << "got a duplicate reply on tid " << tid << " from mds "
2294 << mds_num << " safe:" << is_safe << dendl;
2295 reply->put();
2296 return;
2297 }
2298
2299 if (-ESTALE == reply->get_result()) { // see if we can get to proper MDS
2300 ldout(cct, 20) << "got ESTALE on tid " << request->tid
2301 << " from mds." << request->mds << dendl;
2302 request->send_to_auth = true;
2303 request->resend_mds = choose_target_mds(request);
2304 Inode *in = request->inode();
2305 if (request->resend_mds >= 0 &&
2306 request->resend_mds == request->mds &&
2307 (in == NULL ||
2308 in->caps.count(request->resend_mds) == 0 ||
2309 request->sent_on_mseq == in->caps[request->resend_mds]->mseq)) {
2310 // have to return ESTALE
2311 } else {
2312 request->caller_cond->Signal();
2313 reply->put();
2314 return;
2315 }
2316 ldout(cct, 20) << "have to return ESTALE" << dendl;
2317 }
2318
2319 assert(request->reply == NULL);
2320 request->reply = reply;
2321 insert_trace(request, session);
2322
2323 // Handle unsafe reply
2324 if (!is_safe) {
2325 request->got_unsafe = true;
2326 session->unsafe_requests.push_back(&request->unsafe_item);
2327 if (is_dir_operation(request)) {
2328 Inode *dir = request->inode();
2329 assert(dir);
2330 dir->unsafe_ops.push_back(&request->unsafe_dir_item);
2331 }
2332 if (request->target) {
2333 InodeRef &in = request->target;
2334 in->unsafe_ops.push_back(&request->unsafe_target_item);
2335 }
2336 }
2337
2338 // Only signal the caller once (on the first reply):
2339 // Either its an unsafe reply, or its a safe reply and no unsafe reply was sent.
2340 if (!is_safe || !request->got_unsafe) {
2341 Cond cond;
2342 request->dispatch_cond = &cond;
2343
2344 // wake up waiter
2345 ldout(cct, 20) << "handle_client_reply signalling caller " << (void*)request->caller_cond << dendl;
2346 request->caller_cond->Signal();
2347
2348 // wake for kick back
2349 while (request->dispatch_cond) {
2350 ldout(cct, 20) << "handle_client_reply awaiting kickback on tid " << tid << " " << &cond << dendl;
2351 cond.Wait(client_lock);
2352 }
2353 }
2354
2355 if (is_safe) {
2356 // the filesystem change is committed to disk
2357 // we're done, clean up
2358 if (request->got_unsafe) {
2359 request->unsafe_item.remove_myself();
2360 request->unsafe_dir_item.remove_myself();
2361 request->unsafe_target_item.remove_myself();
2362 signal_cond_list(request->waitfor_safe);
2363 }
2364 request->item.remove_myself();
2365 unregister_request(request);
2366 }
2367 if (unmounting)
2368 mount_cond.Signal();
2369}
2370
2371void Client::_handle_full_flag(int64_t pool)
2372{
2373 ldout(cct, 1) << __func__ << ": FULL: cancelling outstanding operations "
2374 << "on " << pool << dendl;
2375 // Cancel all outstanding ops in this pool with -ENOSPC: it is necessary
2376 // to do this rather than blocking, because otherwise when we fill up we
2377 // potentially lock caps forever on files with dirty pages, and we need
2378 // to be able to release those caps to the MDS so that it can delete files
2379 // and free up space.
2380 epoch_t cancelled_epoch = objecter->op_cancel_writes(-ENOSPC, pool);
2381
2382 // For all inodes with layouts in this pool and a pending flush write op
2383 // (i.e. one of the ones we will cancel), we've got to purge_set their data
2384 // from ObjectCacher so that it doesn't re-issue the write in response to
2385 // the ENOSPC error.
2386 // Fortunately since we're cancelling everything in a given pool, we don't
2387 // need to know which ops belong to which ObjectSet, we can just blow all
2388 // the un-flushed cached data away and mark any dirty inodes' async_err
2389 // field with -ENOSPC as long as we're sure all the ops we cancelled were
2390 // affecting this pool, and all the objectsets we're purging were also
2391 // in this pool.
2392 for (unordered_map<vinodeno_t,Inode*>::iterator i = inode_map.begin();
2393 i != inode_map.end(); ++i)
2394 {
2395 Inode *inode = i->second;
2396 if (inode->oset.dirty_or_tx
2397 && (pool == -1 || inode->layout.pool_id == pool)) {
2398 ldout(cct, 4) << __func__ << ": FULL: inode 0x" << std::hex << i->first << std::dec
2399 << " has dirty objects, purging and setting ENOSPC" << dendl;
2400 objectcacher->purge_set(&inode->oset);
2401 inode->set_async_err(-ENOSPC);
2402 }
2403 }
2404
2405 if (cancelled_epoch != (epoch_t)-1) {
2406 set_cap_epoch_barrier(cancelled_epoch);
2407 }
2408}
2409
2410void Client::handle_osd_map(MOSDMap *m)
2411{
31f18b77
FG
2412 std::set<entity_addr_t> new_blacklists;
2413 objecter->consume_blacklist_events(&new_blacklists);
2414
2415 const auto myaddr = messenger->get_myaddr();
2416 if (!blacklisted && new_blacklists.count(myaddr)) {
2417 auto epoch = objecter->with_osdmap([](const OSDMap &o){
2418 return o.get_epoch();
2419 });
2420 lderr(cct) << "I was blacklisted at osd epoch " << epoch << dendl;
2421 blacklisted = true;
2422 for (std::map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
2423 p != mds_requests.end(); ) {
2424 auto req = p->second;
2425 ++p;
2426 req->abort(-EBLACKLISTED);
2427 if (req->caller_cond) {
2428 req->kick = true;
2429 req->caller_cond->Signal();
2430 }
2431 }
2432
2433 // Progress aborts on any requests that were on this waitlist. Any
2434 // requests that were on a waiting_for_open session waitlist
2435 // will get kicked during close session below.
2436 signal_cond_list(waiting_for_mdsmap);
2437
2438 // Force-close all sessions: assume this is not abandoning any state
2439 // on the MDS side because the MDS will have seen the blacklist too.
2440 while(!mds_sessions.empty()) {
2441 auto i = mds_sessions.begin();
2442 auto session = i->second;
2443 _closed_mds_session(session);
2444 }
2445
2446 // Since we know all our OSD ops will fail, cancel them all preemtively,
2447 // so that on an unhealthy cluster we can umount promptly even if e.g.
2448 // some PGs were inaccessible.
2449 objecter->op_cancel_writes(-EBLACKLISTED);
2450
2451 } else if (blacklisted) {
2452 // Handle case where we were blacklisted but no longer are
2453 blacklisted = objecter->with_osdmap([myaddr](const OSDMap &o){
2454 return o.is_blacklisted(myaddr);});
2455 }
2456
7c673cae
FG
2457 if (objecter->osdmap_full_flag()) {
2458 _handle_full_flag(-1);
2459 } else {
2460 // Accumulate local list of full pools so that I can drop
2461 // the objecter lock before re-entering objecter in
2462 // cancel_writes
2463 std::vector<int64_t> full_pools;
2464
2465 objecter->with_osdmap([&full_pools](const OSDMap &o) {
2466 for (const auto& kv : o.get_pools()) {
2467 if (kv.second.has_flag(pg_pool_t::FLAG_FULL)) {
2468 full_pools.push_back(kv.first);
2469 }
2470 }
2471 });
2472
2473 for (auto p : full_pools)
2474 _handle_full_flag(p);
2475
2476 // Subscribe to subsequent maps to watch for the full flag going
2477 // away. For the global full flag objecter does this for us, but
2478 // it pays no attention to the per-pool full flag so in this branch
2479 // we do it ourselves.
2480 if (!full_pools.empty()) {
2481 objecter->maybe_request_map();
2482 }
2483 }
2484
2485 m->put();
2486}
2487
2488
2489// ------------------------
2490// incoming messages
2491
2492
2493bool Client::ms_dispatch(Message *m)
2494{
2495 Mutex::Locker l(client_lock);
2496 if (!initialized) {
2497 ldout(cct, 10) << "inactive, discarding " << *m << dendl;
2498 m->put();
2499 return true;
2500 }
2501
2502 switch (m->get_type()) {
2503 // mounting and mds sessions
2504 case CEPH_MSG_MDS_MAP:
2505 handle_mds_map(static_cast<MMDSMap*>(m));
2506 break;
2507 case CEPH_MSG_FS_MAP:
2508 handle_fs_map(static_cast<MFSMap*>(m));
2509 break;
2510 case CEPH_MSG_FS_MAP_USER:
2511 handle_fs_map_user(static_cast<MFSMapUser*>(m));
2512 break;
2513 case CEPH_MSG_CLIENT_SESSION:
2514 handle_client_session(static_cast<MClientSession*>(m));
2515 break;
2516
2517 case CEPH_MSG_OSD_MAP:
2518 handle_osd_map(static_cast<MOSDMap*>(m));
2519 break;
2520
2521 // requests
2522 case CEPH_MSG_CLIENT_REQUEST_FORWARD:
2523 handle_client_request_forward(static_cast<MClientRequestForward*>(m));
2524 break;
2525 case CEPH_MSG_CLIENT_REPLY:
2526 handle_client_reply(static_cast<MClientReply*>(m));
2527 break;
2528
2529 case CEPH_MSG_CLIENT_SNAP:
2530 handle_snap(static_cast<MClientSnap*>(m));
2531 break;
2532 case CEPH_MSG_CLIENT_CAPS:
2533 handle_caps(static_cast<MClientCaps*>(m));
2534 break;
2535 case CEPH_MSG_CLIENT_LEASE:
2536 handle_lease(static_cast<MClientLease*>(m));
2537 break;
2538 case MSG_COMMAND_REPLY:
2539 if (m->get_source().type() == CEPH_ENTITY_TYPE_MDS) {
2540 handle_command_reply(static_cast<MCommandReply*>(m));
2541 } else {
2542 return false;
2543 }
2544 break;
2545 case CEPH_MSG_CLIENT_QUOTA:
2546 handle_quota(static_cast<MClientQuota*>(m));
2547 break;
2548
2549 default:
2550 return false;
2551 }
2552
2553 // unmounting?
2554 if (unmounting) {
2555 ldout(cct, 10) << "unmounting: trim pass, size was " << lru.lru_get_size()
2556 << "+" << inode_map.size() << dendl;
2557 long unsigned size = lru.lru_get_size() + inode_map.size();
2558 trim_cache();
2559 if (size < lru.lru_get_size() + inode_map.size()) {
2560 ldout(cct, 10) << "unmounting: trim pass, cache shrank, poking unmount()" << dendl;
2561 mount_cond.Signal();
2562 } else {
2563 ldout(cct, 10) << "unmounting: trim pass, size still " << lru.lru_get_size()
2564 << "+" << inode_map.size() << dendl;
2565 }
2566 }
2567
2568 return true;
2569}
2570
2571void Client::handle_fs_map(MFSMap *m)
2572{
2573 fsmap.reset(new FSMap(m->get_fsmap()));
2574 m->put();
2575
2576 signal_cond_list(waiting_for_fsmap);
2577
2578 monclient->sub_got("fsmap", fsmap->get_epoch());
2579}
2580
2581void Client::handle_fs_map_user(MFSMapUser *m)
2582{
2583 fsmap_user.reset(new FSMapUser);
2584 *fsmap_user = m->get_fsmap();
2585 m->put();
2586
2587 monclient->sub_got("fsmap.user", fsmap_user->get_epoch());
2588 signal_cond_list(waiting_for_fsmap);
2589}
2590
2591void Client::handle_mds_map(MMDSMap* m)
2592{
2593 if (m->get_epoch() <= mdsmap->get_epoch()) {
2594 ldout(cct, 1) << "handle_mds_map epoch " << m->get_epoch()
2595 << " is identical to or older than our "
2596 << mdsmap->get_epoch() << dendl;
2597 m->put();
2598 return;
2599 }
2600
2601 ldout(cct, 1) << "handle_mds_map epoch " << m->get_epoch() << dendl;
2602
2603 std::unique_ptr<MDSMap> oldmap(new MDSMap);
2604 oldmap.swap(mdsmap);
2605
2606 mdsmap->decode(m->get_encoded());
2607
2608 // Cancel any commands for missing or laggy GIDs
2609 std::list<ceph_tid_t> cancel_ops;
2610 auto &commands = command_table.get_commands();
2611 for (const auto &i : commands) {
2612 auto &op = i.second;
2613 const mds_gid_t op_mds_gid = op.mds_gid;
2614 if (mdsmap->is_dne_gid(op_mds_gid) || mdsmap->is_laggy_gid(op_mds_gid)) {
2615 ldout(cct, 1) << __func__ << ": cancelling command op " << i.first << dendl;
2616 cancel_ops.push_back(i.first);
2617 if (op.outs) {
2618 std::ostringstream ss;
2619 ss << "MDS " << op_mds_gid << " went away";
2620 *(op.outs) = ss.str();
2621 }
2622 op.con->mark_down();
2623 if (op.on_finish) {
2624 op.on_finish->complete(-ETIMEDOUT);
2625 }
2626 }
2627 }
2628
2629 for (std::list<ceph_tid_t>::iterator i = cancel_ops.begin();
2630 i != cancel_ops.end(); ++i) {
2631 command_table.erase(*i);
2632 }
2633
2634 // reset session
2635 for (map<mds_rank_t,MetaSession*>::iterator p = mds_sessions.begin();
2636 p != mds_sessions.end(); ) {
2637 mds_rank_t mds = p->first;
2638 MetaSession *session = p->second;
2639 ++p;
2640
2641 int oldstate = oldmap->get_state(mds);
2642 int newstate = mdsmap->get_state(mds);
2643 if (!mdsmap->is_up(mds)) {
2644 session->con->mark_down();
2645 } else if (mdsmap->get_inst(mds) != session->inst) {
2646 session->con->mark_down();
2647 session->inst = mdsmap->get_inst(mds);
2648 // When new MDS starts to take over, notify kernel to trim unused entries
2649 // in its dcache/icache. Hopefully, the kernel will release some unused
2650 // inodes before the new MDS enters reconnect state.
2651 trim_cache_for_reconnect(session);
2652 } else if (oldstate == newstate)
2653 continue; // no change
2654
2655 session->mds_state = newstate;
2656 if (newstate == MDSMap::STATE_RECONNECT) {
2657 session->con = messenger->get_connection(session->inst);
2658 send_reconnect(session);
2659 } else if (newstate >= MDSMap::STATE_ACTIVE) {
2660 if (oldstate < MDSMap::STATE_ACTIVE) {
2661 // kick new requests
2662 kick_requests(session);
2663 kick_flushing_caps(session);
2664 signal_context_list(session->waiting_for_open);
2665 kick_maxsize_requests(session);
2666 wake_inode_waiters(session);
2667 }
2668 connect_mds_targets(mds);
2669 } else if (newstate == MDSMap::STATE_NULL &&
2670 mds >= mdsmap->get_max_mds()) {
2671 _closed_mds_session(session);
2672 }
2673 }
2674
2675 // kick any waiting threads
2676 signal_cond_list(waiting_for_mdsmap);
2677
2678 m->put();
2679
2680 monclient->sub_got("mdsmap", mdsmap->get_epoch());
2681}
2682
2683void Client::send_reconnect(MetaSession *session)
2684{
2685 mds_rank_t mds = session->mds_num;
2686 ldout(cct, 10) << "send_reconnect to mds." << mds << dendl;
2687
2688 // trim unused caps to reduce MDS's cache rejoin time
2689 trim_cache_for_reconnect(session);
2690
2691 session->readonly = false;
2692
2693 if (session->release) {
2694 session->release->put();
2695 session->release = NULL;
2696 }
2697
2698 // reset my cap seq number
2699 session->seq = 0;
2700 //connect to the mds' offload targets
2701 connect_mds_targets(mds);
2702 //make sure unsafe requests get saved
2703 resend_unsafe_requests(session);
2704
2705 MClientReconnect *m = new MClientReconnect;
2706
2707 // i have an open session.
2708 ceph::unordered_set<inodeno_t> did_snaprealm;
2709 for (ceph::unordered_map<vinodeno_t, Inode*>::iterator p = inode_map.begin();
2710 p != inode_map.end();
2711 ++p) {
2712 Inode *in = p->second;
2713 if (in->caps.count(mds)) {
2714 ldout(cct, 10) << " caps on " << p->first
2715 << " " << ccap_string(in->caps[mds]->issued)
2716 << " wants " << ccap_string(in->caps_wanted())
2717 << dendl;
2718 filepath path;
2719 in->make_long_path(path);
2720 ldout(cct, 10) << " path " << path << dendl;
2721
2722 bufferlist flockbl;
2723 _encode_filelocks(in, flockbl);
2724
2725 Cap *cap = in->caps[mds];
2726 cap->seq = 0; // reset seq.
2727 cap->issue_seq = 0; // reset seq.
2728 cap->mseq = 0; // reset seq.
2729 cap->issued = cap->implemented;
2730
2731 snapid_t snap_follows = 0;
2732 if (!in->cap_snaps.empty())
2733 snap_follows = in->cap_snaps.begin()->first;
2734
2735 m->add_cap(p->first.ino,
2736 cap->cap_id,
2737 path.get_ino(), path.get_path(), // ino
2738 in->caps_wanted(), // wanted
2739 cap->issued, // issued
2740 in->snaprealm->ino,
2741 snap_follows,
2742 flockbl);
2743
2744 if (did_snaprealm.count(in->snaprealm->ino) == 0) {
2745 ldout(cct, 10) << " snaprealm " << *in->snaprealm << dendl;
2746 m->add_snaprealm(in->snaprealm->ino, in->snaprealm->seq, in->snaprealm->parent);
2747 did_snaprealm.insert(in->snaprealm->ino);
2748 }
2749 }
2750 }
2751
2752 early_kick_flushing_caps(session);
2753
2754 session->con->send_message(m);
2755
2756 mount_cond.Signal();
2757}
2758
2759
2760void Client::kick_requests(MetaSession *session)
2761{
2762 ldout(cct, 10) << "kick_requests for mds." << session->mds_num << dendl;
2763 for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
2764 p != mds_requests.end();
2765 ++p) {
31f18b77
FG
2766 MetaRequest *req = p->second;
2767 if (req->got_unsafe)
2768 continue;
2769 if (req->aborted()) {
2770 if (req->caller_cond) {
2771 req->kick = true;
2772 req->caller_cond->Signal();
2773 }
7c673cae 2774 continue;
31f18b77
FG
2775 }
2776 if (req->retry_attempt > 0)
7c673cae 2777 continue; // new requests only
31f18b77 2778 if (req->mds == session->mds_num) {
7c673cae
FG
2779 send_request(p->second, session);
2780 }
2781 }
2782}
2783
2784void Client::resend_unsafe_requests(MetaSession *session)
2785{
2786 for (xlist<MetaRequest*>::iterator iter = session->unsafe_requests.begin();
2787 !iter.end();
2788 ++iter)
2789 send_request(*iter, session);
2790
2791 // also re-send old requests when MDS enters reconnect stage. So that MDS can
2792 // process completed requests in clientreplay stage.
2793 for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
2794 p != mds_requests.end();
2795 ++p) {
2796 MetaRequest *req = p->second;
2797 if (req->got_unsafe)
2798 continue;
31f18b77
FG
2799 if (req->aborted())
2800 continue;
7c673cae
FG
2801 if (req->retry_attempt == 0)
2802 continue; // old requests only
2803 if (req->mds == session->mds_num)
2804 send_request(req, session, true);
2805 }
2806}
2807
2808void Client::wait_unsafe_requests()
2809{
2810 list<MetaRequest*> last_unsafe_reqs;
2811 for (map<mds_rank_t,MetaSession*>::iterator p = mds_sessions.begin();
2812 p != mds_sessions.end();
2813 ++p) {
2814 MetaSession *s = p->second;
2815 if (!s->unsafe_requests.empty()) {
2816 MetaRequest *req = s->unsafe_requests.back();
2817 req->get();
2818 last_unsafe_reqs.push_back(req);
2819 }
2820 }
2821
2822 for (list<MetaRequest*>::iterator p = last_unsafe_reqs.begin();
2823 p != last_unsafe_reqs.end();
2824 ++p) {
2825 MetaRequest *req = *p;
2826 if (req->unsafe_item.is_on_list())
2827 wait_on_list(req->waitfor_safe);
2828 put_request(req);
2829 }
2830}
2831
2832void Client::kick_requests_closed(MetaSession *session)
2833{
2834 ldout(cct, 10) << "kick_requests_closed for mds." << session->mds_num << dendl;
2835 for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
2836 p != mds_requests.end(); ) {
2837 MetaRequest *req = p->second;
2838 ++p;
2839 if (req->mds == session->mds_num) {
2840 if (req->caller_cond) {
2841 req->kick = true;
2842 req->caller_cond->Signal();
2843 }
2844 req->item.remove_myself();
2845 if (req->got_unsafe) {
2846 lderr(cct) << "kick_requests_closed removing unsafe request " << req->get_tid() << dendl;
2847 req->unsafe_item.remove_myself();
2848 req->unsafe_dir_item.remove_myself();
2849 req->unsafe_target_item.remove_myself();
2850 signal_cond_list(req->waitfor_safe);
2851 unregister_request(req);
2852 }
2853 }
2854 }
2855 assert(session->requests.empty());
2856 assert(session->unsafe_requests.empty());
2857}
2858
2859
2860
2861
2862/************
2863 * leases
2864 */
2865
2866void Client::got_mds_push(MetaSession *s)
2867{
2868 s->seq++;
2869 ldout(cct, 10) << " mds." << s->mds_num << " seq now " << s->seq << dendl;
2870 if (s->state == MetaSession::STATE_CLOSING) {
2871 s->con->send_message(new MClientSession(CEPH_SESSION_REQUEST_CLOSE, s->seq));
2872 }
2873}
2874
2875void Client::handle_lease(MClientLease *m)
2876{
2877 ldout(cct, 10) << "handle_lease " << *m << dendl;
2878
2879 assert(m->get_action() == CEPH_MDS_LEASE_REVOKE);
2880
2881 mds_rank_t mds = mds_rank_t(m->get_source().num());
2882 MetaSession *session = _get_mds_session(mds, m->get_connection().get());
2883 if (!session) {
2884 m->put();
2885 return;
2886 }
2887
2888 got_mds_push(session);
2889
2890 ceph_seq_t seq = m->get_seq();
2891
2892 Inode *in;
2893 vinodeno_t vino(m->get_ino(), CEPH_NOSNAP);
2894 if (inode_map.count(vino) == 0) {
2895 ldout(cct, 10) << " don't have vino " << vino << dendl;
2896 goto revoke;
2897 }
2898 in = inode_map[vino];
2899
2900 if (m->get_mask() & CEPH_LOCK_DN) {
2901 if (!in->dir || in->dir->dentries.count(m->dname) == 0) {
2902 ldout(cct, 10) << " don't have dir|dentry " << m->get_ino() << "/" << m->dname <<dendl;
2903 goto revoke;
2904 }
2905 Dentry *dn = in->dir->dentries[m->dname];
2906 ldout(cct, 10) << " revoked DN lease on " << dn << dendl;
2907 dn->lease_mds = -1;
2908 }
2909
2910 revoke:
2911 m->get_connection()->send_message(
2912 new MClientLease(
2913 CEPH_MDS_LEASE_RELEASE, seq,
2914 m->get_mask(), m->get_ino(), m->get_first(), m->get_last(), m->dname));
2915 m->put();
2916}
2917
2918void Client::put_inode(Inode *in, int n)
2919{
2920 ldout(cct, 10) << "put_inode on " << *in << dendl;
2921 int left = in->_put(n);
2922 if (left == 0) {
2923 // release any caps
2924 remove_all_caps(in);
2925
2926 ldout(cct, 10) << "put_inode deleting " << *in << dendl;
2927 bool unclean = objectcacher->release_set(&in->oset);
2928 assert(!unclean);
2929 inode_map.erase(in->vino());
2930 if (use_faked_inos())
2931 _release_faked_ino(in);
2932
2933 if (in == root) {
2934 root = 0;
2935 root_ancestor = 0;
2936 while (!root_parents.empty())
2937 root_parents.erase(root_parents.begin());
2938 }
2939
2940 delete in;
2941 }
2942}
2943
2944void Client::close_dir(Dir *dir)
2945{
2946 Inode *in = dir->parent_inode;
2947 ldout(cct, 15) << "close_dir dir " << dir << " on " << in << dendl;
2948 assert(dir->is_empty());
2949 assert(in->dir == dir);
2950 assert(in->dn_set.size() < 2); // dirs can't be hard-linked
2951 if (!in->dn_set.empty())
2952 in->get_first_parent()->put(); // unpin dentry
2953
2954 delete in->dir;
2955 in->dir = 0;
2956 put_inode(in); // unpin inode
2957}
2958
2959 /**
2960 * Don't call this with in==NULL, use get_or_create for that
2961 * leave dn set to default NULL unless you're trying to add
2962 * a new inode to a pre-created Dentry
2963 */
2964Dentry* Client::link(Dir *dir, const string& name, Inode *in, Dentry *dn)
2965{
2966 if (!dn) {
2967 // create a new Dentry
2968 dn = new Dentry;
2969 dn->name = name;
2970
2971 // link to dir
2972 dn->dir = dir;
2973 dir->dentries[dn->name] = dn;
2974 lru.lru_insert_mid(dn); // mid or top?
2975
2976 ldout(cct, 15) << "link dir " << dir->parent_inode << " '" << name << "' to inode " << in
2977 << " dn " << dn << " (new dn)" << dendl;
2978 } else {
2979 ldout(cct, 15) << "link dir " << dir->parent_inode << " '" << name << "' to inode " << in
2980 << " dn " << dn << " (old dn)" << dendl;
2981 }
2982
2983 if (in) { // link to inode
2984 dn->inode = in;
2985 if (in->is_dir()) {
2986 if (in->dir)
2987 dn->get(); // dir -> dn pin
2988 if (in->ll_ref)
2989 dn->get(); // ll_ref -> dn pin
2990 }
2991
2992 assert(in->dn_set.count(dn) == 0);
2993
2994 // only one parent for directories!
2995 if (in->is_dir() && !in->dn_set.empty()) {
2996 Dentry *olddn = in->get_first_parent();
2997 assert(olddn->dir != dir || olddn->name != name);
2998 Inode *old_diri = olddn->dir->parent_inode;
2999 old_diri->dir_release_count++;
3000 clear_dir_complete_and_ordered(old_diri, true);
3001 unlink(olddn, true, true); // keep dir, dentry
3002 }
3003
3004 in->dn_set.insert(dn);
3005
3006 ldout(cct, 20) << "link inode " << in << " parents now " << in->dn_set << dendl;
3007 }
3008
3009 return dn;
3010}
3011
3012void Client::unlink(Dentry *dn, bool keepdir, bool keepdentry)
3013{
3014 InodeRef in;
3015 in.swap(dn->inode);
3016 ldout(cct, 15) << "unlink dir " << dn->dir->parent_inode << " '" << dn->name << "' dn " << dn
3017 << " inode " << dn->inode << dendl;
3018
3019 // unlink from inode
3020 if (in) {
3021 if (in->is_dir()) {
3022 if (in->dir)
3023 dn->put(); // dir -> dn pin
3024 if (in->ll_ref)
3025 dn->put(); // ll_ref -> dn pin
3026 }
3027 dn->inode = 0;
3028 assert(in->dn_set.count(dn));
3029 in->dn_set.erase(dn);
3030 ldout(cct, 20) << "unlink inode " << in << " parents now " << in->dn_set << dendl;
3031 }
3032
3033 if (keepdentry) {
3034 dn->lease_mds = -1;
3035 } else {
3036 ldout(cct, 15) << "unlink removing '" << dn->name << "' dn " << dn << dendl;
3037
3038 // unlink from dir
3039 dn->dir->dentries.erase(dn->name);
3040 if (dn->dir->is_empty() && !keepdir)
3041 close_dir(dn->dir);
3042 dn->dir = 0;
3043
3044 // delete den
3045 lru.lru_remove(dn);
3046 dn->put();
3047 }
3048}
3049
3050/**
3051 * For asynchronous flushes, check for errors from the IO and
3052 * update the inode if necessary
3053 */
3054class C_Client_FlushComplete : public Context {
3055private:
3056 Client *client;
3057 InodeRef inode;
3058public:
3059 C_Client_FlushComplete(Client *c, Inode *in) : client(c), inode(in) { }
3060 void finish(int r) override {
3061 assert(client->client_lock.is_locked_by_me());
3062 if (r != 0) {
3063 client_t const whoami = client->whoami; // For the benefit of ldout prefix
3064 ldout(client->cct, 1) << "I/O error from flush on inode " << inode
3065 << " 0x" << std::hex << inode->ino << std::dec
3066 << ": " << r << "(" << cpp_strerror(r) << ")" << dendl;
3067 inode->set_async_err(r);
3068 }
3069 }
3070};
3071
3072
3073/****
3074 * caps
3075 */
3076
3077void Client::get_cap_ref(Inode *in, int cap)
3078{
3079 if ((cap & CEPH_CAP_FILE_BUFFER) &&
3080 in->cap_refs[CEPH_CAP_FILE_BUFFER] == 0) {
3081 ldout(cct, 5) << "get_cap_ref got first FILE_BUFFER ref on " << *in << dendl;
3082 in->get();
3083 }
3084 if ((cap & CEPH_CAP_FILE_CACHE) &&
3085 in->cap_refs[CEPH_CAP_FILE_CACHE] == 0) {
3086 ldout(cct, 5) << "get_cap_ref got first FILE_CACHE ref on " << *in << dendl;
3087 in->get();
3088 }
3089 in->get_cap_ref(cap);
3090}
3091
3092void Client::put_cap_ref(Inode *in, int cap)
3093{
3094 int last = in->put_cap_ref(cap);
3095 if (last) {
3096 int put_nref = 0;
3097 int drop = last & ~in->caps_issued();
3098 if (in->snapid == CEPH_NOSNAP) {
3099 if ((last & CEPH_CAP_FILE_WR) &&
3100 !in->cap_snaps.empty() &&
3101 in->cap_snaps.rbegin()->second.writing) {
3102 ldout(cct, 10) << "put_cap_ref finishing pending cap_snap on " << *in << dendl;
3103 in->cap_snaps.rbegin()->second.writing = 0;
3104 finish_cap_snap(in, in->cap_snaps.rbegin()->second, get_caps_used(in));
3105 signal_cond_list(in->waitfor_caps); // wake up blocked sync writers
3106 }
3107 if (last & CEPH_CAP_FILE_BUFFER) {
3108 for (auto &p : in->cap_snaps)
3109 p.second.dirty_data = 0;
3110 signal_cond_list(in->waitfor_commit);
3111 ldout(cct, 5) << "put_cap_ref dropped last FILE_BUFFER ref on " << *in << dendl;
3112 ++put_nref;
3113 }
3114 }
3115 if (last & CEPH_CAP_FILE_CACHE) {
3116 ldout(cct, 5) << "put_cap_ref dropped last FILE_CACHE ref on " << *in << dendl;
3117 ++put_nref;
3118 }
3119 if (drop)
3120 check_caps(in, 0);
3121 if (put_nref)
3122 put_inode(in, put_nref);
3123 }
3124}
3125
3126int Client::get_caps(Inode *in, int need, int want, int *phave, loff_t endoff)
3127{
3128 int r = check_pool_perm(in, need);
3129 if (r < 0)
3130 return r;
3131
3132 while (1) {
3133 int file_wanted = in->caps_file_wanted();
3134 if ((file_wanted & need) != need) {
3135 ldout(cct, 10) << "get_caps " << *in << " need " << ccap_string(need)
3136 << " file_wanted " << ccap_string(file_wanted) << ", EBADF "
3137 << dendl;
3138 return -EBADF;
3139 }
3140
3141 int implemented;
3142 int have = in->caps_issued(&implemented);
3143
3144 bool waitfor_caps = false;
3145 bool waitfor_commit = false;
3146
3147 if (have & need & CEPH_CAP_FILE_WR) {
3148 if (endoff > 0 &&
3149 (endoff >= (loff_t)in->max_size ||
3150 endoff > (loff_t)(in->size << 1)) &&
3151 endoff > (loff_t)in->wanted_max_size) {
3152 ldout(cct, 10) << "wanted_max_size " << in->wanted_max_size << " -> " << endoff << dendl;
3153 in->wanted_max_size = endoff;
3154 check_caps(in, 0);
3155 }
3156
3157 if (endoff >= 0 && endoff > (loff_t)in->max_size) {
3158 ldout(cct, 10) << "waiting on max_size, endoff " << endoff << " max_size " << in->max_size << " on " << *in << dendl;
3159 waitfor_caps = true;
3160 }
3161 if (!in->cap_snaps.empty()) {
3162 if (in->cap_snaps.rbegin()->second.writing) {
3163 ldout(cct, 10) << "waiting on cap_snap write to complete" << dendl;
3164 waitfor_caps = true;
3165 }
3166 for (auto &p : in->cap_snaps) {
3167 if (p.second.dirty_data) {
3168 waitfor_commit = true;
3169 break;
3170 }
3171 }
3172 if (waitfor_commit) {
3173 _flush(in, new C_Client_FlushComplete(this, in));
3174 ldout(cct, 10) << "waiting for WRBUFFER to get dropped" << dendl;
3175 }
3176 }
3177 }
3178
3179 if (!waitfor_caps && !waitfor_commit) {
3180 if ((have & need) == need) {
7c673cae
FG
3181 int revoking = implemented & ~have;
3182 ldout(cct, 10) << "get_caps " << *in << " have " << ccap_string(have)
3183 << " need " << ccap_string(need) << " want " << ccap_string(want)
c07f9fc5 3184 << " revoking " << ccap_string(revoking)
7c673cae 3185 << dendl;
c07f9fc5 3186 if ((revoking & want) == 0) {
7c673cae
FG
3187 *phave = need | (have & want);
3188 in->get_cap_ref(need);
3189 return 0;
3190 }
3191 }
3192 ldout(cct, 10) << "waiting for caps " << *in << " need " << ccap_string(need) << " want " << ccap_string(want) << dendl;
3193 waitfor_caps = true;
3194 }
3195
3196 if ((need & CEPH_CAP_FILE_WR) && in->auth_cap &&
3197 in->auth_cap->session->readonly)
3198 return -EROFS;
3199
3200 if (in->flags & I_CAP_DROPPED) {
3201 int mds_wanted = in->caps_mds_wanted();
3202 if ((mds_wanted & need) != need) {
3203 int ret = _renew_caps(in);
3204 if (ret < 0)
3205 return ret;
3206 continue;
3207 }
3208 if ((mds_wanted & file_wanted) ==
3209 (file_wanted & (CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR))) {
3210 in->flags &= ~I_CAP_DROPPED;
3211 }
3212 }
3213
3214 if (waitfor_caps)
3215 wait_on_list(in->waitfor_caps);
3216 else if (waitfor_commit)
3217 wait_on_list(in->waitfor_commit);
3218 }
3219}
3220
3221int Client::get_caps_used(Inode *in)
3222{
3223 unsigned used = in->caps_used();
3224 if (!(used & CEPH_CAP_FILE_CACHE) &&
3225 !objectcacher->set_is_empty(&in->oset))
3226 used |= CEPH_CAP_FILE_CACHE;
3227 return used;
3228}
3229
3230void Client::cap_delay_requeue(Inode *in)
3231{
3232 ldout(cct, 10) << "cap_delay_requeue on " << *in << dendl;
3233 in->hold_caps_until = ceph_clock_now();
3234 in->hold_caps_until += cct->_conf->client_caps_release_delay;
3235 delayed_caps.push_back(&in->cap_item);
3236}
3237
3238void Client::send_cap(Inode *in, MetaSession *session, Cap *cap,
3239 bool sync, int used, int want, int retain,
3240 int flush, ceph_tid_t flush_tid)
3241{
3242 int held = cap->issued | cap->implemented;
3243 int revoking = cap->implemented & ~cap->issued;
3244 retain &= ~revoking;
3245 int dropping = cap->issued & ~retain;
3246 int op = CEPH_CAP_OP_UPDATE;
3247
3248 ldout(cct, 10) << "send_cap " << *in
3249 << " mds." << session->mds_num << " seq " << cap->seq
3250 << (sync ? " sync " : " async ")
3251 << " used " << ccap_string(used)
3252 << " want " << ccap_string(want)
3253 << " flush " << ccap_string(flush)
3254 << " retain " << ccap_string(retain)
3255 << " held "<< ccap_string(held)
3256 << " revoking " << ccap_string(revoking)
3257 << " dropping " << ccap_string(dropping)
3258 << dendl;
3259
3260 if (cct->_conf->client_inject_release_failure && revoking) {
3261 const int would_have_issued = cap->issued & retain;
3262 const int would_have_implemented = cap->implemented & (cap->issued | used);
3263 // Simulated bug:
3264 // - tell the server we think issued is whatever they issued plus whatever we implemented
3265 // - leave what we have implemented in place
3266 ldout(cct, 20) << __func__ << " injecting failure to release caps" << dendl;
3267 cap->issued = cap->issued | cap->implemented;
3268
3269 // Make an exception for revoking xattr caps: we are injecting
3270 // failure to release other caps, but allow xattr because client
3271 // will block on xattr ops if it can't release these to MDS (#9800)
3272 const int xattr_mask = CEPH_CAP_XATTR_SHARED | CEPH_CAP_XATTR_EXCL;
3273 cap->issued ^= xattr_mask & revoking;
3274 cap->implemented ^= xattr_mask & revoking;
3275
3276 ldout(cct, 20) << __func__ << " issued " << ccap_string(cap->issued) << " vs " << ccap_string(would_have_issued) << dendl;
3277 ldout(cct, 20) << __func__ << " implemented " << ccap_string(cap->implemented) << " vs " << ccap_string(would_have_implemented) << dendl;
3278 } else {
3279 // Normal behaviour
3280 cap->issued &= retain;
3281 cap->implemented &= cap->issued | used;
3282 }
3283
3284 snapid_t follows = 0;
3285
3286 if (flush)
3287 follows = in->snaprealm->get_snap_context().seq;
3288
3289 MClientCaps *m = new MClientCaps(op,
3290 in->ino,
3291 0,
3292 cap->cap_id, cap->seq,
3293 cap->implemented,
3294 want,
3295 flush,
3296 cap->mseq,
3297 cap_epoch_barrier);
3298 m->caller_uid = in->cap_dirtier_uid;
3299 m->caller_gid = in->cap_dirtier_gid;
3300
3301 m->head.issue_seq = cap->issue_seq;
3302 m->set_tid(flush_tid);
3303
3304 m->head.uid = in->uid;
3305 m->head.gid = in->gid;
3306 m->head.mode = in->mode;
3307
3308 m->head.nlink = in->nlink;
3309
3310 if (flush & CEPH_CAP_XATTR_EXCL) {
3311 ::encode(in->xattrs, m->xattrbl);
3312 m->head.xattr_version = in->xattr_version;
3313 }
3314
3315 m->size = in->size;
3316 m->max_size = in->max_size;
3317 m->truncate_seq = in->truncate_seq;
3318 m->truncate_size = in->truncate_size;
3319 m->mtime = in->mtime;
3320 m->atime = in->atime;
3321 m->ctime = in->ctime;
3322 m->btime = in->btime;
3323 m->time_warp_seq = in->time_warp_seq;
3324 m->change_attr = in->change_attr;
3325 if (sync)
3326 m->flags |= CLIENT_CAPS_SYNC;
3327
3328 if (flush & CEPH_CAP_FILE_WR) {
3329 m->inline_version = in->inline_version;
3330 m->inline_data = in->inline_data;
3331 }
3332
3333 in->reported_size = in->size;
3334 m->set_snap_follows(follows);
3335 cap->wanted = want;
3336 if (cap == in->auth_cap) {
3337 m->set_max_size(in->wanted_max_size);
3338 in->requested_max_size = in->wanted_max_size;
3339 ldout(cct, 15) << "auth cap, setting max_size = " << in->requested_max_size << dendl;
3340 }
3341
3342 if (!session->flushing_caps_tids.empty())
3343 m->set_oldest_flush_tid(*session->flushing_caps_tids.begin());
3344
3345 session->con->send_message(m);
3346}
3347
31f18b77
FG
3348static bool is_max_size_approaching(Inode *in)
3349{
3350 /* mds will adjust max size according to the reported size */
3351 if (in->flushing_caps & CEPH_CAP_FILE_WR)
3352 return false;
3353 if (in->size >= in->max_size)
3354 return true;
3355 /* half of previous max_size increment has been used */
3356 if (in->max_size > in->reported_size &&
3357 (in->size << 1) >= in->max_size + in->reported_size)
3358 return true;
3359 return false;
3360}
7c673cae
FG
3361
3362/**
3363 * check_caps
3364 *
3365 * Examine currently used and wanted versus held caps. Release, flush or ack
3366 * revoked caps to the MDS as appropriate.
3367 *
3368 * @param in the inode to check
3369 * @param flags flags to apply to cap check
3370 */
3371void Client::check_caps(Inode *in, unsigned flags)
3372{
3373 unsigned wanted = in->caps_wanted();
3374 unsigned used = get_caps_used(in);
3375 unsigned cap_used;
3376
3377 if (in->is_dir() && (in->flags & I_COMPLETE)) {
3378 // we do this here because we don't want to drop to Fs (and then
3379 // drop the Fs if we do a create!) if that alone makes us send lookups
3380 // to the MDS. Doing it in in->caps_wanted() has knock-on effects elsewhere
3381 wanted |= CEPH_CAP_FILE_EXCL;
3382 }
3383
3384 int implemented;
3385 int issued = in->caps_issued(&implemented);
3386 int revoking = implemented & ~issued;
3387
3388 int retain = wanted | used | CEPH_CAP_PIN;
3389 if (!unmounting) {
3390 if (wanted)
3391 retain |= CEPH_CAP_ANY;
3392 else
3393 retain |= CEPH_CAP_ANY_SHARED;
3394 }
3395
3396 ldout(cct, 10) << "check_caps on " << *in
3397 << " wanted " << ccap_string(wanted)
3398 << " used " << ccap_string(used)
3399 << " issued " << ccap_string(issued)
3400 << " revoking " << ccap_string(revoking)
3401 << " flags=" << flags
3402 << dendl;
3403
3404 if (in->snapid != CEPH_NOSNAP)
3405 return; //snap caps last forever, can't write
3406
3407 if (in->caps.empty())
3408 return; // guard if at end of func
3409
3410 if ((revoking & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO)) &&
3411 (used & CEPH_CAP_FILE_CACHE) && !(used & CEPH_CAP_FILE_BUFFER))
3412 _release(in);
3413
3414 if (!in->cap_snaps.empty())
3415 flush_snaps(in);
3416
3417 if (flags & CHECK_CAPS_NODELAY)
3418 in->hold_caps_until = utime_t();
3419 else
3420 cap_delay_requeue(in);
3421
3422 utime_t now = ceph_clock_now();
3423
3424 map<mds_rank_t, Cap*>::iterator it = in->caps.begin();
3425 while (it != in->caps.end()) {
3426 mds_rank_t mds = it->first;
3427 Cap *cap = it->second;
3428 ++it;
3429
3430 MetaSession *session = mds_sessions[mds];
3431 assert(session);
3432
3433 cap_used = used;
3434 if (in->auth_cap && cap != in->auth_cap)
3435 cap_used &= ~in->auth_cap->issued;
3436
3437 revoking = cap->implemented & ~cap->issued;
3438
3439 ldout(cct, 10) << " cap mds." << mds
3440 << " issued " << ccap_string(cap->issued)
3441 << " implemented " << ccap_string(cap->implemented)
3442 << " revoking " << ccap_string(revoking) << dendl;
3443
3444 if (in->wanted_max_size > in->max_size &&
3445 in->wanted_max_size > in->requested_max_size &&
3446 cap == in->auth_cap)
3447 goto ack;
3448
3449 /* approaching file_max? */
3450 if ((cap->issued & CEPH_CAP_FILE_WR) &&
31f18b77
FG
3451 cap == in->auth_cap &&
3452 is_max_size_approaching(in)) {
7c673cae 3453 ldout(cct, 10) << "size " << in->size << " approaching max_size " << in->max_size
31f18b77 3454 << ", reported " << in->reported_size << dendl;
7c673cae
FG
3455 goto ack;
3456 }
3457
3458 /* completed revocation? */
3459 if (revoking && (revoking & cap_used) == 0) {
3460 ldout(cct, 10) << "completed revocation of " << ccap_string(cap->implemented & ~cap->issued) << dendl;
3461 goto ack;
3462 }
3463
3464 /* want more caps from mds? */
3465 if (wanted & ~(cap->wanted | cap->issued))
3466 goto ack;
3467
3468 if (!revoking && unmounting && (cap_used == 0))
3469 goto ack;
3470
3471 if (wanted == cap->wanted && // mds knows what we want.
3472 ((cap->issued & ~retain) == 0) &&// and we don't have anything we wouldn't like
3473 !in->dirty_caps) // and we have no dirty caps
3474 continue;
3475
3476 if (now < in->hold_caps_until) {
3477 ldout(cct, 10) << "delaying cap release" << dendl;
3478 continue;
3479 }
3480
3481 ack:
3482 // re-send old cap/snapcap flushes first.
3483 if (session->mds_state >= MDSMap::STATE_RECONNECT &&
3484 session->mds_state < MDSMap::STATE_ACTIVE &&
3485 session->early_flushing_caps.count(in) == 0) {
3486 ldout(cct, 20) << " reflushing caps (check_caps) on " << *in
3487 << " to mds." << session->mds_num << dendl;
3488 session->early_flushing_caps.insert(in);
3489 if (in->cap_snaps.size())
3490 flush_snaps(in, true);
3491 if (in->flushing_caps)
3492 flush_caps(in, session, flags & CHECK_CAPS_SYNCHRONOUS);
3493 }
3494
3495 int flushing;
3496 ceph_tid_t flush_tid;
3497 if (in->auth_cap == cap && in->dirty_caps) {
3498 flushing = mark_caps_flushing(in, &flush_tid);
3499 } else {
3500 flushing = 0;
3501 flush_tid = 0;
3502 }
3503
3504 send_cap(in, session, cap, flags & CHECK_CAPS_SYNCHRONOUS, cap_used, wanted,
3505 retain, flushing, flush_tid);
3506 }
3507}
3508
3509
3510void Client::queue_cap_snap(Inode *in, SnapContext& old_snapc)
3511{
3512 int used = get_caps_used(in);
3513 int dirty = in->caps_dirty();
3514 ldout(cct, 10) << "queue_cap_snap " << *in << " snapc " << old_snapc << " used " << ccap_string(used) << dendl;
3515
3516 if (in->cap_snaps.size() &&
3517 in->cap_snaps.rbegin()->second.writing) {
3518 ldout(cct, 10) << "queue_cap_snap already have pending cap_snap on " << *in << dendl;
3519 return;
3520 } else if (in->caps_dirty() ||
3521 (used & CEPH_CAP_FILE_WR) ||
3522 (dirty & CEPH_CAP_ANY_WR)) {
3523 const auto &capsnapem = in->cap_snaps.emplace(std::piecewise_construct, std::make_tuple(old_snapc.seq), std::make_tuple(in));
3524 assert(capsnapem.second == true); /* element inserted */
3525 CapSnap &capsnap = capsnapem.first->second;
3526 capsnap.context = old_snapc;
3527 capsnap.issued = in->caps_issued();
3528 capsnap.dirty = in->caps_dirty();
3529
3530 capsnap.dirty_data = (used & CEPH_CAP_FILE_BUFFER);
3531
3532 capsnap.uid = in->uid;
3533 capsnap.gid = in->gid;
3534 capsnap.mode = in->mode;
3535 capsnap.btime = in->btime;
3536 capsnap.xattrs = in->xattrs;
3537 capsnap.xattr_version = in->xattr_version;
3538
3539 if (used & CEPH_CAP_FILE_WR) {
3540 ldout(cct, 10) << "queue_cap_snap WR used on " << *in << dendl;
3541 capsnap.writing = 1;
3542 } else {
3543 finish_cap_snap(in, capsnap, used);
3544 }
3545 } else {
3546 ldout(cct, 10) << "queue_cap_snap not dirty|writing on " << *in << dendl;
3547 }
3548}
3549
3550void Client::finish_cap_snap(Inode *in, CapSnap &capsnap, int used)
3551{
3552 ldout(cct, 10) << "finish_cap_snap " << *in << " capsnap " << (void *)&capsnap << " used " << ccap_string(used) << dendl;
3553 capsnap.size = in->size;
3554 capsnap.mtime = in->mtime;
3555 capsnap.atime = in->atime;
3556 capsnap.ctime = in->ctime;
3557 capsnap.time_warp_seq = in->time_warp_seq;
3558 capsnap.change_attr = in->change_attr;
3559
3560 capsnap.dirty |= in->caps_dirty();
3561
3562 if (capsnap.dirty & CEPH_CAP_FILE_WR) {
3563 capsnap.inline_data = in->inline_data;
3564 capsnap.inline_version = in->inline_version;
3565 }
3566
3567 if (used & CEPH_CAP_FILE_BUFFER) {
3568 ldout(cct, 10) << "finish_cap_snap " << *in << " cap_snap " << &capsnap << " used " << used
3569 << " WRBUFFER, delaying" << dendl;
3570 } else {
3571 capsnap.dirty_data = 0;
3572 flush_snaps(in);
3573 }
3574}
3575
3576void Client::_flushed_cap_snap(Inode *in, snapid_t seq)
3577{
3578 ldout(cct, 10) << "_flushed_cap_snap seq " << seq << " on " << *in << dendl;
3579 in->cap_snaps.at(seq).dirty_data = 0;
3580 flush_snaps(in);
3581}
3582
3583void Client::flush_snaps(Inode *in, bool all_again)
3584{
3585 ldout(cct, 10) << "flush_snaps on " << *in << " all_again " << all_again << dendl;
3586 assert(in->cap_snaps.size());
3587
3588 // pick auth mds
3589 assert(in->auth_cap);
3590 MetaSession *session = in->auth_cap->session;
3591 int mseq = in->auth_cap->mseq;
3592
3593 for (auto &p : in->cap_snaps) {
3594 CapSnap &capsnap = p.second;
3595 if (!all_again) {
3596 // only flush once per session
3597 if (capsnap.flush_tid > 0)
3598 continue;
3599 }
3600
3601 ldout(cct, 10) << "flush_snaps mds." << session->mds_num
3602 << " follows " << p.first
3603 << " size " << capsnap.size
3604 << " mtime " << capsnap.mtime
3605 << " dirty_data=" << capsnap.dirty_data
3606 << " writing=" << capsnap.writing
3607 << " on " << *in << dendl;
3608 if (capsnap.dirty_data || capsnap.writing)
3609 continue;
3610
3611 if (capsnap.flush_tid == 0) {
3612 capsnap.flush_tid = ++last_flush_tid;
3613 if (!in->flushing_cap_item.is_on_list())
3614 session->flushing_caps.push_back(&in->flushing_cap_item);
3615 session->flushing_caps_tids.insert(capsnap.flush_tid);
3616 }
3617
3618 MClientCaps *m = new MClientCaps(CEPH_CAP_OP_FLUSHSNAP, in->ino, in->snaprealm->ino, 0, mseq,
3619 cap_epoch_barrier);
3620 if (user_id >= 0)
3621 m->caller_uid = user_id;
3622 if (group_id >= 0)
3623 m->caller_gid = group_id;
3624
3625 m->set_client_tid(capsnap.flush_tid);
3626 m->head.snap_follows = p.first;
3627
3628 m->head.caps = capsnap.issued;
3629 m->head.dirty = capsnap.dirty;
3630
3631 m->head.uid = capsnap.uid;
3632 m->head.gid = capsnap.gid;
3633 m->head.mode = capsnap.mode;
3634 m->btime = capsnap.btime;
3635
3636 m->size = capsnap.size;
3637
3638 m->head.xattr_version = capsnap.xattr_version;
3639 ::encode(capsnap.xattrs, m->xattrbl);
3640
3641 m->ctime = capsnap.ctime;
3642 m->btime = capsnap.btime;
3643 m->mtime = capsnap.mtime;
3644 m->atime = capsnap.atime;
3645 m->time_warp_seq = capsnap.time_warp_seq;
3646 m->change_attr = capsnap.change_attr;
3647
3648 if (capsnap.dirty & CEPH_CAP_FILE_WR) {
3649 m->inline_version = in->inline_version;
3650 m->inline_data = in->inline_data;
3651 }
3652
3653 assert(!session->flushing_caps_tids.empty());
3654 m->set_oldest_flush_tid(*session->flushing_caps_tids.begin());
3655
3656 session->con->send_message(m);
3657 }
3658}
3659
3660
3661
3662void Client::wait_on_list(list<Cond*>& ls)
3663{
3664 Cond cond;
3665 ls.push_back(&cond);
3666 cond.Wait(client_lock);
3667 ls.remove(&cond);
3668}
3669
3670void Client::signal_cond_list(list<Cond*>& ls)
3671{
3672 for (list<Cond*>::iterator it = ls.begin(); it != ls.end(); ++it)
3673 (*it)->Signal();
3674}
3675
3676void Client::wait_on_context_list(list<Context*>& ls)
3677{
3678 Cond cond;
3679 bool done = false;
3680 int r;
3681 ls.push_back(new C_Cond(&cond, &done, &r));
3682 while (!done)
3683 cond.Wait(client_lock);
3684}
3685
3686void Client::signal_context_list(list<Context*>& ls)
3687{
3688 while (!ls.empty()) {
3689 ls.front()->complete(0);
3690 ls.pop_front();
3691 }
3692}
3693
3694void Client::wake_inode_waiters(MetaSession *s)
3695{
3696 xlist<Cap*>::iterator iter = s->caps.begin();
3697 while (!iter.end()){
3698 signal_cond_list((*iter)->inode->waitfor_caps);
3699 ++iter;
3700 }
3701}
3702
3703
3704// flush dirty data (from objectcache)
3705
3706class C_Client_CacheInvalidate : public Context {
3707private:
3708 Client *client;
3709 vinodeno_t ino;
3710 int64_t offset, length;
3711public:
3712 C_Client_CacheInvalidate(Client *c, Inode *in, int64_t off, int64_t len) :
3713 client(c), offset(off), length(len) {
3714 if (client->use_faked_inos())
3715 ino = vinodeno_t(in->faked_ino, CEPH_NOSNAP);
3716 else
3717 ino = in->vino();
3718 }
3719 void finish(int r) override {
3720 // _async_invalidate takes the lock when it needs to, call this back from outside of lock.
3721 assert(!client->client_lock.is_locked_by_me());
3722 client->_async_invalidate(ino, offset, length);
3723 }
3724};
3725
3726void Client::_async_invalidate(vinodeno_t ino, int64_t off, int64_t len)
3727{
3728 if (unmounting)
3729 return;
3730 ldout(cct, 10) << "_async_invalidate " << ino << " " << off << "~" << len << dendl;
3731 ino_invalidate_cb(callback_handle, ino, off, len);
3732}
3733
3734void Client::_schedule_invalidate_callback(Inode *in, int64_t off, int64_t len) {
3735
3736 if (ino_invalidate_cb)
3737 // we queue the invalidate, which calls the callback and decrements the ref
3738 async_ino_invalidator.queue(new C_Client_CacheInvalidate(this, in, off, len));
3739}
3740
3741void Client::_invalidate_inode_cache(Inode *in)
3742{
3743 ldout(cct, 10) << "_invalidate_inode_cache " << *in << dendl;
3744
3745 // invalidate our userspace inode cache
3746 if (cct->_conf->client_oc)
3747 objectcacher->release_set(&in->oset);
3748
3749 _schedule_invalidate_callback(in, 0, 0);
3750}
3751
3752void Client::_invalidate_inode_cache(Inode *in, int64_t off, int64_t len)
3753{
3754 ldout(cct, 10) << "_invalidate_inode_cache " << *in << " " << off << "~" << len << dendl;
3755
3756 // invalidate our userspace inode cache
3757 if (cct->_conf->client_oc) {
3758 vector<ObjectExtent> ls;
3759 Striper::file_to_extents(cct, in->ino, &in->layout, off, len, in->truncate_size, ls);
3760 objectcacher->discard_set(&in->oset, ls);
3761 }
3762
3763 _schedule_invalidate_callback(in, off, len);
3764}
3765
3766bool Client::_release(Inode *in)
3767{
3768 ldout(cct, 20) << "_release " << *in << dendl;
3769 if (in->cap_refs[CEPH_CAP_FILE_CACHE] == 0) {
3770 _invalidate_inode_cache(in);
3771 return true;
3772 }
3773 return false;
3774}
3775
3776bool Client::_flush(Inode *in, Context *onfinish)
3777{
3778 ldout(cct, 10) << "_flush " << *in << dendl;
3779
3780 if (!in->oset.dirty_or_tx) {
3781 ldout(cct, 10) << " nothing to flush" << dendl;
3782 onfinish->complete(0);
3783 return true;
3784 }
3785
3786 if (objecter->osdmap_pool_full(in->layout.pool_id)) {
3787 ldout(cct, 1) << __func__ << ": FULL, purging for ENOSPC" << dendl;
3788 objectcacher->purge_set(&in->oset);
3789 if (onfinish) {
3790 onfinish->complete(-ENOSPC);
3791 }
3792 return true;
3793 }
3794
3795 return objectcacher->flush_set(&in->oset, onfinish);
3796}
3797
3798void Client::_flush_range(Inode *in, int64_t offset, uint64_t size)
3799{
3800 assert(client_lock.is_locked());
3801 if (!in->oset.dirty_or_tx) {
3802 ldout(cct, 10) << " nothing to flush" << dendl;
3803 return;
3804 }
3805
3806 Mutex flock("Client::_flush_range flock");
3807 Cond cond;
3808 bool safe = false;
3809 Context *onflush = new C_SafeCond(&flock, &cond, &safe);
3810 bool ret = objectcacher->file_flush(&in->oset, &in->layout, in->snaprealm->get_snap_context(),
3811 offset, size, onflush);
3812 if (!ret) {
3813 // wait for flush
3814 client_lock.Unlock();
3815 flock.Lock();
3816 while (!safe)
3817 cond.Wait(flock);
3818 flock.Unlock();
3819 client_lock.Lock();
3820 }
3821}
3822
3823void Client::flush_set_callback(ObjectCacher::ObjectSet *oset)
3824{
3825 // Mutex::Locker l(client_lock);
3826 assert(client_lock.is_locked()); // will be called via dispatch() -> objecter -> ...
3827 Inode *in = static_cast<Inode *>(oset->parent);
3828 assert(in);
3829 _flushed(in);
3830}
3831
3832void Client::_flushed(Inode *in)
3833{
3834 ldout(cct, 10) << "_flushed " << *in << dendl;
3835
3836 put_cap_ref(in, CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_BUFFER);
3837}
3838
3839
3840
3841// checks common to add_update_cap, handle_cap_grant
3842void Client::check_cap_issue(Inode *in, Cap *cap, unsigned issued)
3843{
3844 unsigned had = in->caps_issued();
3845
3846 if ((issued & CEPH_CAP_FILE_CACHE) &&
3847 !(had & CEPH_CAP_FILE_CACHE))
3848 in->cache_gen++;
3849
3850 if ((issued & CEPH_CAP_FILE_SHARED) &&
3851 !(had & CEPH_CAP_FILE_SHARED)) {
3852 in->shared_gen++;
3853
3854 if (in->is_dir())
3855 clear_dir_complete_and_ordered(in, true);
3856 }
3857}
3858
3859void Client::add_update_cap(Inode *in, MetaSession *mds_session, uint64_t cap_id,
3860 unsigned issued, unsigned seq, unsigned mseq, inodeno_t realm,
3861 int flags, const UserPerm& cap_perms)
3862{
3863 Cap *cap = 0;
3864 mds_rank_t mds = mds_session->mds_num;
3865 if (in->caps.count(mds)) {
3866 cap = in->caps[mds];
3867
3868 /*
3869 * auth mds of the inode changed. we received the cap export
3870 * message, but still haven't received the cap import message.
3871 * handle_cap_export() updated the new auth MDS' cap.
3872 *
3873 * "ceph_seq_cmp(seq, cap->seq) <= 0" means we are processing
3874 * a message that was send before the cap import message. So
3875 * don't remove caps.
3876 */
3877 if (ceph_seq_cmp(seq, cap->seq) <= 0) {
3878 assert(cap == in->auth_cap);
3879 assert(cap->cap_id == cap_id);
3880 seq = cap->seq;
3881 mseq = cap->mseq;
3882 issued |= cap->issued;
3883 flags |= CEPH_CAP_FLAG_AUTH;
3884 }
3885 } else {
3886 mds_session->num_caps++;
3887 if (!in->is_any_caps()) {
3888 assert(in->snaprealm == 0);
3889 in->snaprealm = get_snap_realm(realm);
3890 in->snaprealm->inodes_with_caps.push_back(&in->snaprealm_item);
3891 ldout(cct, 15) << "add_update_cap first one, opened snaprealm " << in->snaprealm << dendl;
3892 }
3893 in->caps[mds] = cap = new Cap;
3894
3895 mds_session->caps.push_back(&cap->cap_item);
3896 cap->session = mds_session;
3897 cap->inode = in;
3898 cap->gen = mds_session->cap_gen;
3899 cap_list.push_back(&in->cap_item);
3900 }
3901
3902 check_cap_issue(in, cap, issued);
3903
3904 if (flags & CEPH_CAP_FLAG_AUTH) {
3905 if (in->auth_cap != cap &&
3906 (!in->auth_cap || ceph_seq_cmp(in->auth_cap->mseq, mseq) < 0)) {
3907 if (in->auth_cap && in->flushing_cap_item.is_on_list()) {
3908 ldout(cct, 10) << "add_update_cap changing auth cap: "
3909 << "add myself to new auth MDS' flushing caps list" << dendl;
3910 adjust_session_flushing_caps(in, in->auth_cap->session, mds_session);
3911 }
3912 in->auth_cap = cap;
3913 }
3914 }
3915
3916 unsigned old_caps = cap->issued;
3917 cap->cap_id = cap_id;
3918 cap->issued |= issued;
3919 cap->implemented |= issued;
3920 cap->seq = seq;
3921 cap->issue_seq = seq;
3922 cap->mseq = mseq;
3923 cap->latest_perms = cap_perms;
3924 ldout(cct, 10) << "add_update_cap issued " << ccap_string(old_caps) << " -> " << ccap_string(cap->issued)
3925 << " from mds." << mds
3926 << " on " << *in
3927 << dendl;
3928
3929 if ((issued & ~old_caps) && in->auth_cap == cap) {
3930 // non-auth MDS is revoking the newly grant caps ?
3931 for (map<mds_rank_t,Cap*>::iterator it = in->caps.begin(); it != in->caps.end(); ++it) {
3932 if (it->second == cap)
3933 continue;
3934 if (it->second->implemented & ~it->second->issued & issued) {
3935 check_caps(in, CHECK_CAPS_NODELAY);
3936 break;
3937 }
3938 }
3939 }
3940
3941 if (issued & ~old_caps)
3942 signal_cond_list(in->waitfor_caps);
3943}
3944
3945void Client::remove_cap(Cap *cap, bool queue_release)
3946{
3947 Inode *in = cap->inode;
3948 MetaSession *session = cap->session;
3949 mds_rank_t mds = cap->session->mds_num;
3950
3951 ldout(cct, 10) << "remove_cap mds." << mds << " on " << *in << dendl;
3952
3953 if (queue_release) {
3954 session->enqueue_cap_release(
3955 in->ino,
3956 cap->cap_id,
3957 cap->issue_seq,
3958 cap->mseq,
3959 cap_epoch_barrier);
3960 }
3961
3962 if (in->auth_cap == cap) {
3963 if (in->flushing_cap_item.is_on_list()) {
3964 ldout(cct, 10) << " removing myself from flushing_cap list" << dendl;
3965 in->flushing_cap_item.remove_myself();
3966 }
3967 in->auth_cap = NULL;
3968 }
3969 assert(in->caps.count(mds));
3970 in->caps.erase(mds);
3971
3972 cap->cap_item.remove_myself();
3973 delete cap;
3974 cap = nullptr;
3975
3976 if (!in->is_any_caps()) {
3977 ldout(cct, 15) << "remove_cap last one, closing snaprealm " << in->snaprealm << dendl;
3978 in->snaprealm_item.remove_myself();
3979 put_snap_realm(in->snaprealm);
3980 in->snaprealm = 0;
3981 }
3982}
3983
3984void Client::remove_all_caps(Inode *in)
3985{
3986 while (!in->caps.empty())
3987 remove_cap(in->caps.begin()->second, true);
3988}
3989
3990void Client::remove_session_caps(MetaSession *s)
3991{
3992 ldout(cct, 10) << "remove_session_caps mds." << s->mds_num << dendl;
3993
3994 while (s->caps.size()) {
3995 Cap *cap = *s->caps.begin();
3996 Inode *in = cap->inode;
3997 bool dirty_caps = false, cap_snaps = false;
3998 if (in->auth_cap == cap) {
3999 cap_snaps = !in->cap_snaps.empty();
4000 dirty_caps = in->dirty_caps | in->flushing_caps;
4001 in->wanted_max_size = 0;
4002 in->requested_max_size = 0;
4003 in->flags |= I_CAP_DROPPED;
4004 }
4005 remove_cap(cap, false);
4006 signal_cond_list(in->waitfor_caps);
4007 if (cap_snaps) {
4008 InodeRef tmp_ref(in);
4009 in->cap_snaps.clear();
4010 }
4011 if (dirty_caps) {
4012 lderr(cct) << "remove_session_caps still has dirty|flushing caps on " << *in << dendl;
4013 if (in->flushing_caps) {
4014 num_flushing_caps--;
4015 in->flushing_cap_tids.clear();
4016 }
4017 in->flushing_caps = 0;
4018 in->dirty_caps = 0;
4019 put_inode(in);
4020 }
4021 }
4022 s->flushing_caps_tids.clear();
4023 sync_cond.Signal();
4024}
4025
4026class C_Client_Remount : public Context {
4027private:
4028 Client *client;
4029public:
4030 explicit C_Client_Remount(Client *c) : client(c) {}
4031 void finish(int r) override {
4032 assert (r == 0);
4033 r = client->remount_cb(client->callback_handle);
4034 if (r != 0) {
4035 client_t whoami = client->get_nodeid();
4036 lderr(client->cct) << "tried to remount (to trim kernel dentries) and got error "
4037 << r << dendl;
4038 if (client->require_remount && !client->unmounting) {
4039 assert(0 == "failed to remount for kernel dentry trimming");
4040 }
4041 }
4042 }
4043};
4044
4045void Client::_invalidate_kernel_dcache()
4046{
4047 if (unmounting)
4048 return;
4049 if (can_invalidate_dentries && dentry_invalidate_cb && root->dir) {
4050 for (ceph::unordered_map<string, Dentry*>::iterator p = root->dir->dentries.begin();
4051 p != root->dir->dentries.end();
4052 ++p) {
4053 if (p->second->inode)
4054 _schedule_invalidate_dentry_callback(p->second, false);
4055 }
4056 } else if (remount_cb) {
4057 // Hacky:
4058 // when remounting a file system, linux kernel trims all unused dentries in the fs
4059 remount_finisher.queue(new C_Client_Remount(this));
4060 }
4061}
4062
4063void Client::trim_caps(MetaSession *s, int max)
4064{
4065 mds_rank_t mds = s->mds_num;
4066 int caps_size = s->caps.size();
4067 ldout(cct, 10) << "trim_caps mds." << mds << " max " << max
4068 << " caps " << caps_size << dendl;
4069
4070 int trimmed = 0;
4071 xlist<Cap*>::iterator p = s->caps.begin();
4072 while ((caps_size - trimmed) > max && !p.end()) {
4073 Cap *cap = *p;
4074 Inode *in = cap->inode;
4075
4076 // Increment p early because it will be invalidated if cap
4077 // is deleted inside remove_cap
4078 ++p;
4079
4080 if (in->caps.size() > 1 && cap != in->auth_cap) {
4081 int mine = cap->issued | cap->implemented;
4082 int oissued = in->auth_cap ? in->auth_cap->issued : 0;
4083 // disposable non-auth cap
4084 if (!(get_caps_used(in) & ~oissued & mine)) {
4085 ldout(cct, 20) << " removing unused, unneeded non-auth cap on " << *in << dendl;
4086 remove_cap(cap, true);
4087 trimmed++;
4088 }
4089 } else {
4090 ldout(cct, 20) << " trying to trim dentries for " << *in << dendl;
4091 bool all = true;
4092 set<Dentry*>::iterator q = in->dn_set.begin();
4093 InodeRef tmp_ref(in);
4094 while (q != in->dn_set.end()) {
4095 Dentry *dn = *q++;
4096 if (dn->lru_is_expireable()) {
4097 if (can_invalidate_dentries &&
4098 dn->dir->parent_inode->ino == MDS_INO_ROOT) {
4099 // Only issue one of these per DN for inodes in root: handle
4100 // others more efficiently by calling for root-child DNs at
4101 // the end of this function.
4102 _schedule_invalidate_dentry_callback(dn, true);
4103 }
4104 trim_dentry(dn);
4105 } else {
4106 ldout(cct, 20) << " not expirable: " << dn->name << dendl;
4107 all = false;
4108 }
4109 }
4110 if (all && in->ino != MDS_INO_ROOT) {
4111 ldout(cct, 20) << __func__ << " counting as trimmed: " << *in << dendl;
4112 trimmed++;
4113 }
4114 }
4115 }
4116
4117 if (s->caps.size() > max)
4118 _invalidate_kernel_dcache();
4119}
4120
4121void Client::force_session_readonly(MetaSession *s)
4122{
4123 s->readonly = true;
4124 for (xlist<Cap*>::iterator p = s->caps.begin(); !p.end(); ++p) {
4125 Inode *in = (*p)->inode;
4126 if (in->caps_wanted() & CEPH_CAP_FILE_WR)
4127 signal_cond_list(in->waitfor_caps);
4128 }
4129}
4130
4131void Client::mark_caps_dirty(Inode *in, int caps)
4132{
4133 ldout(cct, 10) << "mark_caps_dirty " << *in << " " << ccap_string(in->dirty_caps) << " -> "
4134 << ccap_string(in->dirty_caps | caps) << dendl;
4135 if (caps && !in->caps_dirty())
4136 in->get();
4137 in->dirty_caps |= caps;
4138}
4139
4140int Client::mark_caps_flushing(Inode *in, ceph_tid_t* ptid)
4141{
4142 MetaSession *session = in->auth_cap->session;
4143
4144 int flushing = in->dirty_caps;
4145 assert(flushing);
4146
4147 ceph_tid_t flush_tid = ++last_flush_tid;
4148 in->flushing_cap_tids[flush_tid] = flushing;
4149
4150 if (!in->flushing_caps) {
4151 ldout(cct, 10) << "mark_caps_flushing " << ccap_string(flushing) << " " << *in << dendl;
4152 num_flushing_caps++;
4153 } else {
4154 ldout(cct, 10) << "mark_caps_flushing (more) " << ccap_string(flushing) << " " << *in << dendl;
4155 }
4156
4157 in->flushing_caps |= flushing;
4158 in->dirty_caps = 0;
4159
4160 if (!in->flushing_cap_item.is_on_list())
4161 session->flushing_caps.push_back(&in->flushing_cap_item);
4162 session->flushing_caps_tids.insert(flush_tid);
4163
4164 *ptid = flush_tid;
4165 return flushing;
4166}
4167
4168void Client::adjust_session_flushing_caps(Inode *in, MetaSession *old_s, MetaSession *new_s)
4169{
4170 for (auto &p : in->cap_snaps) {
4171 CapSnap &capsnap = p.second;
4172 if (capsnap.flush_tid > 0) {
4173 old_s->flushing_caps_tids.erase(capsnap.flush_tid);
4174 new_s->flushing_caps_tids.insert(capsnap.flush_tid);
4175 }
4176 }
4177 for (map<ceph_tid_t, int>::iterator it = in->flushing_cap_tids.begin();
4178 it != in->flushing_cap_tids.end();
4179 ++it) {
4180 old_s->flushing_caps_tids.erase(it->first);
4181 new_s->flushing_caps_tids.insert(it->first);
4182 }
4183 new_s->flushing_caps.push_back(&in->flushing_cap_item);
4184}
4185
4186/*
4187 * Flush all caps back to the MDS. Because the callers generally wait on the
4188 * result of this function (syncfs and umount cases), we set
4189 * CHECK_CAPS_SYNCHRONOUS on the last check_caps call.
4190 */
4191void Client::flush_caps_sync()
4192{
4193 ldout(cct, 10) << __func__ << dendl;
4194 xlist<Inode*>::iterator p = delayed_caps.begin();
4195 while (!p.end()) {
4196 unsigned flags = CHECK_CAPS_NODELAY;
4197 Inode *in = *p;
4198
4199 ++p;
4200 delayed_caps.pop_front();
4201 if (p.end() && cap_list.empty())
4202 flags |= CHECK_CAPS_SYNCHRONOUS;
4203 check_caps(in, flags);
4204 }
4205
4206 // other caps, too
4207 p = cap_list.begin();
4208 while (!p.end()) {
4209 unsigned flags = CHECK_CAPS_NODELAY;
4210 Inode *in = *p;
4211
4212 ++p;
4213 if (p.end())
4214 flags |= CHECK_CAPS_SYNCHRONOUS;
4215 check_caps(in, flags);
4216 }
4217}
4218
4219void Client::flush_caps(Inode *in, MetaSession *session, bool sync)
4220{
4221 ldout(cct, 10) << "flush_caps " << in << " mds." << session->mds_num << dendl;
4222 Cap *cap = in->auth_cap;
4223 assert(cap->session == session);
4224
4225 for (map<ceph_tid_t,int>::iterator p = in->flushing_cap_tids.begin();
4226 p != in->flushing_cap_tids.end();
4227 ++p) {
4228 bool req_sync = false;
4229
4230 /* If this is a synchronous request, then flush the journal on last one */
4231 if (sync && (p->first == in->flushing_cap_tids.rbegin()->first))
4232 req_sync = true;
4233
4234 send_cap(in, session, cap, req_sync,
4235 (get_caps_used(in) | in->caps_dirty()),
4236 in->caps_wanted(), (cap->issued | cap->implemented),
4237 p->second, p->first);
4238 }
4239}
4240
4241void Client::wait_sync_caps(Inode *in, ceph_tid_t want)
4242{
4243 while (in->flushing_caps) {
4244 map<ceph_tid_t, int>::iterator it = in->flushing_cap_tids.begin();
4245 assert(it != in->flushing_cap_tids.end());
4246 if (it->first > want)
4247 break;
4248 ldout(cct, 10) << "wait_sync_caps on " << *in << " flushing "
4249 << ccap_string(it->second) << " want " << want
4250 << " last " << it->first << dendl;
4251 wait_on_list(in->waitfor_caps);
4252 }
4253}
4254
4255void Client::wait_sync_caps(ceph_tid_t want)
4256{
4257 retry:
4258 ldout(cct, 10) << "wait_sync_caps want " << want << " (last is " << last_flush_tid << ", "
4259 << num_flushing_caps << " total flushing)" << dendl;
4260 for (map<mds_rank_t,MetaSession*>::iterator p = mds_sessions.begin();
4261 p != mds_sessions.end();
4262 ++p) {
4263 MetaSession *s = p->second;
4264 if (s->flushing_caps_tids.empty())
4265 continue;
4266 ceph_tid_t oldest_tid = *s->flushing_caps_tids.begin();
4267 if (oldest_tid <= want) {
4268 ldout(cct, 10) << " waiting on mds." << p->first << " tid " << oldest_tid
4269 << " (want " << want << ")" << dendl;
4270 sync_cond.Wait(client_lock);
4271 goto retry;
4272 }
4273 }
4274}
4275
4276void Client::kick_flushing_caps(MetaSession *session)
4277{
4278 mds_rank_t mds = session->mds_num;
4279 ldout(cct, 10) << "kick_flushing_caps mds." << mds << dendl;
4280
4281 for (xlist<Inode*>::iterator p = session->flushing_caps.begin(); !p.end(); ++p) {
4282 Inode *in = *p;
4283 if (session->early_flushing_caps.count(in))
4284 continue;
4285 ldout(cct, 20) << " reflushing caps on " << *in << " to mds." << mds << dendl;
4286 if (in->cap_snaps.size())
4287 flush_snaps(in, true);
4288 if (in->flushing_caps)
4289 flush_caps(in, session);
4290 }
4291
4292 session->early_flushing_caps.clear();
4293}
4294
4295void Client::early_kick_flushing_caps(MetaSession *session)
4296{
4297 session->early_flushing_caps.clear();
4298
4299 for (xlist<Inode*>::iterator p = session->flushing_caps.begin(); !p.end(); ++p) {
4300 Inode *in = *p;
4301 assert(in->auth_cap);
4302
4303 // if flushing caps were revoked, we re-send the cap flush in client reconnect
4304 // stage. This guarantees that MDS processes the cap flush message before issuing
4305 // the flushing caps to other client.
4306 if ((in->flushing_caps & in->auth_cap->issued) == in->flushing_caps)
4307 continue;
4308
4309 ldout(cct, 20) << " reflushing caps (early_kick) on " << *in
4310 << " to mds." << session->mds_num << dendl;
4311
4312 session->early_flushing_caps.insert(in);
4313
4314 if (in->cap_snaps.size())
4315 flush_snaps(in, true);
4316 if (in->flushing_caps)
4317 flush_caps(in, session);
4318
4319 }
4320}
4321
4322void Client::kick_maxsize_requests(MetaSession *session)
4323{
4324 xlist<Cap*>::iterator iter = session->caps.begin();
4325 while (!iter.end()){
4326 (*iter)->inode->requested_max_size = 0;
4327 (*iter)->inode->wanted_max_size = 0;
4328 signal_cond_list((*iter)->inode->waitfor_caps);
4329 ++iter;
4330 }
4331}
4332
4333void SnapRealm::build_snap_context()
4334{
4335 set<snapid_t> snaps;
4336 snapid_t max_seq = seq;
4337
4338 // start with prior_parents?
4339 for (unsigned i=0; i<prior_parent_snaps.size(); i++)
4340 snaps.insert(prior_parent_snaps[i]);
4341
4342 // current parent's snaps
4343 if (pparent) {
4344 const SnapContext& psnapc = pparent->get_snap_context();
4345 for (unsigned i=0; i<psnapc.snaps.size(); i++)
4346 if (psnapc.snaps[i] >= parent_since)
4347 snaps.insert(psnapc.snaps[i]);
4348 if (psnapc.seq > max_seq)
4349 max_seq = psnapc.seq;
4350 }
4351
4352 // my snaps
4353 for (unsigned i=0; i<my_snaps.size(); i++)
4354 snaps.insert(my_snaps[i]);
4355
4356 // ok!
4357 cached_snap_context.seq = max_seq;
4358 cached_snap_context.snaps.resize(0);
4359 cached_snap_context.snaps.reserve(snaps.size());
4360 for (set<snapid_t>::reverse_iterator p = snaps.rbegin(); p != snaps.rend(); ++p)
4361 cached_snap_context.snaps.push_back(*p);
4362}
4363
4364void Client::invalidate_snaprealm_and_children(SnapRealm *realm)
4365{
4366 list<SnapRealm*> q;
4367 q.push_back(realm);
4368
4369 while (!q.empty()) {
4370 realm = q.front();
4371 q.pop_front();
4372
4373 ldout(cct, 10) << "invalidate_snaprealm_and_children " << *realm << dendl;
4374 realm->invalidate_cache();
4375
4376 for (set<SnapRealm*>::iterator p = realm->pchildren.begin();
4377 p != realm->pchildren.end();
4378 ++p)
4379 q.push_back(*p);
4380 }
4381}
4382
4383SnapRealm *Client::get_snap_realm(inodeno_t r)
4384{
4385 SnapRealm *realm = snap_realms[r];
4386 if (!realm)
4387 snap_realms[r] = realm = new SnapRealm(r);
4388 ldout(cct, 20) << "get_snap_realm " << r << " " << realm << " " << realm->nref << " -> " << (realm->nref + 1) << dendl;
4389 realm->nref++;
4390 return realm;
4391}
4392
4393SnapRealm *Client::get_snap_realm_maybe(inodeno_t r)
4394{
4395 if (snap_realms.count(r) == 0) {
4396 ldout(cct, 20) << "get_snap_realm_maybe " << r << " fail" << dendl;
4397 return NULL;
4398 }
4399 SnapRealm *realm = snap_realms[r];
4400 ldout(cct, 20) << "get_snap_realm_maybe " << r << " " << realm << " " << realm->nref << " -> " << (realm->nref + 1) << dendl;
4401 realm->nref++;
4402 return realm;
4403}
4404
4405void Client::put_snap_realm(SnapRealm *realm)
4406{
4407 ldout(cct, 20) << "put_snap_realm " << realm->ino << " " << realm
4408 << " " << realm->nref << " -> " << (realm->nref - 1) << dendl;
4409 if (--realm->nref == 0) {
4410 snap_realms.erase(realm->ino);
4411 if (realm->pparent) {
4412 realm->pparent->pchildren.erase(realm);
4413 put_snap_realm(realm->pparent);
4414 }
4415 delete realm;
4416 }
4417}
4418
4419bool Client::adjust_realm_parent(SnapRealm *realm, inodeno_t parent)
4420{
4421 if (realm->parent != parent) {
4422 ldout(cct, 10) << "adjust_realm_parent " << *realm
4423 << " " << realm->parent << " -> " << parent << dendl;
4424 realm->parent = parent;
4425 if (realm->pparent) {
4426 realm->pparent->pchildren.erase(realm);
4427 put_snap_realm(realm->pparent);
4428 }
4429 realm->pparent = get_snap_realm(parent);
4430 realm->pparent->pchildren.insert(realm);
4431 return true;
4432 }
4433 return false;
4434}
4435
4436static bool has_new_snaps(const SnapContext& old_snapc,
4437 const SnapContext& new_snapc)
4438{
4439 return !new_snapc.snaps.empty() && new_snapc.snaps[0] > old_snapc.seq;
4440}
4441
4442
4443void Client::update_snap_trace(bufferlist& bl, SnapRealm **realm_ret, bool flush)
4444{
4445 SnapRealm *first_realm = NULL;
4446 ldout(cct, 10) << "update_snap_trace len " << bl.length() << dendl;
4447
4448 map<SnapRealm*, SnapContext> dirty_realms;
4449
4450 bufferlist::iterator p = bl.begin();
4451 while (!p.end()) {
4452 SnapRealmInfo info;
4453 ::decode(info, p);
4454 SnapRealm *realm = get_snap_realm(info.ino());
4455
4456 bool invalidate = false;
4457
4458 if (info.seq() > realm->seq) {
4459 ldout(cct, 10) << "update_snap_trace " << *realm << " seq " << info.seq() << " > " << realm->seq
4460 << dendl;
4461
4462 if (flush) {
4463 // writeback any dirty caps _before_ updating snap list (i.e. with old snap info)
4464 // flush me + children
4465 list<SnapRealm*> q;
4466 q.push_back(realm);
4467 while (!q.empty()) {
4468 SnapRealm *realm = q.front();
4469 q.pop_front();
4470
4471 for (set<SnapRealm*>::iterator p = realm->pchildren.begin();
4472 p != realm->pchildren.end();
4473 ++p)
4474 q.push_back(*p);
4475
4476 if (dirty_realms.count(realm) == 0) {
4477 realm->nref++;
4478 dirty_realms[realm] = realm->get_snap_context();
4479 }
4480 }
4481 }
4482
4483 // update
4484 realm->seq = info.seq();
4485 realm->created = info.created();
4486 realm->parent_since = info.parent_since();
4487 realm->prior_parent_snaps = info.prior_parent_snaps;
4488 realm->my_snaps = info.my_snaps;
4489 invalidate = true;
4490 }
4491
4492 // _always_ verify parent
4493 if (adjust_realm_parent(realm, info.parent()))
4494 invalidate = true;
4495
4496 if (invalidate) {
4497 invalidate_snaprealm_and_children(realm);
4498 ldout(cct, 15) << "update_snap_trace " << *realm << " self|parent updated" << dendl;
4499 ldout(cct, 15) << " snapc " << realm->get_snap_context() << dendl;
4500 } else {
4501 ldout(cct, 10) << "update_snap_trace " << *realm << " seq " << info.seq()
4502 << " <= " << realm->seq << " and same parent, SKIPPING" << dendl;
4503 }
4504
4505 if (!first_realm)
4506 first_realm = realm;
4507 else
4508 put_snap_realm(realm);
4509 }
4510
4511 for (map<SnapRealm*, SnapContext>::iterator q = dirty_realms.begin();
4512 q != dirty_realms.end();
4513 ++q) {
4514 SnapRealm *realm = q->first;
4515 // if there are new snaps ?
4516 if (has_new_snaps(q->second, realm->get_snap_context())) {
4517 ldout(cct, 10) << " flushing caps on " << *realm << dendl;
4518 xlist<Inode*>::iterator r = realm->inodes_with_caps.begin();
4519 while (!r.end()) {
4520 Inode *in = *r;
4521 ++r;
4522 queue_cap_snap(in, q->second);
4523 }
4524 } else {
4525 ldout(cct, 10) << " no new snap on " << *realm << dendl;
4526 }
4527 put_snap_realm(realm);
4528 }
4529
4530 if (realm_ret)
4531 *realm_ret = first_realm;
4532 else
4533 put_snap_realm(first_realm);
4534}
4535
4536void Client::handle_snap(MClientSnap *m)
4537{
4538 ldout(cct, 10) << "handle_snap " << *m << dendl;
4539 mds_rank_t mds = mds_rank_t(m->get_source().num());
4540 MetaSession *session = _get_mds_session(mds, m->get_connection().get());
4541 if (!session) {
4542 m->put();
4543 return;
4544 }
4545
4546 got_mds_push(session);
4547
4548 map<Inode*, SnapContext> to_move;
4549 SnapRealm *realm = 0;
4550
4551 if (m->head.op == CEPH_SNAP_OP_SPLIT) {
4552 assert(m->head.split);
4553 SnapRealmInfo info;
4554 bufferlist::iterator p = m->bl.begin();
4555 ::decode(info, p);
4556 assert(info.ino() == m->head.split);
4557
4558 // flush, then move, ino's.
4559 realm = get_snap_realm(info.ino());
4560 ldout(cct, 10) << " splitting off " << *realm << dendl;
4561 for (vector<inodeno_t>::iterator p = m->split_inos.begin();
4562 p != m->split_inos.end();
4563 ++p) {
4564 vinodeno_t vino(*p, CEPH_NOSNAP);
4565 if (inode_map.count(vino)) {
4566 Inode *in = inode_map[vino];
4567 if (!in->snaprealm || in->snaprealm == realm)
4568 continue;
4569 if (in->snaprealm->created > info.created()) {
4570 ldout(cct, 10) << " NOT moving " << *in << " from _newer_ realm "
4571 << *in->snaprealm << dendl;
4572 continue;
4573 }
4574 ldout(cct, 10) << " moving " << *in << " from " << *in->snaprealm << dendl;
4575
4576
4577 in->snaprealm_item.remove_myself();
4578 to_move[in] = in->snaprealm->get_snap_context();
4579 put_snap_realm(in->snaprealm);
4580 }
4581 }
4582
4583 // move child snaprealms, too
4584 for (vector<inodeno_t>::iterator p = m->split_realms.begin();
4585 p != m->split_realms.end();
4586 ++p) {
4587 ldout(cct, 10) << "adjusting snaprealm " << *p << " parent" << dendl;
4588 SnapRealm *child = get_snap_realm_maybe(*p);
4589 if (!child)
4590 continue;
4591 adjust_realm_parent(child, realm->ino);
4592 put_snap_realm(child);
4593 }
4594 }
4595
4596 update_snap_trace(m->bl, NULL, m->head.op != CEPH_SNAP_OP_DESTROY);
4597
4598 if (realm) {
4599 for (auto p = to_move.begin(); p != to_move.end(); ++p) {
4600 Inode *in = p->first;
4601 in->snaprealm = realm;
4602 realm->inodes_with_caps.push_back(&in->snaprealm_item);
4603 realm->nref++;
4604 // queue for snap writeback
4605 if (has_new_snaps(p->second, realm->get_snap_context()))
4606 queue_cap_snap(in, p->second);
4607 }
4608 put_snap_realm(realm);
4609 }
4610
4611 m->put();
4612}
4613
4614void Client::handle_quota(MClientQuota *m)
4615{
4616 mds_rank_t mds = mds_rank_t(m->get_source().num());
4617 MetaSession *session = _get_mds_session(mds, m->get_connection().get());
4618 if (!session) {
4619 m->put();
4620 return;
4621 }
4622
4623 got_mds_push(session);
4624
4625 ldout(cct, 10) << "handle_quota " << *m << " from mds." << mds << dendl;
4626
4627 vinodeno_t vino(m->ino, CEPH_NOSNAP);
4628 if (inode_map.count(vino)) {
4629 Inode *in = NULL;
4630 in = inode_map[vino];
4631
4632 if (in) {
4633 in->quota = m->quota;
4634 in->rstat = m->rstat;
4635 }
4636 }
4637
4638 m->put();
4639}
4640
4641void Client::handle_caps(MClientCaps *m)
4642{
4643 mds_rank_t mds = mds_rank_t(m->get_source().num());
4644 MetaSession *session = _get_mds_session(mds, m->get_connection().get());
4645 if (!session) {
4646 m->put();
4647 return;
4648 }
4649
4650 if (m->osd_epoch_barrier && !objecter->have_map(m->osd_epoch_barrier)) {
4651 // Pause RADOS operations until we see the required epoch
4652 objecter->set_epoch_barrier(m->osd_epoch_barrier);
4653 }
4654
4655 if (m->osd_epoch_barrier > cap_epoch_barrier) {
4656 // Record the barrier so that we will transmit it to MDS when releasing
4657 set_cap_epoch_barrier(m->osd_epoch_barrier);
4658 }
4659
4660 got_mds_push(session);
4661
4662 m->clear_payload(); // for if/when we send back to MDS
4663
4664 Inode *in = 0;
4665 vinodeno_t vino(m->get_ino(), CEPH_NOSNAP);
4666 if (inode_map.count(vino))
4667 in = inode_map[vino];
4668 if (!in) {
4669 if (m->get_op() == CEPH_CAP_OP_IMPORT) {
4670 ldout(cct, 5) << "handle_caps don't have vino " << vino << " on IMPORT, immediately releasing" << dendl;
4671 session->enqueue_cap_release(
4672 m->get_ino(),
4673 m->get_cap_id(),
4674 m->get_seq(),
4675 m->get_mseq(),
4676 cap_epoch_barrier);
4677 } else {
4678 ldout(cct, 5) << "handle_caps don't have vino " << vino << ", dropping" << dendl;
4679 }
4680 m->put();
4681
4682 // in case the mds is waiting on e.g. a revocation
4683 flush_cap_releases();
4684 return;
4685 }
4686
4687 switch (m->get_op()) {
4688 case CEPH_CAP_OP_EXPORT:
4689 return handle_cap_export(session, in, m);
4690 case CEPH_CAP_OP_FLUSHSNAP_ACK:
4691 return handle_cap_flushsnap_ack(session, in, m);
4692 case CEPH_CAP_OP_IMPORT:
4693 handle_cap_import(session, in, m);
4694 }
4695
4696 if (in->caps.count(mds) == 0) {
4697 ldout(cct, 5) << "handle_caps don't have " << *in << " cap on mds." << mds << dendl;
4698 m->put();
4699 return;
4700 }
4701
4702 Cap *cap = in->caps[mds];
4703
4704 switch (m->get_op()) {
4705 case CEPH_CAP_OP_TRUNC: return handle_cap_trunc(session, in, m);
4706 case CEPH_CAP_OP_IMPORT:
4707 case CEPH_CAP_OP_REVOKE:
4708 case CEPH_CAP_OP_GRANT: return handle_cap_grant(session, in, cap, m);
4709 case CEPH_CAP_OP_FLUSH_ACK: return handle_cap_flush_ack(session, in, cap, m);
4710 default:
4711 m->put();
4712 }
4713}
4714
4715void Client::handle_cap_import(MetaSession *session, Inode *in, MClientCaps *m)
4716{
4717 mds_rank_t mds = session->mds_num;
4718
4719 ldout(cct, 5) << "handle_cap_import ino " << m->get_ino() << " mseq " << m->get_mseq()
4720 << " IMPORT from mds." << mds << dendl;
4721
4722 const mds_rank_t peer_mds = mds_rank_t(m->peer.mds);
4723 Cap *cap = NULL;
4724 UserPerm cap_perms;
4725 if (m->peer.cap_id && in->caps.count(peer_mds)) {
4726 cap = in->caps[peer_mds];
4727 if (cap) {
4728 cap_perms = cap->latest_perms;
4729 }
4730 }
4731
4732 // add/update it
4733 SnapRealm *realm = NULL;
4734 update_snap_trace(m->snapbl, &realm);
4735
4736 add_update_cap(in, session, m->get_cap_id(),
4737 m->get_caps(), m->get_seq(), m->get_mseq(), m->get_realm(),
4738 CEPH_CAP_FLAG_AUTH, cap_perms);
4739
4740 if (cap && cap->cap_id == m->peer.cap_id) {
4741 remove_cap(cap, (m->peer.flags & CEPH_CAP_FLAG_RELEASE));
4742 }
4743
4744 if (realm)
4745 put_snap_realm(realm);
4746
4747 if (in->auth_cap && in->auth_cap->session->mds_num == mds) {
4748 // reflush any/all caps (if we are now the auth_cap)
4749 if (in->cap_snaps.size())
4750 flush_snaps(in, true);
4751 if (in->flushing_caps)
4752 flush_caps(in, session);
4753 }
4754}
4755
4756void Client::handle_cap_export(MetaSession *session, Inode *in, MClientCaps *m)
4757{
4758 mds_rank_t mds = session->mds_num;
4759
4760 ldout(cct, 5) << "handle_cap_export ino " << m->get_ino() << " mseq " << m->get_mseq()
4761 << " EXPORT from mds." << mds << dendl;
4762
4763 Cap *cap = NULL;
4764 if (in->caps.count(mds))
4765 cap = in->caps[mds];
4766
4767 const mds_rank_t peer_mds = mds_rank_t(m->peer.mds);
4768
4769 if (cap && cap->cap_id == m->get_cap_id()) {
4770 if (m->peer.cap_id) {
4771 MetaSession *tsession = _get_or_open_mds_session(peer_mds);
4772 if (in->caps.count(peer_mds)) {
4773 Cap *tcap = in->caps[peer_mds];
4774 if (tcap->cap_id != m->peer.cap_id ||
4775 ceph_seq_cmp(tcap->seq, m->peer.seq) < 0) {
4776 tcap->cap_id = m->peer.cap_id;
4777 tcap->seq = m->peer.seq - 1;
4778 tcap->issue_seq = tcap->seq;
4779 tcap->mseq = m->peer.mseq;
4780 tcap->issued |= cap->issued;
4781 tcap->implemented |= cap->issued;
4782 if (cap == in->auth_cap)
4783 in->auth_cap = tcap;
4784 if (in->auth_cap == tcap && in->flushing_cap_item.is_on_list())
4785 adjust_session_flushing_caps(in, session, tsession);
4786 }
4787 } else {
4788 add_update_cap(in, tsession, m->peer.cap_id, cap->issued,
4789 m->peer.seq - 1, m->peer.mseq, (uint64_t)-1,
4790 cap == in->auth_cap ? CEPH_CAP_FLAG_AUTH : 0,
4791 cap->latest_perms);
4792 }
4793 } else {
4794 if (cap == in->auth_cap)
4795 in->flags |= I_CAP_DROPPED;
4796 }
4797
4798 remove_cap(cap, false);
4799 }
4800
4801 m->put();
4802}
4803
4804void Client::handle_cap_trunc(MetaSession *session, Inode *in, MClientCaps *m)
4805{
4806 mds_rank_t mds = session->mds_num;
4807 assert(in->caps[mds]);
4808
4809 ldout(cct, 10) << "handle_cap_trunc on ino " << *in
4810 << " size " << in->size << " -> " << m->get_size()
4811 << dendl;
4812
4813 int implemented = 0;
4814 int issued = in->caps_issued(&implemented) | in->caps_dirty();
4815 issued |= implemented;
4816 update_inode_file_bits(in, m->get_truncate_seq(), m->get_truncate_size(),
4817 m->get_size(), m->get_change_attr(), m->get_time_warp_seq(),
4818 m->get_ctime(), m->get_mtime(), m->get_atime(),
4819 m->inline_version, m->inline_data, issued);
4820 m->put();
4821}
4822
4823void Client::handle_cap_flush_ack(MetaSession *session, Inode *in, Cap *cap, MClientCaps *m)
4824{
4825 ceph_tid_t flush_ack_tid = m->get_client_tid();
4826 int dirty = m->get_dirty();
4827 int cleaned = 0;
4828 int flushed = 0;
4829
4830 for (map<ceph_tid_t, int>::iterator it = in->flushing_cap_tids.begin();
4831 it != in->flushing_cap_tids.end(); ) {
4832 if (it->first == flush_ack_tid)
4833 cleaned = it->second;
4834 if (it->first <= flush_ack_tid) {
4835 session->flushing_caps_tids.erase(it->first);
4836 in->flushing_cap_tids.erase(it++);
4837 ++flushed;
4838 continue;
4839 }
4840 cleaned &= ~it->second;
4841 if (!cleaned)
4842 break;
4843 ++it;
4844 }
4845
4846 ldout(cct, 5) << "handle_cap_flush_ack mds." << session->mds_num
4847 << " cleaned " << ccap_string(cleaned) << " on " << *in
4848 << " with " << ccap_string(dirty) << dendl;
4849
4850 if (flushed) {
4851 signal_cond_list(in->waitfor_caps);
4852 if (session->flushing_caps_tids.empty() ||
4853 *session->flushing_caps_tids.begin() > flush_ack_tid)
4854 sync_cond.Signal();
4855 }
4856
4857 if (!dirty) {
4858 in->cap_dirtier_uid = -1;
4859 in->cap_dirtier_gid = -1;
4860 }
4861
4862 if (!cleaned) {
4863 ldout(cct, 10) << " tid " << m->get_client_tid() << " != any cap bit tids" << dendl;
4864 } else {
4865 if (in->flushing_caps) {
4866 ldout(cct, 5) << " flushing_caps " << ccap_string(in->flushing_caps)
4867 << " -> " << ccap_string(in->flushing_caps & ~cleaned) << dendl;
4868 in->flushing_caps &= ~cleaned;
4869 if (in->flushing_caps == 0) {
4870 ldout(cct, 10) << " " << *in << " !flushing" << dendl;
4871 num_flushing_caps--;
4872 if (in->cap_snaps.empty())
4873 in->flushing_cap_item.remove_myself();
4874 }
4875 if (!in->caps_dirty())
4876 put_inode(in);
4877 }
4878 }
4879
4880 m->put();
4881}
4882
4883
4884void Client::handle_cap_flushsnap_ack(MetaSession *session, Inode *in, MClientCaps *m)
4885{
4886 mds_rank_t mds = session->mds_num;
4887 assert(in->caps[mds]);
4888 snapid_t follows = m->get_snap_follows();
4889
4890 if (in->cap_snaps.count(follows)) {
4891 CapSnap &capsnap = in->cap_snaps.at(follows);
4892 if (m->get_client_tid() != capsnap.flush_tid) {
4893 ldout(cct, 10) << " tid " << m->get_client_tid() << " != " << capsnap.flush_tid << dendl;
4894 } else {
4895 ldout(cct, 5) << "handle_cap_flushedsnap mds." << mds << " flushed snap follows " << follows
4896 << " on " << *in << dendl;
4897 InodeRef tmp_ref;
4898 if (in->get_num_ref() == 1)
4899 tmp_ref = in; // make sure inode not get freed while erasing item from in->cap_snaps
4900 if (in->flushing_caps == 0 && in->cap_snaps.empty())
4901 in->flushing_cap_item.remove_myself();
4902 session->flushing_caps_tids.erase(capsnap.flush_tid);
4903 in->cap_snaps.erase(follows);
4904 }
4905 } else {
4906 ldout(cct, 5) << "handle_cap_flushedsnap DUP(?) mds." << mds << " flushed snap follows " << follows
4907 << " on " << *in << dendl;
4908 // we may not have it if we send multiple FLUSHSNAP requests and (got multiple FLUSHEDSNAPs back)
4909 }
4910
4911 m->put();
4912}
4913
4914class C_Client_DentryInvalidate : public Context {
4915private:
4916 Client *client;
4917 vinodeno_t dirino;
4918 vinodeno_t ino;
4919 string name;
4920public:
4921 C_Client_DentryInvalidate(Client *c, Dentry *dn, bool del) :
4922 client(c), name(dn->name) {
4923 if (client->use_faked_inos()) {
4924 dirino.ino = dn->dir->parent_inode->faked_ino;
4925 if (del)
4926 ino.ino = dn->inode->faked_ino;
4927 } else {
4928 dirino = dn->dir->parent_inode->vino();
4929 if (del)
4930 ino = dn->inode->vino();
4931 }
4932 if (!del)
4933 ino.ino = inodeno_t();
4934 }
4935 void finish(int r) override {
4936 // _async_dentry_invalidate is responsible for its own locking
4937 assert(!client->client_lock.is_locked_by_me());
4938 client->_async_dentry_invalidate(dirino, ino, name);
4939 }
4940};
4941
4942void Client::_async_dentry_invalidate(vinodeno_t dirino, vinodeno_t ino, string& name)
4943{
4944 if (unmounting)
4945 return;
4946 ldout(cct, 10) << "_async_dentry_invalidate '" << name << "' ino " << ino
4947 << " in dir " << dirino << dendl;
4948 dentry_invalidate_cb(callback_handle, dirino, ino, name);
4949}
4950
4951void Client::_schedule_invalidate_dentry_callback(Dentry *dn, bool del)
4952{
4953 if (dentry_invalidate_cb && dn->inode->ll_ref > 0)
4954 async_dentry_invalidator.queue(new C_Client_DentryInvalidate(this, dn, del));
4955}
4956
4957void Client::_try_to_trim_inode(Inode *in, bool sched_inval)
4958{
4959 int ref = in->get_num_ref();
4960
4961 if (in->dir && !in->dir->dentries.empty()) {
4962 for (auto p = in->dir->dentries.begin();
4963 p != in->dir->dentries.end(); ) {
4964 Dentry *dn = p->second;
4965 ++p;
4966 /* rmsnap removes whole subtree, need trim inodes recursively.
4967 * we don't need to invalidate dentries recursively. because
4968 * invalidating a directory dentry effectively invalidate
4969 * whole subtree */
4970 if (in->snapid != CEPH_NOSNAP && dn->inode && dn->inode->is_dir())
4971 _try_to_trim_inode(dn->inode.get(), false);
4972
4973 if (dn->lru_is_expireable())
4974 unlink(dn, true, false); // keep dir, drop dentry
4975 }
4976 if (in->dir->dentries.empty()) {
4977 close_dir(in->dir);
4978 --ref;
4979 }
4980 }
4981
4982 if (ref > 0 && (in->flags & I_SNAPDIR_OPEN)) {
4983 InodeRef snapdir = open_snapdir(in);
4984 _try_to_trim_inode(snapdir.get(), false);
4985 --ref;
4986 }
4987
4988 if (ref > 0 && in->ll_ref > 0 && sched_inval) {
4989 set<Dentry*>::iterator q = in->dn_set.begin();
4990 while (q != in->dn_set.end()) {
4991 Dentry *dn = *q++;
4992 // FIXME: we play lots of unlink/link tricks when handling MDS replies,
4993 // so in->dn_set doesn't always reflect the state of kernel's dcache.
4994 _schedule_invalidate_dentry_callback(dn, true);
4995 unlink(dn, true, true);
4996 }
4997 }
4998}
4999
5000void Client::handle_cap_grant(MetaSession *session, Inode *in, Cap *cap, MClientCaps *m)
5001{
5002 mds_rank_t mds = session->mds_num;
5003 int used = get_caps_used(in);
5004 int wanted = in->caps_wanted();
5005
5006 const int old_caps = cap->issued;
5007 const int new_caps = m->get_caps();
5008 ldout(cct, 5) << "handle_cap_grant on in " << m->get_ino()
5009 << " mds." << mds << " seq " << m->get_seq()
5010 << " caps now " << ccap_string(new_caps)
5011 << " was " << ccap_string(old_caps) << dendl;
5012 cap->seq = m->get_seq();
5013
5014 in->layout = m->get_layout();
5015
5016 // update inode
5017 int implemented = 0;
5018 int issued = in->caps_issued(&implemented) | in->caps_dirty();
5019 issued |= implemented;
5020
5021 if ((issued & CEPH_CAP_AUTH_EXCL) == 0) {
5022 in->mode = m->head.mode;
5023 in->uid = m->head.uid;
5024 in->gid = m->head.gid;
5025 in->btime = m->btime;
5026 }
5027 bool deleted_inode = false;
5028 if ((issued & CEPH_CAP_LINK_EXCL) == 0) {
5029 in->nlink = m->head.nlink;
5030 if (in->nlink == 0 &&
5031 (new_caps & (CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL)))
5032 deleted_inode = true;
5033 }
5034 if ((issued & CEPH_CAP_XATTR_EXCL) == 0 &&
5035 m->xattrbl.length() &&
5036 m->head.xattr_version > in->xattr_version) {
5037 bufferlist::iterator p = m->xattrbl.begin();
5038 ::decode(in->xattrs, p);
5039 in->xattr_version = m->head.xattr_version;
5040 }
5041 update_inode_file_bits(in, m->get_truncate_seq(), m->get_truncate_size(), m->get_size(),
5042 m->get_change_attr(), m->get_time_warp_seq(), m->get_ctime(),
5043 m->get_mtime(), m->get_atime(),
5044 m->inline_version, m->inline_data, issued);
5045
5046 // max_size
5047 if (cap == in->auth_cap &&
5048 m->get_max_size() != in->max_size) {
5049 ldout(cct, 10) << "max_size " << in->max_size << " -> " << m->get_max_size() << dendl;
5050 in->max_size = m->get_max_size();
5051 if (in->max_size > in->wanted_max_size) {
5052 in->wanted_max_size = 0;
5053 in->requested_max_size = 0;
5054 }
5055 }
5056
5057 bool check = false;
5058 if (m->get_op() == CEPH_CAP_OP_IMPORT && m->get_wanted() != wanted)
5059 check = true;
5060
5061 check_cap_issue(in, cap, new_caps);
5062
5063 // update caps
5064 if (old_caps & ~new_caps) {
5065 ldout(cct, 10) << " revocation of " << ccap_string(~new_caps & old_caps) << dendl;
5066 cap->issued = new_caps;
5067 cap->implemented |= new_caps;
5068
5069 if (((used & ~new_caps) & CEPH_CAP_FILE_BUFFER)
5070 && !_flush(in, new C_Client_FlushComplete(this, in))) {
5071 // waitin' for flush
5072 } else if ((old_caps & ~new_caps) & CEPH_CAP_FILE_CACHE) {
5073 if (_release(in))
5074 check = true;
5075 } else {
5076 cap->wanted = 0; // don't let check_caps skip sending a response to MDS
5077 check = true;
5078 }
5079
5080 } else if (old_caps == new_caps) {
5081 ldout(cct, 10) << " caps unchanged at " << ccap_string(old_caps) << dendl;
5082 } else {
5083 ldout(cct, 10) << " grant, new caps are " << ccap_string(new_caps & ~old_caps) << dendl;
5084 cap->issued = new_caps;
5085 cap->implemented |= new_caps;
5086
5087 if (cap == in->auth_cap) {
5088 // non-auth MDS is revoking the newly grant caps ?
5089 for (map<mds_rank_t, Cap*>::iterator it = in->caps.begin(); it != in->caps.end(); ++it) {
5090 if (it->second == cap)
5091 continue;
5092 if (it->second->implemented & ~it->second->issued & new_caps) {
5093 check = true;
5094 break;
5095 }
5096 }
5097 }
5098 }
5099
5100 if (check)
5101 check_caps(in, 0);
5102
5103 // wake up waiters
5104 if (new_caps)
5105 signal_cond_list(in->waitfor_caps);
5106
5107 // may drop inode's last ref
5108 if (deleted_inode)
5109 _try_to_trim_inode(in, true);
5110
5111 m->put();
5112}
5113
5114int Client::_getgrouplist(gid_t** sgids, uid_t uid, gid_t gid)
5115{
5116 // cppcheck-suppress variableScope
5117 int sgid_count;
5118 gid_t *sgid_buf;
5119
5120 if (getgroups_cb) {
5121 sgid_count = getgroups_cb(callback_handle, &sgid_buf);
5122 if (sgid_count > 0) {
5123 *sgids = sgid_buf;
5124 return sgid_count;
5125 }
5126 }
5127
5128#if HAVE_GETGROUPLIST
5129 struct passwd *pw;
5130 pw = getpwuid(uid);
5131 if (pw == NULL) {
5132 ldout(cct, 3) << "getting user entry failed" << dendl;
5133 return -errno;
5134 }
5135 //use PAM to get the group list
5136 // initial number of group entries, defaults to posix standard of 16
5137 // PAM implementations may provide more than 16 groups....
5138 sgid_count = 16;
5139 sgid_buf = (gid_t*)malloc(sgid_count * sizeof(gid_t));
5140 if (sgid_buf == NULL) {
5141 ldout(cct, 3) << "allocating group memory failed" << dendl;
5142 return -ENOMEM;
5143 }
5144
5145 while (1) {
5146#if defined(__APPLE__)
5147 if (getgrouplist(pw->pw_name, gid, (int*)sgid_buf, &sgid_count) == -1) {
5148#else
5149 if (getgrouplist(pw->pw_name, gid, sgid_buf, &sgid_count) == -1) {
5150#endif
5151 // we need to resize the group list and try again
5152 void *_realloc = NULL;
5153 if ((_realloc = realloc(sgid_buf, sgid_count * sizeof(gid_t))) == NULL) {
5154 ldout(cct, 3) << "allocating group memory failed" << dendl;
5155 free(sgid_buf);
5156 return -ENOMEM;
5157 }
5158 sgid_buf = (gid_t*)_realloc;
5159 continue;
5160 }
5161 // list was successfully retrieved
5162 break;
5163 }
5164 *sgids = sgid_buf;
5165 return sgid_count;
5166#else
5167 return 0;
5168#endif
5169}
5170
5171int Client::inode_permission(Inode *in, const UserPerm& perms, unsigned want)
5172{
5173 if (perms.uid() == 0)
5174 return 0;
5175
5176 if (perms.uid() != in->uid && (in->mode & S_IRWXG)) {
5177 int ret = _posix_acl_permission(in, perms, want);
5178 if (ret != -EAGAIN)
5179 return ret;
5180 }
5181
5182 // check permissions before doing anything else
5183 if (!in->check_mode(perms, want))
5184 return -EACCES;
5185 return 0;
5186}
5187
5188int Client::xattr_permission(Inode *in, const char *name, unsigned want,
5189 const UserPerm& perms)
5190{
5191 int r = _getattr_for_perm(in, perms);
5192 if (r < 0)
5193 goto out;
5194
5195 r = 0;
5196 if (strncmp(name, "system.", 7) == 0) {
5197 if ((want & MAY_WRITE) && (perms.uid() != 0 && perms.uid() != in->uid))
5198 r = -EPERM;
5199 } else {
5200 r = inode_permission(in, perms, want);
5201 }
5202out:
5203 ldout(cct, 3) << __func__ << " " << in << " = " << r << dendl;
5204 return r;
5205}
5206
5207ostream& operator<<(ostream &out, const UserPerm& perm) {
5208 out << "UserPerm(uid: " << perm.uid() << ", gid: " << perm.gid() << ")";
5209 return out;
5210}
5211
5212int Client::may_setattr(Inode *in, struct ceph_statx *stx, int mask,
5213 const UserPerm& perms)
5214{
5215 ldout(cct, 20) << __func__ << *in << "; " << perms << dendl;
5216 int r = _getattr_for_perm(in, perms);
5217 if (r < 0)
5218 goto out;
5219
5220 if (mask & CEPH_SETATTR_SIZE) {
5221 r = inode_permission(in, perms, MAY_WRITE);
5222 if (r < 0)
5223 goto out;
5224 }
5225
5226 r = -EPERM;
5227 if (mask & CEPH_SETATTR_UID) {
5228 if (perms.uid() != 0 && (perms.uid() != in->uid || stx->stx_uid != in->uid))
5229 goto out;
5230 }
5231 if (mask & CEPH_SETATTR_GID) {
5232 if (perms.uid() != 0 && (perms.uid() != in->uid ||
5233 (!perms.gid_in_groups(stx->stx_gid) && stx->stx_gid != in->gid)))
5234 goto out;
5235 }
5236
5237 if (mask & CEPH_SETATTR_MODE) {
5238 if (perms.uid() != 0 && perms.uid() != in->uid)
5239 goto out;
5240
5241 gid_t i_gid = (mask & CEPH_SETATTR_GID) ? stx->stx_gid : in->gid;
5242 if (perms.uid() != 0 && !perms.gid_in_groups(i_gid))
5243 stx->stx_mode &= ~S_ISGID;
5244 }
5245
5246 if (mask & (CEPH_SETATTR_CTIME | CEPH_SETATTR_BTIME |
5247 CEPH_SETATTR_MTIME | CEPH_SETATTR_ATIME)) {
5248 if (perms.uid() != 0 && perms.uid() != in->uid) {
5249 int check_mask = CEPH_SETATTR_CTIME | CEPH_SETATTR_BTIME;
5250 if (!(mask & CEPH_SETATTR_MTIME_NOW))
5251 check_mask |= CEPH_SETATTR_MTIME;
5252 if (!(mask & CEPH_SETATTR_ATIME_NOW))
5253 check_mask |= CEPH_SETATTR_ATIME;
5254 if (check_mask & mask) {
5255 goto out;
5256 } else {
5257 r = inode_permission(in, perms, MAY_WRITE);
5258 if (r < 0)
5259 goto out;
5260 }
5261 }
5262 }
5263 r = 0;
5264out:
5265 ldout(cct, 3) << __func__ << " " << in << " = " << r << dendl;
5266 return r;
5267}
5268
5269int Client::may_open(Inode *in, int flags, const UserPerm& perms)
5270{
5271 ldout(cct, 20) << __func__ << *in << "; " << perms << dendl;
5272 unsigned want = 0;
5273
5274 if ((flags & O_ACCMODE) == O_WRONLY)
5275 want = MAY_WRITE;
5276 else if ((flags & O_ACCMODE) == O_RDWR)
5277 want = MAY_READ | MAY_WRITE;
5278 else if ((flags & O_ACCMODE) == O_RDONLY)
5279 want = MAY_READ;
5280 if (flags & O_TRUNC)
5281 want |= MAY_WRITE;
5282
5283 int r = 0;
5284 switch (in->mode & S_IFMT) {
5285 case S_IFLNK:
5286 r = -ELOOP;
5287 goto out;
5288 case S_IFDIR:
5289 if (want & MAY_WRITE) {
5290 r = -EISDIR;
5291 goto out;
5292 }
5293 break;
5294 }
5295
5296 r = _getattr_for_perm(in, perms);
5297 if (r < 0)
5298 goto out;
5299
5300 r = inode_permission(in, perms, want);
5301out:
5302 ldout(cct, 3) << __func__ << " " << in << " = " << r << dendl;
5303 return r;
5304}
5305
5306int Client::may_lookup(Inode *dir, const UserPerm& perms)
5307{
5308 ldout(cct, 20) << __func__ << *dir << "; " << perms << dendl;
5309 int r = _getattr_for_perm(dir, perms);
5310 if (r < 0)
5311 goto out;
5312
5313 r = inode_permission(dir, perms, MAY_EXEC);
5314out:
5315 ldout(cct, 3) << __func__ << " " << dir << " = " << r << dendl;
5316 return r;
5317}
5318
5319int Client::may_create(Inode *dir, const UserPerm& perms)
5320{
5321 ldout(cct, 20) << __func__ << *dir << "; " << perms << dendl;
5322 int r = _getattr_for_perm(dir, perms);
5323 if (r < 0)
5324 goto out;
5325
5326 r = inode_permission(dir, perms, MAY_EXEC | MAY_WRITE);
5327out:
5328 ldout(cct, 3) << __func__ << " " << dir << " = " << r << dendl;
5329 return r;
5330}
5331
5332int Client::may_delete(Inode *dir, const char *name, const UserPerm& perms)
5333{
5334 ldout(cct, 20) << __func__ << *dir << "; " << "; name " << name << "; " << perms << dendl;
5335 int r = _getattr_for_perm(dir, perms);
5336 if (r < 0)
5337 goto out;
5338
5339 r = inode_permission(dir, perms, MAY_EXEC | MAY_WRITE);
5340 if (r < 0)
5341 goto out;
5342
5343 /* 'name == NULL' means rmsnap */
5344 if (perms.uid() != 0 && name && (dir->mode & S_ISVTX)) {
5345 InodeRef otherin;
5346 r = _lookup(dir, name, CEPH_CAP_AUTH_SHARED, &otherin, perms);
5347 if (r < 0)
5348 goto out;
5349 if (dir->uid != perms.uid() && otherin->uid != perms.uid())
5350 r = -EPERM;
5351 }
5352out:
5353 ldout(cct, 3) << __func__ << " " << dir << " = " << r << dendl;
5354 return r;
5355}
5356
5357int Client::may_hardlink(Inode *in, const UserPerm& perms)
5358{
5359 ldout(cct, 20) << __func__ << *in << "; " << perms << dendl;
5360 int r = _getattr_for_perm(in, perms);
5361 if (r < 0)
5362 goto out;
5363
5364 if (perms.uid() == 0 || perms.uid() == in->uid) {
5365 r = 0;
5366 goto out;
5367 }
5368
5369 r = -EPERM;
5370 if (!S_ISREG(in->mode))
5371 goto out;
5372
5373 if (in->mode & S_ISUID)
5374 goto out;
5375
5376 if ((in->mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP))
5377 goto out;
5378
5379 r = inode_permission(in, perms, MAY_READ | MAY_WRITE);
5380out:
5381 ldout(cct, 3) << __func__ << " " << in << " = " << r << dendl;
5382 return r;
5383}
5384
5385int Client::_getattr_for_perm(Inode *in, const UserPerm& perms)
5386{
5387 int mask = CEPH_STAT_CAP_MODE;
5388 bool force = false;
5389 if (acl_type != NO_ACL) {
5390 mask |= CEPH_STAT_CAP_XATTR;
5391 force = in->xattr_version == 0;
5392 }
5393 return _getattr(in, mask, perms, force);
5394}
5395
5396vinodeno_t Client::_get_vino(Inode *in)
5397{
5398 /* The caller must hold the client lock */
5399 return vinodeno_t(in->ino, in->snapid);
5400}
5401
5402inodeno_t Client::_get_inodeno(Inode *in)
5403{
5404 /* The caller must hold the client lock */
5405 return in->ino;
5406}
5407
5408
5409/**
5410 * Resolve an MDS spec to a list of MDS daemon GIDs.
5411 *
5412 * The spec is a string representing a GID, rank, filesystem:rank, or name/id.
5413 * It may be '*' in which case it matches all GIDs.
5414 *
5415 * If no error is returned, the `targets` vector will be populated with at least
5416 * one MDS.
5417 */
5418int Client::resolve_mds(
5419 const std::string &mds_spec,
5420 std::vector<mds_gid_t> *targets)
5421{
5422 assert(fsmap);
5423 assert(targets != nullptr);
5424
5425 mds_role_t role;
5426 std::stringstream ss;
5427 int role_r = fsmap->parse_role(mds_spec, &role, ss);
5428 if (role_r == 0) {
5429 // We got a role, resolve it to a GID
5430 ldout(cct, 10) << __func__ << ": resolved '" << mds_spec << "' to role '"
5431 << role << "'" << dendl;
5432 targets->push_back(
5433 fsmap->get_filesystem(role.fscid)->mds_map.get_info(role.rank).global_id);
5434 return 0;
5435 }
5436
5437 std::string strtol_err;
5438 long long rank_or_gid = strict_strtoll(mds_spec.c_str(), 10, &strtol_err);
5439 if (strtol_err.empty()) {
5440 // It is a possible GID
5441 const mds_gid_t mds_gid = mds_gid_t(rank_or_gid);
5442 if (fsmap->gid_exists(mds_gid)) {
5443 ldout(cct, 10) << __func__ << ": validated GID " << mds_gid << dendl;
5444 targets->push_back(mds_gid);
5445 } else {
5446 lderr(cct) << __func__ << ": GID " << mds_gid << " not in MDS map"
5447 << dendl;
5448 return -ENOENT;
5449 }
5450 } else if (mds_spec == "*") {
5451 // It is a wildcard: use all MDSs
5452 const auto mds_info = fsmap->get_mds_info();
5453
5454 if (mds_info.empty()) {
5455 lderr(cct) << __func__ << ": * passed but no MDS daemons found" << dendl;
5456 return -ENOENT;
5457 }
5458
5459 for (const auto i : mds_info) {
5460 targets->push_back(i.first);
5461 }
5462 } else {
5463 // It did not parse as an integer, it is not a wildcard, it must be a name
5464 const mds_gid_t mds_gid = fsmap->find_mds_gid_by_name(mds_spec);
5465 if (mds_gid == 0) {
5466 lderr(cct) << "MDS ID '" << mds_spec << "' not found" << dendl;
5467
5468 lderr(cct) << "FSMap: " << *fsmap << dendl;
5469
5470 return -ENOENT;
5471 } else {
5472 ldout(cct, 10) << __func__ << ": resolved ID '" << mds_spec
5473 << "' to GID " << mds_gid << dendl;
5474 targets->push_back(mds_gid);
5475 }
5476 }
5477
5478 return 0;
5479}
5480
5481
5482/**
5483 * Authenticate with mon and establish global ID
5484 */
5485int Client::authenticate()
5486{
5487 assert(client_lock.is_locked_by_me());
5488
5489 if (monclient->is_authenticated()) {
5490 return 0;
5491 }
5492
5493 client_lock.Unlock();
5494 int r = monclient->authenticate(cct->_conf->client_mount_timeout);
5495 client_lock.Lock();
5496 if (r < 0) {
5497 return r;
5498 }
5499
5500 whoami = monclient->get_global_id();
5501 messenger->set_myname(entity_name_t::CLIENT(whoami.v));
5502
5503 return 0;
5504}
5505
5506int Client::fetch_fsmap(bool user)
5507{
5508 int r;
5509 // Retrieve FSMap to enable looking up daemon addresses. We need FSMap
5510 // rather than MDSMap because no one MDSMap contains all the daemons, and
5511 // a `tell` can address any daemon.
5512 version_t fsmap_latest;
5513 do {
5514 C_SaferCond cond;
5515 monclient->get_version("fsmap", &fsmap_latest, NULL, &cond);
5516 client_lock.Unlock();
5517 r = cond.wait();
5518 client_lock.Lock();
5519 } while (r == -EAGAIN);
5520
5521 if (r < 0) {
5522 lderr(cct) << "Failed to learn FSMap version: " << cpp_strerror(r) << dendl;
5523 return r;
5524 }
5525
5526 ldout(cct, 10) << __func__ << " learned FSMap version " << fsmap_latest << dendl;
5527
5528 if (user) {
5529 if (!fsmap_user || fsmap_user->get_epoch() < fsmap_latest) {
5530 monclient->sub_want("fsmap.user", fsmap_latest, CEPH_SUBSCRIBE_ONETIME);
5531 monclient->renew_subs();
5532 wait_on_list(waiting_for_fsmap);
5533 }
5534 assert(fsmap_user);
5535 assert(fsmap_user->get_epoch() >= fsmap_latest);
5536 } else {
5537 if (!fsmap || fsmap->get_epoch() < fsmap_latest) {
5538 monclient->sub_want("fsmap", fsmap_latest, CEPH_SUBSCRIBE_ONETIME);
5539 monclient->renew_subs();
5540 wait_on_list(waiting_for_fsmap);
5541 }
5542 assert(fsmap);
5543 assert(fsmap->get_epoch() >= fsmap_latest);
5544 }
5545 ldout(cct, 10) << __func__ << " finished waiting for FSMap version "
5546 << fsmap_latest << dendl;
5547 return 0;
5548}
5549
5550/**
5551 *
5552 * @mds_spec one of ID, rank, GID, "*"
5553 *
5554 */
5555int Client::mds_command(
5556 const std::string &mds_spec,
5557 const vector<string>& cmd,
5558 const bufferlist& inbl,
5559 bufferlist *outbl,
5560 string *outs,
5561 Context *onfinish)
5562{
5563 Mutex::Locker lock(client_lock);
5564
5565 assert(initialized);
5566
5567 int r;
5568 r = authenticate();
5569 if (r < 0) {
5570 return r;
5571 }
5572
5573 r = fetch_fsmap(false);
5574 if (r < 0) {
5575 return r;
5576 }
5577
5578 // Look up MDS target(s) of the command
5579 std::vector<mds_gid_t> targets;
5580 r = resolve_mds(mds_spec, &targets);
5581 if (r < 0) {
5582 return r;
5583 }
5584
5585 // If daemons are laggy, we won't send them commands. If all
5586 // are laggy then we fail.
5587 std::vector<mds_gid_t> non_laggy;
5588 for (const auto gid : targets) {
5589 const auto info = fsmap->get_info_gid(gid);
5590 if (!info.laggy()) {
5591 non_laggy.push_back(gid);
5592 }
5593 }
5594 if (non_laggy.size() == 0) {
5595 *outs = "All targeted MDS daemons are laggy";
5596 return -ENOENT;
5597 }
5598
5599 if (metadata.empty()) {
5600 // We are called on an unmounted client, so metadata
5601 // won't be initialized yet.
5602 populate_metadata("");
5603 }
5604
5605 // Send commands to targets
5606 C_GatherBuilder gather(cct, onfinish);
5607 for (const auto target_gid : non_laggy) {
5608 const auto info = fsmap->get_info_gid(target_gid);
5609
5610 // Open a connection to the target MDS
5611 entity_inst_t inst = info.get_inst();
5612 ConnectionRef conn = messenger->get_connection(inst);
5613
5614 // Generate MDSCommandOp state
5615 auto &op = command_table.start_command();
5616
5617 op.on_finish = gather.new_sub();
5618 op.cmd = cmd;
5619 op.outbl = outbl;
5620 op.outs = outs;
5621 op.inbl = inbl;
5622 op.mds_gid = target_gid;
5623 op.con = conn;
5624
5625 ldout(cct, 4) << __func__ << ": new command op to " << target_gid
5626 << " tid=" << op.tid << cmd << dendl;
5627
5628 // Construct and send MCommand
5629 MCommand *m = op.get_message(monclient->get_fsid());
5630 conn->send_message(m);
5631 }
5632 gather.activate();
5633
5634 return 0;
5635}
5636
5637void Client::handle_command_reply(MCommandReply *m)
5638{
5639 ceph_tid_t const tid = m->get_tid();
5640
5641 ldout(cct, 10) << __func__ << ": tid=" << m->get_tid() << dendl;
5642
5643 if (!command_table.exists(tid)) {
5644 ldout(cct, 1) << __func__ << ": unknown tid " << tid << ", dropping" << dendl;
5645 m->put();
5646 return;
5647 }
5648
5649 auto &op = command_table.get_command(tid);
5650 if (op.outbl) {
5651 op.outbl->claim(m->get_data());
5652 }
5653 if (op.outs) {
5654 *op.outs = m->rs;
5655 }
5656
5657 if (op.on_finish) {
5658 op.on_finish->complete(m->r);
5659 }
5660
5661 command_table.erase(tid);
5662
5663 m->put();
5664}
5665
5666// -------------------
5667// MOUNT
5668
5669int Client::mount(const std::string &mount_root, const UserPerm& perms,
5670 bool require_mds)
5671{
5672 Mutex::Locker lock(client_lock);
5673
5674 if (mounted) {
5675 ldout(cct, 5) << "already mounted" << dendl;
5676 return 0;
5677 }
5678
5679 int r = authenticate();
5680 if (r < 0) {
5681 lderr(cct) << "authentication failed: " << cpp_strerror(r) << dendl;
5682 return r;
5683 }
5684
5685 std::string want = "mdsmap";
5686 const auto &mds_ns = cct->_conf->client_mds_namespace;
5687 if (!mds_ns.empty()) {
5688 r = fetch_fsmap(true);
5689 if (r < 0)
5690 return r;
5691 fs_cluster_id_t cid = fsmap_user->get_fs_cid(mds_ns);
5692 if (cid == FS_CLUSTER_ID_NONE)
5693 return -ENOENT;
5694
5695 std::ostringstream oss;
5696 oss << want << "." << cid;
5697 want = oss.str();
5698 }
5699 ldout(cct, 10) << "Subscribing to map '" << want << "'" << dendl;
5700
5701 monclient->sub_want(want, 0, 0);
5702 monclient->renew_subs();
5703
5704 tick(); // start tick
5705
5706 if (require_mds) {
5707 while (1) {
5708 auto availability = mdsmap->is_cluster_available();
5709 if (availability == MDSMap::STUCK_UNAVAILABLE) {
5710 // Error out
5711 ldout(cct, 10) << "mds cluster unavailable: epoch=" << mdsmap->get_epoch() << dendl;
5712 return CEPH_FUSE_NO_MDS_UP;
5713 } else if (availability == MDSMap::AVAILABLE) {
5714 // Continue to mount
5715 break;
5716 } else if (availability == MDSMap::TRANSIENT_UNAVAILABLE) {
5717 // Else, wait. MDSMonitor will update the map to bring
5718 // us to a conclusion eventually.
5719 wait_on_list(waiting_for_mdsmap);
5720 } else {
5721 // Unexpected value!
5722 ceph_abort();
5723 }
5724 }
5725 }
5726
5727 populate_metadata(mount_root.empty() ? "/" : mount_root);
5728
5729 filepath fp(CEPH_INO_ROOT);
5730 if (!mount_root.empty()) {
5731 fp = filepath(mount_root.c_str());
5732 }
5733 while (true) {
5734 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_GETATTR);
5735 req->set_filepath(fp);
5736 req->head.args.getattr.mask = CEPH_STAT_CAP_INODE_ALL;
5737 int res = make_request(req, perms);
5738 if (res < 0) {
5739 if (res == -EACCES && root) {
5740 ldout(cct, 1) << __func__ << " EACCES on parent of mount point; quotas may not work" << dendl;
5741 break;
5742 }
5743 return res;
5744 }
5745
5746 if (fp.depth())
5747 fp.pop_dentry();
5748 else
5749 break;
5750 }
5751
5752 assert(root);
5753 _ll_get(root);
5754
5755 mounted = true;
5756
5757 // trace?
5758 if (!cct->_conf->client_trace.empty()) {
5759 traceout.open(cct->_conf->client_trace.c_str());
5760 if (traceout.is_open()) {
5761 ldout(cct, 1) << "opened trace file '" << cct->_conf->client_trace << "'" << dendl;
5762 } else {
5763 ldout(cct, 1) << "FAILED to open trace file '" << cct->_conf->client_trace << "'" << dendl;
5764 }
5765 }
5766
5767 /*
5768 ldout(cct, 3) << "op: // client trace data structs" << dendl;
5769 ldout(cct, 3) << "op: struct stat st;" << dendl;
5770 ldout(cct, 3) << "op: struct utimbuf utim;" << dendl;
5771 ldout(cct, 3) << "op: int readlinkbuf_len = 1000;" << dendl;
5772 ldout(cct, 3) << "op: char readlinkbuf[readlinkbuf_len];" << dendl;
5773 ldout(cct, 3) << "op: map<string, inode_t*> dir_contents;" << dendl;
5774 ldout(cct, 3) << "op: map<int, int> open_files;" << dendl;
5775 ldout(cct, 3) << "op: int fd;" << dendl;
5776 */
5777 return 0;
5778}
5779
5780// UNMOUNT
5781
5782void Client::_close_sessions()
5783{
5784 while (!mds_sessions.empty()) {
5785 // send session closes!
5786 for (map<mds_rank_t,MetaSession*>::iterator p = mds_sessions.begin();
5787 p != mds_sessions.end();
5788 ++p) {
5789 if (p->second->state != MetaSession::STATE_CLOSING) {
5790 _close_mds_session(p->second);
5791 }
5792 }
5793
5794 // wait for sessions to close
5795 ldout(cct, 2) << "waiting for " << mds_sessions.size() << " mds sessions to close" << dendl;
5796 mount_cond.Wait(client_lock);
5797 }
5798}
5799
31f18b77
FG
5800void Client::flush_mdlog_sync()
5801{
5802 if (mds_requests.empty())
5803 return;
5804 for (map<mds_rank_t,MetaSession*>::iterator p = mds_sessions.begin();
5805 p != mds_sessions.end();
5806 ++p) {
5807 MetaSession *s = p->second;
5808 flush_mdlog(s);
5809 }
5810}
5811
5812void Client::flush_mdlog(MetaSession *session)
5813{
5814 // Only send this to Luminous or newer MDS daemons, older daemons
5815 // will crash if they see an unknown CEPH_SESSION_* value in this msg.
5816 const uint64_t features = session->con->get_features();
5817 if (HAVE_FEATURE(features, SERVER_LUMINOUS)) {
5818 MClientSession *m = new MClientSession(CEPH_SESSION_REQUEST_FLUSH_MDLOG);
5819 session->con->send_message(m);
5820 }
5821}
5822
5823
7c673cae
FG
5824void Client::unmount()
5825{
5826 Mutex::Locker lock(client_lock);
5827
5828 assert(mounted); // caller is confused?
5829
5830 ldout(cct, 2) << "unmounting" << dendl;
5831 unmounting = true;
5832
31f18b77 5833 flush_mdlog_sync(); // flush the mdlog for pending requests, if any
7c673cae
FG
5834 while (!mds_requests.empty()) {
5835 ldout(cct, 10) << "waiting on " << mds_requests.size() << " requests" << dendl;
5836 mount_cond.Wait(client_lock);
5837 }
5838
5839 if (tick_event)
5840 timer.cancel_event(tick_event);
5841 tick_event = 0;
5842
5843 cwd.reset();
5844
5845 // clean up any unclosed files
5846 while (!fd_map.empty()) {
5847 Fh *fh = fd_map.begin()->second;
5848 fd_map.erase(fd_map.begin());
5849 ldout(cct, 0) << " destroyed lost open file " << fh << " on " << *fh->inode << dendl;
5850 _release_fh(fh);
5851 }
5852
5853 while (!ll_unclosed_fh_set.empty()) {
5854 set<Fh*>::iterator it = ll_unclosed_fh_set.begin();
5855 Fh *fh = *it;
5856 ll_unclosed_fh_set.erase(fh);
5857 ldout(cct, 0) << " destroyed lost open file " << fh << " on " << *(fh->inode) << dendl;
5858 _release_fh(fh);
5859 }
5860
5861 while (!opened_dirs.empty()) {
5862 dir_result_t *dirp = *opened_dirs.begin();
5863 ldout(cct, 0) << " destroyed lost open dir " << dirp << " on " << *dirp->inode << dendl;
5864 _closedir(dirp);
5865 }
5866
5867 _ll_drop_pins();
5868
31f18b77
FG
5869 if (blacklisted) {
5870 ldout(cct, 0) << " skipping clean shutdown, we are blacklisted" << dendl;
5871
5872 if (cct->_conf->client_oc) {
5873 // Purge all cached data so that ObjectCacher doesn't get hung up
5874 // trying to flush it. ObjectCacher's behaviour on EBLACKLISTED
5875 // is to just leave things marked dirty
5876 // (http://tracker.ceph.com/issues/9105)
5877 for (const auto &i : inode_map) {
5878 objectcacher->purge_set(&(i.second->oset));
5879 }
5880 }
5881
5882 mounted = false;
5883 return;
5884 }
5885
7c673cae
FG
5886 while (unsafe_sync_write > 0) {
5887 ldout(cct, 0) << unsafe_sync_write << " unsafe_sync_writes, waiting" << dendl;
5888 mount_cond.Wait(client_lock);
5889 }
5890
5891 if (cct->_conf->client_oc) {
5892 // flush/release all buffered data
5893 ceph::unordered_map<vinodeno_t, Inode*>::iterator next;
5894 for (ceph::unordered_map<vinodeno_t, Inode*>::iterator p = inode_map.begin();
5895 p != inode_map.end();
5896 p = next) {
5897 next = p;
5898 ++next;
5899 Inode *in = p->second;
5900 if (!in) {
5901 ldout(cct, 0) << "null inode_map entry ino " << p->first << dendl;
5902 assert(in);
5903 }
5904 if (!in->caps.empty()) {
5905 InodeRef tmp_ref(in);
5906 _release(in);
5907 _flush(in, new C_Client_FlushComplete(this, in));
5908 }
5909 }
5910 }
5911
5912 flush_caps_sync();
5913 wait_sync_caps(last_flush_tid);
5914
5915 // empty lru cache
5916 lru.lru_set_max(0);
5917 trim_cache();
5918
5919 while (lru.lru_get_size() > 0 ||
5920 !inode_map.empty()) {
5921 ldout(cct, 2) << "cache still has " << lru.lru_get_size()
5922 << "+" << inode_map.size() << " items"
5923 << ", waiting (for caps to release?)"
5924 << dendl;
5925 utime_t until = ceph_clock_now() + utime_t(5, 0);
5926 int r = mount_cond.WaitUntil(client_lock, until);
5927 if (r == ETIMEDOUT) {
5928 dump_cache(NULL);
5929 }
5930 }
5931 assert(lru.lru_get_size() == 0);
5932 assert(inode_map.empty());
5933
5934 // stop tracing
5935 if (!cct->_conf->client_trace.empty()) {
5936 ldout(cct, 1) << "closing trace file '" << cct->_conf->client_trace << "'" << dendl;
5937 traceout.close();
5938 }
5939
5940 _close_sessions();
5941
5942 mounted = false;
5943
5944 ldout(cct, 2) << "unmounted." << dendl;
5945}
5946
5947
5948
5949class C_C_Tick : public Context {
5950 Client *client;
5951public:
5952 explicit C_C_Tick(Client *c) : client(c) {}
5953 void finish(int r) override {
5954 // Called back via Timer, which takes client_lock for us
5955 assert(client->client_lock.is_locked_by_me());
5956 client->tick();
5957 }
5958};
5959
5960void Client::flush_cap_releases()
5961{
5962 // send any cap releases
5963 for (map<mds_rank_t,MetaSession*>::iterator p = mds_sessions.begin();
5964 p != mds_sessions.end();
5965 ++p) {
5966 if (p->second->release && mdsmap->is_clientreplay_or_active_or_stopping(
5967 p->first)) {
5968 if (cct->_conf->client_inject_release_failure) {
5969 ldout(cct, 20) << __func__ << " injecting failure to send cap release message" << dendl;
5970 p->second->release->put();
5971 } else {
5972 p->second->con->send_message(p->second->release);
5973 }
5974 p->second->release = 0;
5975 }
5976 }
5977}
5978
5979void Client::tick()
5980{
5981 if (cct->_conf->client_debug_inject_tick_delay > 0) {
5982 sleep(cct->_conf->client_debug_inject_tick_delay);
5983 assert(0 == cct->_conf->set_val("client_debug_inject_tick_delay", "0"));
5984 cct->_conf->apply_changes(NULL);
5985 }
5986
5987 ldout(cct, 21) << "tick" << dendl;
5988 tick_event = new C_C_Tick(this);
5989 timer.add_event_after(cct->_conf->client_tick_interval, tick_event);
5990
5991 utime_t now = ceph_clock_now();
5992
5993 if (!mounted && !mds_requests.empty()) {
5994 MetaRequest *req = mds_requests.begin()->second;
5995 if (req->op_stamp + cct->_conf->client_mount_timeout < now) {
5996 req->abort(-ETIMEDOUT);
5997 if (req->caller_cond) {
5998 req->kick = true;
5999 req->caller_cond->Signal();
6000 }
6001 signal_cond_list(waiting_for_mdsmap);
6002 for (map<mds_rank_t,MetaSession*>::iterator p = mds_sessions.begin();
6003 p != mds_sessions.end();
6004 ++p)
6005 signal_context_list(p->second->waiting_for_open);
6006 }
6007 }
6008
6009 if (mdsmap->get_epoch()) {
6010 // renew caps?
6011 utime_t el = now - last_cap_renew;
6012 if (el > mdsmap->get_session_timeout() / 3.0)
6013 renew_caps();
6014
6015 flush_cap_releases();
6016 }
6017
6018 // delayed caps
6019 xlist<Inode*>::iterator p = delayed_caps.begin();
6020 while (!p.end()) {
6021 Inode *in = *p;
6022 ++p;
6023 if (in->hold_caps_until > now)
6024 break;
6025 delayed_caps.pop_front();
6026 cap_list.push_back(&in->cap_item);
6027 check_caps(in, CHECK_CAPS_NODELAY);
6028 }
6029
6030 trim_cache(true);
6031}
6032
6033void Client::renew_caps()
6034{
6035 ldout(cct, 10) << "renew_caps()" << dendl;
6036 last_cap_renew = ceph_clock_now();
6037
6038 for (map<mds_rank_t,MetaSession*>::iterator p = mds_sessions.begin();
6039 p != mds_sessions.end();
6040 ++p) {
6041 ldout(cct, 15) << "renew_caps requesting from mds." << p->first << dendl;
6042 if (mdsmap->get_state(p->first) >= MDSMap::STATE_REJOIN)
6043 renew_caps(p->second);
6044 }
6045}
6046
6047void Client::renew_caps(MetaSession *session)
6048{
6049 ldout(cct, 10) << "renew_caps mds." << session->mds_num << dendl;
6050 session->last_cap_renew_request = ceph_clock_now();
6051 uint64_t seq = ++session->cap_renew_seq;
6052 session->con->send_message(new MClientSession(CEPH_SESSION_REQUEST_RENEWCAPS, seq));
6053}
6054
6055
6056// ===============================================================
6057// high level (POSIXy) interface
6058
6059int Client::_do_lookup(Inode *dir, const string& name, int mask,
6060 InodeRef *target, const UserPerm& perms)
6061{
6062 int op = dir->snapid == CEPH_SNAPDIR ? CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP;
6063 MetaRequest *req = new MetaRequest(op);
6064 filepath path;
6065 dir->make_nosnap_relative_path(path);
6066 path.push_dentry(name);
6067 req->set_filepath(path);
6068 req->set_inode(dir);
6069 if (cct->_conf->client_debug_getattr_caps && op == CEPH_MDS_OP_LOOKUP)
6070 mask |= DEBUG_GETATTR_CAPS;
6071 req->head.args.getattr.mask = mask;
6072
6073 ldout(cct, 10) << "_do_lookup on " << path << dendl;
6074
6075 int r = make_request(req, perms, target);
6076 ldout(cct, 10) << "_do_lookup res is " << r << dendl;
6077 return r;
6078}
6079
6080int Client::_lookup(Inode *dir, const string& dname, int mask, InodeRef *target,
6081 const UserPerm& perms)
6082{
6083 int r = 0;
6084 Dentry *dn = NULL;
6085
6086 if (!dir->is_dir()) {
6087 r = -ENOTDIR;
6088 goto done;
6089 }
6090
6091 if (dname == "..") {
6092 if (dir->dn_set.empty())
6093 *target = dir;
6094 else
6095 *target = dir->get_first_parent()->dir->parent_inode; //dirs can't be hard-linked
6096 goto done;
6097 }
6098
6099 if (dname == ".") {
6100 *target = dir;
6101 goto done;
6102 }
6103
6104 if (dname.length() > NAME_MAX) {
6105 r = -ENAMETOOLONG;
6106 goto done;
6107 }
6108
6109 if (dname == cct->_conf->client_snapdir &&
6110 dir->snapid == CEPH_NOSNAP) {
6111 *target = open_snapdir(dir);
6112 goto done;
6113 }
6114
6115 if (dir->dir &&
6116 dir->dir->dentries.count(dname)) {
6117 dn = dir->dir->dentries[dname];
6118
6119 ldout(cct, 20) << "_lookup have dn " << dname << " mds." << dn->lease_mds << " ttl " << dn->lease_ttl
6120 << " seq " << dn->lease_seq
6121 << dendl;
6122
6123 if (!dn->inode || dn->inode->caps_issued_mask(mask)) {
6124 // is dn lease valid?
6125 utime_t now = ceph_clock_now();
6126 if (dn->lease_mds >= 0 &&
6127 dn->lease_ttl > now &&
6128 mds_sessions.count(dn->lease_mds)) {
6129 MetaSession *s = mds_sessions[dn->lease_mds];
6130 if (s->cap_ttl > now &&
6131 s->cap_gen == dn->lease_gen) {
6132 // touch this mds's dir cap too, even though we don't _explicitly_ use it here, to
6133 // make trim_caps() behave.
6134 dir->try_touch_cap(dn->lease_mds);
6135 goto hit_dn;
6136 }
6137 ldout(cct, 20) << " bad lease, cap_ttl " << s->cap_ttl << ", cap_gen " << s->cap_gen
6138 << " vs lease_gen " << dn->lease_gen << dendl;
6139 }
6140 // dir lease?
6141 if (dir->caps_issued_mask(CEPH_CAP_FILE_SHARED)) {
6142 if (dn->cap_shared_gen == dir->shared_gen &&
6143 (!dn->inode || dn->inode->caps_issued_mask(mask)))
6144 goto hit_dn;
6145 if (!dn->inode && (dir->flags & I_COMPLETE)) {
6146 ldout(cct, 10) << "_lookup concluded ENOENT locally for "
6147 << *dir << " dn '" << dname << "'" << dendl;
6148 return -ENOENT;
6149 }
6150 }
6151 } else {
6152 ldout(cct, 20) << " no cap on " << dn->inode->vino() << dendl;
6153 }
6154 } else {
6155 // can we conclude ENOENT locally?
6156 if (dir->caps_issued_mask(CEPH_CAP_FILE_SHARED) &&
6157 (dir->flags & I_COMPLETE)) {
6158 ldout(cct, 10) << "_lookup concluded ENOENT locally for " << *dir << " dn '" << dname << "'" << dendl;
6159 return -ENOENT;
6160 }
6161 }
6162
6163 r = _do_lookup(dir, dname, mask, target, perms);
6164 goto done;
6165
6166 hit_dn:
6167 if (dn->inode) {
6168 *target = dn->inode;
6169 } else {
6170 r = -ENOENT;
6171 }
6172 touch_dn(dn);
6173
6174 done:
6175 if (r < 0)
6176 ldout(cct, 10) << "_lookup " << *dir << " " << dname << " = " << r << dendl;
6177 else
6178 ldout(cct, 10) << "_lookup " << *dir << " " << dname << " = " << **target << dendl;
6179 return r;
6180}
6181
6182int Client::get_or_create(Inode *dir, const char* name,
6183 Dentry **pdn, bool expect_null)
6184{
6185 // lookup
6186 ldout(cct, 20) << "get_or_create " << *dir << " name " << name << dendl;
6187 dir->open_dir();
6188 if (dir->dir->dentries.count(name)) {
6189 Dentry *dn = dir->dir->dentries[name];
6190
6191 // is dn lease valid?
6192 utime_t now = ceph_clock_now();
6193 if (dn->inode &&
6194 dn->lease_mds >= 0 &&
6195 dn->lease_ttl > now &&
6196 mds_sessions.count(dn->lease_mds)) {
6197 MetaSession *s = mds_sessions[dn->lease_mds];
6198 if (s->cap_ttl > now &&
6199 s->cap_gen == dn->lease_gen) {
6200 if (expect_null)
6201 return -EEXIST;
6202 }
6203 }
6204 *pdn = dn;
6205 } else {
6206 // otherwise link up a new one
6207 *pdn = link(dir->dir, name, NULL, NULL);
6208 }
6209
6210 // success
6211 return 0;
6212}
6213
6214int Client::path_walk(const filepath& origpath, InodeRef *end,
6215 const UserPerm& perms, bool followsym, int mask)
6216{
6217 filepath path = origpath;
6218 InodeRef cur;
6219 if (origpath.absolute())
6220 cur = root;
6221 else
6222 cur = cwd;
6223 assert(cur);
6224
6225 ldout(cct, 10) << "path_walk " << path << dendl;
6226
6227 int symlinks = 0;
6228
6229 unsigned i=0;
6230 while (i < path.depth() && cur) {
6231 int caps = 0;
6232 const string &dname = path[i];
6233 ldout(cct, 10) << " " << i << " " << *cur << " " << dname << dendl;
6234 ldout(cct, 20) << " (path is " << path << ")" << dendl;
6235 InodeRef next;
6236 if (cct->_conf->client_permissions) {
6237 int r = may_lookup(cur.get(), perms);
6238 if (r < 0)
6239 return r;
6240 caps = CEPH_CAP_AUTH_SHARED;
6241 }
6242
6243 /* Get extra requested caps on the last component */
6244 if (i == (path.depth() - 1))
6245 caps |= mask;
6246 int r = _lookup(cur.get(), dname, caps, &next, perms);
6247 if (r < 0)
6248 return r;
6249 // only follow trailing symlink if followsym. always follow
6250 // 'directory' symlinks.
6251 if (next && next->is_symlink()) {
6252 symlinks++;
6253 ldout(cct, 20) << " symlink count " << symlinks << ", value is '" << next->symlink << "'" << dendl;
6254 if (symlinks > MAXSYMLINKS) {
6255 return -ELOOP;
6256 }
6257
6258 if (i < path.depth() - 1) {
6259 // dir symlink
6260 // replace consumed components of path with symlink dir target
6261 filepath resolved(next->symlink.c_str());
6262 resolved.append(path.postfixpath(i + 1));
6263 path = resolved;
6264 i = 0;
6265 if (next->symlink[0] == '/') {
6266 cur = root;
6267 }
6268 continue;
6269 } else if (followsym) {
6270 if (next->symlink[0] == '/') {
6271 path = next->symlink.c_str();
6272 i = 0;
6273 // reset position
6274 cur = root;
6275 } else {
6276 filepath more(next->symlink.c_str());
6277 // we need to remove the symlink component from off of the path
6278 // before adding the target that the symlink points to. remain
6279 // at the same position in the path.
6280 path.pop_dentry();
6281 path.append(more);
6282 }
6283 continue;
6284 }
6285 }
6286 cur.swap(next);
6287 i++;
6288 }
6289 if (!cur)
6290 return -ENOENT;
6291 if (end)
6292 end->swap(cur);
6293 return 0;
6294}
6295
6296
6297// namespace ops
6298
6299int Client::link(const char *relexisting, const char *relpath, const UserPerm& perm)
6300{
6301 Mutex::Locker lock(client_lock);
6302 tout(cct) << "link" << std::endl;
6303 tout(cct) << relexisting << std::endl;
6304 tout(cct) << relpath << std::endl;
6305
6306 filepath existing(relexisting);
6307
6308 InodeRef in, dir;
6309 int r = path_walk(existing, &in, perm, true);
6310 if (r < 0)
6311 return r;
6312 if (std::string(relpath) == "/") {
6313 r = -EEXIST;
6314 return r;
6315 }
6316 filepath path(relpath);
6317 string name = path.last_dentry();
6318 path.pop_dentry();
6319
6320 r = path_walk(path, &dir, perm, true);
6321 if (r < 0)
6322 return r;
6323 if (cct->_conf->client_permissions) {
6324 if (S_ISDIR(in->mode)) {
6325 r = -EPERM;
6326 return r;
6327 }
6328 r = may_hardlink(in.get(), perm);
6329 if (r < 0)
6330 return r;
6331 r = may_create(dir.get(), perm);
6332 if (r < 0)
6333 return r;
6334 }
6335 r = _link(in.get(), dir.get(), name.c_str(), perm);
6336 return r;
6337}
6338
6339int Client::unlink(const char *relpath, const UserPerm& perm)
6340{
6341 Mutex::Locker lock(client_lock);
6342 tout(cct) << "unlink" << std::endl;
6343 tout(cct) << relpath << std::endl;
6344
6345 if (std::string(relpath) == "/")
6346 return -EISDIR;
6347
6348 filepath path(relpath);
6349 string name = path.last_dentry();
6350 path.pop_dentry();
6351 InodeRef dir;
6352 int r = path_walk(path, &dir, perm);
6353 if (r < 0)
6354 return r;
6355 if (cct->_conf->client_permissions) {
6356 r = may_delete(dir.get(), name.c_str(), perm);
6357 if (r < 0)
6358 return r;
6359 }
6360 return _unlink(dir.get(), name.c_str(), perm);
6361}
6362
6363int Client::rename(const char *relfrom, const char *relto, const UserPerm& perm)
6364{
6365 Mutex::Locker lock(client_lock);
6366 tout(cct) << "rename" << std::endl;
6367 tout(cct) << relfrom << std::endl;
6368 tout(cct) << relto << std::endl;
6369
6370 if (std::string(relfrom) == "/" || std::string(relto) == "/")
6371 return -EBUSY;
6372
6373 filepath from(relfrom);
6374 filepath to(relto);
6375 string fromname = from.last_dentry();
6376 from.pop_dentry();
6377 string toname = to.last_dentry();
6378 to.pop_dentry();
6379
6380 InodeRef fromdir, todir;
6381 int r = path_walk(from, &fromdir, perm);
6382 if (r < 0)
6383 goto out;
6384 r = path_walk(to, &todir, perm);
6385 if (r < 0)
6386 goto out;
6387
6388 if (cct->_conf->client_permissions) {
6389 int r = may_delete(fromdir.get(), fromname.c_str(), perm);
6390 if (r < 0)
6391 return r;
6392 r = may_delete(todir.get(), toname.c_str(), perm);
6393 if (r < 0 && r != -ENOENT)
6394 return r;
6395 }
6396 r = _rename(fromdir.get(), fromname.c_str(), todir.get(), toname.c_str(), perm);
6397out:
6398 return r;
6399}
6400
6401// dirs
6402
6403int Client::mkdir(const char *relpath, mode_t mode, const UserPerm& perm)
6404{
6405 Mutex::Locker lock(client_lock);
6406 tout(cct) << "mkdir" << std::endl;
6407 tout(cct) << relpath << std::endl;
6408 tout(cct) << mode << std::endl;
6409 ldout(cct, 10) << "mkdir: " << relpath << dendl;
6410
6411 if (std::string(relpath) == "/")
6412 return -EEXIST;
6413
6414 filepath path(relpath);
6415 string name = path.last_dentry();
6416 path.pop_dentry();
6417 InodeRef dir;
6418 int r = path_walk(path, &dir, perm);
6419 if (r < 0)
6420 return r;
6421 if (cct->_conf->client_permissions) {
6422 r = may_create(dir.get(), perm);
6423 if (r < 0)
6424 return r;
6425 }
6426 return _mkdir(dir.get(), name.c_str(), mode, perm);
6427}
6428
6429int Client::mkdirs(const char *relpath, mode_t mode, const UserPerm& perms)
6430{
6431 Mutex::Locker lock(client_lock);
6432 ldout(cct, 10) << "Client::mkdirs " << relpath << dendl;
6433 tout(cct) << "mkdirs" << std::endl;
6434 tout(cct) << relpath << std::endl;
6435 tout(cct) << mode << std::endl;
6436
6437 //get through existing parts of path
6438 filepath path(relpath);
6439 unsigned int i;
6440 int r = 0, caps = 0;
6441 InodeRef cur, next;
6442 cur = cwd;
6443 for (i=0; i<path.depth(); ++i) {
6444 if (cct->_conf->client_permissions) {
6445 r = may_lookup(cur.get(), perms);
6446 if (r < 0)
6447 break;
6448 caps = CEPH_CAP_AUTH_SHARED;
6449 }
6450 r = _lookup(cur.get(), path[i].c_str(), caps, &next, perms);
6451 if (r < 0)
6452 break;
6453 cur.swap(next);
6454 }
6455 //check that we have work left to do
6456 if (i==path.depth()) return -EEXIST;
6457 if (r!=-ENOENT) return r;
6458 ldout(cct, 20) << "mkdirs got through " << i << " directories on path " << relpath << dendl;
6459 //make new directory at each level
6460 for (; i<path.depth(); ++i) {
6461 if (cct->_conf->client_permissions) {
6462 r = may_create(cur.get(), perms);
6463 if (r < 0)
6464 return r;
6465 }
6466 //make new dir
6467 r = _mkdir(cur.get(), path[i].c_str(), mode, perms, &next);
c07f9fc5 6468
7c673cae 6469 //check proper creation/existence
c07f9fc5
FG
6470 if(-EEXIST == r && i < path.depth() - 1) {
6471 r = _lookup(cur.get(), path[i].c_str(), CEPH_CAP_AUTH_SHARED, &next, perms);
6472 }
6473 if (r < 0)
6474 return r;
7c673cae
FG
6475 //move to new dir and continue
6476 cur.swap(next);
6477 ldout(cct, 20) << "mkdirs: successfully created directory "
6478 << filepath(cur->ino).get_path() << dendl;
6479 }
6480 return 0;
6481}
6482
6483int Client::rmdir(const char *relpath, const UserPerm& perms)
6484{
6485 Mutex::Locker lock(client_lock);
6486 tout(cct) << "rmdir" << std::endl;
6487 tout(cct) << relpath << std::endl;
6488
6489 if (std::string(relpath) == "/")
6490 return -EBUSY;
6491
6492 filepath path(relpath);
6493 string name = path.last_dentry();
6494 path.pop_dentry();
6495 InodeRef dir;
6496 int r = path_walk(path, &dir, perms);
6497 if (r < 0)
6498 return r;
6499 if (cct->_conf->client_permissions) {
6500 int r = may_delete(dir.get(), name.c_str(), perms);
6501 if (r < 0)
6502 return r;
6503 }
6504 return _rmdir(dir.get(), name.c_str(), perms);
6505}
6506
6507int Client::mknod(const char *relpath, mode_t mode, const UserPerm& perms, dev_t rdev)
6508{
6509 Mutex::Locker lock(client_lock);
6510 tout(cct) << "mknod" << std::endl;
6511 tout(cct) << relpath << std::endl;
6512 tout(cct) << mode << std::endl;
6513 tout(cct) << rdev << std::endl;
6514
6515 if (std::string(relpath) == "/")
6516 return -EEXIST;
6517
6518 filepath path(relpath);
6519 string name = path.last_dentry();
6520 path.pop_dentry();
6521 InodeRef dir;
6522 int r = path_walk(path, &dir, perms);
6523 if (r < 0)
6524 return r;
6525 if (cct->_conf->client_permissions) {
6526 int r = may_create(dir.get(), perms);
6527 if (r < 0)
6528 return r;
6529 }
6530 return _mknod(dir.get(), name.c_str(), mode, rdev, perms);
6531}
6532
6533// symlinks
6534
6535int Client::symlink(const char *target, const char *relpath, const UserPerm& perms)
6536{
6537 Mutex::Locker lock(client_lock);
6538 tout(cct) << "symlink" << std::endl;
6539 tout(cct) << target << std::endl;
6540 tout(cct) << relpath << std::endl;
6541
6542 if (std::string(relpath) == "/")
6543 return -EEXIST;
6544
6545 filepath path(relpath);
6546 string name = path.last_dentry();
6547 path.pop_dentry();
6548 InodeRef dir;
6549 int r = path_walk(path, &dir, perms);
6550 if (r < 0)
6551 return r;
6552 if (cct->_conf->client_permissions) {
6553 int r = may_create(dir.get(), perms);
6554 if (r < 0)
6555 return r;
6556 }
6557 return _symlink(dir.get(), name.c_str(), target, perms);
6558}
6559
6560int Client::readlink(const char *relpath, char *buf, loff_t size, const UserPerm& perms)
6561{
6562 Mutex::Locker lock(client_lock);
6563 tout(cct) << "readlink" << std::endl;
6564 tout(cct) << relpath << std::endl;
6565
6566 filepath path(relpath);
6567 InodeRef in;
6568 int r = path_walk(path, &in, perms, false);
6569 if (r < 0)
6570 return r;
6571
6572 return _readlink(in.get(), buf, size);
6573}
6574
6575int Client::_readlink(Inode *in, char *buf, size_t size)
6576{
6577 if (!in->is_symlink())
6578 return -EINVAL;
6579
6580 // copy into buf (at most size bytes)
6581 int r = in->symlink.length();
6582 if (r > (int)size)
6583 r = size;
6584 memcpy(buf, in->symlink.c_str(), r);
6585 return r;
6586}
6587
6588
6589// inode stuff
6590
6591int Client::_getattr(Inode *in, int mask, const UserPerm& perms, bool force)
6592{
6593 bool yes = in->caps_issued_mask(mask);
6594
6595 ldout(cct, 10) << "_getattr mask " << ccap_string(mask) << " issued=" << yes << dendl;
6596 if (yes && !force)
6597 return 0;
6598
6599 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_GETATTR);
6600 filepath path;
6601 in->make_nosnap_relative_path(path);
6602 req->set_filepath(path);
6603 req->set_inode(in);
6604 req->head.args.getattr.mask = mask;
6605
6606 int res = make_request(req, perms);
6607 ldout(cct, 10) << "_getattr result=" << res << dendl;
6608 return res;
6609}
6610
6611int Client::_do_setattr(Inode *in, struct ceph_statx *stx, int mask,
6612 const UserPerm& perms, InodeRef *inp)
6613{
6614 int issued = in->caps_issued();
6615
6616 ldout(cct, 10) << "_setattr mask " << mask << " issued " <<
6617 ccap_string(issued) << dendl;
6618
6619 if (in->snapid != CEPH_NOSNAP) {
6620 return -EROFS;
6621 }
6622 if ((mask & CEPH_SETATTR_SIZE) &&
6623 (unsigned long)stx->stx_size > in->size &&
6624 is_quota_bytes_exceeded(in, (unsigned long)stx->stx_size - in->size,
6625 perms)) {
6626 return -EDQUOT;
6627 }
6628
6629 // make the change locally?
6630 if ((in->cap_dirtier_uid >= 0 && perms.uid() != in->cap_dirtier_uid) ||
6631 (in->cap_dirtier_gid >= 0 && perms.gid() != in->cap_dirtier_gid)) {
6632 ldout(cct, 10) << __func__ << " caller " << perms.uid() << ":" << perms.gid()
6633 << " != cap dirtier " << in->cap_dirtier_uid << ":"
6634 << in->cap_dirtier_gid << ", forcing sync setattr"
6635 << dendl;
6636 /*
6637 * This works because we implicitly flush the caps as part of the
6638 * request, so the cap update check will happen with the writeback
6639 * cap context, and then the setattr check will happen with the
6640 * caller's context.
6641 *
6642 * In reality this pattern is likely pretty rare (different users
6643 * setattr'ing the same file). If that turns out not to be the
6644 * case later, we can build a more complex pipelined cap writeback
6645 * infrastructure...
6646 */
6647 if (!mask)
6648 mask |= CEPH_SETATTR_CTIME;
6649 goto force_request;
6650 }
6651
6652 if (!mask) {
6653 // caller just needs us to bump the ctime
6654 in->ctime = ceph_clock_now();
6655 in->cap_dirtier_uid = perms.uid();
6656 in->cap_dirtier_gid = perms.gid();
6657 if (issued & CEPH_CAP_AUTH_EXCL)
6658 mark_caps_dirty(in, CEPH_CAP_AUTH_EXCL);
6659 else if (issued & CEPH_CAP_FILE_EXCL)
6660 mark_caps_dirty(in, CEPH_CAP_FILE_EXCL);
6661 else if (issued & CEPH_CAP_XATTR_EXCL)
6662 mark_caps_dirty(in, CEPH_CAP_XATTR_EXCL);
6663 else
6664 mask |= CEPH_SETATTR_CTIME;
6665 }
6666
6667 if (in->caps_issued_mask(CEPH_CAP_AUTH_EXCL)) {
6668 bool kill_sguid = mask & (CEPH_SETATTR_SIZE|CEPH_SETATTR_KILL_SGUID);
6669
6670 mask &= ~CEPH_SETATTR_KILL_SGUID;
6671
6672 if (mask & CEPH_SETATTR_UID) {
6673 in->ctime = ceph_clock_now();
6674 in->cap_dirtier_uid = perms.uid();
6675 in->cap_dirtier_gid = perms.gid();
6676 in->uid = stx->stx_uid;
6677 mark_caps_dirty(in, CEPH_CAP_AUTH_EXCL);
6678 mask &= ~CEPH_SETATTR_UID;
6679 kill_sguid = true;
6680 ldout(cct,10) << "changing uid to " << stx->stx_uid << dendl;
6681 }
6682 if (mask & CEPH_SETATTR_GID) {
6683 in->ctime = ceph_clock_now();
6684 in->cap_dirtier_uid = perms.uid();
6685 in->cap_dirtier_gid = perms.gid();
6686 in->gid = stx->stx_gid;
6687 mark_caps_dirty(in, CEPH_CAP_AUTH_EXCL);
6688 mask &= ~CEPH_SETATTR_GID;
6689 kill_sguid = true;
6690 ldout(cct,10) << "changing gid to " << stx->stx_gid << dendl;
6691 }
6692
6693 if (mask & CEPH_SETATTR_MODE) {
6694 in->ctime = ceph_clock_now();
6695 in->cap_dirtier_uid = perms.uid();
6696 in->cap_dirtier_gid = perms.gid();
6697 in->mode = (in->mode & ~07777) | (stx->stx_mode & 07777);
6698 mark_caps_dirty(in, CEPH_CAP_AUTH_EXCL);
6699 mask &= ~CEPH_SETATTR_MODE;
6700 ldout(cct,10) << "changing mode to " << stx->stx_mode << dendl;
6701 } else if (kill_sguid && S_ISREG(in->mode)) {
6702 /* Must squash the any setuid/setgid bits with an ownership change */
6703 in->mode &= ~S_ISUID;
6704 if ((in->mode & (S_ISGID|S_IXGRP)) == (S_ISGID|S_IXGRP))
6705 in->mode &= ~S_ISGID;
6706 mark_caps_dirty(in, CEPH_CAP_AUTH_EXCL);
6707 }
6708
6709 if (mask & CEPH_SETATTR_BTIME) {
6710 in->ctime = ceph_clock_now();
6711 in->cap_dirtier_uid = perms.uid();
6712 in->cap_dirtier_gid = perms.gid();
6713 in->btime = utime_t(stx->stx_btime);
6714 mark_caps_dirty(in, CEPH_CAP_AUTH_EXCL);
6715 mask &= ~CEPH_SETATTR_BTIME;
6716 ldout(cct,10) << "changing btime to " << in->btime << dendl;
6717 }
6718 } else if (mask & CEPH_SETATTR_SIZE) {
6719 /* If we don't have Ax, then we must ask the server to clear them on truncate */
6720 mask |= CEPH_SETATTR_KILL_SGUID;
6721 }
6722
6723 if (in->caps_issued_mask(CEPH_CAP_FILE_EXCL)) {
6724 if (mask & (CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME)) {
6725 if (mask & CEPH_SETATTR_MTIME)
6726 in->mtime = utime_t(stx->stx_mtime);
6727 if (mask & CEPH_SETATTR_ATIME)
6728 in->atime = utime_t(stx->stx_atime);
6729 in->ctime = ceph_clock_now();
6730 in->cap_dirtier_uid = perms.uid();
6731 in->cap_dirtier_gid = perms.gid();
6732 in->time_warp_seq++;
6733 mark_caps_dirty(in, CEPH_CAP_FILE_EXCL);
6734 mask &= ~(CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME);
6735 }
6736 }
6737 if (!mask) {
6738 in->change_attr++;
6739 return 0;
6740 }
6741
6742force_request:
6743 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_SETATTR);
6744
6745 filepath path;
6746
6747 in->make_nosnap_relative_path(path);
6748 req->set_filepath(path);
6749 req->set_inode(in);
6750
6751 if (mask & CEPH_SETATTR_KILL_SGUID) {
6752 req->inode_drop |= CEPH_CAP_AUTH_SHARED;
6753 }
6754 if (mask & CEPH_SETATTR_MODE) {
6755 req->head.args.setattr.mode = stx->stx_mode;
6756 req->inode_drop |= CEPH_CAP_AUTH_SHARED;
6757 ldout(cct,10) << "changing mode to " << stx->stx_mode << dendl;
6758 }
6759 if (mask & CEPH_SETATTR_UID) {
6760 req->head.args.setattr.uid = stx->stx_uid;
6761 req->inode_drop |= CEPH_CAP_AUTH_SHARED;
6762 ldout(cct,10) << "changing uid to " << stx->stx_uid << dendl;
6763 }
6764 if (mask & CEPH_SETATTR_GID) {
6765 req->head.args.setattr.gid = stx->stx_gid;
6766 req->inode_drop |= CEPH_CAP_AUTH_SHARED;
6767 ldout(cct,10) << "changing gid to " << stx->stx_gid << dendl;
6768 }
6769 if (mask & CEPH_SETATTR_BTIME) {
6770 req->head.args.setattr.btime = utime_t(stx->stx_btime);
6771 req->inode_drop |= CEPH_CAP_AUTH_SHARED;
6772 }
6773 if (mask & CEPH_SETATTR_MTIME) {
6774 req->head.args.setattr.mtime = utime_t(stx->stx_mtime);
6775 req->inode_drop |= CEPH_CAP_AUTH_SHARED | CEPH_CAP_FILE_RD |
6776 CEPH_CAP_FILE_WR;
6777 }
6778 if (mask & CEPH_SETATTR_ATIME) {
6779 req->head.args.setattr.atime = utime_t(stx->stx_atime);
6780 req->inode_drop |= CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_RD |
6781 CEPH_CAP_FILE_WR;
6782 }
6783 if (mask & CEPH_SETATTR_SIZE) {
6784 if ((unsigned long)stx->stx_size < mdsmap->get_max_filesize()) {
6785 req->head.args.setattr.size = stx->stx_size;
6786 ldout(cct,10) << "changing size to " << stx->stx_size << dendl;
6787 } else { //too big!
6788 put_request(req);
6789 ldout(cct,10) << "unable to set size to " << stx->stx_size << ". Too large!" << dendl;
6790 return -EFBIG;
6791 }
6792 req->inode_drop |= CEPH_CAP_AUTH_SHARED | CEPH_CAP_FILE_RD |
6793 CEPH_CAP_FILE_WR;
6794 }
6795 req->head.args.setattr.mask = mask;
6796
6797 req->regetattr_mask = mask;
6798
6799 int res = make_request(req, perms, inp);
6800 ldout(cct, 10) << "_setattr result=" << res << dendl;
6801 return res;
6802}
6803
6804/* Note that we only care about attrs that setattr cares about */
6805void Client::stat_to_statx(struct stat *st, struct ceph_statx *stx)
6806{
6807 stx->stx_size = st->st_size;
6808 stx->stx_mode = st->st_mode;
6809 stx->stx_uid = st->st_uid;
6810 stx->stx_gid = st->st_gid;
6811 stx->stx_mtime = st->st_mtim;
6812 stx->stx_atime = st->st_atim;
6813}
6814
6815int Client::__setattrx(Inode *in, struct ceph_statx *stx, int mask,
6816 const UserPerm& perms, InodeRef *inp)
6817{
6818 int ret = _do_setattr(in, stx, mask, perms, inp);
6819 if (ret < 0)
6820 return ret;
6821 if (mask & CEPH_SETATTR_MODE)
6822 ret = _posix_acl_chmod(in, stx->stx_mode, perms);
6823 return ret;
6824}
6825
6826int Client::_setattrx(InodeRef &in, struct ceph_statx *stx, int mask,
6827 const UserPerm& perms)
6828{
6829 mask &= (CEPH_SETATTR_MODE | CEPH_SETATTR_UID |
6830 CEPH_SETATTR_GID | CEPH_SETATTR_MTIME |
6831 CEPH_SETATTR_ATIME | CEPH_SETATTR_SIZE |
6832 CEPH_SETATTR_CTIME | CEPH_SETATTR_BTIME);
6833 if (cct->_conf->client_permissions) {
6834 int r = may_setattr(in.get(), stx, mask, perms);
6835 if (r < 0)
6836 return r;
6837 }
6838 return __setattrx(in.get(), stx, mask, perms);
6839}
6840
6841int Client::_setattr(InodeRef &in, struct stat *attr, int mask,
6842 const UserPerm& perms)
6843{
6844 struct ceph_statx stx;
6845
6846 stat_to_statx(attr, &stx);
6847 mask &= ~CEPH_SETATTR_BTIME;
6848 return _setattrx(in, &stx, mask, perms);
6849}
6850
6851int Client::setattr(const char *relpath, struct stat *attr, int mask,
6852 const UserPerm& perms)
6853{
6854 Mutex::Locker lock(client_lock);
6855 tout(cct) << "setattr" << std::endl;
6856 tout(cct) << relpath << std::endl;
6857 tout(cct) << mask << std::endl;
6858
6859 filepath path(relpath);
6860 InodeRef in;
6861 int r = path_walk(path, &in, perms);
6862 if (r < 0)
6863 return r;
6864 return _setattr(in, attr, mask, perms);
6865}
6866
6867int Client::setattrx(const char *relpath, struct ceph_statx *stx, int mask,
6868 const UserPerm& perms, int flags)
6869{
6870 Mutex::Locker lock(client_lock);
6871 tout(cct) << "setattrx" << std::endl;
6872 tout(cct) << relpath << std::endl;
6873 tout(cct) << mask << std::endl;
6874
6875 filepath path(relpath);
6876 InodeRef in;
6877 int r = path_walk(path, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW));
6878 if (r < 0)
6879 return r;
6880 return _setattrx(in, stx, mask, perms);
6881}
6882
6883int Client::fsetattr(int fd, struct stat *attr, int mask, const UserPerm& perms)
6884{
6885 Mutex::Locker lock(client_lock);
6886 tout(cct) << "fsetattr" << std::endl;
6887 tout(cct) << fd << std::endl;
6888 tout(cct) << mask << std::endl;
6889
6890 Fh *f = get_filehandle(fd);
6891 if (!f)
6892 return -EBADF;
6893#if defined(__linux__) && defined(O_PATH)
6894 if (f->flags & O_PATH)
6895 return -EBADF;
6896#endif
6897 return _setattr(f->inode, attr, mask, perms);
6898}
6899
6900int Client::fsetattrx(int fd, struct ceph_statx *stx, int mask, const UserPerm& perms)
6901{
6902 Mutex::Locker lock(client_lock);
6903 tout(cct) << "fsetattr" << std::endl;
6904 tout(cct) << fd << std::endl;
6905 tout(cct) << mask << std::endl;
6906
6907 Fh *f = get_filehandle(fd);
6908 if (!f)
6909 return -EBADF;
6910#if defined(__linux__) && defined(O_PATH)
6911 if (f->flags & O_PATH)
6912 return -EBADF;
6913#endif
6914 return _setattrx(f->inode, stx, mask, perms);
6915}
6916
6917int Client::stat(const char *relpath, struct stat *stbuf, const UserPerm& perms,
6918 frag_info_t *dirstat, int mask)
6919{
6920 ldout(cct, 3) << "stat enter (relpath " << relpath << " mask " << mask << ")" << dendl;
6921 Mutex::Locker lock(client_lock);
6922 tout(cct) << "stat" << std::endl;
6923 tout(cct) << relpath << std::endl;
6924 filepath path(relpath);
6925 InodeRef in;
6926 int r = path_walk(path, &in, perms, true, mask);
6927 if (r < 0)
6928 return r;
6929 r = _getattr(in, mask, perms);
6930 if (r < 0) {
6931 ldout(cct, 3) << "stat exit on error!" << dendl;
6932 return r;
6933 }
6934 fill_stat(in, stbuf, dirstat);
6935 ldout(cct, 3) << "stat exit (relpath " << relpath << " mask " << mask << ")" << dendl;
6936 return r;
6937}
6938
6939unsigned Client::statx_to_mask(unsigned int flags, unsigned int want)
6940{
6941 unsigned mask = 0;
6942
6943 /* if NO_ATTR_SYNC is set, then we don't need any -- just use what's in cache */
6944 if (flags & AT_NO_ATTR_SYNC)
6945 goto out;
6946
6947 /* Always set PIN to distinguish from AT_NO_ATTR_SYNC case */
6948 mask |= CEPH_CAP_PIN;
6949 if (want & (CEPH_STATX_MODE|CEPH_STATX_UID|CEPH_STATX_GID|CEPH_STATX_BTIME|CEPH_STATX_CTIME|CEPH_STATX_VERSION))
6950 mask |= CEPH_CAP_AUTH_SHARED;
6951 if (want & (CEPH_STATX_NLINK|CEPH_STATX_CTIME|CEPH_STATX_VERSION))
6952 mask |= CEPH_CAP_LINK_SHARED;
6953 if (want & (CEPH_STATX_ATIME|CEPH_STATX_MTIME|CEPH_STATX_CTIME|CEPH_STATX_SIZE|CEPH_STATX_BLOCKS|CEPH_STATX_VERSION))
6954 mask |= CEPH_CAP_FILE_SHARED;
6955 if (want & (CEPH_STATX_VERSION|CEPH_STATX_CTIME))
6956 mask |= CEPH_CAP_XATTR_SHARED;
6957out:
6958 return mask;
6959}
6960
6961int Client::statx(const char *relpath, struct ceph_statx *stx,
6962 const UserPerm& perms,
6963 unsigned int want, unsigned int flags)
6964{
6965 ldout(cct, 3) << "statx enter (relpath " << relpath << " want " << want << ")" << dendl;
6966 Mutex::Locker lock(client_lock);
6967 tout(cct) << "statx" << std::endl;
6968 tout(cct) << relpath << std::endl;
6969 filepath path(relpath);
6970 InodeRef in;
6971
6972 unsigned mask = statx_to_mask(flags, want);
6973
6974 int r = path_walk(path, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW), mask);
6975 if (r < 0)
6976 return r;
6977
6978 r = _getattr(in, mask, perms);
6979 if (r < 0) {
6980 ldout(cct, 3) << "statx exit on error!" << dendl;
6981 return r;
6982 }
6983
6984 fill_statx(in, mask, stx);
6985 ldout(cct, 3) << "statx exit (relpath " << relpath << " mask " << stx->stx_mask << ")" << dendl;
6986 return r;
6987}
6988
6989int Client::lstat(const char *relpath, struct stat *stbuf,
6990 const UserPerm& perms, frag_info_t *dirstat, int mask)
6991{
6992 ldout(cct, 3) << "lstat enter (relpath " << relpath << " mask " << mask << ")" << dendl;
6993 Mutex::Locker lock(client_lock);
6994 tout(cct) << "lstat" << std::endl;
6995 tout(cct) << relpath << std::endl;
6996 filepath path(relpath);
6997 InodeRef in;
6998 // don't follow symlinks
6999 int r = path_walk(path, &in, perms, false, mask);
7000 if (r < 0)
7001 return r;
7002 r = _getattr(in, mask, perms);
7003 if (r < 0) {
7004 ldout(cct, 3) << "lstat exit on error!" << dendl;
7005 return r;
7006 }
7007 fill_stat(in, stbuf, dirstat);
7008 ldout(cct, 3) << "lstat exit (relpath " << relpath << " mask " << mask << ")" << dendl;
7009 return r;
7010}
7011
7012int Client::fill_stat(Inode *in, struct stat *st, frag_info_t *dirstat, nest_info_t *rstat)
7013{
7014 ldout(cct, 10) << "fill_stat on " << in->ino << " snap/dev" << in->snapid
7015 << " mode 0" << oct << in->mode << dec
7016 << " mtime " << in->mtime << " ctime " << in->ctime << dendl;
7017 memset(st, 0, sizeof(struct stat));
7018 if (use_faked_inos())
7019 st->st_ino = in->faked_ino;
7020 else
7021 st->st_ino = in->ino;
7022 st->st_dev = in->snapid;
7023 st->st_mode = in->mode;
7024 st->st_rdev = in->rdev;
7025 st->st_nlink = in->nlink;
7026 st->st_uid = in->uid;
7027 st->st_gid = in->gid;
7028 if (in->ctime > in->mtime) {
7029 stat_set_ctime_sec(st, in->ctime.sec());
7030 stat_set_ctime_nsec(st, in->ctime.nsec());
7031 } else {
7032 stat_set_ctime_sec(st, in->mtime.sec());
7033 stat_set_ctime_nsec(st, in->mtime.nsec());
7034 }
7035 stat_set_atime_sec(st, in->atime.sec());
7036 stat_set_atime_nsec(st, in->atime.nsec());
7037 stat_set_mtime_sec(st, in->mtime.sec());
7038 stat_set_mtime_nsec(st, in->mtime.nsec());
7039 if (in->is_dir()) {
7040 if (cct->_conf->client_dirsize_rbytes)
7041 st->st_size = in->rstat.rbytes;
7042 else
7043 st->st_size = in->dirstat.size();
7044 st->st_blocks = 1;
7045 } else {
7046 st->st_size = in->size;
7047 st->st_blocks = (in->size + 511) >> 9;
7048 }
7049 st->st_blksize = MAX(in->layout.stripe_unit, 4096);
7050
7051 if (dirstat)
7052 *dirstat = in->dirstat;
7053 if (rstat)
7054 *rstat = in->rstat;
7055
7056 return in->caps_issued();
7057}
7058
7059void Client::fill_statx(Inode *in, unsigned int mask, struct ceph_statx *stx)
7060{
7061 ldout(cct, 10) << "fill_statx on " << in->ino << " snap/dev" << in->snapid
7062 << " mode 0" << oct << in->mode << dec
7063 << " mtime " << in->mtime << " ctime " << in->ctime << dendl;
7064 memset(stx, 0, sizeof(struct ceph_statx));
7065
7066 /*
7067 * If mask is 0, then the caller set AT_NO_ATTR_SYNC. Reset the mask
7068 * so that all bits are set.
7069 */
7070 if (!mask)
7071 mask = ~0;
7072
7073 /* These are always considered to be available */
7074 stx->stx_dev = in->snapid;
7075 stx->stx_blksize = MAX(in->layout.stripe_unit, 4096);
7076
7077 /* Type bits are always set, even when CEPH_STATX_MODE is not */
7078 stx->stx_mode = S_IFMT & in->mode;
7079 stx->stx_ino = use_faked_inos() ? in->faked_ino : (ino_t)in->ino;
7080 stx->stx_rdev = in->rdev;
7081 stx->stx_mask |= (CEPH_STATX_INO|CEPH_STATX_RDEV);
7082
7083 if (mask & CEPH_CAP_AUTH_SHARED) {
7084 stx->stx_uid = in->uid;
7085 stx->stx_gid = in->gid;
7086 stx->stx_mode = in->mode;
7087 in->btime.to_timespec(&stx->stx_btime);
7088 stx->stx_mask |= (CEPH_STATX_MODE|CEPH_STATX_UID|CEPH_STATX_GID|CEPH_STATX_BTIME);
7089 }
7090
7091 if (mask & CEPH_CAP_LINK_SHARED) {
7092 stx->stx_nlink = in->nlink;
7093 stx->stx_mask |= CEPH_STATX_NLINK;
7094 }
7095
7096 if (mask & CEPH_CAP_FILE_SHARED) {
7097
7098 in->atime.to_timespec(&stx->stx_atime);
7099 in->mtime.to_timespec(&stx->stx_mtime);
7100
7101 if (in->is_dir()) {
7102 if (cct->_conf->client_dirsize_rbytes)
7103 stx->stx_size = in->rstat.rbytes;
7104 else
7105 stx->stx_size = in->dirstat.size();
7106 stx->stx_blocks = 1;
7107 } else {
7108 stx->stx_size = in->size;
7109 stx->stx_blocks = (in->size + 511) >> 9;
7110 }
7111 stx->stx_mask |= (CEPH_STATX_ATIME|CEPH_STATX_MTIME|
7112 CEPH_STATX_SIZE|CEPH_STATX_BLOCKS);
7113 }
7114
7115 /* Change time and change_attr both require all shared caps to view */
7116 if ((mask & CEPH_STAT_CAP_INODE_ALL) == CEPH_STAT_CAP_INODE_ALL) {
7117 stx->stx_version = in->change_attr;
7118 if (in->ctime > in->mtime)
7119 in->ctime.to_timespec(&stx->stx_ctime);
7120 else
7121 in->mtime.to_timespec(&stx->stx_ctime);
7122 stx->stx_mask |= (CEPH_STATX_CTIME|CEPH_STATX_VERSION);
7123 }
7124
7125}
7126
7127void Client::touch_dn(Dentry *dn)
7128{
7129 lru.lru_touch(dn);
7130}
7131
7132int Client::chmod(const char *relpath, mode_t mode, const UserPerm& perms)
7133{
7134 Mutex::Locker lock(client_lock);
7135 tout(cct) << "chmod" << std::endl;
7136 tout(cct) << relpath << std::endl;
7137 tout(cct) << mode << std::endl;
7138 filepath path(relpath);
7139 InodeRef in;
7140 int r = path_walk(path, &in, perms);
7141 if (r < 0)
7142 return r;
7143 struct stat attr;
7144 attr.st_mode = mode;
7145 return _setattr(in, &attr, CEPH_SETATTR_MODE, perms);
7146}
7147
7148int Client::fchmod(int fd, mode_t mode, const UserPerm& perms)
7149{
7150 Mutex::Locker lock(client_lock);
7151 tout(cct) << "fchmod" << std::endl;
7152 tout(cct) << fd << std::endl;
7153 tout(cct) << mode << std::endl;
7154 Fh *f = get_filehandle(fd);
7155 if (!f)
7156 return -EBADF;
7157#if defined(__linux__) && defined(O_PATH)
7158 if (f->flags & O_PATH)
7159 return -EBADF;
7160#endif
7161 struct stat attr;
7162 attr.st_mode = mode;
7163 return _setattr(f->inode, &attr, CEPH_SETATTR_MODE, perms);
7164}
7165
7166int Client::lchmod(const char *relpath, mode_t mode, const UserPerm& perms)
7167{
7168 Mutex::Locker lock(client_lock);
7169 tout(cct) << "lchmod" << std::endl;
7170 tout(cct) << relpath << std::endl;
7171 tout(cct) << mode << std::endl;
7172 filepath path(relpath);
7173 InodeRef in;
7174 // don't follow symlinks
7175 int r = path_walk(path, &in, perms, false);
7176 if (r < 0)
7177 return r;
7178 struct stat attr;
7179 attr.st_mode = mode;
7180 return _setattr(in, &attr, CEPH_SETATTR_MODE, perms);
7181}
7182
7183int Client::chown(const char *relpath, uid_t new_uid, gid_t new_gid,
7184 const UserPerm& perms)
7185{
7186 Mutex::Locker lock(client_lock);
7187 tout(cct) << "chown" << std::endl;
7188 tout(cct) << relpath << std::endl;
7189 tout(cct) << new_uid << std::endl;
7190 tout(cct) << new_gid << std::endl;
7191 filepath path(relpath);
7192 InodeRef in;
7193 int r = path_walk(path, &in, perms);
7194 if (r < 0)
7195 return r;
7196 struct stat attr;
7197 attr.st_uid = new_uid;
7198 attr.st_gid = new_gid;
7199 int mask = 0;
7200 if (new_uid != static_cast<uid_t>(-1)) mask |= CEPH_SETATTR_UID;
7201 if (new_gid != static_cast<gid_t>(-1)) mask |= CEPH_SETATTR_GID;
7202 return _setattr(in, &attr, mask, perms);
7203}
7204
7205int Client::fchown(int fd, uid_t new_uid, gid_t new_gid, const UserPerm& perms)
7206{
7207 Mutex::Locker lock(client_lock);
7208 tout(cct) << "fchown" << std::endl;
7209 tout(cct) << fd << std::endl;
7210 tout(cct) << new_uid << std::endl;
7211 tout(cct) << new_gid << std::endl;
7212 Fh *f = get_filehandle(fd);
7213 if (!f)
7214 return -EBADF;
7215#if defined(__linux__) && defined(O_PATH)
7216 if (f->flags & O_PATH)
7217 return -EBADF;
7218#endif
7219 struct stat attr;
7220 attr.st_uid = new_uid;
7221 attr.st_gid = new_gid;
7222 int mask = 0;
7223 if (new_uid != static_cast<uid_t>(-1)) mask |= CEPH_SETATTR_UID;
7224 if (new_gid != static_cast<gid_t>(-1)) mask |= CEPH_SETATTR_GID;
7225 return _setattr(f->inode, &attr, mask, perms);
7226}
7227
7228int Client::lchown(const char *relpath, uid_t new_uid, gid_t new_gid,
7229 const UserPerm& perms)
7230{
7231 Mutex::Locker lock(client_lock);
7232 tout(cct) << "lchown" << std::endl;
7233 tout(cct) << relpath << std::endl;
7234 tout(cct) << new_uid << std::endl;
7235 tout(cct) << new_gid << std::endl;
7236 filepath path(relpath);
7237 InodeRef in;
7238 // don't follow symlinks
7239 int r = path_walk(path, &in, perms, false);
7240 if (r < 0)
7241 return r;
7242 struct stat attr;
7243 attr.st_uid = new_uid;
7244 attr.st_gid = new_gid;
7245 int mask = 0;
7246 if (new_uid != static_cast<uid_t>(-1)) mask |= CEPH_SETATTR_UID;
7247 if (new_gid != static_cast<gid_t>(-1)) mask |= CEPH_SETATTR_GID;
7248 return _setattr(in, &attr, mask, perms);
7249}
7250
7251int Client::utime(const char *relpath, struct utimbuf *buf,
7252 const UserPerm& perms)
7253{
7254 Mutex::Locker lock(client_lock);
7255 tout(cct) << "utime" << std::endl;
7256 tout(cct) << relpath << std::endl;
7257 tout(cct) << buf->modtime << std::endl;
7258 tout(cct) << buf->actime << std::endl;
7259 filepath path(relpath);
7260 InodeRef in;
7261 int r = path_walk(path, &in, perms);
7262 if (r < 0)
7263 return r;
7264 struct stat attr;
7265 stat_set_mtime_sec(&attr, buf->modtime);
7266 stat_set_mtime_nsec(&attr, 0);
7267 stat_set_atime_sec(&attr, buf->actime);
7268 stat_set_atime_nsec(&attr, 0);
7269 return _setattr(in, &attr, CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME, perms);
7270}
7271
7272int Client::lutime(const char *relpath, struct utimbuf *buf,
7273 const UserPerm& perms)
7274{
7275 Mutex::Locker lock(client_lock);
7276 tout(cct) << "lutime" << std::endl;
7277 tout(cct) << relpath << std::endl;
7278 tout(cct) << buf->modtime << std::endl;
7279 tout(cct) << buf->actime << std::endl;
7280 filepath path(relpath);
7281 InodeRef in;
7282 // don't follow symlinks
7283 int r = path_walk(path, &in, perms, false);
7284 if (r < 0)
7285 return r;
7286 struct stat attr;
7287 stat_set_mtime_sec(&attr, buf->modtime);
7288 stat_set_mtime_nsec(&attr, 0);
7289 stat_set_atime_sec(&attr, buf->actime);
7290 stat_set_atime_nsec(&attr, 0);
7291 return _setattr(in, &attr, CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME, perms);
7292}
7293
7294int Client::flock(int fd, int operation, uint64_t owner)
7295{
7296 Mutex::Locker lock(client_lock);
7297 tout(cct) << "flock" << std::endl;
7298 tout(cct) << fd << std::endl;
7299 tout(cct) << operation << std::endl;
7300 tout(cct) << owner << std::endl;
7301 Fh *f = get_filehandle(fd);
7302 if (!f)
7303 return -EBADF;
7304
7305 return _flock(f, operation, owner);
7306}
7307
7308int Client::opendir(const char *relpath, dir_result_t **dirpp, const UserPerm& perms)
7309{
7310 Mutex::Locker lock(client_lock);
7311 tout(cct) << "opendir" << std::endl;
7312 tout(cct) << relpath << std::endl;
7313 filepath path(relpath);
7314 InodeRef in;
7315 int r = path_walk(path, &in, perms, true);
7316 if (r < 0)
7317 return r;
7318 if (cct->_conf->client_permissions) {
7319 int r = may_open(in.get(), O_RDONLY, perms);
7320 if (r < 0)
7321 return r;
7322 }
7323 r = _opendir(in.get(), dirpp, perms);
7324 /* if ENOTDIR, dirpp will be an uninitialized point and it's very dangerous to access its value */
7325 if (r != -ENOTDIR)
7326 tout(cct) << (unsigned long)*dirpp << std::endl;
7327 return r;
7328}
7329
7330int Client::_opendir(Inode *in, dir_result_t **dirpp, const UserPerm& perms)
7331{
7332 if (!in->is_dir())
7333 return -ENOTDIR;
7334 *dirpp = new dir_result_t(in, perms);
7335 opened_dirs.insert(*dirpp);
7336 ldout(cct, 3) << "_opendir(" << in->ino << ") = " << 0 << " (" << *dirpp << ")" << dendl;
7337 return 0;
7338}
7339
7340
7341int Client::closedir(dir_result_t *dir)
7342{
7343 Mutex::Locker lock(client_lock);
7344 tout(cct) << "closedir" << std::endl;
7345 tout(cct) << (unsigned long)dir << std::endl;
7346
7347 ldout(cct, 3) << "closedir(" << dir << ") = 0" << dendl;
7348 _closedir(dir);
7349 return 0;
7350}
7351
7352void Client::_closedir(dir_result_t *dirp)
7353{
7354 ldout(cct, 10) << "_closedir(" << dirp << ")" << dendl;
7355 if (dirp->inode) {
7356 ldout(cct, 10) << "_closedir detaching inode " << dirp->inode << dendl;
7357 dirp->inode.reset();
7358 }
7359 _readdir_drop_dirp_buffer(dirp);
7360 opened_dirs.erase(dirp);
7361 delete dirp;
7362}
7363
7364void Client::rewinddir(dir_result_t *dirp)
7365{
7366 Mutex::Locker lock(client_lock);
7367
7368 ldout(cct, 3) << "rewinddir(" << dirp << ")" << dendl;
7369 dir_result_t *d = static_cast<dir_result_t*>(dirp);
7370 _readdir_drop_dirp_buffer(d);
7371 d->reset();
7372}
7373
7374loff_t Client::telldir(dir_result_t *dirp)
7375{
7376 dir_result_t *d = static_cast<dir_result_t*>(dirp);
7377 ldout(cct, 3) << "telldir(" << dirp << ") = " << d->offset << dendl;
7378 return d->offset;
7379}
7380
7381void Client::seekdir(dir_result_t *dirp, loff_t offset)
7382{
7383 Mutex::Locker lock(client_lock);
7384
7385 ldout(cct, 3) << "seekdir(" << dirp << ", " << offset << ")" << dendl;
7386
7387 if (offset == dirp->offset)
7388 return;
7389
7390 if (offset > dirp->offset)
7391 dirp->release_count = 0; // bump if we do a forward seek
7392 else
7393 dirp->ordered_count = 0; // disable filling readdir cache
7394
7395 if (dirp->hash_order()) {
7396 if (dirp->offset > offset) {
7397 _readdir_drop_dirp_buffer(dirp);
7398 dirp->reset();
7399 }
7400 } else {
7401 if (offset == 0 ||
7402 dirp->buffer_frag != frag_t(dir_result_t::fpos_high(offset)) ||
7403 dirp->offset_low() > dir_result_t::fpos_low(offset)) {
7404 _readdir_drop_dirp_buffer(dirp);
7405 dirp->reset();
7406 }
7407 }
7408
7409 dirp->offset = offset;
7410}
7411
7412
7413//struct dirent {
7414// ino_t d_ino; /* inode number */
7415// off_t d_off; /* offset to the next dirent */
7416// unsigned short d_reclen; /* length of this record */
7417// unsigned char d_type; /* type of file */
7418// char d_name[256]; /* filename */
7419//};
7420void Client::fill_dirent(struct dirent *de, const char *name, int type, uint64_t ino, loff_t next_off)
7421{
7422 strncpy(de->d_name, name, 255);
7423 de->d_name[255] = '\0';
7424#ifndef __CYGWIN__
7425 de->d_ino = ino;
7426#if !defined(DARWIN) && !defined(__FreeBSD__)
7427 de->d_off = next_off;
7428#endif
7429 de->d_reclen = 1;
7430 de->d_type = IFTODT(type);
7431 ldout(cct, 10) << "fill_dirent '" << de->d_name << "' -> " << inodeno_t(de->d_ino)
7432 << " type " << (int)de->d_type << " w/ next_off " << hex << next_off << dec << dendl;
7433#endif
7434}
7435
7436void Client::_readdir_next_frag(dir_result_t *dirp)
7437{
7438 frag_t fg = dirp->buffer_frag;
7439
7440 if (fg.is_rightmost()) {
7441 ldout(cct, 10) << "_readdir_next_frag advance from " << fg << " to END" << dendl;
7442 dirp->set_end();
7443 return;
7444 }
7445
7446 // advance
7447 fg = fg.next();
7448 ldout(cct, 10) << "_readdir_next_frag advance from " << dirp->buffer_frag << " to " << fg << dendl;
7449
7450 if (dirp->hash_order()) {
7451 // keep last_name
7452 int64_t new_offset = dir_result_t::make_fpos(fg.value(), 2, true);
7453 if (dirp->offset < new_offset) // don't decrease offset
7454 dirp->offset = new_offset;
7455 } else {
7456 dirp->last_name.clear();
7457 dirp->offset = dir_result_t::make_fpos(fg, 2, false);
7458 _readdir_rechoose_frag(dirp);
7459 }
7460}
7461
7462void Client::_readdir_rechoose_frag(dir_result_t *dirp)
7463{
7464 assert(dirp->inode);
7465
7466 if (dirp->hash_order())
7467 return;
7468
7469 frag_t cur = frag_t(dirp->offset_high());
7470 frag_t fg = dirp->inode->dirfragtree[cur.value()];
7471 if (fg != cur) {
7472 ldout(cct, 10) << "_readdir_rechoose_frag frag " << cur << " maps to " << fg << dendl;
7473 dirp->offset = dir_result_t::make_fpos(fg, 2, false);
7474 dirp->last_name.clear();
7475 dirp->next_offset = 2;
7476 }
7477}
7478
7479void Client::_readdir_drop_dirp_buffer(dir_result_t *dirp)
7480{
7481 ldout(cct, 10) << "_readdir_drop_dirp_buffer " << dirp << dendl;
7482 dirp->buffer.clear();
7483}
7484
7485int Client::_readdir_get_frag(dir_result_t *dirp)
7486{
7487 assert(dirp);
7488 assert(dirp->inode);
7489
7490 // get the current frag.
7491 frag_t fg;
7492 if (dirp->hash_order())
7493 fg = dirp->inode->dirfragtree[dirp->offset_high()];
7494 else
7495 fg = frag_t(dirp->offset_high());
7496
7497 ldout(cct, 10) << "_readdir_get_frag " << dirp << " on " << dirp->inode->ino << " fg " << fg
7498 << " offset " << hex << dirp->offset << dec << dendl;
7499
7500 int op = CEPH_MDS_OP_READDIR;
7501 if (dirp->inode && dirp->inode->snapid == CEPH_SNAPDIR)
7502 op = CEPH_MDS_OP_LSSNAP;
7503
7504 InodeRef& diri = dirp->inode;
7505
7506 MetaRequest *req = new MetaRequest(op);
7507 filepath path;
7508 diri->make_nosnap_relative_path(path);
7509 req->set_filepath(path);
7510 req->set_inode(diri.get());
7511 req->head.args.readdir.frag = fg;
7512 req->head.args.readdir.flags = CEPH_READDIR_REPLY_BITFLAGS;
7513 if (dirp->last_name.length()) {
7514 req->path2.set_path(dirp->last_name.c_str());
7515 } else if (dirp->hash_order()) {
7516 req->head.args.readdir.offset_hash = dirp->offset_high();
7517 }
7518 req->dirp = dirp;
7519
7520 bufferlist dirbl;
7521 int res = make_request(req, dirp->perms, NULL, NULL, -1, &dirbl);
7522
7523 if (res == -EAGAIN) {
7524 ldout(cct, 10) << "_readdir_get_frag got EAGAIN, retrying" << dendl;
7525 _readdir_rechoose_frag(dirp);
7526 return _readdir_get_frag(dirp);
7527 }
7528
7529 if (res == 0) {
7530 ldout(cct, 10) << "_readdir_get_frag " << dirp << " got frag " << dirp->buffer_frag
7531 << " size " << dirp->buffer.size() << dendl;
7532 } else {
7533 ldout(cct, 10) << "_readdir_get_frag got error " << res << ", setting end flag" << dendl;
7534 dirp->set_end();
7535 }
7536
7537 return res;
7538}
7539
7540struct dentry_off_lt {
7541 bool operator()(const Dentry* dn, int64_t off) const {
7542 return dir_result_t::fpos_cmp(dn->offset, off) < 0;
7543 }
7544};
7545
7546int Client::_readdir_cache_cb(dir_result_t *dirp, add_dirent_cb_t cb, void *p,
7547 int caps, bool getref)
7548{
7549 assert(client_lock.is_locked());
7550 ldout(cct, 10) << "_readdir_cache_cb " << dirp << " on " << dirp->inode->ino
7551 << " last_name " << dirp->last_name << " offset " << hex << dirp->offset << dec
7552 << dendl;
7553 Dir *dir = dirp->inode->dir;
7554
7555 if (!dir) {
7556 ldout(cct, 10) << " dir is empty" << dendl;
7557 dirp->set_end();
7558 return 0;
7559 }
7560
7561 vector<Dentry*>::iterator pd = std::lower_bound(dir->readdir_cache.begin(),
7562 dir->readdir_cache.end(),
7563 dirp->offset, dentry_off_lt());
7564
7565 string dn_name;
7566 while (true) {
7567 if (!dirp->inode->is_complete_and_ordered())
7568 return -EAGAIN;
7569 if (pd == dir->readdir_cache.end())
7570 break;
7571 Dentry *dn = *pd;
7572 if (dn->inode == NULL) {
7573 ldout(cct, 15) << " skipping null '" << dn->name << "'" << dendl;
7574 ++pd;
7575 continue;
7576 }
7577 if (dn->cap_shared_gen != dir->parent_inode->shared_gen) {
7578 ldout(cct, 15) << " skipping mismatch shared gen '" << dn->name << "'" << dendl;
7579 ++pd;
7580 continue;
7581 }
7582
7583 int r = _getattr(dn->inode, caps, dirp->perms);
7584 if (r < 0)
7585 return r;
7586
7587 struct ceph_statx stx;
7588 struct dirent de;
7589 fill_statx(dn->inode, caps, &stx);
7590
7591 uint64_t next_off = dn->offset + 1;
7592 ++pd;
7593 if (pd == dir->readdir_cache.end())
7594 next_off = dir_result_t::END;
7595
7596 Inode *in = NULL;
7597 fill_dirent(&de, dn->name.c_str(), stx.stx_mode, stx.stx_ino, next_off);
7598 if (getref) {
7599 in = dn->inode.get();
7600 _ll_get(in);
7601 }
7602
7603 dn_name = dn->name; // fill in name while we have lock
7604
7605 client_lock.Unlock();
7606 r = cb(p, &de, &stx, next_off, in); // _next_ offset
7607 client_lock.Lock();
7608 ldout(cct, 15) << " de " << de.d_name << " off " << hex << dn->offset << dec
7609 << " = " << r << dendl;
7610 if (r < 0) {
7611 return r;
7612 }
7613
7614 dirp->offset = next_off;
7615 if (dirp->at_end())
7616 dirp->next_offset = 2;
7617 else
7618 dirp->next_offset = dirp->offset_low();
7619 dirp->last_name = dn_name; // we successfully returned this one; update!
7620 if (r > 0)
7621 return r;
7622 }
7623
7624 ldout(cct, 10) << "_readdir_cache_cb " << dirp << " on " << dirp->inode->ino << " at end" << dendl;
7625 dirp->set_end();
7626 return 0;
7627}
7628
7629int Client::readdir_r_cb(dir_result_t *d, add_dirent_cb_t cb, void *p,
7630 unsigned want, unsigned flags, bool getref)
7631{
7632 int caps = statx_to_mask(flags, want);
7633
7634 Mutex::Locker lock(client_lock);
7635
7636 dir_result_t *dirp = static_cast<dir_result_t*>(d);
7637
7638 ldout(cct, 10) << "readdir_r_cb " << *dirp->inode << " offset " << hex << dirp->offset
7639 << dec << " at_end=" << dirp->at_end()
7640 << " hash_order=" << dirp->hash_order() << dendl;
7641
7642 struct dirent de;
7643 struct ceph_statx stx;
7644 memset(&de, 0, sizeof(de));
7645 memset(&stx, 0, sizeof(stx));
7646
7647 InodeRef& diri = dirp->inode;
7648
7649 if (dirp->at_end())
7650 return 0;
7651
7652 if (dirp->offset == 0) {
7653 ldout(cct, 15) << " including ." << dendl;
7654 assert(diri->dn_set.size() < 2); // can't have multiple hard-links to a dir
7655 uint64_t next_off = 1;
7656
7657 int r;
7658 r = _getattr(diri, caps, dirp->perms);
7659 if (r < 0)
7660 return r;
7661
7662 fill_statx(diri, caps, &stx);
7663 fill_dirent(&de, ".", S_IFDIR, stx.stx_ino, next_off);
7664
7665 Inode *inode = NULL;
7666 if (getref) {
7667 inode = diri.get();
7668 _ll_get(inode);
7669 }
7670
7671 client_lock.Unlock();
7672 r = cb(p, &de, &stx, next_off, inode);
7673 client_lock.Lock();
7674 if (r < 0)
7675 return r;
7676
7677 dirp->offset = next_off;
7678 if (r > 0)
7679 return r;
7680 }
7681 if (dirp->offset == 1) {
7682 ldout(cct, 15) << " including .." << dendl;
7683 uint64_t next_off = 2;
7684 InodeRef in;
7685 if (diri->dn_set.empty())
7686 in = diri;
7687 else
7688 in = diri->get_first_parent()->inode;
7689
7690 int r;
7691 r = _getattr(diri, caps, dirp->perms);
7692 if (r < 0)
7693 return r;
7694
7695 fill_statx(in, caps, &stx);
7696 fill_dirent(&de, "..", S_IFDIR, stx.stx_ino, next_off);
7697
7698 Inode *inode = NULL;
7699 if (getref) {
7700 inode = in.get();
7701 _ll_get(inode);
7702 }
7703
7704 client_lock.Unlock();
7705 r = cb(p, &de, &stx, next_off, inode);
7706 client_lock.Lock();
7707 if (r < 0)
7708 return r;
7709
7710 dirp->offset = next_off;
7711 if (r > 0)
7712 return r;
7713 }
7714
7715 // can we read from our cache?
7716 ldout(cct, 10) << "offset " << hex << dirp->offset << dec
7717 << " snapid " << dirp->inode->snapid << " (complete && ordered) "
7718 << dirp->inode->is_complete_and_ordered()
7719 << " issued " << ccap_string(dirp->inode->caps_issued())
7720 << dendl;
7721 if (dirp->inode->snapid != CEPH_SNAPDIR &&
7722 dirp->inode->is_complete_and_ordered() &&
7723 dirp->inode->caps_issued_mask(CEPH_CAP_FILE_SHARED)) {
7724 int err = _readdir_cache_cb(dirp, cb, p, caps, getref);
7725 if (err != -EAGAIN)
7726 return err;
7727 }
7728
7729 while (1) {
7730 if (dirp->at_end())
7731 return 0;
7732
7733 bool check_caps = true;
7734 if (!dirp->is_cached()) {
7735 int r = _readdir_get_frag(dirp);
7736 if (r)
7737 return r;
7738 // _readdir_get_frag () may updates dirp->offset if the replied dirfrag is
7739 // different than the requested one. (our dirfragtree was outdated)
7740 check_caps = false;
7741 }
7742 frag_t fg = dirp->buffer_frag;
7743
7744 ldout(cct, 10) << "frag " << fg << " buffer size " << dirp->buffer.size()
7745 << " offset " << hex << dirp->offset << dendl;
7746
7747 for (auto it = std::lower_bound(dirp->buffer.begin(), dirp->buffer.end(),
7748 dirp->offset, dir_result_t::dentry_off_lt());
7749 it != dirp->buffer.end();
7750 ++it) {
7751 dir_result_t::dentry &entry = *it;
7752
7753 uint64_t next_off = entry.offset + 1;
7754
7755 int r;
7756 if (check_caps) {
7757 r = _getattr(entry.inode, caps, dirp->perms);
7758 if (r < 0)
7759 return r;
7760 }
7761
7762 fill_statx(entry.inode, caps, &stx);
7763 fill_dirent(&de, entry.name.c_str(), stx.stx_mode, stx.stx_ino, next_off);
7764
7765 Inode *inode = NULL;
7766 if (getref) {
7767 inode = entry.inode.get();
7768 _ll_get(inode);
7769 }
7770
7771 client_lock.Unlock();
7772 r = cb(p, &de, &stx, next_off, inode); // _next_ offset
7773 client_lock.Lock();
7774
7775 ldout(cct, 15) << " de " << de.d_name << " off " << hex << next_off - 1 << dec
7776 << " = " << r << dendl;
7777 if (r < 0)
7778 return r;
7779
7780 dirp->offset = next_off;
7781 if (r > 0)
7782 return r;
7783 }
7784
7785 if (dirp->next_offset > 2) {
7786 ldout(cct, 10) << " fetching next chunk of this frag" << dendl;
7787 _readdir_drop_dirp_buffer(dirp);
7788 continue; // more!
7789 }
7790
7791 if (!fg.is_rightmost()) {
7792 // next frag!
7793 _readdir_next_frag(dirp);
7794 continue;
7795 }
7796
7797 if (diri->shared_gen == dirp->start_shared_gen &&
7798 diri->dir_release_count == dirp->release_count) {
7799 if (diri->dir_ordered_count == dirp->ordered_count) {
7800 ldout(cct, 10) << " marking (I_COMPLETE|I_DIR_ORDERED) on " << *diri << dendl;
7801 if (diri->dir) {
7802 assert(diri->dir->readdir_cache.size() >= dirp->cache_index);
7803 diri->dir->readdir_cache.resize(dirp->cache_index);
7804 }
7805 diri->flags |= I_COMPLETE | I_DIR_ORDERED;
7806 } else {
7807 ldout(cct, 10) << " marking I_COMPLETE on " << *diri << dendl;
7808 diri->flags |= I_COMPLETE;
7809 }
7810 }
7811
7812 dirp->set_end();
7813 return 0;
7814 }
7815 ceph_abort();
7816 return 0;
7817}
7818
7819
7820int Client::readdir_r(dir_result_t *d, struct dirent *de)
7821{
7822 return readdirplus_r(d, de, 0, 0, 0, NULL);
7823}
7824
7825/*
7826 * readdirplus_r
7827 *
7828 * returns
7829 * 1 if we got a dirent
7830 * 0 for end of directory
7831 * <0 on error
7832 */
7833
7834struct single_readdir {
7835 struct dirent *de;
7836 struct ceph_statx *stx;
7837 Inode *inode;
7838 bool full;
7839};
7840
7841static int _readdir_single_dirent_cb(void *p, struct dirent *de,
7842 struct ceph_statx *stx, off_t off,
7843 Inode *in)
7844{
7845 single_readdir *c = static_cast<single_readdir *>(p);
7846
7847 if (c->full)
7848 return -1; // already filled this dirent
7849
7850 *c->de = *de;
7851 if (c->stx)
7852 *c->stx = *stx;
7853 c->inode = in;
7854 c->full = true;
7855 return 1;
7856}
7857
7858struct dirent *Client::readdir(dir_result_t *d)
7859{
7860 int ret;
7861 static struct dirent de;
7862 single_readdir sr;
7863 sr.de = &de;
7864 sr.stx = NULL;
7865 sr.inode = NULL;
7866 sr.full = false;
7867
7868 // our callback fills the dirent and sets sr.full=true on first
7869 // call, and returns -1 the second time around.
7870 ret = readdir_r_cb(d, _readdir_single_dirent_cb, (void *)&sr);
7871 if (ret < -1) {
7872 errno = -ret; // this sucks.
7873 return (dirent *) NULL;
7874 }
7875 if (sr.full) {
7876 return &de;
7877 }
7878 return (dirent *) NULL;
7879}
7880
7881int Client::readdirplus_r(dir_result_t *d, struct dirent *de,
7882 struct ceph_statx *stx, unsigned want,
7883 unsigned flags, Inode **out)
7884{
7885 single_readdir sr;
7886 sr.de = de;
7887 sr.stx = stx;
7888 sr.inode = NULL;
7889 sr.full = false;
7890
7891 // our callback fills the dirent and sets sr.full=true on first
7892 // call, and returns -1 the second time around.
7893 int r = readdir_r_cb(d, _readdir_single_dirent_cb, (void *)&sr, want, flags, out);
7894 if (r < -1)
7895 return r;
7896 if (out)
7897 *out = sr.inode;
7898 if (sr.full)
7899 return 1;
7900 return 0;
7901}
7902
7903
7904/* getdents */
7905struct getdents_result {
7906 char *buf;
7907 int buflen;
7908 int pos;
7909 bool fullent;
7910};
7911
7912static int _readdir_getdent_cb(void *p, struct dirent *de,
7913 struct ceph_statx *stx, off_t off, Inode *in)
7914{
7915 struct getdents_result *c = static_cast<getdents_result *>(p);
7916
7917 int dlen;
7918 if (c->fullent)
7919 dlen = sizeof(*de);
7920 else
7921 dlen = strlen(de->d_name) + 1;
7922
7923 if (c->pos + dlen > c->buflen)
7924 return -1; // doesn't fit
7925
7926 if (c->fullent) {
7927 memcpy(c->buf + c->pos, de, sizeof(*de));
7928 } else {
7929 memcpy(c->buf + c->pos, de->d_name, dlen);
7930 }
7931 c->pos += dlen;
7932 return 0;
7933}
7934
7935int Client::_getdents(dir_result_t *dir, char *buf, int buflen, bool fullent)
7936{
7937 getdents_result gr;
7938 gr.buf = buf;
7939 gr.buflen = buflen;
7940 gr.fullent = fullent;
7941 gr.pos = 0;
7942
7943 int r = readdir_r_cb(dir, _readdir_getdent_cb, (void *)&gr);
7944
7945 if (r < 0) { // some error
7946 if (r == -1) { // buffer ran out of space
7947 if (gr.pos) { // but we got some entries already!
7948 return gr.pos;
7949 } // or we need a larger buffer
7950 return -ERANGE;
7951 } else { // actual error, return it
7952 return r;
7953 }
7954 }
7955 return gr.pos;
7956}
7957
7958
7959/* getdir */
7960struct getdir_result {
7961 list<string> *contents;
7962 int num;
7963};
7964
7965static int _getdir_cb(void *p, struct dirent *de, struct ceph_statx *stx, off_t off, Inode *in)
7966{
7967 getdir_result *r = static_cast<getdir_result *>(p);
7968
7969 r->contents->push_back(de->d_name);
7970 r->num++;
7971 return 0;
7972}
7973
7974int Client::getdir(const char *relpath, list<string>& contents,
7975 const UserPerm& perms)
7976{
7977 ldout(cct, 3) << "getdir(" << relpath << ")" << dendl;
7978 {
7979 Mutex::Locker lock(client_lock);
7980 tout(cct) << "getdir" << std::endl;
7981 tout(cct) << relpath << std::endl;
7982 }
7983
7984 dir_result_t *d;
7985 int r = opendir(relpath, &d, perms);
7986 if (r < 0)
7987 return r;
7988
7989 getdir_result gr;
7990 gr.contents = &contents;
7991 gr.num = 0;
7992 r = readdir_r_cb(d, _getdir_cb, (void *)&gr);
7993
7994 closedir(d);
7995
7996 if (r < 0)
7997 return r;
7998 return gr.num;
7999}
8000
8001
8002/****** file i/o **********/
8003int Client::open(const char *relpath, int flags, const UserPerm& perms,
8004 mode_t mode, int stripe_unit, int stripe_count,
8005 int object_size, const char *data_pool)
8006{
8007 ldout(cct, 3) << "open enter(" << relpath << ", " << ceph_flags_sys2wire(flags) << "," << mode << ")" << dendl;
8008 Mutex::Locker lock(client_lock);
8009 tout(cct) << "open" << std::endl;
8010 tout(cct) << relpath << std::endl;
8011 tout(cct) << ceph_flags_sys2wire(flags) << std::endl;
8012
8013 Fh *fh = NULL;
8014
8015#if defined(__linux__) && defined(O_PATH)
8016 /* When the O_PATH is being specified, others flags than O_DIRECTORY
8017 * and O_NOFOLLOW are ignored. Please refer do_entry_open() function
8018 * in kernel (fs/open.c). */
8019 if (flags & O_PATH)
8020 flags &= O_DIRECTORY | O_NOFOLLOW | O_PATH;
8021#endif
8022
8023 filepath path(relpath);
8024 InodeRef in;
8025 bool created = false;
8026 /* O_CREATE with O_EXCL enforces O_NOFOLLOW. */
8027 bool followsym = !((flags & O_NOFOLLOW) || ((flags & O_CREAT) && (flags & O_EXCL)));
8028 int r = path_walk(path, &in, perms, followsym, ceph_caps_for_mode(mode));
8029
8030 if (r == 0 && (flags & O_CREAT) && (flags & O_EXCL))
8031 return -EEXIST;
8032
8033#if defined(__linux__) && defined(O_PATH)
8034 if (r == 0 && in->is_symlink() && (flags & O_NOFOLLOW) && !(flags & O_PATH))
8035#else
8036 if (r == 0 && in->is_symlink() && (flags & O_NOFOLLOW))
8037#endif
8038 return -ELOOP;
8039
8040 if (r == -ENOENT && (flags & O_CREAT)) {
8041 filepath dirpath = path;
8042 string dname = dirpath.last_dentry();
8043 dirpath.pop_dentry();
8044 InodeRef dir;
8045 r = path_walk(dirpath, &dir, perms, true,
8046 cct->_conf->client_permissions ? CEPH_CAP_AUTH_SHARED : 0);
8047 if (r < 0)
8048 goto out;
8049 if (cct->_conf->client_permissions) {
8050 r = may_create(dir.get(), perms);
8051 if (r < 0)
8052 goto out;
8053 }
8054 r = _create(dir.get(), dname.c_str(), flags, mode, &in, &fh, stripe_unit,
8055 stripe_count, object_size, data_pool, &created, perms);
8056 }
8057 if (r < 0)
8058 goto out;
8059
8060 if (!created) {
8061 // posix says we can only check permissions of existing files
8062 if (cct->_conf->client_permissions) {
8063 r = may_open(in.get(), flags, perms);
8064 if (r < 0)
8065 goto out;
8066 }
8067 }
8068
8069 if (!fh)
8070 r = _open(in.get(), flags, mode, &fh, perms);
8071 if (r >= 0) {
8072 // allocate a integer file descriptor
8073 assert(fh);
8074 r = get_fd();
8075 assert(fd_map.count(r) == 0);
8076 fd_map[r] = fh;
8077 }
8078
8079 out:
8080 tout(cct) << r << std::endl;
8081 ldout(cct, 3) << "open exit(" << path << ", " << ceph_flags_sys2wire(flags) << ") = " << r << dendl;
8082 return r;
8083}
8084
8085int Client::open(const char *relpath, int flags, const UserPerm& perms, mode_t mode)
8086{
8087 /* Use default file striping parameters */
8088 return open(relpath, flags, perms, mode, 0, 0, 0, NULL);
8089}
8090
8091int Client::lookup_hash(inodeno_t ino, inodeno_t dirino, const char *name,
8092 const UserPerm& perms)
8093{
8094 Mutex::Locker lock(client_lock);
8095 ldout(cct, 3) << "lookup_hash enter(" << ino << ", #" << dirino << "/" << name << ")" << dendl;
8096
8097 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPHASH);
8098 filepath path(ino);
8099 req->set_filepath(path);
8100
8101 uint32_t h = ceph_str_hash(CEPH_STR_HASH_RJENKINS, name, strlen(name));
8102 char f[30];
8103 sprintf(f, "%u", h);
8104 filepath path2(dirino);
8105 path2.push_dentry(string(f));
8106 req->set_filepath2(path2);
8107
8108 int r = make_request(req, perms, NULL, NULL,
8109 rand() % mdsmap->get_num_in_mds());
8110 ldout(cct, 3) << "lookup_hash exit(" << ino << ", #" << dirino << "/" << name << ") = " << r << dendl;
8111 return r;
8112}
8113
8114
8115/**
8116 * Load inode into local cache.
8117 *
8118 * If inode pointer is non-NULL, and take a reference on
8119 * the resulting Inode object in one operation, so that caller
8120 * can safely assume inode will still be there after return.
8121 */
8122int Client::lookup_ino(inodeno_t ino, const UserPerm& perms, Inode **inode)
8123{
8124 Mutex::Locker lock(client_lock);
8125 ldout(cct, 3) << "lookup_ino enter(" << ino << ")" << dendl;
8126
8127 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPINO);
8128 filepath path(ino);
8129 req->set_filepath(path);
8130
8131 int r = make_request(req, perms, NULL, NULL, rand() % mdsmap->get_num_in_mds());
8132 if (r == 0 && inode != NULL) {
8133 vinodeno_t vino(ino, CEPH_NOSNAP);
8134 unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino);
8135 assert(p != inode_map.end());
8136 *inode = p->second;
8137 _ll_get(*inode);
8138 }
8139 ldout(cct, 3) << "lookup_ino exit(" << ino << ") = " << r << dendl;
8140 return r;
8141}
8142
8143
8144
8145/**
8146 * Find the parent inode of `ino` and insert it into
8147 * our cache. Conditionally also set `parent` to a referenced
8148 * Inode* if caller provides non-NULL value.
8149 */
8150int Client::lookup_parent(Inode *ino, const UserPerm& perms, Inode **parent)
8151{
8152 Mutex::Locker lock(client_lock);
8153 ldout(cct, 3) << "lookup_parent enter(" << ino->ino << ")" << dendl;
8154
8155 if (!ino->dn_set.empty()) {
8156 // if we exposed the parent here, we'd need to check permissions,
8157 // but right now we just rely on the MDS doing so in make_request
8158 ldout(cct, 3) << "lookup_parent dentry already present" << dendl;
8159 return 0;
8160 }
8161
8162 if (ino->is_root()) {
8163 *parent = NULL;
8164 ldout(cct, 3) << "ino is root, no parent" << dendl;
8165 return -EINVAL;
8166 }
8167
8168 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPPARENT);
8169 filepath path(ino->ino);
8170 req->set_filepath(path);
8171
8172 InodeRef target;
8173 int r = make_request(req, perms, &target, NULL, rand() % mdsmap->get_num_in_mds());
8174 // Give caller a reference to the parent ino if they provided a pointer.
8175 if (parent != NULL) {
8176 if (r == 0) {
8177 *parent = target.get();
8178 _ll_get(*parent);
8179 ldout(cct, 3) << "lookup_parent found parent " << (*parent)->ino << dendl;
8180 } else {
8181 *parent = NULL;
8182 }
8183 }
8184 ldout(cct, 3) << "lookup_parent exit(" << ino->ino << ") = " << r << dendl;
8185 return r;
8186}
8187
8188
8189/**
8190 * Populate the parent dentry for `ino`, provided it is
8191 * a child of `parent`.
8192 */
8193int Client::lookup_name(Inode *ino, Inode *parent, const UserPerm& perms)
8194{
8195 assert(parent->is_dir());
8196
8197 Mutex::Locker lock(client_lock);
8198 ldout(cct, 3) << "lookup_name enter(" << ino->ino << ")" << dendl;
8199
8200 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPNAME);
8201 req->set_filepath2(filepath(parent->ino));
8202 req->set_filepath(filepath(ino->ino));
8203 req->set_inode(ino);
8204
8205 int r = make_request(req, perms, NULL, NULL, rand() % mdsmap->get_num_in_mds());
8206 ldout(cct, 3) << "lookup_name exit(" << ino->ino << ") = " << r << dendl;
8207 return r;
8208}
8209
8210
8211 Fh *Client::_create_fh(Inode *in, int flags, int cmode, const UserPerm& perms)
8212{
8213 assert(in);
8214 Fh *f = new Fh(in);
8215 f->mode = cmode;
8216 f->flags = flags;
8217
8218 // inode
8219 f->actor_perms = perms;
8220
8221 ldout(cct, 10) << "_create_fh " << in->ino << " mode " << cmode << dendl;
8222
8223 if (in->snapid != CEPH_NOSNAP) {
8224 in->snap_cap_refs++;
8225 ldout(cct, 5) << "open success, fh is " << f << " combined IMMUTABLE SNAP caps "
8226 << ccap_string(in->caps_issued()) << dendl;
8227 }
8228
8229 const md_config_t *conf = cct->_conf;
8230 f->readahead.set_trigger_requests(1);
8231 f->readahead.set_min_readahead_size(conf->client_readahead_min);
8232 uint64_t max_readahead = Readahead::NO_LIMIT;
8233 if (conf->client_readahead_max_bytes) {
8234 max_readahead = MIN(max_readahead, (uint64_t)conf->client_readahead_max_bytes);
8235 }
8236 if (conf->client_readahead_max_periods) {
8237 max_readahead = MIN(max_readahead, in->layout.get_period()*(uint64_t)conf->client_readahead_max_periods);
8238 }
8239 f->readahead.set_max_readahead_size(max_readahead);
8240 vector<uint64_t> alignments;
8241 alignments.push_back(in->layout.get_period());
8242 alignments.push_back(in->layout.stripe_unit);
8243 f->readahead.set_alignments(alignments);
8244
8245 return f;
8246}
8247
8248int Client::_release_fh(Fh *f)
8249{
8250 //ldout(cct, 3) << "op: client->close(open_files[ " << fh << " ]);" << dendl;
8251 //ldout(cct, 3) << "op: open_files.erase( " << fh << " );" << dendl;
8252 Inode *in = f->inode.get();
8253 ldout(cct, 5) << "_release_fh " << f << " mode " << f->mode << " on " << *in << dendl;
8254
8255 if (in->snapid == CEPH_NOSNAP) {
8256 if (in->put_open_ref(f->mode)) {
8257 _flush(in, new C_Client_FlushComplete(this, in));
8258 check_caps(in, 0);
8259 }
8260 } else {
8261 assert(in->snap_cap_refs > 0);
8262 in->snap_cap_refs--;
8263 }
8264
8265 _release_filelocks(f);
8266
8267 // Finally, read any async err (i.e. from flushes)
8268 int err = f->take_async_err();
8269 if (err != 0) {
8270 ldout(cct, 1) << "_release_fh " << f << " on inode " << *in << " caught async_err = "
8271 << cpp_strerror(err) << dendl;
8272 } else {
8273 ldout(cct, 10) << "_release_fh " << f << " on inode " << *in << " no async_err state" << dendl;
8274 }
8275
8276 _put_fh(f);
8277
8278 return err;
8279}
8280
8281void Client::_put_fh(Fh *f)
8282{
8283 int left = f->put();
8284 if (!left) {
8285 delete f;
8286 }
8287}
8288
8289int Client::_open(Inode *in, int flags, mode_t mode, Fh **fhp,
8290 const UserPerm& perms)
8291{
8292 if (in->snapid != CEPH_NOSNAP &&
8293 (flags & (O_WRONLY | O_RDWR | O_CREAT | O_TRUNC | O_APPEND))) {
8294 return -EROFS;
8295 }
8296
8297 // use normalized flags to generate cmode
8298 int cmode = ceph_flags_to_mode(ceph_flags_sys2wire(flags));
8299 if (cmode < 0)
8300 return -EINVAL;
8301 int want = ceph_caps_for_mode(cmode);
8302 int result = 0;
8303
8304 in->get_open_ref(cmode); // make note of pending open, since it effects _wanted_ caps.
8305
8306 if ((flags & O_TRUNC) == 0 &&
8307 in->caps_issued_mask(want)) {
8308 // update wanted?
8309 check_caps(in, CHECK_CAPS_NODELAY);
8310 } else {
8311 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_OPEN);
8312 filepath path;
8313 in->make_nosnap_relative_path(path);
8314 req->set_filepath(path);
8315 req->head.args.open.flags = ceph_flags_sys2wire(flags & ~O_CREAT);
8316 req->head.args.open.mode = mode;
8317 req->head.args.open.pool = -1;
8318 if (cct->_conf->client_debug_getattr_caps)
8319 req->head.args.open.mask = DEBUG_GETATTR_CAPS;
8320 else
8321 req->head.args.open.mask = 0;
8322 req->head.args.open.old_size = in->size; // for O_TRUNC
8323 req->set_inode(in);
8324 result = make_request(req, perms);
8325 }
8326
8327 // success?
8328 if (result >= 0) {
8329 if (fhp)
8330 *fhp = _create_fh(in, flags, cmode, perms);
8331 } else {
8332 in->put_open_ref(cmode);
8333 }
8334
8335 trim_cache();
8336
8337 return result;
8338}
8339
8340int Client::_renew_caps(Inode *in)
8341{
8342 int wanted = in->caps_file_wanted();
8343 if (in->is_any_caps() &&
8344 ((wanted & CEPH_CAP_ANY_WR) == 0 || in->auth_cap)) {
8345 check_caps(in, CHECK_CAPS_NODELAY);
8346 return 0;
8347 }
8348
8349 int flags = 0;
8350 if ((wanted & CEPH_CAP_FILE_RD) && (wanted & CEPH_CAP_FILE_WR))
8351 flags = O_RDWR;
8352 else if (wanted & CEPH_CAP_FILE_RD)
8353 flags = O_RDONLY;
8354 else if (wanted & CEPH_CAP_FILE_WR)
8355 flags = O_WRONLY;
8356
8357 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_OPEN);
8358 filepath path;
8359 in->make_nosnap_relative_path(path);
8360 req->set_filepath(path);
8361 req->head.args.open.flags = flags;
8362 req->head.args.open.pool = -1;
8363 if (cct->_conf->client_debug_getattr_caps)
8364 req->head.args.open.mask = DEBUG_GETATTR_CAPS;
8365 else
8366 req->head.args.open.mask = 0;
8367 req->set_inode(in);
8368
8369 // duplicate in case Cap goes away; not sure if that race is a concern?
8370 const UserPerm *pperm = in->get_best_perms();
8371 UserPerm perms;
8372 if (pperm != NULL)
8373 perms = *pperm;
8374 int ret = make_request(req, perms);
8375 return ret;
8376}
8377
8378int Client::close(int fd)
8379{
8380 ldout(cct, 3) << "close enter(" << fd << ")" << dendl;
8381 Mutex::Locker lock(client_lock);
8382 tout(cct) << "close" << std::endl;
8383 tout(cct) << fd << std::endl;
8384
8385 Fh *fh = get_filehandle(fd);
8386 if (!fh)
8387 return -EBADF;
8388 int err = _release_fh(fh);
8389 fd_map.erase(fd);
8390 put_fd(fd);
8391 ldout(cct, 3) << "close exit(" << fd << ")" << dendl;
8392 return err;
8393}
8394
8395
8396// ------------
8397// read, write
8398
8399loff_t Client::lseek(int fd, loff_t offset, int whence)
8400{
8401 Mutex::Locker lock(client_lock);
8402 tout(cct) << "lseek" << std::endl;
8403 tout(cct) << fd << std::endl;
8404 tout(cct) << offset << std::endl;
8405 tout(cct) << whence << std::endl;
8406
8407 Fh *f = get_filehandle(fd);
8408 if (!f)
8409 return -EBADF;
8410#if defined(__linux__) && defined(O_PATH)
8411 if (f->flags & O_PATH)
8412 return -EBADF;
8413#endif
8414 return _lseek(f, offset, whence);
8415}
8416
8417loff_t Client::_lseek(Fh *f, loff_t offset, int whence)
8418{
8419 Inode *in = f->inode.get();
8420 int r;
8421
8422 switch (whence) {
8423 case SEEK_SET:
8424 f->pos = offset;
8425 break;
8426
8427 case SEEK_CUR:
8428 f->pos += offset;
8429 break;
8430
8431 case SEEK_END:
8432 r = _getattr(in, CEPH_STAT_CAP_SIZE, f->actor_perms);
8433 if (r < 0)
8434 return r;
8435 f->pos = in->size + offset;
8436 break;
8437
8438 default:
8439 ceph_abort();
8440 }
8441
8442 ldout(cct, 3) << "_lseek(" << f << ", " << offset << ", " << whence << ") = " << f->pos << dendl;
8443 return f->pos;
8444}
8445
8446
8447void Client::lock_fh_pos(Fh *f)
8448{
8449 ldout(cct, 10) << "lock_fh_pos " << f << dendl;
8450
8451 if (f->pos_locked || !f->pos_waiters.empty()) {
8452 Cond cond;
8453 f->pos_waiters.push_back(&cond);
8454 ldout(cct, 10) << "lock_fh_pos BLOCKING on " << f << dendl;
8455 while (f->pos_locked || f->pos_waiters.front() != &cond)
8456 cond.Wait(client_lock);
8457 ldout(cct, 10) << "lock_fh_pos UNBLOCKING on " << f << dendl;
8458 assert(f->pos_waiters.front() == &cond);
8459 f->pos_waiters.pop_front();
8460 }
8461
8462 f->pos_locked = true;
8463}
8464
8465void Client::unlock_fh_pos(Fh *f)
8466{
8467 ldout(cct, 10) << "unlock_fh_pos " << f << dendl;
8468 f->pos_locked = false;
8469}
8470
8471int Client::uninline_data(Inode *in, Context *onfinish)
8472{
8473 if (!in->inline_data.length()) {
8474 onfinish->complete(0);
8475 return 0;
8476 }
8477
8478 char oid_buf[32];
8479 snprintf(oid_buf, sizeof(oid_buf), "%llx.00000000", (long long unsigned)in->ino);
8480 object_t oid = oid_buf;
8481
8482 ObjectOperation create_ops;
8483 create_ops.create(false);
8484
8485 objecter->mutate(oid,
8486 OSDMap::file_to_object_locator(in->layout),
8487 create_ops,
8488 in->snaprealm->get_snap_context(),
8489 ceph::real_clock::now(),
8490 0,
8491 NULL);
8492
8493 bufferlist inline_version_bl;
8494 ::encode(in->inline_version, inline_version_bl);
8495
8496 ObjectOperation uninline_ops;
8497 uninline_ops.cmpxattr("inline_version",
8498 CEPH_OSD_CMPXATTR_OP_GT,
8499 CEPH_OSD_CMPXATTR_MODE_U64,
8500 inline_version_bl);
8501 bufferlist inline_data = in->inline_data;
8502 uninline_ops.write(0, inline_data, in->truncate_size, in->truncate_seq);
8503 uninline_ops.setxattr("inline_version", stringify(in->inline_version));
8504
8505 objecter->mutate(oid,
8506 OSDMap::file_to_object_locator(in->layout),
8507 uninline_ops,
8508 in->snaprealm->get_snap_context(),
8509 ceph::real_clock::now(),
8510 0,
8511 onfinish);
8512
8513 return 0;
8514}
8515
8516//
8517
8518// blocking osd interface
8519
8520int Client::read(int fd, char *buf, loff_t size, loff_t offset)
8521{
8522 Mutex::Locker lock(client_lock);
8523 tout(cct) << "read" << std::endl;
8524 tout(cct) << fd << std::endl;
8525 tout(cct) << size << std::endl;
8526 tout(cct) << offset << std::endl;
8527
8528 Fh *f = get_filehandle(fd);
8529 if (!f)
8530 return -EBADF;
8531#if defined(__linux__) && defined(O_PATH)
8532 if (f->flags & O_PATH)
8533 return -EBADF;
8534#endif
8535 bufferlist bl;
8536 int r = _read(f, offset, size, &bl);
8537 ldout(cct, 3) << "read(" << fd << ", " << (void*)buf << ", " << size << ", " << offset << ") = " << r << dendl;
8538 if (r >= 0) {
8539 bl.copy(0, bl.length(), buf);
8540 r = bl.length();
8541 }
8542 return r;
8543}
8544
8545int Client::preadv(int fd, const struct iovec *iov, int iovcnt, loff_t offset)
8546{
8547 if (iovcnt < 0)
8548 return -EINVAL;
8549 return _preadv_pwritev(fd, iov, iovcnt, offset, false);
8550}
8551
8552int Client::_read(Fh *f, int64_t offset, uint64_t size, bufferlist *bl)
8553{
8554 const md_config_t *conf = cct->_conf;
8555 Inode *in = f->inode.get();
8556
8557 if ((f->mode & CEPH_FILE_MODE_RD) == 0)
8558 return -EBADF;
8559 //bool lazy = f->mode == CEPH_FILE_MODE_LAZY;
8560
8561 bool movepos = false;
8562 if (offset < 0) {
8563 lock_fh_pos(f);
8564 offset = f->pos;
8565 movepos = true;
8566 }
8567 loff_t start_pos = offset;
8568
8569 if (in->inline_version == 0) {
8570 int r = _getattr(in, CEPH_STAT_CAP_INLINE_DATA, f->actor_perms, true);
c07f9fc5
FG
8571 if (r < 0) {
8572 if (movepos)
8573 unlock_fh_pos(f);
7c673cae 8574 return r;
c07f9fc5 8575 }
7c673cae
FG
8576 assert(in->inline_version > 0);
8577 }
8578
8579retry:
8580 int have;
8581 int r = get_caps(in, CEPH_CAP_FILE_RD, CEPH_CAP_FILE_CACHE, &have, -1);
c07f9fc5
FG
8582 if (r < 0) {
8583 if (movepos)
8584 unlock_fh_pos(f);
7c673cae 8585 return r;
c07f9fc5 8586 }
7c673cae
FG
8587 if (f->flags & O_DIRECT)
8588 have &= ~CEPH_CAP_FILE_CACHE;
8589
8590 Mutex uninline_flock("Client::_read_uninline_data flock");
8591 Cond uninline_cond;
8592 bool uninline_done = false;
8593 int uninline_ret = 0;
8594 Context *onuninline = NULL;
8595
8596 if (in->inline_version < CEPH_INLINE_NONE) {
8597 if (!(have & CEPH_CAP_FILE_CACHE)) {
8598 onuninline = new C_SafeCond(&uninline_flock,
8599 &uninline_cond,
8600 &uninline_done,
8601 &uninline_ret);
8602 uninline_data(in, onuninline);
8603 } else {
8604 uint32_t len = in->inline_data.length();
8605
8606 uint64_t endoff = offset + size;
8607 if (endoff > in->size)
8608 endoff = in->size;
8609
8610 if (offset < len) {
8611 if (endoff <= len) {
8612 bl->substr_of(in->inline_data, offset, endoff - offset);
8613 } else {
8614 bl->substr_of(in->inline_data, offset, len - offset);
8615 bl->append_zero(endoff - len);
8616 }
8617 } else if ((uint64_t)offset < endoff) {
8618 bl->append_zero(endoff - offset);
8619 }
8620
8621 goto success;
8622 }
8623 }
8624
8625 if (!conf->client_debug_force_sync_read &&
8626 (conf->client_oc && (have & CEPH_CAP_FILE_CACHE))) {
8627
8628 if (f->flags & O_RSYNC) {
8629 _flush_range(in, offset, size);
8630 }
8631 r = _read_async(f, offset, size, bl);
8632 if (r < 0)
8633 goto done;
8634 } else {
8635 if (f->flags & O_DIRECT)
8636 _flush_range(in, offset, size);
8637
8638 bool checkeof = false;
8639 r = _read_sync(f, offset, size, bl, &checkeof);
8640 if (r < 0)
8641 goto done;
8642 if (checkeof) {
8643 offset += r;
8644 size -= r;
8645
8646 put_cap_ref(in, CEPH_CAP_FILE_RD);
8647 have = 0;
8648 // reverify size
8649 r = _getattr(in, CEPH_STAT_CAP_SIZE, f->actor_perms);
8650 if (r < 0)
8651 goto done;
8652
8653 // eof? short read.
8654 if ((uint64_t)offset < in->size)
8655 goto retry;
8656 }
8657 }
8658
8659success:
8660 if (movepos) {
8661 // adjust fd pos
8662 f->pos = start_pos + bl->length();
8663 unlock_fh_pos(f);
8664 }
8665
8666done:
8667 // done!
8668
8669 if (onuninline) {
8670 client_lock.Unlock();
8671 uninline_flock.Lock();
8672 while (!uninline_done)
8673 uninline_cond.Wait(uninline_flock);
8674 uninline_flock.Unlock();
8675 client_lock.Lock();
8676
8677 if (uninline_ret >= 0 || uninline_ret == -ECANCELED) {
8678 in->inline_data.clear();
8679 in->inline_version = CEPH_INLINE_NONE;
8680 mark_caps_dirty(in, CEPH_CAP_FILE_WR);
8681 check_caps(in, 0);
8682 } else
8683 r = uninline_ret;
8684 }
8685
8686 if (have)
8687 put_cap_ref(in, CEPH_CAP_FILE_RD);
c07f9fc5
FG
8688 if (r < 0) {
8689 if (movepos)
8690 unlock_fh_pos(f);
8691 return r;
8692 } else
8693 return bl->length();
7c673cae
FG
8694}
8695
8696Client::C_Readahead::C_Readahead(Client *c, Fh *f) :
8697 client(c), f(f) {
8698 f->get();
8699 f->readahead.inc_pending();
8700}
8701
8702Client::C_Readahead::~C_Readahead() {
8703 f->readahead.dec_pending();
8704 client->_put_fh(f);
8705}
8706
8707void Client::C_Readahead::finish(int r) {
8708 lgeneric_subdout(client->cct, client, 20) << "client." << client->get_nodeid() << " " << "C_Readahead on " << f->inode << dendl;
8709 client->put_cap_ref(f->inode.get(), CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE);
8710}
8711
8712int Client::_read_async(Fh *f, uint64_t off, uint64_t len, bufferlist *bl)
8713{
8714 const md_config_t *conf = cct->_conf;
8715 Inode *in = f->inode.get();
8716
8717 ldout(cct, 10) << "_read_async " << *in << " " << off << "~" << len << dendl;
8718
8719 // trim read based on file size?
8720 if (off >= in->size)
8721 return 0;
8722 if (len == 0)
8723 return 0;
8724 if (off + len > in->size) {
8725 len = in->size - off;
8726 }
8727
8728 ldout(cct, 10) << " min_bytes=" << f->readahead.get_min_readahead_size()
8729 << " max_bytes=" << f->readahead.get_max_readahead_size()
8730 << " max_periods=" << conf->client_readahead_max_periods << dendl;
8731
8732 // read (and possibly block)
8733 int r, rvalue = 0;
8734 Mutex flock("Client::_read_async flock");
8735 Cond cond;
8736 bool done = false;
8737 Context *onfinish = new C_SafeCond(&flock, &cond, &done, &rvalue);
8738 r = objectcacher->file_read(&in->oset, &in->layout, in->snapid,
8739 off, len, bl, 0, onfinish);
8740 if (r == 0) {
8741 get_cap_ref(in, CEPH_CAP_FILE_CACHE);
8742 client_lock.Unlock();
8743 flock.Lock();
8744 while (!done)
8745 cond.Wait(flock);
8746 flock.Unlock();
8747 client_lock.Lock();
8748 put_cap_ref(in, CEPH_CAP_FILE_CACHE);
8749 r = rvalue;
8750 } else {
8751 // it was cached.
8752 delete onfinish;
8753 }
8754
8755 if(f->readahead.get_min_readahead_size() > 0) {
8756 pair<uint64_t, uint64_t> readahead_extent = f->readahead.update(off, len, in->size);
8757 if (readahead_extent.second > 0) {
8758 ldout(cct, 20) << "readahead " << readahead_extent.first << "~" << readahead_extent.second
8759 << " (caller wants " << off << "~" << len << ")" << dendl;
8760 Context *onfinish2 = new C_Readahead(this, f);
8761 int r2 = objectcacher->file_read(&in->oset, &in->layout, in->snapid,
8762 readahead_extent.first, readahead_extent.second,
8763 NULL, 0, onfinish2);
8764 if (r2 == 0) {
8765 ldout(cct, 20) << "readahead initiated, c " << onfinish2 << dendl;
8766 get_cap_ref(in, CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE);
8767 } else {
8768 ldout(cct, 20) << "readahead was no-op, already cached" << dendl;
8769 delete onfinish2;
8770 }
8771 }
8772 }
8773
8774 return r;
8775}
8776
8777int Client::_read_sync(Fh *f, uint64_t off, uint64_t len, bufferlist *bl,
8778 bool *checkeof)
8779{
8780 Inode *in = f->inode.get();
8781 uint64_t pos = off;
8782 int left = len;
8783 int read = 0;
8784
8785 ldout(cct, 10) << "_read_sync " << *in << " " << off << "~" << len << dendl;
8786
8787 Mutex flock("Client::_read_sync flock");
8788 Cond cond;
8789 while (left > 0) {
8790 int r = 0;
8791 bool done = false;
8792 Context *onfinish = new C_SafeCond(&flock, &cond, &done, &r);
8793 bufferlist tbl;
8794
8795 int wanted = left;
8796 filer->read_trunc(in->ino, &in->layout, in->snapid,
8797 pos, left, &tbl, 0,
8798 in->truncate_size, in->truncate_seq,
8799 onfinish);
8800 client_lock.Unlock();
8801 flock.Lock();
8802 while (!done)
8803 cond.Wait(flock);
8804 flock.Unlock();
8805 client_lock.Lock();
8806
8807 // if we get ENOENT from OSD, assume 0 bytes returned
8808 if (r == -ENOENT)
8809 r = 0;
8810 if (r < 0)
8811 return r;
8812 if (tbl.length()) {
8813 r = tbl.length();
8814
8815 read += r;
8816 pos += r;
8817 left -= r;
8818 bl->claim_append(tbl);
8819 }
8820 // short read?
8821 if (r >= 0 && r < wanted) {
8822 if (pos < in->size) {
8823 // zero up to known EOF
8824 int64_t some = in->size - pos;
8825 if (some > left)
8826 some = left;
8827 bufferptr z(some);
8828 z.zero();
8829 bl->push_back(z);
8830 read += some;
8831 pos += some;
8832 left -= some;
8833 if (left == 0)
8834 return read;
8835 }
8836
8837 *checkeof = true;
8838 return read;
8839 }
8840 }
8841 return read;
8842}
8843
8844
8845/*
8846 * we keep count of uncommitted sync writes on the inode, so that
8847 * fsync can DDRT.
8848 */
8849void Client::_sync_write_commit(Inode *in)
8850{
8851 assert(unsafe_sync_write > 0);
8852 unsafe_sync_write--;
8853
8854 put_cap_ref(in, CEPH_CAP_FILE_BUFFER);
8855
8856 ldout(cct, 15) << "sync_write_commit unsafe_sync_write = " << unsafe_sync_write << dendl;
8857 if (unsafe_sync_write == 0 && unmounting) {
8858 ldout(cct, 10) << "sync_write_commit -- no more unsafe writes, unmount can proceed" << dendl;
8859 mount_cond.Signal();
8860 }
8861}
8862
8863int Client::write(int fd, const char *buf, loff_t size, loff_t offset)
8864{
8865 Mutex::Locker lock(client_lock);
8866 tout(cct) << "write" << std::endl;
8867 tout(cct) << fd << std::endl;
8868 tout(cct) << size << std::endl;
8869 tout(cct) << offset << std::endl;
8870
8871 Fh *fh = get_filehandle(fd);
8872 if (!fh)
8873 return -EBADF;
8874#if defined(__linux__) && defined(O_PATH)
8875 if (fh->flags & O_PATH)
8876 return -EBADF;
8877#endif
8878 int r = _write(fh, offset, size, buf, NULL, 0);
8879 ldout(cct, 3) << "write(" << fd << ", \"...\", " << size << ", " << offset << ") = " << r << dendl;
8880 return r;
8881}
8882
8883int Client::pwritev(int fd, const struct iovec *iov, int iovcnt, int64_t offset)
8884{
8885 if (iovcnt < 0)
8886 return -EINVAL;
8887 return _preadv_pwritev(fd, iov, iovcnt, offset, true);
8888}
8889
8890int Client::_preadv_pwritev(int fd, const struct iovec *iov, unsigned iovcnt, int64_t offset, bool write)
8891{
8892 Mutex::Locker lock(client_lock);
8893 tout(cct) << fd << std::endl;
8894 tout(cct) << offset << std::endl;
8895
8896 Fh *fh = get_filehandle(fd);
8897 if (!fh)
8898 return -EBADF;
8899#if defined(__linux__) && defined(O_PATH)
8900 if (fh->flags & O_PATH)
8901 return -EBADF;
8902#endif
8903 loff_t totallen = 0;
8904 for (unsigned i = 0; i < iovcnt; i++) {
8905 totallen += iov[i].iov_len;
8906 }
8907 if (write) {
8908 int w = _write(fh, offset, totallen, NULL, iov, iovcnt);
8909 ldout(cct, 3) << "pwritev(" << fd << ", \"...\", " << totallen << ", " << offset << ") = " << w << dendl;
8910 return w;
8911 } else {
8912 bufferlist bl;
8913 int r = _read(fh, offset, totallen, &bl);
8914 ldout(cct, 3) << "preadv(" << fd << ", " << offset << ") = " << r << dendl;
8915 if (r <= 0)
8916 return r;
8917
8918 int bufoff = 0;
8919 for (unsigned j = 0, resid = r; j < iovcnt && resid > 0; j++) {
8920 /*
8921 * This piece of code aims to handle the case that bufferlist does not have enough data
8922 * to fill in the iov
8923 */
8924 if (resid < iov[j].iov_len) {
8925 bl.copy(bufoff, resid, (char *)iov[j].iov_base);
8926 break;
8927 } else {
8928 bl.copy(bufoff, iov[j].iov_len, (char *)iov[j].iov_base);
8929 }
8930 resid -= iov[j].iov_len;
8931 bufoff += iov[j].iov_len;
8932 }
8933 return r;
8934 }
8935}
8936
8937int Client::_write(Fh *f, int64_t offset, uint64_t size, const char *buf,
8938 const struct iovec *iov, int iovcnt)
8939{
8940 if ((uint64_t)(offset+size) > mdsmap->get_max_filesize()) //too large!
8941 return -EFBIG;
8942
8943 //ldout(cct, 7) << "write fh " << fh << " size " << size << " offset " << offset << dendl;
8944 Inode *in = f->inode.get();
8945
8946 if (objecter->osdmap_pool_full(in->layout.pool_id)) {
8947 return -ENOSPC;
8948 }
8949
8950 assert(in->snapid == CEPH_NOSNAP);
8951
8952 // was Fh opened as writeable?
8953 if ((f->mode & CEPH_FILE_MODE_WR) == 0)
8954 return -EBADF;
8955
8956 // check quota
8957 uint64_t endoff = offset + size;
8958 if (endoff > in->size && is_quota_bytes_exceeded(in, endoff - in->size,
8959 f->actor_perms)) {
8960 return -EDQUOT;
8961 }
8962
8963 // use/adjust fd pos?
8964 if (offset < 0) {
8965 lock_fh_pos(f);
8966 /*
8967 * FIXME: this is racy in that we may block _after_ this point waiting for caps, and size may
8968 * change out from under us.
8969 */
8970 if (f->flags & O_APPEND) {
8971 int r = _lseek(f, 0, SEEK_END);
8972 if (r < 0) {
8973 unlock_fh_pos(f);
8974 return r;
8975 }
8976 }
8977 offset = f->pos;
8978 f->pos = offset+size;
8979 unlock_fh_pos(f);
8980 }
8981
8982 //bool lazy = f->mode == CEPH_FILE_MODE_LAZY;
8983
8984 ldout(cct, 10) << "cur file size is " << in->size << dendl;
8985
8986 // time it.
8987 utime_t start = ceph_clock_now();
8988
8989 if (in->inline_version == 0) {
8990 int r = _getattr(in, CEPH_STAT_CAP_INLINE_DATA, f->actor_perms, true);
8991 if (r < 0)
8992 return r;
8993 assert(in->inline_version > 0);
8994 }
8995
8996 // copy into fresh buffer (since our write may be resub, async)
8997 bufferlist bl;
8998 if (buf) {
8999 if (size > 0)
9000 bl.append(buf, size);
9001 } else if (iov){
9002 for (int i = 0; i < iovcnt; i++) {
9003 if (iov[i].iov_len > 0) {
9004 bl.append((const char *)iov[i].iov_base, iov[i].iov_len);
9005 }
9006 }
9007 }
9008
9009 utime_t lat;
9010 uint64_t totalwritten;
9011 int have;
9012 int r = get_caps(in, CEPH_CAP_FILE_WR|CEPH_CAP_AUTH_SHARED,
9013 CEPH_CAP_FILE_BUFFER, &have, endoff);
9014 if (r < 0)
9015 return r;
9016
9017 /* clear the setuid/setgid bits, if any */
9018 if (unlikely((in->mode & S_ISUID) ||
9019 (in->mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP))) {
9020 struct ceph_statx stx = { 0 };
9021
9022 put_cap_ref(in, CEPH_CAP_AUTH_SHARED);
9023 r = __setattrx(in, &stx, CEPH_SETATTR_KILL_SGUID, f->actor_perms);
9024 if (r < 0)
9025 return r;
9026 } else {
9027 put_cap_ref(in, CEPH_CAP_AUTH_SHARED);
9028 }
9029
9030 if (f->flags & O_DIRECT)
9031 have &= ~CEPH_CAP_FILE_BUFFER;
9032
9033 ldout(cct, 10) << " snaprealm " << *in->snaprealm << dendl;
9034
9035 Mutex uninline_flock("Client::_write_uninline_data flock");
9036 Cond uninline_cond;
9037 bool uninline_done = false;
9038 int uninline_ret = 0;
9039 Context *onuninline = NULL;
9040
9041 if (in->inline_version < CEPH_INLINE_NONE) {
9042 if (endoff > cct->_conf->client_max_inline_size ||
9043 endoff > CEPH_INLINE_MAX_SIZE ||
9044 !(have & CEPH_CAP_FILE_BUFFER)) {
9045 onuninline = new C_SafeCond(&uninline_flock,
9046 &uninline_cond,
9047 &uninline_done,
9048 &uninline_ret);
9049 uninline_data(in, onuninline);
9050 } else {
9051 get_cap_ref(in, CEPH_CAP_FILE_BUFFER);
9052
9053 uint32_t len = in->inline_data.length();
9054
9055 if (endoff < len)
9056 in->inline_data.copy(endoff, len - endoff, bl);
9057
9058 if (offset < len)
9059 in->inline_data.splice(offset, len - offset);
9060 else if (offset > len)
9061 in->inline_data.append_zero(offset - len);
9062
9063 in->inline_data.append(bl);
9064 in->inline_version++;
9065
9066 put_cap_ref(in, CEPH_CAP_FILE_BUFFER);
9067
9068 goto success;
9069 }
9070 }
9071
9072 if (cct->_conf->client_oc && (have & CEPH_CAP_FILE_BUFFER)) {
9073 // do buffered write
9074 if (!in->oset.dirty_or_tx)
9075 get_cap_ref(in, CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_BUFFER);
9076
9077 get_cap_ref(in, CEPH_CAP_FILE_BUFFER);
9078
9079 // async, caching, non-blocking.
9080 r = objectcacher->file_write(&in->oset, &in->layout,
9081 in->snaprealm->get_snap_context(),
9082 offset, size, bl, ceph::real_clock::now(),
9083 0);
9084 put_cap_ref(in, CEPH_CAP_FILE_BUFFER);
9085
9086 if (r < 0)
9087 goto done;
9088
9089 // flush cached write if O_SYNC is set on file fh
9090 // O_DSYNC == O_SYNC on linux < 2.6.33
9091 // O_SYNC = __O_SYNC | O_DSYNC on linux >= 2.6.33
9092 if ((f->flags & O_SYNC) || (f->flags & O_DSYNC)) {
9093 _flush_range(in, offset, size);
9094 }
9095 } else {
9096 if (f->flags & O_DIRECT)
9097 _flush_range(in, offset, size);
9098
9099 // simple, non-atomic sync write
9100 Mutex flock("Client::_write flock");
9101 Cond cond;
9102 bool done = false;
9103 Context *onfinish = new C_SafeCond(&flock, &cond, &done);
9104
9105 unsafe_sync_write++;
9106 get_cap_ref(in, CEPH_CAP_FILE_BUFFER); // released by onsafe callback
9107
9108 filer->write_trunc(in->ino, &in->layout, in->snaprealm->get_snap_context(),
9109 offset, size, bl, ceph::real_clock::now(), 0,
9110 in->truncate_size, in->truncate_seq,
9111 onfinish);
9112 client_lock.Unlock();
9113 flock.Lock();
9114
9115 while (!done)
9116 cond.Wait(flock);
9117 flock.Unlock();
9118 client_lock.Lock();
9119 _sync_write_commit(in);
9120 }
9121
9122 // if we get here, write was successful, update client metadata
9123success:
9124 // time
9125 lat = ceph_clock_now();
9126 lat -= start;
9127 logger->tinc(l_c_wrlat, lat);
9128
9129 totalwritten = size;
9130 r = (int)totalwritten;
9131
9132 // extend file?
9133 if (totalwritten + offset > in->size) {
9134 in->size = totalwritten + offset;
9135 mark_caps_dirty(in, CEPH_CAP_FILE_WR);
9136
9137 if (is_quota_bytes_approaching(in, f->actor_perms)) {
9138 check_caps(in, CHECK_CAPS_NODELAY);
31f18b77
FG
9139 } else if (is_max_size_approaching(in)) {
9140 check_caps(in, 0);
7c673cae
FG
9141 }
9142
9143 ldout(cct, 7) << "wrote to " << totalwritten+offset << ", extending file size" << dendl;
9144 } else {
9145 ldout(cct, 7) << "wrote to " << totalwritten+offset << ", leaving file size at " << in->size << dendl;
9146 }
9147
9148 // mtime
9149 in->mtime = ceph_clock_now();
9150 in->change_attr++;
9151 mark_caps_dirty(in, CEPH_CAP_FILE_WR);
9152
9153done:
9154
9155 if (onuninline) {
9156 client_lock.Unlock();
9157 uninline_flock.Lock();
9158 while (!uninline_done)
9159 uninline_cond.Wait(uninline_flock);
9160 uninline_flock.Unlock();
9161 client_lock.Lock();
9162
9163 if (uninline_ret >= 0 || uninline_ret == -ECANCELED) {
9164 in->inline_data.clear();
9165 in->inline_version = CEPH_INLINE_NONE;
9166 mark_caps_dirty(in, CEPH_CAP_FILE_WR);
9167 check_caps(in, 0);
9168 } else
9169 r = uninline_ret;
9170 }
9171
9172 put_cap_ref(in, CEPH_CAP_FILE_WR);
9173 return r;
9174}
9175
9176int Client::_flush(Fh *f)
9177{
9178 Inode *in = f->inode.get();
9179 int err = f->take_async_err();
9180 if (err != 0) {
9181 ldout(cct, 1) << __func__ << ": " << f << " on inode " << *in << " caught async_err = "
9182 << cpp_strerror(err) << dendl;
9183 } else {
9184 ldout(cct, 10) << __func__ << ": " << f << " on inode " << *in << " no async_err state" << dendl;
9185 }
9186
9187 return err;
9188}
9189
9190int Client::truncate(const char *relpath, loff_t length, const UserPerm& perms)
9191{
9192 struct ceph_statx stx;
9193 stx.stx_size = length;
9194 return setattrx(relpath, &stx, CEPH_SETATTR_SIZE, perms);
9195}
9196
9197int Client::ftruncate(int fd, loff_t length, const UserPerm& perms)
9198{
9199 Mutex::Locker lock(client_lock);
9200 tout(cct) << "ftruncate" << std::endl;
9201 tout(cct) << fd << std::endl;
9202 tout(cct) << length << std::endl;
9203
9204 Fh *f = get_filehandle(fd);
9205 if (!f)
9206 return -EBADF;
9207#if defined(__linux__) && defined(O_PATH)
9208 if (f->flags & O_PATH)
9209 return -EBADF;
9210#endif
9211 struct stat attr;
9212 attr.st_size = length;
9213 return _setattr(f->inode, &attr, CEPH_SETATTR_SIZE, perms);
9214}
9215
9216int Client::fsync(int fd, bool syncdataonly)
9217{
9218 Mutex::Locker lock(client_lock);
9219 tout(cct) << "fsync" << std::endl;
9220 tout(cct) << fd << std::endl;
9221 tout(cct) << syncdataonly << std::endl;
9222
9223 Fh *f = get_filehandle(fd);
9224 if (!f)
9225 return -EBADF;
9226#if defined(__linux__) && defined(O_PATH)
9227 if (f->flags & O_PATH)
9228 return -EBADF;
9229#endif
9230 int r = _fsync(f, syncdataonly);
9231 if (r == 0) {
9232 // The IOs in this fsync were okay, but maybe something happened
9233 // in the background that we shoudl be reporting?
9234 r = f->take_async_err();
9235 ldout(cct, 3) << "fsync(" << fd << ", " << syncdataonly
9236 << ") = 0, async_err = " << r << dendl;
9237 } else {
9238 // Assume that an error we encountered during fsync, even reported
9239 // synchronously, would also have applied the error to the Fh, and we
9240 // should clear it here to avoid returning the same error again on next
9241 // call.
9242 ldout(cct, 3) << "fsync(" << fd << ", " << syncdataonly << ") = "
9243 << r << dendl;
9244 f->take_async_err();
9245 }
9246 return r;
9247}
9248
9249int Client::_fsync(Inode *in, bool syncdataonly)
9250{
9251 int r = 0;
9252 Mutex lock("Client::_fsync::lock");
9253 Cond cond;
9254 bool done = false;
9255 C_SafeCond *object_cacher_completion = NULL;
9256 ceph_tid_t flush_tid = 0;
9257 InodeRef tmp_ref;
9258
9259 ldout(cct, 3) << "_fsync on " << *in << " " << (syncdataonly ? "(dataonly)":"(data+metadata)") << dendl;
9260
9261 if (cct->_conf->client_oc) {
9262 object_cacher_completion = new C_SafeCond(&lock, &cond, &done, &r);
9263 tmp_ref = in; // take a reference; C_SafeCond doesn't and _flush won't either
9264 _flush(in, object_cacher_completion);
9265 ldout(cct, 15) << "using return-valued form of _fsync" << dendl;
9266 }
9267
9268 if (!syncdataonly && in->dirty_caps) {
9269 check_caps(in, CHECK_CAPS_NODELAY|CHECK_CAPS_SYNCHRONOUS);
9270 if (in->flushing_caps)
9271 flush_tid = last_flush_tid;
9272 } else ldout(cct, 10) << "no metadata needs to commit" << dendl;
9273
9274 if (!syncdataonly && !in->unsafe_ops.empty()) {
9275 MetaRequest *req = in->unsafe_ops.back();
9276 ldout(cct, 15) << "waiting on unsafe requests, last tid " << req->get_tid() << dendl;
9277
9278 req->get();
9279 wait_on_list(req->waitfor_safe);
9280 put_request(req);
9281 }
9282
9283 if (object_cacher_completion) { // wait on a real reply instead of guessing
9284 client_lock.Unlock();
9285 lock.Lock();
9286 ldout(cct, 15) << "waiting on data to flush" << dendl;
9287 while (!done)
9288 cond.Wait(lock);
9289 lock.Unlock();
9290 client_lock.Lock();
9291 ldout(cct, 15) << "got " << r << " from flush writeback" << dendl;
9292 } else {
9293 // FIXME: this can starve
9294 while (in->cap_refs[CEPH_CAP_FILE_BUFFER] > 0) {
9295 ldout(cct, 10) << "ino " << in->ino << " has " << in->cap_refs[CEPH_CAP_FILE_BUFFER]
9296 << " uncommitted, waiting" << dendl;
9297 wait_on_list(in->waitfor_commit);
9298 }
9299 }
9300
9301 if (!r) {
9302 if (flush_tid > 0)
9303 wait_sync_caps(in, flush_tid);
9304
9305 ldout(cct, 10) << "ino " << in->ino << " has no uncommitted writes" << dendl;
9306 } else {
9307 ldout(cct, 1) << "ino " << in->ino << " failed to commit to disk! "
9308 << cpp_strerror(-r) << dendl;
9309 }
9310
9311 return r;
9312}
9313
9314int Client::_fsync(Fh *f, bool syncdataonly)
9315{
9316 ldout(cct, 3) << "_fsync(" << f << ", " << (syncdataonly ? "dataonly)":"data+metadata)") << dendl;
9317 return _fsync(f->inode.get(), syncdataonly);
9318}
9319
9320int Client::fstat(int fd, struct stat *stbuf, const UserPerm& perms, int mask)
9321{
9322 Mutex::Locker lock(client_lock);
9323 tout(cct) << "fstat mask " << hex << mask << dec << std::endl;
9324 tout(cct) << fd << std::endl;
9325
9326 Fh *f = get_filehandle(fd);
9327 if (!f)
9328 return -EBADF;
9329 int r = _getattr(f->inode, mask, perms);
9330 if (r < 0)
9331 return r;
9332 fill_stat(f->inode, stbuf, NULL);
9333 ldout(cct, 3) << "fstat(" << fd << ", " << stbuf << ") = " << r << dendl;
9334 return r;
9335}
9336
9337int Client::fstatx(int fd, struct ceph_statx *stx, const UserPerm& perms,
9338 unsigned int want, unsigned int flags)
9339{
9340 Mutex::Locker lock(client_lock);
9341 tout(cct) << "fstatx flags " << hex << flags << " want " << want << dec << std::endl;
9342 tout(cct) << fd << std::endl;
9343
9344 Fh *f = get_filehandle(fd);
9345 if (!f)
9346 return -EBADF;
9347
9348 unsigned mask = statx_to_mask(flags, want);
9349
9350 int r = 0;
9351 if (mask && !f->inode->caps_issued_mask(mask)) {
9352 r = _getattr(f->inode, mask, perms);
9353 if (r < 0) {
9354 ldout(cct, 3) << "fstatx exit on error!" << dendl;
9355 return r;
9356 }
9357 }
9358
9359 fill_statx(f->inode, mask, stx);
9360 ldout(cct, 3) << "fstatx(" << fd << ", " << stx << ") = " << r << dendl;
9361 return r;
9362}
9363
9364// not written yet, but i want to link!
9365
9366int Client::chdir(const char *relpath, std::string &new_cwd,
9367 const UserPerm& perms)
9368{
9369 Mutex::Locker lock(client_lock);
9370 tout(cct) << "chdir" << std::endl;
9371 tout(cct) << relpath << std::endl;
9372 filepath path(relpath);
9373 InodeRef in;
9374 int r = path_walk(path, &in, perms);
9375 if (r < 0)
9376 return r;
9377 if (cwd != in)
9378 cwd.swap(in);
9379 ldout(cct, 3) << "chdir(" << relpath << ") cwd now " << cwd->ino << dendl;
9380
9381 getcwd(new_cwd, perms);
9382 return 0;
9383}
9384
9385void Client::getcwd(string& dir, const UserPerm& perms)
9386{
9387 filepath path;
9388 ldout(cct, 10) << "getcwd " << *cwd << dendl;
9389
9390 Inode *in = cwd.get();
9391 while (in != root) {
9392 assert(in->dn_set.size() < 2); // dirs can't be hard-linked
9393
9394 // A cwd or ancester is unlinked
9395 if (in->dn_set.empty()) {
9396 return;
9397 }
9398
9399 Dentry *dn = in->get_first_parent();
9400
9401
9402 if (!dn) {
9403 // look it up
9404 ldout(cct, 10) << "getcwd looking up parent for " << *in << dendl;
9405 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPNAME);
9406 filepath path(in->ino);
9407 req->set_filepath(path);
9408 req->set_inode(in);
9409 int res = make_request(req, perms);
9410 if (res < 0)
9411 break;
9412
9413 // start over
9414 path = filepath();
9415 in = cwd.get();
9416 continue;
9417 }
9418 path.push_front_dentry(dn->name);
9419 in = dn->dir->parent_inode;
9420 }
9421 dir = "/";
9422 dir += path.get_path();
9423}
9424
9425int Client::statfs(const char *path, struct statvfs *stbuf,
9426 const UserPerm& perms)
9427{
9428 Mutex::Locker l(client_lock);
9429 tout(cct) << "statfs" << std::endl;
9430
9431 ceph_statfs stats;
9432 C_SaferCond cond;
d2e6a577
FG
9433
9434 const vector<int64_t> &data_pools = mdsmap->get_data_pools();
9435 if (data_pools.size() == 1) {
9436 objecter->get_fs_stats(stats, data_pools[0], &cond);
9437 } else {
9438 objecter->get_fs_stats(stats, boost::optional<int64_t>(), &cond);
9439 }
7c673cae
FG
9440
9441 client_lock.Unlock();
9442 int rval = cond.wait();
9443 client_lock.Lock();
9444
9445 if (rval < 0) {
9446 ldout(cct, 1) << "underlying call to statfs returned error: "
9447 << cpp_strerror(rval)
9448 << dendl;
9449 return rval;
9450 }
9451
9452 memset(stbuf, 0, sizeof(*stbuf));
9453
9454 /*
9455 * we're going to set a block size of 4MB so we can represent larger
9456 * FSes without overflowing. Additionally convert the space
9457 * measurements from KB to bytes while making them in terms of
9458 * blocks. We use 4MB only because it is big enough, and because it
9459 * actually *is* the (ceph) default block size.
9460 */
9461 const int CEPH_BLOCK_SHIFT = 22;
9462 stbuf->f_frsize = 1 << CEPH_BLOCK_SHIFT;
9463 stbuf->f_bsize = 1 << CEPH_BLOCK_SHIFT;
9464 stbuf->f_files = stats.num_objects;
9465 stbuf->f_ffree = -1;
9466 stbuf->f_favail = -1;
9467 stbuf->f_fsid = -1; // ??
9468 stbuf->f_flag = 0; // ??
9469 stbuf->f_namemax = NAME_MAX;
9470
9471 // Usually quota_root will == root_ancestor, but if the mount root has no
9472 // quota but we can see a parent of it that does have a quota, we'll
9473 // respect that one instead.
9474 assert(root != nullptr);
9475 Inode *quota_root = root->quota.is_enable() ? root : get_quota_root(root, perms);
9476
9477 // get_quota_root should always give us something
9478 // because client quotas are always enabled
9479 assert(quota_root != nullptr);
9480
9481 if (quota_root && cct->_conf->client_quota_df && quota_root->quota.max_bytes) {
9482
9483 // Skip the getattr if any sessions are stale, as we don't want to
9484 // block `df` if this client has e.g. been evicted, or if the MDS cluster
9485 // is unhealthy.
9486 if (!_any_stale_sessions()) {
9487 int r = _getattr(quota_root, 0, perms, true);
9488 if (r != 0) {
9489 // Ignore return value: error getting latest inode metadata is not a good
9490 // reason to break "df".
9491 lderr(cct) << "Error in getattr on quota root 0x"
9492 << std::hex << quota_root->ino << std::dec
9493 << " statfs result may be outdated" << dendl;
9494 }
9495 }
9496
9497 // Special case: if there is a size quota set on the Inode acting
9498 // as the root for this client mount, then report the quota status
9499 // as the filesystem statistics.
9500 const fsblkcnt_t total = quota_root->quota.max_bytes >> CEPH_BLOCK_SHIFT;
9501 const fsblkcnt_t used = quota_root->rstat.rbytes >> CEPH_BLOCK_SHIFT;
31f18b77
FG
9502 // It is possible for a quota to be exceeded: arithmetic here must
9503 // handle case where used > total.
9504 const fsblkcnt_t free = total > used ? total - used : 0;
7c673cae
FG
9505
9506 stbuf->f_blocks = total;
9507 stbuf->f_bfree = free;
9508 stbuf->f_bavail = free;
9509 } else {
d2e6a577 9510 // General case: report the cluster statistics returned from RADOS. Because
7c673cae
FG
9511 // multiple pools may be used without one filesystem namespace via
9512 // layouts, this is the most correct thing we can do.
9513 stbuf->f_blocks = stats.kb >> (CEPH_BLOCK_SHIFT - 10);
9514 stbuf->f_bfree = stats.kb_avail >> (CEPH_BLOCK_SHIFT - 10);
9515 stbuf->f_bavail = stats.kb_avail >> (CEPH_BLOCK_SHIFT - 10);
9516 }
9517
9518 return rval;
9519}
9520
9521int Client::_do_filelock(Inode *in, Fh *fh, int lock_type, int op, int sleep,
9522 struct flock *fl, uint64_t owner, bool removing)
9523{
9524 ldout(cct, 10) << "_do_filelock ino " << in->ino
9525 << (lock_type == CEPH_LOCK_FCNTL ? " fcntl" : " flock")
9526 << " type " << fl->l_type << " owner " << owner
9527 << " " << fl->l_start << "~" << fl->l_len << dendl;
9528
9529 int lock_cmd;
9530 if (F_RDLCK == fl->l_type)
9531 lock_cmd = CEPH_LOCK_SHARED;
9532 else if (F_WRLCK == fl->l_type)
9533 lock_cmd = CEPH_LOCK_EXCL;
9534 else if (F_UNLCK == fl->l_type)
9535 lock_cmd = CEPH_LOCK_UNLOCK;
9536 else
9537 return -EIO;
9538
9539 if (op != CEPH_MDS_OP_SETFILELOCK || lock_cmd == CEPH_LOCK_UNLOCK)
9540 sleep = 0;
9541
9542 /*
9543 * Set the most significant bit, so that MDS knows the 'owner'
9544 * is sufficient to identify the owner of lock. (old code uses
9545 * both 'owner' and 'pid')
9546 */
9547 owner |= (1ULL << 63);
9548
9549 MetaRequest *req = new MetaRequest(op);
9550 filepath path;
9551 in->make_nosnap_relative_path(path);
9552 req->set_filepath(path);
9553 req->set_inode(in);
9554
9555 req->head.args.filelock_change.rule = lock_type;
9556 req->head.args.filelock_change.type = lock_cmd;
9557 req->head.args.filelock_change.owner = owner;
9558 req->head.args.filelock_change.pid = fl->l_pid;
9559 req->head.args.filelock_change.start = fl->l_start;
9560 req->head.args.filelock_change.length = fl->l_len;
9561 req->head.args.filelock_change.wait = sleep;
9562
9563 int ret;
9564 bufferlist bl;
9565
9566 if (sleep && switch_interrupt_cb) {
9567 // enable interrupt
9568 switch_interrupt_cb(callback_handle, req->get());
9569 ret = make_request(req, fh->actor_perms, NULL, NULL, -1, &bl);
7c673cae
FG
9570 // disable interrupt
9571 switch_interrupt_cb(callback_handle, NULL);
31f18b77
FG
9572 if (ret == 0 && req->aborted()) {
9573 // effect of this lock request has been revoked by the 'lock intr' request
9574 ret = req->get_abort_code();
9575 }
7c673cae
FG
9576 put_request(req);
9577 } else {
9578 ret = make_request(req, fh->actor_perms, NULL, NULL, -1, &bl);
9579 }
9580
9581 if (ret == 0) {
9582 if (op == CEPH_MDS_OP_GETFILELOCK) {
9583 ceph_filelock filelock;
9584 bufferlist::iterator p = bl.begin();
9585 ::decode(filelock, p);
9586
9587 if (CEPH_LOCK_SHARED == filelock.type)
9588 fl->l_type = F_RDLCK;
9589 else if (CEPH_LOCK_EXCL == filelock.type)
9590 fl->l_type = F_WRLCK;
9591 else
9592 fl->l_type = F_UNLCK;
9593
9594 fl->l_whence = SEEK_SET;
9595 fl->l_start = filelock.start;
9596 fl->l_len = filelock.length;
9597 fl->l_pid = filelock.pid;
9598 } else if (op == CEPH_MDS_OP_SETFILELOCK) {
9599 ceph_lock_state_t *lock_state;
9600 if (lock_type == CEPH_LOCK_FCNTL) {
9601 if (!in->fcntl_locks)
9602 in->fcntl_locks = new ceph_lock_state_t(cct, CEPH_LOCK_FCNTL);
9603 lock_state = in->fcntl_locks;
9604 } else if (lock_type == CEPH_LOCK_FLOCK) {
9605 if (!in->flock_locks)
9606 in->flock_locks = new ceph_lock_state_t(cct, CEPH_LOCK_FLOCK);
9607 lock_state = in->flock_locks;
9608 } else {
9609 ceph_abort();
9610 return -EINVAL;
9611 }
9612 _update_lock_state(fl, owner, lock_state);
9613
9614 if (!removing) {
9615 if (lock_type == CEPH_LOCK_FCNTL) {
9616 if (!fh->fcntl_locks)
9617 fh->fcntl_locks = new ceph_lock_state_t(cct, CEPH_LOCK_FCNTL);
9618 lock_state = fh->fcntl_locks;
9619 } else {
9620 if (!fh->flock_locks)
9621 fh->flock_locks = new ceph_lock_state_t(cct, CEPH_LOCK_FLOCK);
9622 lock_state = fh->flock_locks;
9623 }
9624 _update_lock_state(fl, owner, lock_state);
9625 }
9626 } else
9627 ceph_abort();
9628 }
9629 return ret;
9630}
9631
9632int Client::_interrupt_filelock(MetaRequest *req)
9633{
31f18b77
FG
9634 // Set abort code, but do not kick. The abort code prevents the request
9635 // from being re-sent.
9636 req->abort(-EINTR);
9637 if (req->mds < 0)
9638 return 0; // haven't sent the request
9639
7c673cae
FG
9640 Inode *in = req->inode();
9641
9642 int lock_type;
9643 if (req->head.args.filelock_change.rule == CEPH_LOCK_FLOCK)
9644 lock_type = CEPH_LOCK_FLOCK_INTR;
9645 else if (req->head.args.filelock_change.rule == CEPH_LOCK_FCNTL)
9646 lock_type = CEPH_LOCK_FCNTL_INTR;
9647 else {
9648 ceph_abort();
9649 return -EINVAL;
9650 }
9651
9652 MetaRequest *intr_req = new MetaRequest(CEPH_MDS_OP_SETFILELOCK);
9653 filepath path;
9654 in->make_nosnap_relative_path(path);
9655 intr_req->set_filepath(path);
9656 intr_req->set_inode(in);
9657 intr_req->head.args.filelock_change = req->head.args.filelock_change;
9658 intr_req->head.args.filelock_change.rule = lock_type;
9659 intr_req->head.args.filelock_change.type = CEPH_LOCK_UNLOCK;
9660
9661 UserPerm perms(req->get_uid(), req->get_gid());
9662 return make_request(intr_req, perms, NULL, NULL, -1);
9663}
9664
9665void Client::_encode_filelocks(Inode *in, bufferlist& bl)
9666{
9667 if (!in->fcntl_locks && !in->flock_locks)
9668 return;
9669
9670 unsigned nr_fcntl_locks = in->fcntl_locks ? in->fcntl_locks->held_locks.size() : 0;
9671 ::encode(nr_fcntl_locks, bl);
9672 if (nr_fcntl_locks) {
9673 ceph_lock_state_t* lock_state = in->fcntl_locks;
9674 for(multimap<uint64_t, ceph_filelock>::iterator p = lock_state->held_locks.begin();
9675 p != lock_state->held_locks.end();
9676 ++p)
9677 ::encode(p->second, bl);
9678 }
9679
9680 unsigned nr_flock_locks = in->flock_locks ? in->flock_locks->held_locks.size() : 0;
9681 ::encode(nr_flock_locks, bl);
9682 if (nr_flock_locks) {
9683 ceph_lock_state_t* lock_state = in->flock_locks;
9684 for(multimap<uint64_t, ceph_filelock>::iterator p = lock_state->held_locks.begin();
9685 p != lock_state->held_locks.end();
9686 ++p)
9687 ::encode(p->second, bl);
9688 }
9689
9690 ldout(cct, 10) << "_encode_filelocks ino " << in->ino << ", " << nr_fcntl_locks
9691 << " fcntl locks, " << nr_flock_locks << " flock locks" << dendl;
9692}
9693
9694void Client::_release_filelocks(Fh *fh)
9695{
9696 if (!fh->fcntl_locks && !fh->flock_locks)
9697 return;
9698
9699 Inode *in = fh->inode.get();
9700 ldout(cct, 10) << "_release_filelocks " << fh << " ino " << in->ino << dendl;
9701
9702 list<pair<int, ceph_filelock> > to_release;
9703
9704 if (fh->fcntl_locks) {
9705 ceph_lock_state_t* lock_state = fh->fcntl_locks;
9706 for(multimap<uint64_t, ceph_filelock>::iterator p = lock_state->held_locks.begin();
9707 p != lock_state->held_locks.end();
9708 ++p)
9709 to_release.push_back(pair<int, ceph_filelock>(CEPH_LOCK_FCNTL, p->second));
9710 delete fh->fcntl_locks;
9711 }
9712 if (fh->flock_locks) {
9713 ceph_lock_state_t* lock_state = fh->flock_locks;
9714 for(multimap<uint64_t, ceph_filelock>::iterator p = lock_state->held_locks.begin();
9715 p != lock_state->held_locks.end();
9716 ++p)
9717 to_release.push_back(pair<int, ceph_filelock>(CEPH_LOCK_FLOCK, p->second));
9718 delete fh->flock_locks;
9719 }
9720
9721 if (to_release.empty())
9722 return;
9723
9724 struct flock fl;
9725 memset(&fl, 0, sizeof(fl));
9726 fl.l_whence = SEEK_SET;
9727 fl.l_type = F_UNLCK;
9728
9729 for (list<pair<int, ceph_filelock> >::iterator p = to_release.begin();
9730 p != to_release.end();
9731 ++p) {
9732 fl.l_start = p->second.start;
9733 fl.l_len = p->second.length;
9734 fl.l_pid = p->second.pid;
9735 _do_filelock(in, fh, p->first, CEPH_MDS_OP_SETFILELOCK, 0, &fl,
9736 p->second.owner, true);
9737 }
9738}
9739
9740void Client::_update_lock_state(struct flock *fl, uint64_t owner,
9741 ceph_lock_state_t *lock_state)
9742{
9743 int lock_cmd;
9744 if (F_RDLCK == fl->l_type)
9745 lock_cmd = CEPH_LOCK_SHARED;
9746 else if (F_WRLCK == fl->l_type)
9747 lock_cmd = CEPH_LOCK_EXCL;
9748 else
9749 lock_cmd = CEPH_LOCK_UNLOCK;;
9750
9751 ceph_filelock filelock;
9752 filelock.start = fl->l_start;
9753 filelock.length = fl->l_len;
9754 filelock.client = 0;
9755 // see comment in _do_filelock()
9756 filelock.owner = owner | (1ULL << 63);
9757 filelock.pid = fl->l_pid;
9758 filelock.type = lock_cmd;
9759
9760 if (filelock.type == CEPH_LOCK_UNLOCK) {
9761 list<ceph_filelock> activated_locks;
9762 lock_state->remove_lock(filelock, activated_locks);
9763 } else {
9764 bool r = lock_state->add_lock(filelock, false, false, NULL);
9765 assert(r);
9766 }
9767}
9768
9769int Client::_getlk(Fh *fh, struct flock *fl, uint64_t owner)
9770{
9771 Inode *in = fh->inode.get();
9772 ldout(cct, 10) << "_getlk " << fh << " ino " << in->ino << dendl;
9773 int ret = _do_filelock(in, fh, CEPH_LOCK_FCNTL, CEPH_MDS_OP_GETFILELOCK, 0, fl, owner);
9774 return ret;
9775}
9776
9777int Client::_setlk(Fh *fh, struct flock *fl, uint64_t owner, int sleep)
9778{
9779 Inode *in = fh->inode.get();
9780 ldout(cct, 10) << "_setlk " << fh << " ino " << in->ino << dendl;
9781 int ret = _do_filelock(in, fh, CEPH_LOCK_FCNTL, CEPH_MDS_OP_SETFILELOCK, sleep, fl, owner);
9782 ldout(cct, 10) << "_setlk " << fh << " ino " << in->ino << " result=" << ret << dendl;
9783 return ret;
9784}
9785
9786int Client::_flock(Fh *fh, int cmd, uint64_t owner)
9787{
9788 Inode *in = fh->inode.get();
9789 ldout(cct, 10) << "_flock " << fh << " ino " << in->ino << dendl;
9790
9791 int sleep = !(cmd & LOCK_NB);
9792 cmd &= ~LOCK_NB;
9793
9794 int type;
9795 switch (cmd) {
9796 case LOCK_SH:
9797 type = F_RDLCK;
9798 break;
9799 case LOCK_EX:
9800 type = F_WRLCK;
9801 break;
9802 case LOCK_UN:
9803 type = F_UNLCK;
9804 break;
9805 default:
9806 return -EINVAL;
9807 }
9808
9809 struct flock fl;
9810 memset(&fl, 0, sizeof(fl));
9811 fl.l_type = type;
9812 fl.l_whence = SEEK_SET;
9813
9814 int ret = _do_filelock(in, fh, CEPH_LOCK_FLOCK, CEPH_MDS_OP_SETFILELOCK, sleep, &fl, owner);
9815 ldout(cct, 10) << "_flock " << fh << " ino " << in->ino << " result=" << ret << dendl;
9816 return ret;
9817}
9818
9819int Client::ll_statfs(Inode *in, struct statvfs *stbuf, const UserPerm& perms)
9820{
9821 /* Since the only thing this does is wrap a call to statfs, and
9822 statfs takes a lock, it doesn't seem we have a need to split it
9823 out. */
9824 return statfs(0, stbuf, perms);
9825}
9826
9827void Client::ll_register_callbacks(struct client_callback_args *args)
9828{
9829 if (!args)
9830 return;
9831 Mutex::Locker l(client_lock);
9832 ldout(cct, 10) << "ll_register_callbacks cb " << args->handle
9833 << " invalidate_ino_cb " << args->ino_cb
9834 << " invalidate_dentry_cb " << args->dentry_cb
9835 << " getgroups_cb" << args->getgroups_cb
9836 << " switch_interrupt_cb " << args->switch_intr_cb
9837 << " remount_cb " << args->remount_cb
9838 << dendl;
9839 callback_handle = args->handle;
9840 if (args->ino_cb) {
9841 ino_invalidate_cb = args->ino_cb;
9842 async_ino_invalidator.start();
9843 }
9844 if (args->dentry_cb) {
9845 dentry_invalidate_cb = args->dentry_cb;
9846 async_dentry_invalidator.start();
9847 }
9848 if (args->switch_intr_cb) {
9849 switch_interrupt_cb = args->switch_intr_cb;
9850 interrupt_finisher.start();
9851 }
9852 if (args->remount_cb) {
9853 remount_cb = args->remount_cb;
9854 remount_finisher.start();
9855 }
9856 getgroups_cb = args->getgroups_cb;
9857 umask_cb = args->umask_cb;
9858}
9859
9860int Client::test_dentry_handling(bool can_invalidate)
9861{
9862 int r = 0;
9863
9864 can_invalidate_dentries = can_invalidate;
9865
9866 if (can_invalidate_dentries) {
9867 assert(dentry_invalidate_cb);
9868 ldout(cct, 1) << "using dentry_invalidate_cb" << dendl;
9869 } else if (remount_cb) {
9870 ldout(cct, 1) << "using remount_cb" << dendl;
9871 int s = remount_cb(callback_handle);
9872 if (s) {
9873 lderr(cct) << "Failed to invoke remount, needed to ensure kernel dcache consistency"
9874 << dendl;
9875 }
9876 if (cct->_conf->client_die_on_failed_remount) {
9877 require_remount = true;
9878 r = s;
9879 }
9880 } else {
9881 lderr(cct) << "no method to invalidate kernel dentry cache; expect issues!" << dendl;
9882 if (cct->_conf->client_die_on_failed_remount)
9883 ceph_abort();
9884 }
9885 return r;
9886}
9887
9888int Client::_sync_fs()
9889{
9890 ldout(cct, 10) << "_sync_fs" << dendl;
9891
9892 // flush file data
9893 Mutex lock("Client::_fsync::lock");
9894 Cond cond;
9895 bool flush_done = false;
9896 if (cct->_conf->client_oc)
9897 objectcacher->flush_all(new C_SafeCond(&lock, &cond, &flush_done));
9898 else
9899 flush_done = true;
9900
9901 // flush caps
9902 flush_caps_sync();
9903 ceph_tid_t flush_tid = last_flush_tid;
9904
9905 // wait for unsafe mds requests
9906 wait_unsafe_requests();
9907
9908 wait_sync_caps(flush_tid);
9909
9910 if (!flush_done) {
9911 client_lock.Unlock();
9912 lock.Lock();
9913 ldout(cct, 15) << "waiting on data to flush" << dendl;
9914 while (!flush_done)
9915 cond.Wait(lock);
9916 lock.Unlock();
9917 client_lock.Lock();
9918 }
9919
9920 return 0;
9921}
9922
9923int Client::sync_fs()
9924{
9925 Mutex::Locker l(client_lock);
9926 return _sync_fs();
9927}
9928
9929int64_t Client::drop_caches()
9930{
9931 Mutex::Locker l(client_lock);
9932 return objectcacher->release_all();
9933}
9934
9935
9936int Client::lazyio_propogate(int fd, loff_t offset, size_t count)
9937{
9938 Mutex::Locker l(client_lock);
9939 ldout(cct, 3) << "op: client->lazyio_propogate(" << fd
9940 << ", " << offset << ", " << count << ")" << dendl;
9941
9942 Fh *f = get_filehandle(fd);
9943 if (!f)
9944 return -EBADF;
9945
9946 // for now
9947 _fsync(f, true);
9948
9949 return 0;
9950}
9951
9952int Client::lazyio_synchronize(int fd, loff_t offset, size_t count)
9953{
9954 Mutex::Locker l(client_lock);
9955 ldout(cct, 3) << "op: client->lazyio_synchronize(" << fd
9956 << ", " << offset << ", " << count << ")" << dendl;
9957
9958 Fh *f = get_filehandle(fd);
9959 if (!f)
9960 return -EBADF;
9961 Inode *in = f->inode.get();
9962
9963 _fsync(f, true);
9964 if (_release(in))
9965 check_caps(in, 0);
9966 return 0;
9967}
9968
9969
9970// =============================
9971// snaps
9972
9973int Client::mksnap(const char *relpath, const char *name, const UserPerm& perm)
9974{
9975 Mutex::Locker l(client_lock);
9976 filepath path(relpath);
9977 InodeRef in;
9978 int r = path_walk(path, &in, perm);
9979 if (r < 0)
9980 return r;
9981 if (cct->_conf->client_permissions) {
9982 r = may_create(in.get(), perm);
9983 if (r < 0)
9984 return r;
9985 }
9986 Inode *snapdir = open_snapdir(in.get());
9987 return _mkdir(snapdir, name, 0, perm);
9988}
9989int Client::rmsnap(const char *relpath, const char *name, const UserPerm& perms)
9990{
9991 Mutex::Locker l(client_lock);
9992 filepath path(relpath);
9993 InodeRef in;
9994 int r = path_walk(path, &in, perms);
9995 if (r < 0)
9996 return r;
9997 if (cct->_conf->client_permissions) {
9998 r = may_delete(in.get(), NULL, perms);
9999 if (r < 0)
10000 return r;
10001 }
10002 Inode *snapdir = open_snapdir(in.get());
10003 return _rmdir(snapdir, name, perms);
10004}
10005
10006// =============================
10007// expose caps
10008
10009int Client::get_caps_issued(int fd) {
10010
10011 Mutex::Locker lock(client_lock);
10012
10013 Fh *f = get_filehandle(fd);
10014 if (!f)
10015 return -EBADF;
10016
10017 return f->inode->caps_issued();
10018}
10019
10020int Client::get_caps_issued(const char *path, const UserPerm& perms)
10021{
10022 Mutex::Locker lock(client_lock);
10023 filepath p(path);
10024 InodeRef in;
10025 int r = path_walk(p, &in, perms, true);
10026 if (r < 0)
10027 return r;
10028 return in->caps_issued();
10029}
10030
10031// =========================================
10032// low level
10033
10034Inode *Client::open_snapdir(Inode *diri)
10035{
10036 Inode *in;
10037 vinodeno_t vino(diri->ino, CEPH_SNAPDIR);
10038 if (!inode_map.count(vino)) {
10039 in = new Inode(this, vino, &diri->layout);
10040
10041 in->ino = diri->ino;
10042 in->snapid = CEPH_SNAPDIR;
10043 in->mode = diri->mode;
10044 in->uid = diri->uid;
10045 in->gid = diri->gid;
10046 in->mtime = diri->mtime;
10047 in->ctime = diri->ctime;
10048 in->btime = diri->btime;
10049 in->size = diri->size;
10050 in->change_attr = diri->change_attr;
10051
10052 in->dirfragtree.clear();
10053 in->snapdir_parent = diri;
10054 diri->flags |= I_SNAPDIR_OPEN;
10055 inode_map[vino] = in;
10056 if (use_faked_inos())
10057 _assign_faked_ino(in);
10058 ldout(cct, 10) << "open_snapdir created snapshot inode " << *in << dendl;
10059 } else {
10060 in = inode_map[vino];
10061 ldout(cct, 10) << "open_snapdir had snapshot inode " << *in << dendl;
10062 }
10063 return in;
10064}
10065
10066int Client::ll_lookup(Inode *parent, const char *name, struct stat *attr,
10067 Inode **out, const UserPerm& perms)
10068{
10069 Mutex::Locker lock(client_lock);
31f18b77
FG
10070 vinodeno_t vparent = _get_vino(parent);
10071 ldout(cct, 3) << "ll_lookup " << vparent << " " << name << dendl;
7c673cae
FG
10072 tout(cct) << "ll_lookup" << std::endl;
10073 tout(cct) << name << std::endl;
10074
10075 int r = 0;
10076 if (!cct->_conf->fuse_default_permissions) {
10077 r = may_lookup(parent, perms);
10078 if (r < 0)
10079 return r;
10080 }
10081
10082 string dname(name);
10083 InodeRef in;
10084
10085 r = _lookup(parent, dname, CEPH_STAT_CAP_INODE_ALL, &in, perms);
10086 if (r < 0) {
10087 attr->st_ino = 0;
10088 goto out;
10089 }
10090
10091 assert(in);
10092 fill_stat(in, attr);
10093 _ll_get(in.get());
10094
10095 out:
31f18b77 10096 ldout(cct, 3) << "ll_lookup " << vparent << " " << name
7c673cae
FG
10097 << " -> " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
10098 tout(cct) << attr->st_ino << std::endl;
10099 *out = in.get();
10100 return r;
10101}
10102
10103int Client::ll_lookupx(Inode *parent, const char *name, Inode **out,
10104 struct ceph_statx *stx, unsigned want, unsigned flags,
10105 const UserPerm& perms)
10106{
10107 Mutex::Locker lock(client_lock);
31f18b77
FG
10108 vinodeno_t vparent = _get_vino(parent);
10109 ldout(cct, 3) << "ll_lookupx " << vparent << " " << name << dendl;
7c673cae
FG
10110 tout(cct) << "ll_lookupx" << std::endl;
10111 tout(cct) << name << std::endl;
10112
10113 int r = 0;
10114 if (!cct->_conf->fuse_default_permissions) {
10115 r = may_lookup(parent, perms);
10116 if (r < 0)
10117 return r;
10118 }
10119
10120 string dname(name);
10121 InodeRef in;
10122
10123 unsigned mask = statx_to_mask(flags, want);
10124 r = _lookup(parent, dname, mask, &in, perms);
10125 if (r < 0) {
10126 stx->stx_ino = 0;
10127 stx->stx_mask = 0;
10128 } else {
10129 assert(in);
10130 fill_statx(in, mask, stx);
10131 _ll_get(in.get());
10132 }
10133
31f18b77 10134 ldout(cct, 3) << "ll_lookupx " << vparent << " " << name
7c673cae
FG
10135 << " -> " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
10136 tout(cct) << stx->stx_ino << std::endl;
10137 *out = in.get();
10138 return r;
10139}
10140
10141int Client::ll_walk(const char* name, Inode **out, struct ceph_statx *stx,
10142 unsigned int want, unsigned int flags, const UserPerm& perms)
10143{
10144 Mutex::Locker lock(client_lock);
10145 filepath fp(name, 0);
10146 InodeRef in;
10147 int rc;
10148 unsigned mask = statx_to_mask(flags, want);
10149
10150 ldout(cct, 3) << "ll_walk" << name << dendl;
10151 tout(cct) << "ll_walk" << std::endl;
10152 tout(cct) << name << std::endl;
10153
10154 rc = path_walk(fp, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW), mask);
10155 if (rc < 0) {
10156 /* zero out mask, just in case... */
10157 stx->stx_mask = 0;
10158 stx->stx_ino = 0;
10159 *out = NULL;
10160 return rc;
10161 } else {
10162 assert(in);
10163 fill_statx(in, mask, stx);
10164 _ll_get(in.get());
10165 *out = in.get();
10166 return 0;
10167 }
10168}
10169
10170void Client::_ll_get(Inode *in)
10171{
10172 if (in->ll_ref == 0) {
10173 in->get();
10174 if (in->is_dir() && !in->dn_set.empty()) {
10175 assert(in->dn_set.size() == 1); // dirs can't be hard-linked
10176 in->get_first_parent()->get(); // pin dentry
10177 }
10178 }
10179 in->ll_get();
10180 ldout(cct, 20) << "_ll_get " << in << " " << in->ino << " -> " << in->ll_ref << dendl;
10181}
10182
10183int Client::_ll_put(Inode *in, int num)
10184{
10185 in->ll_put(num);
10186 ldout(cct, 20) << "_ll_put " << in << " " << in->ino << " " << num << " -> " << in->ll_ref << dendl;
10187 if (in->ll_ref == 0) {
10188 if (in->is_dir() && !in->dn_set.empty()) {
10189 assert(in->dn_set.size() == 1); // dirs can't be hard-linked
10190 in->get_first_parent()->put(); // unpin dentry
10191 }
10192 put_inode(in);
10193 return 0;
10194 } else {
10195 return in->ll_ref;
10196 }
10197}
10198
10199void Client::_ll_drop_pins()
10200{
10201 ldout(cct, 10) << "_ll_drop_pins" << dendl;
10202 ceph::unordered_map<vinodeno_t, Inode*>::iterator next;
10203 for (ceph::unordered_map<vinodeno_t, Inode*>::iterator it = inode_map.begin();
10204 it != inode_map.end();
10205 it = next) {
10206 Inode *in = it->second;
10207 next = it;
10208 ++next;
10209 if (in->ll_ref)
10210 _ll_put(in, in->ll_ref);
10211 }
10212}
10213
10214bool Client::ll_forget(Inode *in, int count)
10215{
10216 Mutex::Locker lock(client_lock);
10217 inodeno_t ino = _get_inodeno(in);
10218
10219 ldout(cct, 3) << "ll_forget " << ino << " " << count << dendl;
10220 tout(cct) << "ll_forget" << std::endl;
10221 tout(cct) << ino.val << std::endl;
10222 tout(cct) << count << std::endl;
10223
10224 if (ino == 1) return true; // ignore forget on root.
10225
10226 bool last = false;
10227 if (in->ll_ref < count) {
10228 ldout(cct, 1) << "WARNING: ll_forget on " << ino << " " << count
10229 << ", which only has ll_ref=" << in->ll_ref << dendl;
10230 _ll_put(in, in->ll_ref);
10231 last = true;
10232 } else {
10233 if (_ll_put(in, count) == 0)
10234 last = true;
10235 }
10236
10237 return last;
10238}
10239
10240bool Client::ll_put(Inode *in)
10241{
10242 /* ll_forget already takes the lock */
10243 return ll_forget(in, 1);
10244}
10245
10246snapid_t Client::ll_get_snapid(Inode *in)
10247{
10248 Mutex::Locker lock(client_lock);
10249 return in->snapid;
10250}
10251
10252Inode *Client::ll_get_inode(ino_t ino)
10253{
10254 Mutex::Locker lock(client_lock);
10255 vinodeno_t vino = _map_faked_ino(ino);
10256 unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino);
10257 if (p == inode_map.end())
10258 return NULL;
10259 Inode *in = p->second;
10260 _ll_get(in);
10261 return in;
10262}
10263
10264Inode *Client::ll_get_inode(vinodeno_t vino)
10265{
10266 Mutex::Locker lock(client_lock);
10267 unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino);
10268 if (p == inode_map.end())
10269 return NULL;
10270 Inode *in = p->second;
10271 _ll_get(in);
10272 return in;
10273}
10274
10275int Client::_ll_getattr(Inode *in, int caps, const UserPerm& perms)
10276{
10277 vinodeno_t vino = _get_vino(in);
10278
10279 ldout(cct, 3) << "ll_getattr " << vino << dendl;
10280 tout(cct) << "ll_getattr" << std::endl;
10281 tout(cct) << vino.ino.val << std::endl;
10282
10283 if (vino.snapid < CEPH_NOSNAP)
10284 return 0;
10285 else
10286 return _getattr(in, caps, perms);
10287}
10288
10289int Client::ll_getattr(Inode *in, struct stat *attr, const UserPerm& perms)
10290{
10291 Mutex::Locker lock(client_lock);
10292
10293 int res = _ll_getattr(in, CEPH_STAT_CAP_INODE_ALL, perms);
10294
10295 if (res == 0)
10296 fill_stat(in, attr);
10297 ldout(cct, 3) << "ll_getattr " << _get_vino(in) << " = " << res << dendl;
10298 return res;
10299}
10300
10301int Client::ll_getattrx(Inode *in, struct ceph_statx *stx, unsigned int want,
10302 unsigned int flags, const UserPerm& perms)
10303{
10304 Mutex::Locker lock(client_lock);
10305
10306 int res = 0;
10307 unsigned mask = statx_to_mask(flags, want);
10308
10309 if (mask && !in->caps_issued_mask(mask))
10310 res = _ll_getattr(in, mask, perms);
10311
10312 if (res == 0)
10313 fill_statx(in, mask, stx);
10314 ldout(cct, 3) << "ll_getattrx " << _get_vino(in) << " = " << res << dendl;
10315 return res;
10316}
10317
10318int Client::_ll_setattrx(Inode *in, struct ceph_statx *stx, int mask,
10319 const UserPerm& perms, InodeRef *inp)
10320{
10321 vinodeno_t vino = _get_vino(in);
10322
10323 ldout(cct, 3) << "ll_setattrx " << vino << " mask " << hex << mask << dec
10324 << dendl;
10325 tout(cct) << "ll_setattrx" << std::endl;
10326 tout(cct) << vino.ino.val << std::endl;
10327 tout(cct) << stx->stx_mode << std::endl;
10328 tout(cct) << stx->stx_uid << std::endl;
10329 tout(cct) << stx->stx_gid << std::endl;
10330 tout(cct) << stx->stx_size << std::endl;
10331 tout(cct) << stx->stx_mtime << std::endl;
10332 tout(cct) << stx->stx_atime << std::endl;
10333 tout(cct) << stx->stx_btime << std::endl;
10334 tout(cct) << mask << std::endl;
10335
10336 if (!cct->_conf->fuse_default_permissions) {
10337 int res = may_setattr(in, stx, mask, perms);
10338 if (res < 0)
10339 return res;
10340 }
10341
10342 mask &= ~(CEPH_SETATTR_MTIME_NOW | CEPH_SETATTR_ATIME_NOW);
10343
10344 return __setattrx(in, stx, mask, perms, inp);
10345}
10346
10347int Client::ll_setattrx(Inode *in, struct ceph_statx *stx, int mask,
10348 const UserPerm& perms)
10349{
10350 Mutex::Locker lock(client_lock);
10351 InodeRef target(in);
10352 int res = _ll_setattrx(in, stx, mask, perms, &target);
10353 if (res == 0) {
10354 assert(in == target.get());
10355 fill_statx(in, in->caps_issued(), stx);
10356 }
10357
10358 ldout(cct, 3) << "ll_setattrx " << _get_vino(in) << " = " << res << dendl;
10359 return res;
10360}
10361
10362int Client::ll_setattr(Inode *in, struct stat *attr, int mask,
10363 const UserPerm& perms)
10364{
10365 struct ceph_statx stx;
10366 stat_to_statx(attr, &stx);
10367
10368 Mutex::Locker lock(client_lock);
10369 InodeRef target(in);
10370 int res = _ll_setattrx(in, &stx, mask, perms, &target);
10371 if (res == 0) {
10372 assert(in == target.get());
10373 fill_stat(in, attr);
10374 }
10375
10376 ldout(cct, 3) << "ll_setattr " << _get_vino(in) << " = " << res << dendl;
10377 return res;
10378}
10379
10380
10381// ----------
10382// xattrs
10383
10384int Client::getxattr(const char *path, const char *name, void *value, size_t size,
10385 const UserPerm& perms)
10386{
10387 Mutex::Locker lock(client_lock);
10388 InodeRef in;
10389 int r = Client::path_walk(path, &in, perms, true, CEPH_STAT_CAP_XATTR);
10390 if (r < 0)
10391 return r;
10392 return _getxattr(in, name, value, size, perms);
10393}
10394
10395int Client::lgetxattr(const char *path, const char *name, void *value, size_t size,
10396 const UserPerm& perms)
10397{
10398 Mutex::Locker lock(client_lock);
10399 InodeRef in;
10400 int r = Client::path_walk(path, &in, perms, false, CEPH_STAT_CAP_XATTR);
10401 if (r < 0)
10402 return r;
10403 return _getxattr(in, name, value, size, perms);
10404}
10405
10406int Client::fgetxattr(int fd, const char *name, void *value, size_t size,
10407 const UserPerm& perms)
10408{
10409 Mutex::Locker lock(client_lock);
10410 Fh *f = get_filehandle(fd);
10411 if (!f)
10412 return -EBADF;
10413 return _getxattr(f->inode, name, value, size, perms);
10414}
10415
10416int Client::listxattr(const char *path, char *list, size_t size,
10417 const UserPerm& perms)
10418{
10419 Mutex::Locker lock(client_lock);
10420 InodeRef in;
10421 int r = Client::path_walk(path, &in, perms, true, CEPH_STAT_CAP_XATTR);
10422 if (r < 0)
10423 return r;
10424 return Client::_listxattr(in.get(), list, size, perms);
10425}
10426
10427int Client::llistxattr(const char *path, char *list, size_t size,
10428 const UserPerm& perms)
10429{
10430 Mutex::Locker lock(client_lock);
10431 InodeRef in;
10432 int r = Client::path_walk(path, &in, perms, false, CEPH_STAT_CAP_XATTR);
10433 if (r < 0)
10434 return r;
10435 return Client::_listxattr(in.get(), list, size, perms);
10436}
10437
10438int Client::flistxattr(int fd, char *list, size_t size, const UserPerm& perms)
10439{
10440 Mutex::Locker lock(client_lock);
10441 Fh *f = get_filehandle(fd);
10442 if (!f)
10443 return -EBADF;
10444 return Client::_listxattr(f->inode.get(), list, size, perms);
10445}
10446
10447int Client::removexattr(const char *path, const char *name,
10448 const UserPerm& perms)
10449{
10450 Mutex::Locker lock(client_lock);
10451 InodeRef in;
10452 int r = Client::path_walk(path, &in, perms, true);
10453 if (r < 0)
10454 return r;
10455 return _removexattr(in, name, perms);
10456}
10457
10458int Client::lremovexattr(const char *path, const char *name,
10459 const UserPerm& perms)
10460{
10461 Mutex::Locker lock(client_lock);
10462 InodeRef in;
10463 int r = Client::path_walk(path, &in, perms, false);
10464 if (r < 0)
10465 return r;
10466 return _removexattr(in, name, perms);
10467}
10468
10469int Client::fremovexattr(int fd, const char *name, const UserPerm& perms)
10470{
10471 Mutex::Locker lock(client_lock);
10472 Fh *f = get_filehandle(fd);
10473 if (!f)
10474 return -EBADF;
10475 return _removexattr(f->inode, name, perms);
10476}
10477
10478int Client::setxattr(const char *path, const char *name, const void *value,
10479 size_t size, int flags, const UserPerm& perms)
10480{
10481 _setxattr_maybe_wait_for_osdmap(name, value, size);
10482
10483 Mutex::Locker lock(client_lock);
10484 InodeRef in;
10485 int r = Client::path_walk(path, &in, perms, true);
10486 if (r < 0)
10487 return r;
10488 return _setxattr(in, name, value, size, flags, perms);
10489}
10490
10491int Client::lsetxattr(const char *path, const char *name, const void *value,
10492 size_t size, int flags, const UserPerm& perms)
10493{
10494 _setxattr_maybe_wait_for_osdmap(name, value, size);
10495
10496 Mutex::Locker lock(client_lock);
10497 InodeRef in;
10498 int r = Client::path_walk(path, &in, perms, false);
10499 if (r < 0)
10500 return r;
10501 return _setxattr(in, name, value, size, flags, perms);
10502}
10503
10504int Client::fsetxattr(int fd, const char *name, const void *value, size_t size,
10505 int flags, const UserPerm& perms)
10506{
10507 _setxattr_maybe_wait_for_osdmap(name, value, size);
10508
10509 Mutex::Locker lock(client_lock);
10510 Fh *f = get_filehandle(fd);
10511 if (!f)
10512 return -EBADF;
10513 return _setxattr(f->inode, name, value, size, flags, perms);
10514}
10515
10516int Client::_getxattr(Inode *in, const char *name, void *value, size_t size,
10517 const UserPerm& perms)
10518{
10519 int r;
10520
10521 const VXattr *vxattr = _match_vxattr(in, name);
10522 if (vxattr) {
10523 r = -ENODATA;
10524
10525 // Do a force getattr to get the latest quota before returning
10526 // a value to userspace.
10527 r = _getattr(in, 0, perms, true);
10528 if (r != 0) {
10529 // Error from getattr!
10530 return r;
10531 }
10532
10533 // call pointer-to-member function
10534 char buf[256];
10535 if (!(vxattr->exists_cb && !(this->*(vxattr->exists_cb))(in))) {
10536 r = (this->*(vxattr->getxattr_cb))(in, buf, sizeof(buf));
10537 } else {
10538 r = -ENODATA;
10539 }
10540
10541 if (size != 0) {
10542 if (r > (int)size) {
10543 r = -ERANGE;
10544 } else if (r > 0) {
10545 memcpy(value, buf, r);
10546 }
10547 }
10548 goto out;
10549 }
10550
10551 if (acl_type == NO_ACL && !strncmp(name, "system.", 7)) {
10552 r = -EOPNOTSUPP;
10553 goto out;
10554 }
10555
10556 r = _getattr(in, CEPH_STAT_CAP_XATTR, perms, in->xattr_version == 0);
10557 if (r == 0) {
10558 string n(name);
10559 r = -ENODATA;
10560 if (in->xattrs.count(n)) {
10561 r = in->xattrs[n].length();
10562 if (r > 0 && size != 0) {
10563 if (size >= (unsigned)r)
10564 memcpy(value, in->xattrs[n].c_str(), r);
10565 else
10566 r = -ERANGE;
10567 }
10568 }
10569 }
10570 out:
10571 ldout(cct, 3) << "_getxattr(" << in->ino << ", \"" << name << "\", " << size << ") = " << r << dendl;
10572 return r;
10573}
10574
10575int Client::_getxattr(InodeRef &in, const char *name, void *value, size_t size,
10576 const UserPerm& perms)
10577{
10578 if (cct->_conf->client_permissions) {
10579 int r = xattr_permission(in.get(), name, MAY_READ, perms);
10580 if (r < 0)
10581 return r;
10582 }
10583 return _getxattr(in.get(), name, value, size, perms);
10584}
10585
10586int Client::ll_getxattr(Inode *in, const char *name, void *value,
10587 size_t size, const UserPerm& perms)
10588{
10589 Mutex::Locker lock(client_lock);
10590
10591 vinodeno_t vino = _get_vino(in);
10592
10593 ldout(cct, 3) << "ll_getxattr " << vino << " " << name << " size " << size << dendl;
10594 tout(cct) << "ll_getxattr" << std::endl;
10595 tout(cct) << vino.ino.val << std::endl;
10596 tout(cct) << name << std::endl;
10597
10598 if (!cct->_conf->fuse_default_permissions) {
10599 int r = xattr_permission(in, name, MAY_READ, perms);
10600 if (r < 0)
10601 return r;
10602 }
10603
10604 return _getxattr(in, name, value, size, perms);
10605}
10606
10607int Client::_listxattr(Inode *in, char *name, size_t size,
10608 const UserPerm& perms)
10609{
10610 int r = _getattr(in, CEPH_STAT_CAP_XATTR, perms, in->xattr_version == 0);
10611 if (r == 0) {
10612 for (map<string,bufferptr>::iterator p = in->xattrs.begin();
10613 p != in->xattrs.end();
10614 ++p)
10615 r += p->first.length() + 1;
10616
10617 const VXattr *vxattrs = _get_vxattrs(in);
10618 r += _vxattrs_name_size(vxattrs);
10619
10620 if (size != 0) {
10621 if (size >= (unsigned)r) {
10622 for (map<string,bufferptr>::iterator p = in->xattrs.begin();
10623 p != in->xattrs.end();
10624 ++p) {
10625 memcpy(name, p->first.c_str(), p->first.length());
10626 name += p->first.length();
10627 *name = '\0';
10628 name++;
10629 }
10630 if (vxattrs) {
10631 for (int i = 0; !vxattrs[i].name.empty(); i++) {
10632 const VXattr& vxattr = vxattrs[i];
10633 if (vxattr.hidden)
10634 continue;
10635 // call pointer-to-member function
10636 if(vxattr.exists_cb && !(this->*(vxattr.exists_cb))(in))
10637 continue;
10638 memcpy(name, vxattr.name.c_str(), vxattr.name.length());
10639 name += vxattr.name.length();
10640 *name = '\0';
10641 name++;
10642 }
10643 }
10644 } else
10645 r = -ERANGE;
10646 }
10647 }
10648 ldout(cct, 3) << "_listxattr(" << in->ino << ", " << size << ") = " << r << dendl;
10649 return r;
10650}
10651
10652int Client::ll_listxattr(Inode *in, char *names, size_t size,
10653 const UserPerm& perms)
10654{
10655 Mutex::Locker lock(client_lock);
10656
10657 vinodeno_t vino = _get_vino(in);
10658
10659 ldout(cct, 3) << "ll_listxattr " << vino << " size " << size << dendl;
10660 tout(cct) << "ll_listxattr" << std::endl;
10661 tout(cct) << vino.ino.val << std::endl;
10662 tout(cct) << size << std::endl;
10663
10664 return _listxattr(in, names, size, perms);
10665}
10666
10667int Client::_do_setxattr(Inode *in, const char *name, const void *value,
10668 size_t size, int flags, const UserPerm& perms)
10669{
10670
10671 int xattr_flags = 0;
10672 if (!value)
10673 xattr_flags |= CEPH_XATTR_REMOVE;
10674 if (flags & XATTR_CREATE)
10675 xattr_flags |= CEPH_XATTR_CREATE;
10676 if (flags & XATTR_REPLACE)
10677 xattr_flags |= CEPH_XATTR_REPLACE;
10678
10679 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_SETXATTR);
10680 filepath path;
10681 in->make_nosnap_relative_path(path);
10682 req->set_filepath(path);
10683 req->set_string2(name);
10684 req->set_inode(in);
10685 req->head.args.setxattr.flags = xattr_flags;
10686
10687 bufferlist bl;
10688 bl.append((const char*)value, size);
10689 req->set_data(bl);
10690
10691 int res = make_request(req, perms);
10692
10693 trim_cache();
10694 ldout(cct, 3) << "_setxattr(" << in->ino << ", \"" << name << "\") = " <<
10695 res << dendl;
10696 return res;
10697}
10698
10699int Client::_setxattr(Inode *in, const char *name, const void *value,
10700 size_t size, int flags, const UserPerm& perms)
10701{
10702 if (in->snapid != CEPH_NOSNAP) {
10703 return -EROFS;
10704 }
10705
10706 bool posix_acl_xattr = false;
10707 if (acl_type == POSIX_ACL)
10708 posix_acl_xattr = !strncmp(name, "system.", 7);
10709
10710 if (strncmp(name, "user.", 5) &&
10711 strncmp(name, "security.", 9) &&
10712 strncmp(name, "trusted.", 8) &&
10713 strncmp(name, "ceph.", 5) &&
10714 !posix_acl_xattr)
10715 return -EOPNOTSUPP;
10716
10717 if (posix_acl_xattr) {
10718 if (!strcmp(name, ACL_EA_ACCESS)) {
10719 mode_t new_mode = in->mode;
10720 if (value) {
10721 int ret = posix_acl_equiv_mode(value, size, &new_mode);
10722 if (ret < 0)
10723 return ret;
10724 if (ret == 0) {
10725 value = NULL;
10726 size = 0;
10727 }
10728 if (new_mode != in->mode) {
10729 struct ceph_statx stx;
10730 stx.stx_mode = new_mode;
10731 ret = _do_setattr(in, &stx, CEPH_SETATTR_MODE, perms, NULL);
10732 if (ret < 0)
10733 return ret;
10734 }
10735 }
10736 } else if (!strcmp(name, ACL_EA_DEFAULT)) {
10737 if (value) {
10738 if (!S_ISDIR(in->mode))
10739 return -EACCES;
10740 int ret = posix_acl_check(value, size);
10741 if (ret < 0)
10742 return -EINVAL;
10743 if (ret == 0) {
10744 value = NULL;
10745 size = 0;
10746 }
10747 }
10748 } else {
10749 return -EOPNOTSUPP;
10750 }
10751 } else {
10752 const VXattr *vxattr = _match_vxattr(in, name);
10753 if (vxattr && vxattr->readonly)
10754 return -EOPNOTSUPP;
10755 }
10756
10757 return _do_setxattr(in, name, value, size, flags, perms);
10758}
10759
10760int Client::_setxattr(InodeRef &in, const char *name, const void *value,
10761 size_t size, int flags, const UserPerm& perms)
10762{
10763 if (cct->_conf->client_permissions) {
10764 int r = xattr_permission(in.get(), name, MAY_WRITE, perms);
10765 if (r < 0)
10766 return r;
10767 }
10768 return _setxattr(in.get(), name, value, size, flags, perms);
10769}
10770
10771int Client::_setxattr_check_data_pool(string& name, string& value, const OSDMap *osdmap)
10772{
10773 string tmp;
10774 if (name == "layout") {
10775 string::iterator begin = value.begin();
10776 string::iterator end = value.end();
10777 keys_and_values<string::iterator> p; // create instance of parser
10778 std::map<string, string> m; // map to receive results
10779 if (!qi::parse(begin, end, p, m)) { // returns true if successful
10780 return -EINVAL;
10781 }
10782 if (begin != end)
10783 return -EINVAL;
10784 for (map<string,string>::iterator q = m.begin(); q != m.end(); ++q) {
10785 if (q->first == "pool") {
10786 tmp = q->second;
10787 break;
10788 }
10789 }
10790 } else if (name == "layout.pool") {
10791 tmp = value;
10792 }
10793
10794 if (tmp.length()) {
10795 int64_t pool;
10796 try {
10797 pool = boost::lexical_cast<unsigned>(tmp);
10798 if (!osdmap->have_pg_pool(pool))
10799 return -ENOENT;
10800 } catch (boost::bad_lexical_cast const&) {
10801 pool = osdmap->lookup_pg_pool_name(tmp);
10802 if (pool < 0) {
10803 return -ENOENT;
10804 }
10805 }
10806 }
10807
10808 return 0;
10809}
10810
10811void Client::_setxattr_maybe_wait_for_osdmap(const char *name, const void *value, size_t size)
10812{
10813 // For setting pool of layout, MetaRequest need osdmap epoch.
10814 // There is a race which create a new data pool but client and mds both don't have.
10815 // Make client got the latest osdmap which make mds quickly judge whether get newer osdmap.
10816 if (strcmp(name, "ceph.file.layout.pool") == 0 || strcmp(name, "ceph.dir.layout.pool") == 0 ||
10817 strcmp(name, "ceph.file.layout") == 0 || strcmp(name, "ceph.dir.layout") == 0) {
10818 string rest(strstr(name, "layout"));
10819 string v((const char*)value, size);
10820 int r = objecter->with_osdmap([&](const OSDMap& o) {
10821 return _setxattr_check_data_pool(rest, v, &o);
10822 });
10823
10824 if (r == -ENOENT) {
10825 C_SaferCond ctx;
10826 objecter->wait_for_latest_osdmap(&ctx);
10827 ctx.wait();
10828 }
10829 }
10830}
10831
10832int Client::ll_setxattr(Inode *in, const char *name, const void *value,
10833 size_t size, int flags, const UserPerm& perms)
10834{
10835 _setxattr_maybe_wait_for_osdmap(name, value, size);
10836
10837 Mutex::Locker lock(client_lock);
10838
10839 vinodeno_t vino = _get_vino(in);
10840
10841 ldout(cct, 3) << "ll_setxattr " << vino << " " << name << " size " << size << dendl;
10842 tout(cct) << "ll_setxattr" << std::endl;
10843 tout(cct) << vino.ino.val << std::endl;
10844 tout(cct) << name << std::endl;
10845
10846 if (!cct->_conf->fuse_default_permissions) {
10847 int r = xattr_permission(in, name, MAY_WRITE, perms);
10848 if (r < 0)
10849 return r;
10850 }
10851 return _setxattr(in, name, value, size, flags, perms);
10852}
10853
10854int Client::_removexattr(Inode *in, const char *name, const UserPerm& perms)
10855{
10856 if (in->snapid != CEPH_NOSNAP) {
10857 return -EROFS;
10858 }
10859
10860 // same xattrs supported by kernel client
10861 if (strncmp(name, "user.", 5) &&
10862 strncmp(name, "system.", 7) &&
10863 strncmp(name, "security.", 9) &&
10864 strncmp(name, "trusted.", 8) &&
10865 strncmp(name, "ceph.", 5))
10866 return -EOPNOTSUPP;
10867
10868 const VXattr *vxattr = _match_vxattr(in, name);
10869 if (vxattr && vxattr->readonly)
10870 return -EOPNOTSUPP;
10871
10872 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_RMXATTR);
10873 filepath path;
10874 in->make_nosnap_relative_path(path);
10875 req->set_filepath(path);
10876 req->set_filepath2(name);
10877 req->set_inode(in);
10878
10879 int res = make_request(req, perms);
10880
10881 trim_cache();
10882 ldout(cct, 3) << "_removexattr(" << in->ino << ", \"" << name << "\") = " << res << dendl;
10883 return res;
10884}
10885
10886int Client::_removexattr(InodeRef &in, const char *name, const UserPerm& perms)
10887{
10888 if (cct->_conf->client_permissions) {
10889 int r = xattr_permission(in.get(), name, MAY_WRITE, perms);
10890 if (r < 0)
10891 return r;
10892 }
10893 return _removexattr(in.get(), name, perms);
10894}
10895
10896int Client::ll_removexattr(Inode *in, const char *name, const UserPerm& perms)
10897{
10898 Mutex::Locker lock(client_lock);
10899
10900 vinodeno_t vino = _get_vino(in);
10901
10902 ldout(cct, 3) << "ll_removexattr " << vino << " " << name << dendl;
10903 tout(cct) << "ll_removexattr" << std::endl;
10904 tout(cct) << vino.ino.val << std::endl;
10905 tout(cct) << name << std::endl;
10906
10907 if (!cct->_conf->fuse_default_permissions) {
10908 int r = xattr_permission(in, name, MAY_WRITE, perms);
10909 if (r < 0)
10910 return r;
10911 }
10912
10913 return _removexattr(in, name, perms);
10914}
10915
10916bool Client::_vxattrcb_quota_exists(Inode *in)
10917{
10918 return in->quota.is_enable();
10919}
10920size_t Client::_vxattrcb_quota(Inode *in, char *val, size_t size)
10921{
10922 return snprintf(val, size,
10923 "max_bytes=%lld max_files=%lld",
10924 (long long int)in->quota.max_bytes,
10925 (long long int)in->quota.max_files);
10926}
10927size_t Client::_vxattrcb_quota_max_bytes(Inode *in, char *val, size_t size)
10928{
10929 return snprintf(val, size, "%lld", (long long int)in->quota.max_bytes);
10930}
10931size_t Client::_vxattrcb_quota_max_files(Inode *in, char *val, size_t size)
10932{
10933 return snprintf(val, size, "%lld", (long long int)in->quota.max_files);
10934}
10935
10936bool Client::_vxattrcb_layout_exists(Inode *in)
10937{
10938 return in->layout != file_layout_t();
10939}
10940size_t Client::_vxattrcb_layout(Inode *in, char *val, size_t size)
10941{
10942 int r = snprintf(val, size,
10943 "stripe_unit=%lld stripe_count=%lld object_size=%lld pool=",
10944 (unsigned long long)in->layout.stripe_unit,
10945 (unsigned long long)in->layout.stripe_count,
10946 (unsigned long long)in->layout.object_size);
10947 objecter->with_osdmap([&](const OSDMap& o) {
10948 if (o.have_pg_pool(in->layout.pool_id))
10949 r += snprintf(val + r, size - r, "%s",
10950 o.get_pool_name(in->layout.pool_id).c_str());
10951 else
10952 r += snprintf(val + r, size - r, "%" PRIu64,
10953 (uint64_t)in->layout.pool_id);
10954 });
10955 if (in->layout.pool_ns.length())
10956 r += snprintf(val + r, size - r, " pool_namespace=%s",
10957 in->layout.pool_ns.c_str());
10958 return r;
10959}
10960size_t Client::_vxattrcb_layout_stripe_unit(Inode *in, char *val, size_t size)
10961{
10962 return snprintf(val, size, "%lld", (unsigned long long)in->layout.stripe_unit);
10963}
10964size_t Client::_vxattrcb_layout_stripe_count(Inode *in, char *val, size_t size)
10965{
10966 return snprintf(val, size, "%lld", (unsigned long long)in->layout.stripe_count);
10967}
10968size_t Client::_vxattrcb_layout_object_size(Inode *in, char *val, size_t size)
10969{
10970 return snprintf(val, size, "%lld", (unsigned long long)in->layout.object_size);
10971}
10972size_t Client::_vxattrcb_layout_pool(Inode *in, char *val, size_t size)
10973{
10974 size_t r;
10975 objecter->with_osdmap([&](const OSDMap& o) {
10976 if (o.have_pg_pool(in->layout.pool_id))
10977 r = snprintf(val, size, "%s", o.get_pool_name(
10978 in->layout.pool_id).c_str());
10979 else
10980 r = snprintf(val, size, "%" PRIu64, (uint64_t)in->layout.pool_id);
10981 });
10982 return r;
10983}
10984size_t Client::_vxattrcb_layout_pool_namespace(Inode *in, char *val, size_t size)
10985{
10986 return snprintf(val, size, "%s", in->layout.pool_ns.c_str());
10987}
10988size_t Client::_vxattrcb_dir_entries(Inode *in, char *val, size_t size)
10989{
10990 return snprintf(val, size, "%lld", (unsigned long long)(in->dirstat.nfiles + in->dirstat.nsubdirs));
10991}
10992size_t Client::_vxattrcb_dir_files(Inode *in, char *val, size_t size)
10993{
10994 return snprintf(val, size, "%lld", (unsigned long long)in->dirstat.nfiles);
10995}
10996size_t Client::_vxattrcb_dir_subdirs(Inode *in, char *val, size_t size)
10997{
10998 return snprintf(val, size, "%lld", (unsigned long long)in->dirstat.nsubdirs);
10999}
11000size_t Client::_vxattrcb_dir_rentries(Inode *in, char *val, size_t size)
11001{
11002 return snprintf(val, size, "%lld", (unsigned long long)(in->rstat.rfiles + in->rstat.rsubdirs));
11003}
11004size_t Client::_vxattrcb_dir_rfiles(Inode *in, char *val, size_t size)
11005{
11006 return snprintf(val, size, "%lld", (unsigned long long)in->rstat.rfiles);
11007}
11008size_t Client::_vxattrcb_dir_rsubdirs(Inode *in, char *val, size_t size)
11009{
11010 return snprintf(val, size, "%lld", (unsigned long long)in->rstat.rsubdirs);
11011}
11012size_t Client::_vxattrcb_dir_rbytes(Inode *in, char *val, size_t size)
11013{
11014 return snprintf(val, size, "%lld", (unsigned long long)in->rstat.rbytes);
11015}
11016size_t Client::_vxattrcb_dir_rctime(Inode *in, char *val, size_t size)
11017{
11018 return snprintf(val, size, "%ld.09%ld", (long)in->rstat.rctime.sec(),
11019 (long)in->rstat.rctime.nsec());
11020}
11021
11022#define CEPH_XATTR_NAME(_type, _name) "ceph." #_type "." #_name
11023#define CEPH_XATTR_NAME2(_type, _name, _name2) "ceph." #_type "." #_name "." #_name2
11024
11025#define XATTR_NAME_CEPH(_type, _name) \
11026{ \
11027 name: CEPH_XATTR_NAME(_type, _name), \
11028 getxattr_cb: &Client::_vxattrcb_ ## _type ## _ ## _name, \
11029 readonly: true, \
11030 hidden: false, \
11031 exists_cb: NULL, \
11032}
11033#define XATTR_LAYOUT_FIELD(_type, _name, _field) \
11034{ \
11035 name: CEPH_XATTR_NAME2(_type, _name, _field), \
11036 getxattr_cb: &Client::_vxattrcb_ ## _name ## _ ## _field, \
11037 readonly: false, \
11038 hidden: true, \
11039 exists_cb: &Client::_vxattrcb_layout_exists, \
11040}
11041#define XATTR_QUOTA_FIELD(_type, _name) \
11042{ \
11043 name: CEPH_XATTR_NAME(_type, _name), \
11044 getxattr_cb: &Client::_vxattrcb_ ## _type ## _ ## _name, \
11045 readonly: false, \
11046 hidden: true, \
11047 exists_cb: &Client::_vxattrcb_quota_exists, \
11048}
11049
11050const Client::VXattr Client::_dir_vxattrs[] = {
11051 {
11052 name: "ceph.dir.layout",
11053 getxattr_cb: &Client::_vxattrcb_layout,
11054 readonly: false,
11055 hidden: true,
11056 exists_cb: &Client::_vxattrcb_layout_exists,
11057 },
11058 XATTR_LAYOUT_FIELD(dir, layout, stripe_unit),
11059 XATTR_LAYOUT_FIELD(dir, layout, stripe_count),
11060 XATTR_LAYOUT_FIELD(dir, layout, object_size),
11061 XATTR_LAYOUT_FIELD(dir, layout, pool),
11062 XATTR_LAYOUT_FIELD(dir, layout, pool_namespace),
11063 XATTR_NAME_CEPH(dir, entries),
11064 XATTR_NAME_CEPH(dir, files),
11065 XATTR_NAME_CEPH(dir, subdirs),
11066 XATTR_NAME_CEPH(dir, rentries),
11067 XATTR_NAME_CEPH(dir, rfiles),
11068 XATTR_NAME_CEPH(dir, rsubdirs),
11069 XATTR_NAME_CEPH(dir, rbytes),
11070 XATTR_NAME_CEPH(dir, rctime),
11071 {
11072 name: "ceph.quota",
11073 getxattr_cb: &Client::_vxattrcb_quota,
11074 readonly: false,
11075 hidden: true,
11076 exists_cb: &Client::_vxattrcb_quota_exists,
11077 },
11078 XATTR_QUOTA_FIELD(quota, max_bytes),
11079 XATTR_QUOTA_FIELD(quota, max_files),
11080 { name: "" } /* Required table terminator */
11081};
11082
11083const Client::VXattr Client::_file_vxattrs[] = {
11084 {
11085 name: "ceph.file.layout",
11086 getxattr_cb: &Client::_vxattrcb_layout,
11087 readonly: false,
11088 hidden: true,
11089 exists_cb: &Client::_vxattrcb_layout_exists,
11090 },
11091 XATTR_LAYOUT_FIELD(file, layout, stripe_unit),
11092 XATTR_LAYOUT_FIELD(file, layout, stripe_count),
11093 XATTR_LAYOUT_FIELD(file, layout, object_size),
11094 XATTR_LAYOUT_FIELD(file, layout, pool),
11095 XATTR_LAYOUT_FIELD(file, layout, pool_namespace),
11096 { name: "" } /* Required table terminator */
11097};
11098
11099const Client::VXattr *Client::_get_vxattrs(Inode *in)
11100{
11101 if (in->is_dir())
11102 return _dir_vxattrs;
11103 else if (in->is_file())
11104 return _file_vxattrs;
11105 return NULL;
11106}
11107
11108const Client::VXattr *Client::_match_vxattr(Inode *in, const char *name)
11109{
11110 if (strncmp(name, "ceph.", 5) == 0) {
11111 const VXattr *vxattr = _get_vxattrs(in);
11112 if (vxattr) {
11113 while (!vxattr->name.empty()) {
11114 if (vxattr->name == name)
11115 return vxattr;
11116 vxattr++;
11117 }
11118 }
11119 }
11120 return NULL;
11121}
11122
11123size_t Client::_vxattrs_calcu_name_size(const VXattr *vxattr)
11124{
11125 size_t len = 0;
11126 while (!vxattr->name.empty()) {
11127 if (!vxattr->hidden)
11128 len += vxattr->name.length() + 1;
11129 vxattr++;
11130 }
11131 return len;
11132}
11133
11134int Client::ll_readlink(Inode *in, char *buf, size_t buflen, const UserPerm& perms)
11135{
11136 Mutex::Locker lock(client_lock);
11137
11138 vinodeno_t vino = _get_vino(in);
11139
11140 ldout(cct, 3) << "ll_readlink " << vino << dendl;
11141 tout(cct) << "ll_readlink" << std::endl;
11142 tout(cct) << vino.ino.val << std::endl;
11143
11144 set<Dentry*>::iterator dn = in->dn_set.begin();
11145 while (dn != in->dn_set.end()) {
11146 touch_dn(*dn);
11147 ++dn;
11148 }
11149
11150 int r = _readlink(in, buf, buflen); // FIXME: no permission checking!
11151 ldout(cct, 3) << "ll_readlink " << vino << " = " << r << dendl;
11152 return r;
11153}
11154
11155int Client::_mknod(Inode *dir, const char *name, mode_t mode, dev_t rdev,
11156 const UserPerm& perms, InodeRef *inp)
11157{
11158 ldout(cct, 3) << "_mknod(" << dir->ino << " " << name << ", 0" << oct
11159 << mode << dec << ", " << rdev << ", uid " << perms.uid()
11160 << ", gid " << perms.gid() << ")" << dendl;
11161
11162 if (strlen(name) > NAME_MAX)
11163 return -ENAMETOOLONG;
11164
11165 if (dir->snapid != CEPH_NOSNAP) {
11166 return -EROFS;
11167 }
11168 if (is_quota_files_exceeded(dir, perms)) {
11169 return -EDQUOT;
11170 }
11171
11172 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_MKNOD);
11173
11174 filepath path;
11175 dir->make_nosnap_relative_path(path);
11176 path.push_dentry(name);
11177 req->set_filepath(path);
11178 req->set_inode(dir);
11179 req->head.args.mknod.rdev = rdev;
11180 req->dentry_drop = CEPH_CAP_FILE_SHARED;
11181 req->dentry_unless = CEPH_CAP_FILE_EXCL;
11182
11183 bufferlist xattrs_bl;
11184 int res = _posix_acl_create(dir, &mode, xattrs_bl, perms);
11185 if (res < 0)
11186 goto fail;
11187 req->head.args.mknod.mode = mode;
11188 if (xattrs_bl.length() > 0)
11189 req->set_data(xattrs_bl);
11190
11191 Dentry *de;
11192 res = get_or_create(dir, name, &de);
11193 if (res < 0)
11194 goto fail;
11195 req->set_dentry(de);
11196
11197 res = make_request(req, perms, inp);
11198
11199 trim_cache();
11200
11201 ldout(cct, 3) << "mknod(" << path << ", 0" << oct << mode << dec << ") = " << res << dendl;
11202 return res;
11203
11204 fail:
11205 put_request(req);
11206 return res;
11207}
11208
11209int Client::ll_mknod(Inode *parent, const char *name, mode_t mode,
11210 dev_t rdev, struct stat *attr, Inode **out,
11211 const UserPerm& perms)
11212{
11213 Mutex::Locker lock(client_lock);
11214
11215 vinodeno_t vparent = _get_vino(parent);
11216
11217 ldout(cct, 3) << "ll_mknod " << vparent << " " << name << dendl;
11218 tout(cct) << "ll_mknod" << std::endl;
11219 tout(cct) << vparent.ino.val << std::endl;
11220 tout(cct) << name << std::endl;
11221 tout(cct) << mode << std::endl;
11222 tout(cct) << rdev << std::endl;
11223
11224 if (!cct->_conf->fuse_default_permissions) {
11225 int r = may_create(parent, perms);
11226 if (r < 0)
11227 return r;
11228 }
11229
11230 InodeRef in;
11231 int r = _mknod(parent, name, mode, rdev, perms, &in);
11232 if (r == 0) {
11233 fill_stat(in, attr);
11234 _ll_get(in.get());
11235 }
11236 tout(cct) << attr->st_ino << std::endl;
11237 ldout(cct, 3) << "ll_mknod " << vparent << " " << name
11238 << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
11239 *out = in.get();
11240 return r;
11241}
11242
11243int Client::ll_mknodx(Inode *parent, const char *name, mode_t mode,
11244 dev_t rdev, Inode **out,
11245 struct ceph_statx *stx, unsigned want, unsigned flags,
11246 const UserPerm& perms)
11247{
11248 unsigned caps = statx_to_mask(flags, want);
11249 Mutex::Locker lock(client_lock);
11250
11251 vinodeno_t vparent = _get_vino(parent);
11252
11253 ldout(cct, 3) << "ll_mknodx " << vparent << " " << name << dendl;
11254 tout(cct) << "ll_mknodx" << std::endl;
11255 tout(cct) << vparent.ino.val << std::endl;
11256 tout(cct) << name << std::endl;
11257 tout(cct) << mode << std::endl;
11258 tout(cct) << rdev << std::endl;
11259
11260 if (!cct->_conf->fuse_default_permissions) {
11261 int r = may_create(parent, perms);
11262 if (r < 0)
11263 return r;
11264 }
11265
11266 InodeRef in;
11267 int r = _mknod(parent, name, mode, rdev, perms, &in);
11268 if (r == 0) {
11269 fill_statx(in, caps, stx);
11270 _ll_get(in.get());
11271 }
11272 tout(cct) << stx->stx_ino << std::endl;
11273 ldout(cct, 3) << "ll_mknodx " << vparent << " " << name
11274 << " = " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
11275 *out = in.get();
11276 return r;
11277}
11278
11279int Client::_create(Inode *dir, const char *name, int flags, mode_t mode,
11280 InodeRef *inp, Fh **fhp, int stripe_unit, int stripe_count,
11281 int object_size, const char *data_pool, bool *created,
11282 const UserPerm& perms)
11283{
11284 ldout(cct, 3) << "_create(" << dir->ino << " " << name << ", 0" << oct <<
11285 mode << dec << ")" << dendl;
11286
11287 if (strlen(name) > NAME_MAX)
11288 return -ENAMETOOLONG;
11289 if (dir->snapid != CEPH_NOSNAP) {
11290 return -EROFS;
11291 }
11292 if (is_quota_files_exceeded(dir, perms)) {
11293 return -EDQUOT;
11294 }
11295
11296 // use normalized flags to generate cmode
11297 int cmode = ceph_flags_to_mode(ceph_flags_sys2wire(flags));
11298 if (cmode < 0)
11299 return -EINVAL;
11300
11301 int64_t pool_id = -1;
11302 if (data_pool && *data_pool) {
11303 pool_id = objecter->with_osdmap(
11304 std::mem_fn(&OSDMap::lookup_pg_pool_name), data_pool);
11305 if (pool_id < 0)
11306 return -EINVAL;
11307 if (pool_id > 0xffffffffll)
11308 return -ERANGE; // bummer!
11309 }
11310
11311 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_CREATE);
11312
11313 filepath path;
11314 dir->make_nosnap_relative_path(path);
11315 path.push_dentry(name);
11316 req->set_filepath(path);
11317 req->set_inode(dir);
11318 req->head.args.open.flags = ceph_flags_sys2wire(flags | O_CREAT);
11319
11320 req->head.args.open.stripe_unit = stripe_unit;
11321 req->head.args.open.stripe_count = stripe_count;
11322 req->head.args.open.object_size = object_size;
11323 if (cct->_conf->client_debug_getattr_caps)
11324 req->head.args.open.mask = DEBUG_GETATTR_CAPS;
11325 else
11326 req->head.args.open.mask = 0;
11327 req->head.args.open.pool = pool_id;
11328 req->dentry_drop = CEPH_CAP_FILE_SHARED;
11329 req->dentry_unless = CEPH_CAP_FILE_EXCL;
11330
11331 mode |= S_IFREG;
11332 bufferlist xattrs_bl;
11333 int res = _posix_acl_create(dir, &mode, xattrs_bl, perms);
11334 if (res < 0)
11335 goto fail;
11336 req->head.args.open.mode = mode;
11337 if (xattrs_bl.length() > 0)
11338 req->set_data(xattrs_bl);
11339
11340 Dentry *de;
11341 res = get_or_create(dir, name, &de);
11342 if (res < 0)
11343 goto fail;
11344 req->set_dentry(de);
11345
11346 res = make_request(req, perms, inp, created);
11347 if (res < 0) {
11348 goto reply_error;
11349 }
11350
11351 /* If the caller passed a value in fhp, do the open */
11352 if(fhp) {
11353 (*inp)->get_open_ref(cmode);
11354 *fhp = _create_fh(inp->get(), flags, cmode, perms);
11355 }
11356
11357 reply_error:
11358 trim_cache();
11359
11360 ldout(cct, 3) << "create(" << path << ", 0" << oct << mode << dec
11361 << " layout " << stripe_unit
11362 << ' ' << stripe_count
11363 << ' ' << object_size
11364 <<") = " << res << dendl;
11365 return res;
11366
11367 fail:
11368 put_request(req);
11369 return res;
11370}
11371
11372
11373int Client::_mkdir(Inode *dir, const char *name, mode_t mode, const UserPerm& perm,
11374 InodeRef *inp)
11375{
11376 ldout(cct, 3) << "_mkdir(" << dir->ino << " " << name << ", 0" << oct
11377 << mode << dec << ", uid " << perm.uid()
11378 << ", gid " << perm.gid() << ")" << dendl;
11379
11380 if (strlen(name) > NAME_MAX)
11381 return -ENAMETOOLONG;
11382
11383 if (dir->snapid != CEPH_NOSNAP && dir->snapid != CEPH_SNAPDIR) {
11384 return -EROFS;
11385 }
11386 if (is_quota_files_exceeded(dir, perm)) {
11387 return -EDQUOT;
11388 }
11389 MetaRequest *req = new MetaRequest(dir->snapid == CEPH_SNAPDIR ?
11390 CEPH_MDS_OP_MKSNAP : CEPH_MDS_OP_MKDIR);
11391
11392 filepath path;
11393 dir->make_nosnap_relative_path(path);
11394 path.push_dentry(name);
11395 req->set_filepath(path);
11396 req->set_inode(dir);
11397 req->dentry_drop = CEPH_CAP_FILE_SHARED;
11398 req->dentry_unless = CEPH_CAP_FILE_EXCL;
11399
11400 mode |= S_IFDIR;
11401 bufferlist xattrs_bl;
11402 int res = _posix_acl_create(dir, &mode, xattrs_bl, perm);
11403 if (res < 0)
11404 goto fail;
11405 req->head.args.mkdir.mode = mode;
11406 if (xattrs_bl.length() > 0)
11407 req->set_data(xattrs_bl);
11408
11409 Dentry *de;
11410 res = get_or_create(dir, name, &de);
11411 if (res < 0)
11412 goto fail;
11413 req->set_dentry(de);
11414
11415 ldout(cct, 10) << "_mkdir: making request" << dendl;
11416 res = make_request(req, perm, inp);
11417 ldout(cct, 10) << "_mkdir result is " << res << dendl;
11418
11419 trim_cache();
11420
11421 ldout(cct, 3) << "_mkdir(" << path << ", 0" << oct << mode << dec << ") = " << res << dendl;
11422 return res;
11423
11424 fail:
11425 put_request(req);
11426 return res;
11427}
11428
11429int Client::ll_mkdir(Inode *parent, const char *name, mode_t mode,
11430 struct stat *attr, Inode **out, const UserPerm& perm)
11431{
11432 Mutex::Locker lock(client_lock);
11433
11434 vinodeno_t vparent = _get_vino(parent);
11435
11436 ldout(cct, 3) << "ll_mkdir " << vparent << " " << name << dendl;
11437 tout(cct) << "ll_mkdir" << std::endl;
11438 tout(cct) << vparent.ino.val << std::endl;
11439 tout(cct) << name << std::endl;
11440 tout(cct) << mode << std::endl;
11441
11442 if (!cct->_conf->fuse_default_permissions) {
11443 int r = may_create(parent, perm);
11444 if (r < 0)
11445 return r;
11446 }
11447
11448 InodeRef in;
11449 int r = _mkdir(parent, name, mode, perm, &in);
11450 if (r == 0) {
11451 fill_stat(in, attr);
11452 _ll_get(in.get());
11453 }
11454 tout(cct) << attr->st_ino << std::endl;
11455 ldout(cct, 3) << "ll_mkdir " << vparent << " " << name
11456 << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
11457 *out = in.get();
11458 return r;
11459}
11460
11461int Client::ll_mkdirx(Inode *parent, const char *name, mode_t mode, Inode **out,
11462 struct ceph_statx *stx, unsigned want, unsigned flags,
11463 const UserPerm& perms)
11464{
11465 Mutex::Locker lock(client_lock);
11466
11467 vinodeno_t vparent = _get_vino(parent);
11468
11469 ldout(cct, 3) << "ll_mkdirx " << vparent << " " << name << dendl;
11470 tout(cct) << "ll_mkdirx" << std::endl;
11471 tout(cct) << vparent.ino.val << std::endl;
11472 tout(cct) << name << std::endl;
11473 tout(cct) << mode << std::endl;
11474
11475 if (!cct->_conf->fuse_default_permissions) {
11476 int r = may_create(parent, perms);
11477 if (r < 0)
11478 return r;
11479 }
11480
11481 InodeRef in;
11482 int r = _mkdir(parent, name, mode, perms, &in);
11483 if (r == 0) {
11484 fill_statx(in, statx_to_mask(flags, want), stx);
11485 _ll_get(in.get());
11486 } else {
11487 stx->stx_ino = 0;
11488 stx->stx_mask = 0;
11489 }
11490 tout(cct) << stx->stx_ino << std::endl;
11491 ldout(cct, 3) << "ll_mkdirx " << vparent << " " << name
11492 << " = " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
11493 *out = in.get();
11494 return r;
11495}
11496
11497int Client::_symlink(Inode *dir, const char *name, const char *target,
11498 const UserPerm& perms, InodeRef *inp)
11499{
11500 ldout(cct, 3) << "_symlink(" << dir->ino << " " << name << ", " << target
11501 << ", uid " << perms.uid() << ", gid " << perms.gid() << ")"
11502 << dendl;
11503
11504 if (strlen(name) > NAME_MAX)
11505 return -ENAMETOOLONG;
11506
11507 if (dir->snapid != CEPH_NOSNAP) {
11508 return -EROFS;
11509 }
11510 if (is_quota_files_exceeded(dir, perms)) {
11511 return -EDQUOT;
11512 }
11513
11514 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_SYMLINK);
11515
11516 filepath path;
11517 dir->make_nosnap_relative_path(path);
11518 path.push_dentry(name);
11519 req->set_filepath(path);
11520 req->set_inode(dir);
11521 req->set_string2(target);
11522 req->dentry_drop = CEPH_CAP_FILE_SHARED;
11523 req->dentry_unless = CEPH_CAP_FILE_EXCL;
11524
11525 Dentry *de;
11526 int res = get_or_create(dir, name, &de);
11527 if (res < 0)
11528 goto fail;
11529 req->set_dentry(de);
11530
11531 res = make_request(req, perms, inp);
11532
11533 trim_cache();
11534 ldout(cct, 3) << "_symlink(\"" << path << "\", \"" << target << "\") = " <<
11535 res << dendl;
11536 return res;
11537
11538 fail:
11539 put_request(req);
11540 return res;
11541}
11542
11543int Client::ll_symlink(Inode *parent, const char *name, const char *value,
11544 struct stat *attr, Inode **out, const UserPerm& perms)
11545{
11546 Mutex::Locker lock(client_lock);
11547
11548 vinodeno_t vparent = _get_vino(parent);
11549
11550 ldout(cct, 3) << "ll_symlink " << vparent << " " << name << " -> " << value
11551 << dendl;
11552 tout(cct) << "ll_symlink" << std::endl;
11553 tout(cct) << vparent.ino.val << std::endl;
11554 tout(cct) << name << std::endl;
11555 tout(cct) << value << std::endl;
11556
11557 if (!cct->_conf->fuse_default_permissions) {
11558 int r = may_create(parent, perms);
11559 if (r < 0)
11560 return r;
11561 }
11562
11563 InodeRef in;
11564 int r = _symlink(parent, name, value, perms, &in);
11565 if (r == 0) {
11566 fill_stat(in, attr);
11567 _ll_get(in.get());
11568 }
11569 tout(cct) << attr->st_ino << std::endl;
11570 ldout(cct, 3) << "ll_symlink " << vparent << " " << name
11571 << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
11572 *out = in.get();
11573 return r;
11574}
11575
11576int Client::ll_symlinkx(Inode *parent, const char *name, const char *value,
11577 Inode **out, struct ceph_statx *stx, unsigned want,
11578 unsigned flags, const UserPerm& perms)
11579{
11580 Mutex::Locker lock(client_lock);
11581
11582 vinodeno_t vparent = _get_vino(parent);
11583
11584 ldout(cct, 3) << "ll_symlinkx " << vparent << " " << name << " -> " << value
11585 << dendl;
11586 tout(cct) << "ll_symlinkx" << std::endl;
11587 tout(cct) << vparent.ino.val << std::endl;
11588 tout(cct) << name << std::endl;
11589 tout(cct) << value << std::endl;
11590
11591 if (!cct->_conf->fuse_default_permissions) {
11592 int r = may_create(parent, perms);
11593 if (r < 0)
11594 return r;
11595 }
11596
11597 InodeRef in;
11598 int r = _symlink(parent, name, value, perms, &in);
11599 if (r == 0) {
11600 fill_statx(in, statx_to_mask(flags, want), stx);
11601 _ll_get(in.get());
11602 }
11603 tout(cct) << stx->stx_ino << std::endl;
11604 ldout(cct, 3) << "ll_symlinkx " << vparent << " " << name
11605 << " = " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
11606 *out = in.get();
11607 return r;
11608}
11609
11610int Client::_unlink(Inode *dir, const char *name, const UserPerm& perm)
11611{
11612 ldout(cct, 3) << "_unlink(" << dir->ino << " " << name
11613 << " uid " << perm.uid() << " gid " << perm.gid()
11614 << ")" << dendl;
11615
11616 if (dir->snapid != CEPH_NOSNAP) {
11617 return -EROFS;
11618 }
11619
11620 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_UNLINK);
11621
11622 filepath path;
11623 dir->make_nosnap_relative_path(path);
11624 path.push_dentry(name);
11625 req->set_filepath(path);
11626
11627 InodeRef otherin;
11628
11629 Dentry *de;
11630 int res = get_or_create(dir, name, &de);
11631 if (res < 0)
11632 goto fail;
11633 req->set_dentry(de);
11634 req->dentry_drop = CEPH_CAP_FILE_SHARED;
11635 req->dentry_unless = CEPH_CAP_FILE_EXCL;
11636
11637 res = _lookup(dir, name, 0, &otherin, perm);
11638 if (res < 0)
11639 goto fail;
11640 req->set_other_inode(otherin.get());
11641 req->other_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
11642
11643 req->set_inode(dir);
11644
11645 res = make_request(req, perm);
11646
11647 trim_cache();
11648 ldout(cct, 3) << "unlink(" << path << ") = " << res << dendl;
11649 return res;
11650
11651 fail:
11652 put_request(req);
11653 return res;
11654}
11655
11656int Client::ll_unlink(Inode *in, const char *name, const UserPerm& perm)
11657{
11658 Mutex::Locker lock(client_lock);
11659
11660 vinodeno_t vino = _get_vino(in);
11661
11662 ldout(cct, 3) << "ll_unlink " << vino << " " << name << dendl;
11663 tout(cct) << "ll_unlink" << std::endl;
11664 tout(cct) << vino.ino.val << std::endl;
11665 tout(cct) << name << std::endl;
11666
11667 if (!cct->_conf->fuse_default_permissions) {
11668 int r = may_delete(in, name, perm);
11669 if (r < 0)
11670 return r;
11671 }
11672 return _unlink(in, name, perm);
11673}
11674
11675int Client::_rmdir(Inode *dir, const char *name, const UserPerm& perms)
11676{
11677 ldout(cct, 3) << "_rmdir(" << dir->ino << " " << name << " uid "
11678 << perms.uid() << " gid " << perms.gid() << ")" << dendl;
11679
11680 if (dir->snapid != CEPH_NOSNAP && dir->snapid != CEPH_SNAPDIR) {
11681 return -EROFS;
11682 }
11683
11684 MetaRequest *req = new MetaRequest(dir->snapid == CEPH_SNAPDIR ? CEPH_MDS_OP_RMSNAP:CEPH_MDS_OP_RMDIR);
11685 filepath path;
11686 dir->make_nosnap_relative_path(path);
11687 path.push_dentry(name);
11688 req->set_filepath(path);
11689
11690 req->dentry_drop = CEPH_CAP_FILE_SHARED;
11691 req->dentry_unless = CEPH_CAP_FILE_EXCL;
11692 req->other_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
11693
11694 InodeRef in;
11695
11696 Dentry *de;
11697 int res = get_or_create(dir, name, &de);
11698 if (res < 0)
11699 goto fail;
11700 res = _lookup(dir, name, 0, &in, perms);
11701 if (res < 0)
11702 goto fail;
11703 if (req->get_op() == CEPH_MDS_OP_RMDIR) {
11704 req->set_inode(dir);
11705 req->set_dentry(de);
11706 req->set_other_inode(in.get());
11707 } else {
11708 unlink(de, true, true);
11709 req->set_other_inode(in.get());
11710 }
11711
11712 res = make_request(req, perms);
11713
11714 trim_cache();
11715 ldout(cct, 3) << "rmdir(" << path << ") = " << res << dendl;
11716 return res;
11717
11718 fail:
11719 put_request(req);
11720 return res;
11721}
11722
11723int Client::ll_rmdir(Inode *in, const char *name, const UserPerm& perms)
11724{
11725 Mutex::Locker lock(client_lock);
11726
11727 vinodeno_t vino = _get_vino(in);
11728
11729 ldout(cct, 3) << "ll_rmdir " << vino << " " << name << dendl;
11730 tout(cct) << "ll_rmdir" << std::endl;
11731 tout(cct) << vino.ino.val << std::endl;
11732 tout(cct) << name << std::endl;
11733
11734 if (!cct->_conf->fuse_default_permissions) {
11735 int r = may_delete(in, name, perms);
11736 if (r < 0)
11737 return r;
11738 }
11739
11740 return _rmdir(in, name, perms);
11741}
11742
11743int Client::_rename(Inode *fromdir, const char *fromname, Inode *todir, const char *toname, const UserPerm& perm)
11744{
11745 ldout(cct, 3) << "_rename(" << fromdir->ino << " " << fromname << " to "
11746 << todir->ino << " " << toname
11747 << " uid " << perm.uid() << " gid " << perm.gid() << ")"
11748 << dendl;
11749
11750 if (fromdir->snapid != todir->snapid)
11751 return -EXDEV;
11752
11753 int op = CEPH_MDS_OP_RENAME;
11754 if (fromdir->snapid != CEPH_NOSNAP) {
11755 if (fromdir == todir && fromdir->snapid == CEPH_SNAPDIR)
11756 op = CEPH_MDS_OP_RENAMESNAP;
11757 else
11758 return -EROFS;
11759 }
11760 if (fromdir != todir) {
11761 Inode *fromdir_root =
11762 fromdir->quota.is_enable() ? fromdir : get_quota_root(fromdir, perm);
11763 Inode *todir_root =
11764 todir->quota.is_enable() ? todir : get_quota_root(todir, perm);
11765 if (fromdir_root != todir_root) {
11766 return -EXDEV;
11767 }
11768 }
11769
11770 InodeRef target;
11771 MetaRequest *req = new MetaRequest(op);
11772
11773 filepath from;
11774 fromdir->make_nosnap_relative_path(from);
11775 from.push_dentry(fromname);
11776 filepath to;
11777 todir->make_nosnap_relative_path(to);
11778 to.push_dentry(toname);
11779 req->set_filepath(to);
11780 req->set_filepath2(from);
11781
11782 Dentry *oldde;
11783 int res = get_or_create(fromdir, fromname, &oldde);
11784 if (res < 0)
11785 goto fail;
11786 Dentry *de;
11787 res = get_or_create(todir, toname, &de);
11788 if (res < 0)
11789 goto fail;
11790
11791 if (op == CEPH_MDS_OP_RENAME) {
11792 req->set_old_dentry(oldde);
11793 req->old_dentry_drop = CEPH_CAP_FILE_SHARED;
11794 req->old_dentry_unless = CEPH_CAP_FILE_EXCL;
11795
11796 req->set_dentry(de);
11797 req->dentry_drop = CEPH_CAP_FILE_SHARED;
11798 req->dentry_unless = CEPH_CAP_FILE_EXCL;
11799
11800 InodeRef oldin, otherin;
11801 res = _lookup(fromdir, fromname, 0, &oldin, perm);
11802 if (res < 0)
11803 goto fail;
11804 req->set_old_inode(oldin.get());
11805 req->old_inode_drop = CEPH_CAP_LINK_SHARED;
11806
11807 res = _lookup(todir, toname, 0, &otherin, perm);
11808 if (res != 0 && res != -ENOENT) {
11809 goto fail;
11810 } else if (res == 0) {
11811 req->set_other_inode(otherin.get());
11812 req->other_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
11813 }
11814
11815 req->set_inode(todir);
11816 } else {
11817 // renamesnap reply contains no tracedn, so we need to invalidate
11818 // dentry manually
11819 unlink(oldde, true, true);
11820 unlink(de, true, true);
11821 }
11822
11823 res = make_request(req, perm, &target);
11824 ldout(cct, 10) << "rename result is " << res << dendl;
11825
11826 // renamed item from our cache
11827
11828 trim_cache();
11829 ldout(cct, 3) << "_rename(" << from << ", " << to << ") = " << res << dendl;
11830 return res;
11831
11832 fail:
11833 put_request(req);
11834 return res;
11835}
11836
11837int Client::ll_rename(Inode *parent, const char *name, Inode *newparent,
11838 const char *newname, const UserPerm& perm)
11839{
11840 Mutex::Locker lock(client_lock);
11841
11842 vinodeno_t vparent = _get_vino(parent);
11843 vinodeno_t vnewparent = _get_vino(newparent);
11844
11845 ldout(cct, 3) << "ll_rename " << vparent << " " << name << " to "
11846 << vnewparent << " " << newname << dendl;
11847 tout(cct) << "ll_rename" << std::endl;
11848 tout(cct) << vparent.ino.val << std::endl;
11849 tout(cct) << name << std::endl;
11850 tout(cct) << vnewparent.ino.val << std::endl;
11851 tout(cct) << newname << std::endl;
11852
11853 if (!cct->_conf->fuse_default_permissions) {
11854 int r = may_delete(parent, name, perm);
11855 if (r < 0)
11856 return r;
11857 r = may_delete(newparent, newname, perm);
11858 if (r < 0 && r != -ENOENT)
11859 return r;
11860 }
11861
11862 return _rename(parent, name, newparent, newname, perm);
11863}
11864
11865int Client::_link(Inode *in, Inode *dir, const char *newname, const UserPerm& perm, InodeRef *inp)
11866{
11867 ldout(cct, 3) << "_link(" << in->ino << " to " << dir->ino << " " << newname
11868 << " uid " << perm.uid() << " gid " << perm.gid() << ")" << dendl;
11869
11870 if (strlen(newname) > NAME_MAX)
11871 return -ENAMETOOLONG;
11872
11873 if (in->snapid != CEPH_NOSNAP || dir->snapid != CEPH_NOSNAP) {
11874 return -EROFS;
11875 }
11876 if (is_quota_files_exceeded(dir, perm)) {
11877 return -EDQUOT;
11878 }
11879
11880 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LINK);
11881
11882 filepath path(newname, dir->ino);
11883 req->set_filepath(path);
11884 filepath existing(in->ino);
11885 req->set_filepath2(existing);
11886
11887 req->set_inode(dir);
11888 req->inode_drop = CEPH_CAP_FILE_SHARED;
11889 req->inode_unless = CEPH_CAP_FILE_EXCL;
11890
11891 Dentry *de;
11892 int res = get_or_create(dir, newname, &de);
11893 if (res < 0)
11894 goto fail;
11895 req->set_dentry(de);
11896
11897 res = make_request(req, perm, inp);
11898 ldout(cct, 10) << "link result is " << res << dendl;
11899
11900 trim_cache();
11901 ldout(cct, 3) << "link(" << existing << ", " << path << ") = " << res << dendl;
11902 return res;
11903
11904 fail:
11905 put_request(req);
11906 return res;
11907}
11908
11909int Client::ll_link(Inode *in, Inode *newparent, const char *newname,
11910 const UserPerm& perm)
11911{
11912 Mutex::Locker lock(client_lock);
11913
11914 vinodeno_t vino = _get_vino(in);
11915 vinodeno_t vnewparent = _get_vino(newparent);
11916
31f18b77 11917 ldout(cct, 3) << "ll_link " << vino << " to " << vnewparent << " " <<
7c673cae
FG
11918 newname << dendl;
11919 tout(cct) << "ll_link" << std::endl;
11920 tout(cct) << vino.ino.val << std::endl;
11921 tout(cct) << vnewparent << std::endl;
11922 tout(cct) << newname << std::endl;
11923
11924 int r = 0;
11925 InodeRef target;
11926
11927 if (!cct->_conf->fuse_default_permissions) {
11928 if (S_ISDIR(in->mode))
11929 return -EPERM;
11930
11931 r = may_hardlink(in, perm);
11932 if (r < 0)
11933 return r;
11934
11935 r = may_create(newparent, perm);
11936 if (r < 0)
11937 return r;
11938 }
11939
11940 return _link(in, newparent, newname, perm, &target);
11941}
11942
11943int Client::ll_num_osds(void)
11944{
11945 Mutex::Locker lock(client_lock);
11946 return objecter->with_osdmap(std::mem_fn(&OSDMap::get_num_osds));
11947}
11948
11949int Client::ll_osdaddr(int osd, uint32_t *addr)
11950{
11951 Mutex::Locker lock(client_lock);
11952 entity_addr_t g;
11953 bool exists = objecter->with_osdmap([&](const OSDMap& o) {
11954 if (!o.exists(osd))
11955 return false;
11956 g = o.get_addr(osd);
11957 return true;
11958 });
11959 if (!exists)
11960 return -1;
11961 uint32_t nb_addr = (g.in4_addr()).sin_addr.s_addr;
11962 *addr = ntohl(nb_addr);
11963 return 0;
11964}
11965uint32_t Client::ll_stripe_unit(Inode *in)
11966{
11967 Mutex::Locker lock(client_lock);
11968 return in->layout.stripe_unit;
11969}
11970
11971uint64_t Client::ll_snap_seq(Inode *in)
11972{
11973 Mutex::Locker lock(client_lock);
11974 return in->snaprealm->seq;
11975}
11976
11977int Client::ll_file_layout(Inode *in, file_layout_t *layout)
11978{
11979 Mutex::Locker lock(client_lock);
11980 *layout = in->layout;
11981 return 0;
11982}
11983
11984int Client::ll_file_layout(Fh *fh, file_layout_t *layout)
11985{
11986 return ll_file_layout(fh->inode.get(), layout);
11987}
11988
11989/* Currently we cannot take advantage of redundancy in reads, since we
11990 would have to go through all possible placement groups (a
11991 potentially quite large number determined by a hash), and use CRUSH
11992 to calculate the appropriate set of OSDs for each placement group,
11993 then index into that. An array with one entry per OSD is much more
11994 tractable and works for demonstration purposes. */
11995
11996int Client::ll_get_stripe_osd(Inode *in, uint64_t blockno,
11997 file_layout_t* layout)
11998{
11999 Mutex::Locker lock(client_lock);
12000 inodeno_t ino = ll_get_inodeno(in);
12001 uint32_t object_size = layout->object_size;
12002 uint32_t su = layout->stripe_unit;
12003 uint32_t stripe_count = layout->stripe_count;
12004 uint64_t stripes_per_object = object_size / su;
12005
12006 uint64_t stripeno = blockno / stripe_count; // which horizontal stripe (Y)
12007 uint64_t stripepos = blockno % stripe_count; // which object in the object set (X)
12008 uint64_t objectsetno = stripeno / stripes_per_object; // which object set
12009 uint64_t objectno = objectsetno * stripe_count + stripepos; // object id
12010
12011 object_t oid = file_object_t(ino, objectno);
12012 return objecter->with_osdmap([&](const OSDMap& o) {
12013 ceph_object_layout olayout =
12014 o.file_to_object_layout(oid, *layout);
12015 pg_t pg = (pg_t)olayout.ol_pgid;
12016 vector<int> osds;
12017 int primary;
12018 o.pg_to_acting_osds(pg, &osds, &primary);
12019 return primary;
12020 });
12021}
12022
12023/* Return the offset of the block, internal to the object */
12024
12025uint64_t Client::ll_get_internal_offset(Inode *in, uint64_t blockno)
12026{
12027 Mutex::Locker lock(client_lock);
12028 file_layout_t *layout=&(in->layout);
12029 uint32_t object_size = layout->object_size;
12030 uint32_t su = layout->stripe_unit;
12031 uint64_t stripes_per_object = object_size / su;
12032
12033 return (blockno % stripes_per_object) * su;
12034}
12035
12036int Client::ll_opendir(Inode *in, int flags, dir_result_t** dirpp,
12037 const UserPerm& perms)
12038{
12039 Mutex::Locker lock(client_lock);
12040
12041 vinodeno_t vino = _get_vino(in);
12042
12043 ldout(cct, 3) << "ll_opendir " << vino << dendl;
12044 tout(cct) << "ll_opendir" << std::endl;
12045 tout(cct) << vino.ino.val << std::endl;
12046
12047 if (!cct->_conf->fuse_default_permissions) {
12048 int r = may_open(in, flags, perms);
12049 if (r < 0)
12050 return r;
12051 }
12052
12053 int r = _opendir(in, dirpp, perms);
12054 tout(cct) << (unsigned long)*dirpp << std::endl;
12055
12056 ldout(cct, 3) << "ll_opendir " << vino << " = " << r << " (" << *dirpp << ")"
12057 << dendl;
12058 return r;
12059}
12060
12061int Client::ll_releasedir(dir_result_t *dirp)
12062{
12063 Mutex::Locker lock(client_lock);
12064 ldout(cct, 3) << "ll_releasedir " << dirp << dendl;
12065 tout(cct) << "ll_releasedir" << std::endl;
12066 tout(cct) << (unsigned long)dirp << std::endl;
12067 _closedir(dirp);
12068 return 0;
12069}
12070
12071int Client::ll_fsyncdir(dir_result_t *dirp)
12072{
12073 Mutex::Locker lock(client_lock);
12074 ldout(cct, 3) << "ll_fsyncdir " << dirp << dendl;
12075 tout(cct) << "ll_fsyncdir" << std::endl;
12076 tout(cct) << (unsigned long)dirp << std::endl;
12077
12078 return _fsync(dirp->inode.get(), false);
12079}
12080
12081int Client::ll_open(Inode *in, int flags, Fh **fhp, const UserPerm& perms)
12082{
12083 assert(!(flags & O_CREAT));
12084
12085 Mutex::Locker lock(client_lock);
12086
12087 vinodeno_t vino = _get_vino(in);
12088
12089 ldout(cct, 3) << "ll_open " << vino << " " << ceph_flags_sys2wire(flags) << dendl;
12090 tout(cct) << "ll_open" << std::endl;
12091 tout(cct) << vino.ino.val << std::endl;
12092 tout(cct) << ceph_flags_sys2wire(flags) << std::endl;
12093
12094 int r;
12095 if (!cct->_conf->fuse_default_permissions) {
12096 r = may_open(in, flags, perms);
12097 if (r < 0)
12098 goto out;
12099 }
12100
12101 r = _open(in, flags, 0, fhp /* may be NULL */, perms);
12102
12103 out:
12104 Fh *fhptr = fhp ? *fhp : NULL;
12105 if (fhptr) {
12106 ll_unclosed_fh_set.insert(fhptr);
12107 }
12108 tout(cct) << (unsigned long)fhptr << std::endl;
12109 ldout(cct, 3) << "ll_open " << vino << " " << ceph_flags_sys2wire(flags) <<
12110 " = " << r << " (" << fhptr << ")" << dendl;
12111 return r;
12112}
12113
12114int Client::_ll_create(Inode *parent, const char *name, mode_t mode,
12115 int flags, InodeRef *in, int caps, Fh **fhp,
12116 const UserPerm& perms)
12117{
12118 *fhp = NULL;
12119
12120 vinodeno_t vparent = _get_vino(parent);
12121
12122 ldout(cct, 3) << "_ll_create " << vparent << " " << name << " 0" << oct <<
12123 mode << dec << " " << ceph_flags_sys2wire(flags) << ", uid " << perms.uid()
12124 << ", gid " << perms.gid() << dendl;
12125 tout(cct) << "ll_create" << std::endl;
12126 tout(cct) << vparent.ino.val << std::endl;
12127 tout(cct) << name << std::endl;
12128 tout(cct) << mode << std::endl;
12129 tout(cct) << ceph_flags_sys2wire(flags) << std::endl;
12130
12131 bool created = false;
12132 int r = _lookup(parent, name, caps, in, perms);
12133
12134 if (r == 0 && (flags & O_CREAT) && (flags & O_EXCL))
12135 return -EEXIST;
12136
12137 if (r == -ENOENT && (flags & O_CREAT)) {
12138 if (!cct->_conf->fuse_default_permissions) {
12139 r = may_create(parent, perms);
12140 if (r < 0)
12141 goto out;
12142 }
12143 r = _create(parent, name, flags, mode, in, fhp, 0, 0, 0, NULL, &created,
12144 perms);
12145 if (r < 0)
12146 goto out;
12147 }
12148
12149 if (r < 0)
12150 goto out;
12151
12152 assert(*in);
12153
12154 ldout(cct, 20) << "_ll_create created = " << created << dendl;
12155 if (!created) {
12156 if (!cct->_conf->fuse_default_permissions) {
12157 r = may_open(in->get(), flags, perms);
12158 if (r < 0) {
12159 if (*fhp) {
12160 int release_r = _release_fh(*fhp);
12161 assert(release_r == 0); // during create, no async data ops should have happened
12162 }
12163 goto out;
12164 }
12165 }
12166 if (*fhp == NULL) {
12167 r = _open(in->get(), flags, mode, fhp, perms);
12168 if (r < 0)
12169 goto out;
12170 }
12171 }
12172
12173out:
12174 if (*fhp) {
12175 ll_unclosed_fh_set.insert(*fhp);
12176 }
12177
12178 ino_t ino = 0;
12179 if (r >= 0) {
12180 Inode *inode = in->get();
12181 if (use_faked_inos())
12182 ino = inode->faked_ino;
12183 else
12184 ino = inode->ino;
12185 }
12186
12187 tout(cct) << (unsigned long)*fhp << std::endl;
12188 tout(cct) << ino << std::endl;
31f18b77 12189 ldout(cct, 3) << "_ll_create " << vparent << " " << name << " 0" << oct <<
7c673cae
FG
12190 mode << dec << " " << ceph_flags_sys2wire(flags) << " = " << r << " (" <<
12191 *fhp << " " << hex << ino << dec << ")" << dendl;
12192
12193 return r;
12194}
12195
12196int Client::ll_create(Inode *parent, const char *name, mode_t mode,
12197 int flags, struct stat *attr, Inode **outp, Fh **fhp,
12198 const UserPerm& perms)
12199{
12200 Mutex::Locker lock(client_lock);
12201 InodeRef in;
12202
12203 int r = _ll_create(parent, name, mode, flags, &in, CEPH_STAT_CAP_INODE_ALL,
12204 fhp, perms);
12205 if (r >= 0) {
12206 assert(in);
12207
12208 // passing an Inode in outp requires an additional ref
12209 if (outp) {
12210 _ll_get(in.get());
12211 *outp = in.get();
12212 }
12213 fill_stat(in, attr);
12214 } else {
12215 attr->st_ino = 0;
12216 }
12217
12218 return r;
12219}
12220
12221int Client::ll_createx(Inode *parent, const char *name, mode_t mode,
12222 int oflags, Inode **outp, Fh **fhp,
12223 struct ceph_statx *stx, unsigned want, unsigned lflags,
12224 const UserPerm& perms)
12225{
12226 unsigned caps = statx_to_mask(lflags, want);
12227 Mutex::Locker lock(client_lock);
12228 InodeRef in;
12229
12230
12231 int r = _ll_create(parent, name, mode, oflags, &in, caps, fhp, perms);
12232 if (r >= 0) {
12233 assert(in);
12234
12235 // passing an Inode in outp requires an additional ref
12236 if (outp) {
12237 _ll_get(in.get());
12238 *outp = in.get();
12239 }
12240 fill_statx(in, caps, stx);
12241 } else {
12242 stx->stx_ino = 0;
12243 stx->stx_mask = 0;
12244 }
12245
12246 return r;
12247}
12248
12249loff_t Client::ll_lseek(Fh *fh, loff_t offset, int whence)
12250{
12251 Mutex::Locker lock(client_lock);
12252 tout(cct) << "ll_lseek" << std::endl;
12253 tout(cct) << offset << std::endl;
12254 tout(cct) << whence << std::endl;
12255
12256 return _lseek(fh, offset, whence);
12257}
12258
12259int Client::ll_read(Fh *fh, loff_t off, loff_t len, bufferlist *bl)
12260{
12261 Mutex::Locker lock(client_lock);
12262 ldout(cct, 3) << "ll_read " << fh << " " << fh->inode->ino << " " << " " << off << "~" << len << dendl;
12263 tout(cct) << "ll_read" << std::endl;
12264 tout(cct) << (unsigned long)fh << std::endl;
12265 tout(cct) << off << std::endl;
12266 tout(cct) << len << std::endl;
12267
12268 return _read(fh, off, len, bl);
12269}
12270
12271int Client::ll_read_block(Inode *in, uint64_t blockid,
12272 char *buf,
12273 uint64_t offset,
12274 uint64_t length,
12275 file_layout_t* layout)
12276{
12277 Mutex::Locker lock(client_lock);
12278 vinodeno_t vino = ll_get_vino(in);
12279 object_t oid = file_object_t(vino.ino, blockid);
12280 C_SaferCond onfinish;
12281 bufferlist bl;
12282
12283 objecter->read(oid,
12284 object_locator_t(layout->pool_id),
12285 offset,
12286 length,
12287 vino.snapid,
12288 &bl,
12289 CEPH_OSD_FLAG_READ,
12290 &onfinish);
12291
12292 client_lock.Unlock();
12293 int r = onfinish.wait();
12294 client_lock.Lock();
12295
12296 if (r >= 0) {
12297 bl.copy(0, bl.length(), buf);
12298 r = bl.length();
12299 }
12300
12301 return r;
12302}
12303
12304/* It appears that the OSD doesn't return success unless the entire
12305 buffer was written, return the write length on success. */
12306
12307int Client::ll_write_block(Inode *in, uint64_t blockid,
12308 char* buf, uint64_t offset,
12309 uint64_t length, file_layout_t* layout,
12310 uint64_t snapseq, uint32_t sync)
12311{
12312 Mutex flock("Client::ll_write_block flock");
12313 vinodeno_t vino = ll_get_vino(in);
12314 Cond cond;
12315 bool done;
12316 int r = 0;
12317 Context *onsafe;
12318
12319 if (length == 0) {
12320 return -EINVAL;
12321 }
12322 if (true || sync) {
12323 /* if write is stable, the epilogue is waiting on
12324 * flock */
12325 onsafe = new C_SafeCond(&flock, &cond, &done, &r);
12326 done = false;
12327 } else {
12328 /* if write is unstable, we just place a barrier for
12329 * future commits to wait on */
12330 /*onsafe = new C_Block_Sync(this, vino.ino,
12331 barrier_interval(offset, offset + length), &r);
12332 */
12333 done = true;
12334 }
12335 object_t oid = file_object_t(vino.ino, blockid);
12336 SnapContext fakesnap;
12337 bufferptr bp;
12338 if (length > 0) bp = buffer::copy(buf, length);
12339 bufferlist bl;
12340 bl.push_back(bp);
12341
12342 ldout(cct, 1) << "ll_block_write for " << vino.ino << "." << blockid
12343 << dendl;
12344
12345 fakesnap.seq = snapseq;
12346
12347 /* lock just in time */
12348 client_lock.Lock();
12349
12350 objecter->write(oid,
12351 object_locator_t(layout->pool_id),
12352 offset,
12353 length,
12354 fakesnap,
12355 bl,
12356 ceph::real_clock::now(),
12357 0,
12358 onsafe);
12359
12360 client_lock.Unlock();
12361 if (!done /* also !sync */) {
12362 flock.Lock();
12363 while (! done)
12364 cond.Wait(flock);
12365 flock.Unlock();
12366 }
12367
12368 if (r < 0) {
12369 return r;
12370 } else {
12371 return length;
12372 }
12373}
12374
12375int Client::ll_commit_blocks(Inode *in,
12376 uint64_t offset,
12377 uint64_t length)
12378{
12379 Mutex::Locker lock(client_lock);
12380 /*
12381 BarrierContext *bctx;
12382 vinodeno_t vino = ll_get_vino(in);
12383 uint64_t ino = vino.ino;
12384
12385 ldout(cct, 1) << "ll_commit_blocks for " << vino.ino << " from "
12386 << offset << " to " << length << dendl;
12387
12388 if (length == 0) {
12389 return -EINVAL;
12390 }
12391
12392 map<uint64_t, BarrierContext*>::iterator p = barriers.find(ino);
12393 if (p != barriers.end()) {
12394 barrier_interval civ(offset, offset + length);
12395 p->second->commit_barrier(civ);
12396 }
12397 */
12398 return 0;
12399}
12400
12401int Client::ll_write(Fh *fh, loff_t off, loff_t len, const char *data)
12402{
12403 Mutex::Locker lock(client_lock);
12404 ldout(cct, 3) << "ll_write " << fh << " " << fh->inode->ino << " " << off <<
12405 "~" << len << dendl;
12406 tout(cct) << "ll_write" << std::endl;
12407 tout(cct) << (unsigned long)fh << std::endl;
12408 tout(cct) << off << std::endl;
12409 tout(cct) << len << std::endl;
12410
12411 int r = _write(fh, off, len, data, NULL, 0);
12412 ldout(cct, 3) << "ll_write " << fh << " " << off << "~" << len << " = " << r
12413 << dendl;
12414 return r;
12415}
12416
12417int Client::ll_flush(Fh *fh)
12418{
12419 Mutex::Locker lock(client_lock);
12420 ldout(cct, 3) << "ll_flush " << fh << " " << fh->inode->ino << " " << dendl;
12421 tout(cct) << "ll_flush" << std::endl;
12422 tout(cct) << (unsigned long)fh << std::endl;
12423
12424 return _flush(fh);
12425}
12426
12427int Client::ll_fsync(Fh *fh, bool syncdataonly)
12428{
12429 Mutex::Locker lock(client_lock);
12430 ldout(cct, 3) << "ll_fsync " << fh << " " << fh->inode->ino << " " << dendl;
12431 tout(cct) << "ll_fsync" << std::endl;
12432 tout(cct) << (unsigned long)fh << std::endl;
12433
12434 int r = _fsync(fh, syncdataonly);
12435 if (r) {
12436 // If we're returning an error, clear it from the FH
12437 fh->take_async_err();
12438 }
12439 return r;
12440}
12441
12442#ifdef FALLOC_FL_PUNCH_HOLE
12443
12444int Client::_fallocate(Fh *fh, int mode, int64_t offset, int64_t length)
12445{
12446 if (offset < 0 || length <= 0)
12447 return -EINVAL;
12448
12449 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
12450 return -EOPNOTSUPP;
12451
12452 if ((mode & FALLOC_FL_PUNCH_HOLE) && !(mode & FALLOC_FL_KEEP_SIZE))
12453 return -EOPNOTSUPP;
12454
12455 Inode *in = fh->inode.get();
12456
12457 if (objecter->osdmap_pool_full(in->layout.pool_id) &&
12458 !(mode & FALLOC_FL_PUNCH_HOLE)) {
12459 return -ENOSPC;
12460 }
12461
12462 if (in->snapid != CEPH_NOSNAP)
12463 return -EROFS;
12464
12465 if ((fh->mode & CEPH_FILE_MODE_WR) == 0)
12466 return -EBADF;
12467
12468 uint64_t size = offset + length;
12469 if (!(mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE)) &&
12470 size > in->size &&
12471 is_quota_bytes_exceeded(in, size - in->size, fh->actor_perms)) {
12472 return -EDQUOT;
12473 }
12474
12475 int have;
12476 int r = get_caps(in, CEPH_CAP_FILE_WR, CEPH_CAP_FILE_BUFFER, &have, -1);
12477 if (r < 0)
12478 return r;
12479
12480 Mutex uninline_flock("Client::_fallocate_uninline_data flock");
12481 Cond uninline_cond;
12482 bool uninline_done = false;
12483 int uninline_ret = 0;
12484 Context *onuninline = NULL;
12485
12486 if (mode & FALLOC_FL_PUNCH_HOLE) {
12487 if (in->inline_version < CEPH_INLINE_NONE &&
12488 (have & CEPH_CAP_FILE_BUFFER)) {
12489 bufferlist bl;
12490 int len = in->inline_data.length();
12491 if (offset < len) {
12492 if (offset > 0)
12493 in->inline_data.copy(0, offset, bl);
12494 int size = length;
12495 if (offset + size > len)
12496 size = len - offset;
12497 if (size > 0)
12498 bl.append_zero(size);
12499 if (offset + size < len)
12500 in->inline_data.copy(offset + size, len - offset - size, bl);
12501 in->inline_data = bl;
12502 in->inline_version++;
12503 }
12504 in->mtime = ceph_clock_now();
12505 in->change_attr++;
12506 mark_caps_dirty(in, CEPH_CAP_FILE_WR);
12507 } else {
12508 if (in->inline_version < CEPH_INLINE_NONE) {
12509 onuninline = new C_SafeCond(&uninline_flock,
12510 &uninline_cond,
12511 &uninline_done,
12512 &uninline_ret);
12513 uninline_data(in, onuninline);
12514 }
12515
12516 Mutex flock("Client::_punch_hole flock");
12517 Cond cond;
12518 bool done = false;
12519 Context *onfinish = new C_SafeCond(&flock, &cond, &done);
12520
12521 unsafe_sync_write++;
12522 get_cap_ref(in, CEPH_CAP_FILE_BUFFER);
12523
12524 _invalidate_inode_cache(in, offset, length);
12525 filer->zero(in->ino, &in->layout,
12526 in->snaprealm->get_snap_context(),
12527 offset, length,
12528 ceph::real_clock::now(),
12529 0, true, onfinish);
12530 in->mtime = ceph_clock_now();
12531 in->change_attr++;
12532 mark_caps_dirty(in, CEPH_CAP_FILE_WR);
12533
12534 client_lock.Unlock();
12535 flock.Lock();
12536 while (!done)
12537 cond.Wait(flock);
12538 flock.Unlock();
12539 client_lock.Lock();
12540 _sync_write_commit(in);
12541 }
12542 } else if (!(mode & FALLOC_FL_KEEP_SIZE)) {
12543 uint64_t size = offset + length;
12544 if (size > in->size) {
12545 in->size = size;
12546 in->mtime = ceph_clock_now();
12547 in->change_attr++;
12548 mark_caps_dirty(in, CEPH_CAP_FILE_WR);
12549
12550 if (is_quota_bytes_approaching(in, fh->actor_perms)) {
12551 check_caps(in, CHECK_CAPS_NODELAY);
31f18b77
FG
12552 } else if (is_max_size_approaching(in)) {
12553 check_caps(in, 0);
7c673cae
FG
12554 }
12555 }
12556 }
12557
12558 if (onuninline) {
12559 client_lock.Unlock();
12560 uninline_flock.Lock();
12561 while (!uninline_done)
12562 uninline_cond.Wait(uninline_flock);
12563 uninline_flock.Unlock();
12564 client_lock.Lock();
12565
12566 if (uninline_ret >= 0 || uninline_ret == -ECANCELED) {
12567 in->inline_data.clear();
12568 in->inline_version = CEPH_INLINE_NONE;
12569 mark_caps_dirty(in, CEPH_CAP_FILE_WR);
12570 check_caps(in, 0);
12571 } else
12572 r = uninline_ret;
12573 }
12574
12575 put_cap_ref(in, CEPH_CAP_FILE_WR);
12576 return r;
12577}
12578#else
12579
12580int Client::_fallocate(Fh *fh, int mode, int64_t offset, int64_t length)
12581{
12582 return -EOPNOTSUPP;
12583}
12584
12585#endif
12586
12587
12588int Client::ll_fallocate(Fh *fh, int mode, loff_t offset, loff_t length)
12589{
12590 Mutex::Locker lock(client_lock);
12591 ldout(cct, 3) << "ll_fallocate " << fh << " " << fh->inode->ino << " " << dendl;
12592 tout(cct) << "ll_fallocate " << mode << " " << offset << " " << length << std::endl;
12593 tout(cct) << (unsigned long)fh << std::endl;
12594
12595 return _fallocate(fh, mode, offset, length);
12596}
12597
12598int Client::fallocate(int fd, int mode, loff_t offset, loff_t length)
12599{
12600 Mutex::Locker lock(client_lock);
12601 tout(cct) << "fallocate " << " " << fd << mode << " " << offset << " " << length << std::endl;
12602
12603 Fh *fh = get_filehandle(fd);
12604 if (!fh)
12605 return -EBADF;
12606#if defined(__linux__) && defined(O_PATH)
12607 if (fh->flags & O_PATH)
12608 return -EBADF;
12609#endif
12610 return _fallocate(fh, mode, offset, length);
12611}
12612
12613int Client::ll_release(Fh *fh)
12614{
12615 Mutex::Locker lock(client_lock);
12616 ldout(cct, 3) << "ll_release (fh)" << fh << " " << fh->inode->ino << " " <<
12617 dendl;
12618 tout(cct) << "ll_release (fh)" << std::endl;
12619 tout(cct) << (unsigned long)fh << std::endl;
12620
12621 if (ll_unclosed_fh_set.count(fh))
12622 ll_unclosed_fh_set.erase(fh);
12623 return _release_fh(fh);
12624}
12625
12626int Client::ll_getlk(Fh *fh, struct flock *fl, uint64_t owner)
12627{
12628 Mutex::Locker lock(client_lock);
12629
12630 ldout(cct, 3) << "ll_getlk (fh)" << fh << " " << fh->inode->ino << dendl;
12631 tout(cct) << "ll_getk (fh)" << (unsigned long)fh << std::endl;
12632
12633 return _getlk(fh, fl, owner);
12634}
12635
12636int Client::ll_setlk(Fh *fh, struct flock *fl, uint64_t owner, int sleep)
12637{
12638 Mutex::Locker lock(client_lock);
12639
12640 ldout(cct, 3) << "ll_setlk (fh) " << fh << " " << fh->inode->ino << dendl;
12641 tout(cct) << "ll_setk (fh)" << (unsigned long)fh << std::endl;
12642
12643 return _setlk(fh, fl, owner, sleep);
12644}
12645
12646int Client::ll_flock(Fh *fh, int cmd, uint64_t owner)
12647{
12648 Mutex::Locker lock(client_lock);
12649
12650 ldout(cct, 3) << "ll_flock (fh) " << fh << " " << fh->inode->ino << dendl;
12651 tout(cct) << "ll_flock (fh)" << (unsigned long)fh << std::endl;
12652
12653 return _flock(fh, cmd, owner);
12654}
12655
12656class C_Client_RequestInterrupt : public Context {
12657private:
12658 Client *client;
12659 MetaRequest *req;
12660public:
12661 C_Client_RequestInterrupt(Client *c, MetaRequest *r) : client(c), req(r) {
12662 req->get();
12663 }
12664 void finish(int r) override {
12665 Mutex::Locker l(client->client_lock);
12666 assert(req->head.op == CEPH_MDS_OP_SETFILELOCK);
12667 client->_interrupt_filelock(req);
12668 client->put_request(req);
12669 }
12670};
12671
12672void Client::ll_interrupt(void *d)
12673{
12674 MetaRequest *req = static_cast<MetaRequest*>(d);
12675 ldout(cct, 3) << "ll_interrupt tid " << req->get_tid() << dendl;
12676 tout(cct) << "ll_interrupt tid " << req->get_tid() << std::endl;
12677 interrupt_finisher.queue(new C_Client_RequestInterrupt(this, req));
12678}
12679
12680// =========================================
12681// layout
12682
12683// expose file layouts
12684
12685int Client::describe_layout(const char *relpath, file_layout_t *lp,
12686 const UserPerm& perms)
12687{
12688 Mutex::Locker lock(client_lock);
12689
12690 filepath path(relpath);
12691 InodeRef in;
12692 int r = path_walk(path, &in, perms);
12693 if (r < 0)
12694 return r;
12695
12696 *lp = in->layout;
12697
12698 ldout(cct, 3) << "describe_layout(" << relpath << ") = 0" << dendl;
12699 return 0;
12700}
12701
12702int Client::fdescribe_layout(int fd, file_layout_t *lp)
12703{
12704 Mutex::Locker lock(client_lock);
12705
12706 Fh *f = get_filehandle(fd);
12707 if (!f)
12708 return -EBADF;
12709 Inode *in = f->inode.get();
12710
12711 *lp = in->layout;
12712
12713 ldout(cct, 3) << "fdescribe_layout(" << fd << ") = 0" << dendl;
12714 return 0;
12715}
12716
d2e6a577
FG
12717int64_t Client::get_default_pool_id()
12718{
12719 Mutex::Locker lock(client_lock);
12720 /* first data pool is the default */
12721 return mdsmap->get_first_data_pool();
12722}
7c673cae
FG
12723
12724// expose osdmap
12725
12726int64_t Client::get_pool_id(const char *pool_name)
12727{
12728 Mutex::Locker lock(client_lock);
12729 return objecter->with_osdmap(std::mem_fn(&OSDMap::lookup_pg_pool_name),
12730 pool_name);
12731}
12732
12733string Client::get_pool_name(int64_t pool)
12734{
12735 Mutex::Locker lock(client_lock);
12736 return objecter->with_osdmap([pool](const OSDMap& o) {
12737 return o.have_pg_pool(pool) ? o.get_pool_name(pool) : string();
12738 });
12739}
12740
12741int Client::get_pool_replication(int64_t pool)
12742{
12743 Mutex::Locker lock(client_lock);
12744 return objecter->with_osdmap([pool](const OSDMap& o) {
12745 return o.have_pg_pool(pool) ? o.get_pg_pool(pool)->get_size() : -ENOENT;
12746 });
12747}
12748
12749int Client::get_file_extent_osds(int fd, loff_t off, loff_t *len, vector<int>& osds)
12750{
12751 Mutex::Locker lock(client_lock);
12752
12753 Fh *f = get_filehandle(fd);
12754 if (!f)
12755 return -EBADF;
12756 Inode *in = f->inode.get();
12757
12758 vector<ObjectExtent> extents;
12759 Striper::file_to_extents(cct, in->ino, &in->layout, off, 1, in->truncate_size, extents);
12760 assert(extents.size() == 1);
12761
12762 objecter->with_osdmap([&](const OSDMap& o) {
12763 pg_t pg = o.object_locator_to_pg(extents[0].oid, extents[0].oloc);
12764 o.pg_to_acting_osds(pg, osds);
12765 });
12766
12767 if (osds.empty())
12768 return -EINVAL;
12769
12770 /*
12771 * Return the remainder of the extent (stripe unit)
12772 *
12773 * If length = 1 is passed to Striper::file_to_extents we get a single
12774 * extent back, but its length is one so we still need to compute the length
12775 * to the end of the stripe unit.
12776 *
12777 * If length = su then we may get 1 or 2 objects back in the extents vector
12778 * which would have to be examined. Even then, the offsets are local to the
12779 * object, so matching up to the file offset is extra work.
12780 *
12781 * It seems simpler to stick with length = 1 and manually compute the
12782 * remainder.
12783 */
12784 if (len) {
12785 uint64_t su = in->layout.stripe_unit;
12786 *len = su - (off % su);
12787 }
12788
12789 return 0;
12790}
12791
12792int Client::get_osd_crush_location(int id, vector<pair<string, string> >& path)
12793{
12794 Mutex::Locker lock(client_lock);
12795 if (id < 0)
12796 return -EINVAL;
12797 return objecter->with_osdmap([&](const OSDMap& o) {
12798 return o.crush->get_full_location_ordered(id, path);
12799 });
12800}
12801
12802int Client::get_file_stripe_address(int fd, loff_t offset,
12803 vector<entity_addr_t>& address)
12804{
12805 Mutex::Locker lock(client_lock);
12806
12807 Fh *f = get_filehandle(fd);
12808 if (!f)
12809 return -EBADF;
12810 Inode *in = f->inode.get();
12811
12812 // which object?
12813 vector<ObjectExtent> extents;
12814 Striper::file_to_extents(cct, in->ino, &in->layout, offset, 1,
12815 in->truncate_size, extents);
12816 assert(extents.size() == 1);
12817
12818 // now we have the object and its 'layout'
12819 return objecter->with_osdmap([&](const OSDMap& o) {
12820 pg_t pg = o.object_locator_to_pg(extents[0].oid, extents[0].oloc);
12821 vector<int> osds;
12822 o.pg_to_acting_osds(pg, osds);
12823 if (osds.empty())
12824 return -EINVAL;
12825 for (unsigned i = 0; i < osds.size(); i++) {
12826 entity_addr_t addr = o.get_addr(osds[i]);
12827 address.push_back(addr);
12828 }
12829 return 0;
12830 });
12831}
12832
12833int Client::get_osd_addr(int osd, entity_addr_t& addr)
12834{
12835 Mutex::Locker lock(client_lock);
12836 return objecter->with_osdmap([&](const OSDMap& o) {
12837 if (!o.exists(osd))
12838 return -ENOENT;
12839
12840 addr = o.get_addr(osd);
12841 return 0;
12842 });
12843}
12844
12845int Client::enumerate_layout(int fd, vector<ObjectExtent>& result,
12846 loff_t length, loff_t offset)
12847{
12848 Mutex::Locker lock(client_lock);
12849
12850 Fh *f = get_filehandle(fd);
12851 if (!f)
12852 return -EBADF;
12853 Inode *in = f->inode.get();
12854
12855 // map to a list of extents
12856 Striper::file_to_extents(cct, in->ino, &in->layout, offset, length, in->truncate_size, result);
12857
12858 ldout(cct, 3) << "enumerate_layout(" << fd << ", " << length << ", " << offset << ") = 0" << dendl;
12859 return 0;
12860}
12861
12862
12863/*
12864 * find an osd with the same ip. -1 if none.
12865 */
12866int Client::get_local_osd()
12867{
12868 Mutex::Locker lock(client_lock);
12869 objecter->with_osdmap([this](const OSDMap& o) {
12870 if (o.get_epoch() != local_osd_epoch) {
12871 local_osd = o.find_osd_on_ip(messenger->get_myaddr());
12872 local_osd_epoch = o.get_epoch();
12873 }
12874 });
12875 return local_osd;
12876}
12877
12878
12879
12880
12881
12882
12883// ===============================
12884
12885void Client::ms_handle_connect(Connection *con)
12886{
12887 ldout(cct, 10) << "ms_handle_connect on " << con->get_peer_addr() << dendl;
12888}
12889
12890bool Client::ms_handle_reset(Connection *con)
12891{
12892 ldout(cct, 0) << "ms_handle_reset on " << con->get_peer_addr() << dendl;
12893 return false;
12894}
12895
12896void Client::ms_handle_remote_reset(Connection *con)
12897{
12898 ldout(cct, 0) << "ms_handle_remote_reset on " << con->get_peer_addr() << dendl;
12899 Mutex::Locker l(client_lock);
12900 switch (con->get_peer_type()) {
12901 case CEPH_ENTITY_TYPE_MDS:
12902 {
12903 // kludge to figure out which mds this is; fixme with a Connection* state
12904 mds_rank_t mds = MDS_RANK_NONE;
12905 MetaSession *s = NULL;
12906 for (map<mds_rank_t,MetaSession*>::iterator p = mds_sessions.begin();
12907 p != mds_sessions.end();
12908 ++p) {
12909 if (mdsmap->get_addr(p->first) == con->get_peer_addr()) {
12910 mds = p->first;
12911 s = p->second;
12912 }
12913 }
12914 if (mds >= 0) {
d2e6a577 12915 assert (s != NULL);
7c673cae
FG
12916 switch (s->state) {
12917 case MetaSession::STATE_CLOSING:
12918 ldout(cct, 1) << "reset from mds we were closing; we'll call that closed" << dendl;
12919 _closed_mds_session(s);
12920 break;
12921
12922 case MetaSession::STATE_OPENING:
12923 {
12924 ldout(cct, 1) << "reset from mds we were opening; retrying" << dendl;
12925 list<Context*> waiters;
12926 waiters.swap(s->waiting_for_open);
12927 _closed_mds_session(s);
12928 MetaSession *news = _get_or_open_mds_session(mds);
12929 news->waiting_for_open.swap(waiters);
12930 }
12931 break;
12932
12933 case MetaSession::STATE_OPEN:
12934 {
12935 const md_config_t *conf = cct->_conf;
12936 if (conf->client_reconnect_stale) {
12937 ldout(cct, 1) << "reset from mds we were open; close mds session for reconnect" << dendl;
12938 _closed_mds_session(s);
12939 } else {
12940 ldout(cct, 1) << "reset from mds we were open; mark session as stale" << dendl;
12941 s->state = MetaSession::STATE_STALE;
12942 }
12943 }
12944 break;
12945
12946 case MetaSession::STATE_NEW:
12947 case MetaSession::STATE_CLOSED:
12948 default:
12949 break;
12950 }
12951 }
12952 }
12953 break;
12954 }
12955}
12956
12957bool Client::ms_handle_refused(Connection *con)
12958{
12959 ldout(cct, 1) << "ms_handle_refused on " << con->get_peer_addr() << dendl;
12960 return false;
12961}
12962
12963bool Client::ms_get_authorizer(int dest_type, AuthAuthorizer **authorizer, bool force_new)
12964{
12965 if (dest_type == CEPH_ENTITY_TYPE_MON)
12966 return true;
12967 *authorizer = monclient->build_authorizer(dest_type);
12968 return true;
12969}
12970
12971Inode *Client::get_quota_root(Inode *in, const UserPerm& perms)
12972{
12973 Inode *cur = in;
12974 utime_t now = ceph_clock_now();
12975
12976 while (cur) {
12977 if (cur != in && cur->quota.is_enable())
12978 break;
12979
12980 Inode *parent_in = NULL;
12981 if (!cur->dn_set.empty()) {
12982 for (auto p = cur->dn_set.begin(); p != cur->dn_set.end(); ++p) {
12983 Dentry *dn = *p;
12984 if (dn->lease_mds >= 0 &&
12985 dn->lease_ttl > now &&
12986 mds_sessions.count(dn->lease_mds)) {
12987 parent_in = dn->dir->parent_inode;
12988 } else {
12989 Inode *diri = dn->dir->parent_inode;
12990 if (diri->caps_issued_mask(CEPH_CAP_FILE_SHARED) &&
12991 diri->shared_gen == dn->cap_shared_gen) {
12992 parent_in = dn->dir->parent_inode;
12993 }
12994 }
12995 if (parent_in)
12996 break;
12997 }
12998 } else if (root_parents.count(cur)) {
12999 parent_in = root_parents[cur].get();
13000 }
13001
13002 if (parent_in) {
13003 cur = parent_in;
13004 continue;
13005 }
13006
13007 if (cur == root_ancestor)
13008 break;
13009
13010 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPNAME);
13011 filepath path(cur->ino);
13012 req->set_filepath(path);
13013 req->set_inode(cur);
13014
13015 InodeRef parent_ref;
13016 int ret = make_request(req, perms, &parent_ref);
13017 if (ret < 0) {
13018 ldout(cct, 1) << __func__ << " " << in->vino()
13019 << " failed to find parent of " << cur->vino()
13020 << " err " << ret << dendl;
13021 // FIXME: what to do?
13022 cur = root_ancestor;
13023 break;
13024 }
13025
13026 now = ceph_clock_now();
13027 if (cur == in)
13028 cur = parent_ref.get();
13029 else
13030 cur = in; // start over
13031 }
13032
13033 ldout(cct, 10) << __func__ << " " << in->vino() << " -> " << cur->vino() << dendl;
13034 return cur;
13035}
13036
13037/**
13038 * Traverse quota ancestors of the Inode, return true
13039 * if any of them passes the passed function
13040 */
13041bool Client::check_quota_condition(Inode *in, const UserPerm& perms,
13042 std::function<bool (const Inode &in)> test)
13043{
13044 while (true) {
13045 assert(in != NULL);
13046 if (test(*in)) {
13047 return true;
13048 }
13049
13050 if (in == root_ancestor) {
13051 // We're done traversing, drop out
13052 return false;
13053 } else {
13054 // Continue up the tree
13055 in = get_quota_root(in, perms);
13056 }
13057 }
13058
13059 return false;
13060}
13061
13062bool Client::is_quota_files_exceeded(Inode *in, const UserPerm& perms)
13063{
13064 return check_quota_condition(in, perms,
13065 [](const Inode &in) {
13066 return in.quota.max_files && in.rstat.rsize() >= in.quota.max_files;
13067 });
13068}
13069
13070bool Client::is_quota_bytes_exceeded(Inode *in, int64_t new_bytes,
13071 const UserPerm& perms)
13072{
13073 return check_quota_condition(in, perms,
13074 [&new_bytes](const Inode &in) {
13075 return in.quota.max_bytes && (in.rstat.rbytes + new_bytes)
13076 > in.quota.max_bytes;
13077 });
13078}
13079
13080bool Client::is_quota_bytes_approaching(Inode *in, const UserPerm& perms)
13081{
13082 return check_quota_condition(in, perms,
13083 [](const Inode &in) {
13084 if (in.quota.max_bytes) {
13085 if (in.rstat.rbytes >= in.quota.max_bytes) {
13086 return true;
13087 }
13088
13089 assert(in.size >= in.reported_size);
13090 const uint64_t space = in.quota.max_bytes - in.rstat.rbytes;
13091 const uint64_t size = in.size - in.reported_size;
13092 return (space >> 4) < size;
13093 } else {
13094 return false;
13095 }
13096 });
13097}
13098
13099enum {
13100 POOL_CHECKED = 1,
13101 POOL_CHECKING = 2,
13102 POOL_READ = 4,
13103 POOL_WRITE = 8,
13104};
13105
13106int Client::check_pool_perm(Inode *in, int need)
13107{
13108 if (!cct->_conf->client_check_pool_perm)
13109 return 0;
13110
13111 int64_t pool_id = in->layout.pool_id;
13112 std::string pool_ns = in->layout.pool_ns;
13113 std::pair<int64_t, std::string> perm_key(pool_id, pool_ns);
13114 int have = 0;
13115 while (true) {
13116 auto it = pool_perms.find(perm_key);
13117 if (it == pool_perms.end())
13118 break;
13119 if (it->second == POOL_CHECKING) {
13120 // avoid concurrent checkings
13121 wait_on_list(waiting_for_pool_perm);
13122 } else {
13123 have = it->second;
13124 assert(have & POOL_CHECKED);
13125 break;
13126 }
13127 }
13128
13129 if (!have) {
13130 if (in->snapid != CEPH_NOSNAP) {
13131 // pool permission check needs to write to the first object. But for snapshot,
13132 // head of the first object may have alread been deleted. To avoid creating
13133 // orphan object, skip the check for now.
13134 return 0;
13135 }
13136
13137 pool_perms[perm_key] = POOL_CHECKING;
13138
13139 char oid_buf[32];
13140 snprintf(oid_buf, sizeof(oid_buf), "%llx.00000000", (unsigned long long)in->ino);
13141 object_t oid = oid_buf;
13142
13143 SnapContext nullsnapc;
13144
13145 C_SaferCond rd_cond;
13146 ObjectOperation rd_op;
13147 rd_op.stat(NULL, (ceph::real_time*)nullptr, NULL);
13148
13149 objecter->mutate(oid, OSDMap::file_to_object_locator(in->layout), rd_op,
13150 nullsnapc, ceph::real_clock::now(), 0, &rd_cond);
13151
13152 C_SaferCond wr_cond;
13153 ObjectOperation wr_op;
13154 wr_op.create(true);
13155
13156 objecter->mutate(oid, OSDMap::file_to_object_locator(in->layout), wr_op,
13157 nullsnapc, ceph::real_clock::now(), 0, &wr_cond);
13158
13159 client_lock.Unlock();
13160 int rd_ret = rd_cond.wait();
13161 int wr_ret = wr_cond.wait();
13162 client_lock.Lock();
13163
13164 bool errored = false;
13165
13166 if (rd_ret == 0 || rd_ret == -ENOENT)
13167 have |= POOL_READ;
13168 else if (rd_ret != -EPERM) {
13169 ldout(cct, 10) << "check_pool_perm on pool " << pool_id << " ns " << pool_ns
13170 << " rd_err = " << rd_ret << " wr_err = " << wr_ret << dendl;
13171 errored = true;
13172 }
13173
13174 if (wr_ret == 0 || wr_ret == -EEXIST)
13175 have |= POOL_WRITE;
13176 else if (wr_ret != -EPERM) {
13177 ldout(cct, 10) << "check_pool_perm on pool " << pool_id << " ns " << pool_ns
13178 << " rd_err = " << rd_ret << " wr_err = " << wr_ret << dendl;
13179 errored = true;
13180 }
13181
13182 if (errored) {
13183 // Indeterminate: erase CHECKING state so that subsequent calls re-check.
13184 // Raise EIO because actual error code might be misleading for
13185 // userspace filesystem user.
13186 pool_perms.erase(perm_key);
13187 signal_cond_list(waiting_for_pool_perm);
13188 return -EIO;
13189 }
13190
13191 pool_perms[perm_key] = have | POOL_CHECKED;
13192 signal_cond_list(waiting_for_pool_perm);
13193 }
13194
13195 if ((need & CEPH_CAP_FILE_RD) && !(have & POOL_READ)) {
13196 ldout(cct, 10) << "check_pool_perm on pool " << pool_id << " ns " << pool_ns
13197 << " need " << ccap_string(need) << ", but no read perm" << dendl;
13198 return -EPERM;
13199 }
13200 if ((need & CEPH_CAP_FILE_WR) && !(have & POOL_WRITE)) {
13201 ldout(cct, 10) << "check_pool_perm on pool " << pool_id << " ns " << pool_ns
13202 << " need " << ccap_string(need) << ", but no write perm" << dendl;
13203 return -EPERM;
13204 }
13205
13206 return 0;
13207}
13208
13209int Client::_posix_acl_permission(Inode *in, const UserPerm& perms, unsigned want)
13210{
13211 if (acl_type == POSIX_ACL) {
13212 if (in->xattrs.count(ACL_EA_ACCESS)) {
13213 const bufferptr& access_acl = in->xattrs[ACL_EA_ACCESS];
13214
13215 return posix_acl_permits(access_acl, in->uid, in->gid, perms, want);
13216 }
13217 }
13218 return -EAGAIN;
13219}
13220
13221int Client::_posix_acl_chmod(Inode *in, mode_t mode, const UserPerm& perms)
13222{
13223 if (acl_type == NO_ACL)
13224 return 0;
13225
13226 int r = _getattr(in, CEPH_STAT_CAP_XATTR, perms, in->xattr_version == 0);
13227 if (r < 0)
13228 goto out;
13229
13230 if (acl_type == POSIX_ACL) {
13231 if (in->xattrs.count(ACL_EA_ACCESS)) {
13232 const bufferptr& access_acl = in->xattrs[ACL_EA_ACCESS];
13233 bufferptr acl(access_acl.c_str(), access_acl.length());
13234 r = posix_acl_access_chmod(acl, mode);
13235 if (r < 0)
13236 goto out;
13237 r = _do_setxattr(in, ACL_EA_ACCESS, acl.c_str(), acl.length(), 0, perms);
13238 } else {
13239 r = 0;
13240 }
13241 }
13242out:
13243 ldout(cct, 10) << __func__ << " ino " << in->ino << " result=" << r << dendl;
13244 return r;
13245}
13246
13247int Client::_posix_acl_create(Inode *dir, mode_t *mode, bufferlist& xattrs_bl,
13248 const UserPerm& perms)
13249{
13250 if (acl_type == NO_ACL)
13251 return 0;
13252
13253 if (S_ISLNK(*mode))
13254 return 0;
13255
13256 int r = _getattr(dir, CEPH_STAT_CAP_XATTR, perms, dir->xattr_version == 0);
13257 if (r < 0)
13258 goto out;
13259
13260 if (acl_type == POSIX_ACL) {
13261 if (dir->xattrs.count(ACL_EA_DEFAULT)) {
13262 map<string, bufferptr> xattrs;
13263
13264 const bufferptr& default_acl = dir->xattrs[ACL_EA_DEFAULT];
13265 bufferptr acl(default_acl.c_str(), default_acl.length());
13266 r = posix_acl_inherit_mode(acl, mode);
13267 if (r < 0)
13268 goto out;
13269
13270 if (r > 0) {
13271 r = posix_acl_equiv_mode(acl.c_str(), acl.length(), mode);
13272 if (r < 0)
13273 goto out;
13274 if (r > 0)
13275 xattrs[ACL_EA_ACCESS] = acl;
13276 }
13277
13278 if (S_ISDIR(*mode))
13279 xattrs[ACL_EA_DEFAULT] = dir->xattrs[ACL_EA_DEFAULT];
13280
13281 r = xattrs.size();
13282 if (r > 0)
13283 ::encode(xattrs, xattrs_bl);
13284 } else {
13285 if (umask_cb)
13286 *mode &= ~umask_cb(callback_handle);
13287 r = 0;
13288 }
13289 }
13290out:
13291 ldout(cct, 10) << __func__ << " dir ino " << dir->ino << " result=" << r << dendl;
13292 return r;
13293}
13294
13295void Client::set_filer_flags(int flags)
13296{
13297 Mutex::Locker l(client_lock);
13298 assert(flags == 0 ||
13299 flags == CEPH_OSD_FLAG_LOCALIZE_READS);
13300 objecter->add_global_op_flags(flags);
13301}
13302
13303void Client::clear_filer_flags(int flags)
13304{
13305 Mutex::Locker l(client_lock);
13306 assert(flags == CEPH_OSD_FLAG_LOCALIZE_READS);
13307 objecter->clear_global_op_flag(flags);
13308}
13309
13310/**
13311 * This is included in cap release messages, to cause
13312 * the MDS to wait until this OSD map epoch. It is necessary
13313 * in corner cases where we cancel RADOS ops, so that
13314 * nobody else tries to do IO to the same objects in
13315 * the same epoch as the cancelled ops.
13316 */
13317void Client::set_cap_epoch_barrier(epoch_t e)
13318{
13319 ldout(cct, 5) << __func__ << " epoch = " << e << dendl;
13320 cap_epoch_barrier = e;
13321}
13322
13323const char** Client::get_tracked_conf_keys() const
13324{
13325 static const char* keys[] = {
13326 "client_cache_size",
13327 "client_cache_mid",
13328 "client_acl_type",
13329 NULL
13330 };
13331 return keys;
13332}
13333
13334void Client::handle_conf_change(const struct md_config_t *conf,
13335 const std::set <std::string> &changed)
13336{
13337 Mutex::Locker lock(client_lock);
13338
13339 if (changed.count("client_cache_size") ||
13340 changed.count("client_cache_mid")) {
13341 lru.lru_set_max(cct->_conf->client_cache_size);
13342 lru.lru_set_midpoint(cct->_conf->client_cache_mid);
13343 }
13344 if (changed.count("client_acl_type")) {
13345 acl_type = NO_ACL;
13346 if (cct->_conf->client_acl_type == "posix_acl")
13347 acl_type = POSIX_ACL;
13348 }
13349}
13350
13351void Client::init_groups(UserPerm *perms)
13352{
13353 gid_t *sgids;
13354 int count = _getgrouplist(&sgids, perms->uid(), perms->gid());
13355 perms->init_gids(sgids, count);
13356}
13357
13358void intrusive_ptr_add_ref(Inode *in)
13359{
13360 in->get();
13361}
13362
13363void intrusive_ptr_release(Inode *in)
13364{
13365 in->client->put_inode(in);
13366}
13367
13368mds_rank_t Client::_get_random_up_mds() const
13369{
13370 assert(client_lock.is_locked_by_me());
13371
13372 std::set<mds_rank_t> up;
13373 mdsmap->get_up_mds_set(up);
13374
13375 if (up.empty())
13376 return MDS_RANK_NONE;
13377 std::set<mds_rank_t>::const_iterator p = up.begin();
13378 for (int n = rand() % up.size(); n; n--)
13379 ++p;
13380 return *p;
13381}
13382
13383
13384StandaloneClient::StandaloneClient(Messenger *m, MonClient *mc)
13385 : Client(m, mc, new Objecter(m->cct, m, mc, NULL, 0, 0))
13386{
13387 monclient->set_messenger(m);
13388 objecter->set_client_incarnation(0);
13389}
13390
13391StandaloneClient::~StandaloneClient()
13392{
13393 delete objecter;
13394 objecter = nullptr;
13395}
13396
13397int StandaloneClient::init()
13398{
13399 timer.init();
13400 objectcacher->start();
13401 objecter->init();
13402
13403 client_lock.Lock();
13404 assert(!initialized);
13405
13406 messenger->add_dispatcher_tail(objecter);
13407 messenger->add_dispatcher_tail(this);
13408
13409 monclient->set_want_keys(CEPH_ENTITY_TYPE_MDS | CEPH_ENTITY_TYPE_OSD);
13410 int r = monclient->init();
13411 if (r < 0) {
13412 // need to do cleanup because we're in an intermediate init state
13413 timer.shutdown();
13414 client_lock.Unlock();
13415 objecter->shutdown();
13416 objectcacher->stop();
13417 monclient->shutdown();
13418 return r;
13419 }
13420 objecter->start();
13421
13422 client_lock.Unlock();
13423 _finish_init();
13424
13425 return 0;
13426}
13427
13428void StandaloneClient::shutdown()
13429{
13430 Client::shutdown();
13431 objecter->shutdown();
13432 monclient->shutdown();
13433}
13434