]> git.proxmox.com Git - ceph.git/blame - ceph/src/client/Client.cc
update sources to v12.1.0
[ceph.git] / ceph / src / client / Client.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15
16// unix-ey fs stuff
17#include <unistd.h>
18#include <sys/types.h>
19#include <time.h>
20#include <utime.h>
21#include <sys/stat.h>
22#include <sys/param.h>
23#include <fcntl.h>
24#include <sys/file.h>
25#include <sys/utsname.h>
26#include <sys/uio.h>
27
28#include <boost/lexical_cast.hpp>
29#include <boost/fusion/include/std_pair.hpp>
30
31#if defined(__FreeBSD__)
32#define XATTR_CREATE 0x1
33#define XATTR_REPLACE 0x2
34#else
35#include <sys/xattr.h>
36#endif
37
38#if defined(__linux__)
39#include <linux/falloc.h>
40#endif
41
42#include <sys/statvfs.h>
43
44#include "common/config.h"
45#include "common/version.h"
46
47// ceph stuff
48#include "messages/MClientSession.h"
49#include "messages/MClientReconnect.h"
50#include "messages/MClientRequest.h"
51#include "messages/MClientRequestForward.h"
52#include "messages/MClientReply.h"
53#include "messages/MClientCaps.h"
54#include "messages/MClientLease.h"
55#include "messages/MClientSnap.h"
56#include "messages/MCommandReply.h"
57#include "messages/MOSDMap.h"
58#include "messages/MClientQuota.h"
59#include "messages/MClientCapRelease.h"
60#include "messages/MMDSMap.h"
61#include "messages/MFSMap.h"
62#include "messages/MFSMapUser.h"
63
64#include "mon/MonClient.h"
65
66#include "mds/flock.h"
67#include "osd/OSDMap.h"
68#include "osdc/Filer.h"
69
70#include "common/Cond.h"
71#include "common/Mutex.h"
72#include "common/perf_counters.h"
73#include "common/admin_socket.h"
74#include "common/errno.h"
75#include "include/str_list.h"
76
77#define dout_subsys ceph_subsys_client
78
79#include "include/lru.h"
80#include "include/compat.h"
81#include "include/stringify.h"
82
83#include "Client.h"
84#include "Inode.h"
85#include "Dentry.h"
86#include "Dir.h"
87#include "ClientSnapRealm.h"
88#include "Fh.h"
89#include "MetaSession.h"
90#include "MetaRequest.h"
91#include "ObjecterWriteback.h"
92#include "posix_acl.h"
93
94#include "include/assert.h"
95#include "include/stat.h"
96
97#include "include/cephfs/ceph_statx.h"
98
99#if HAVE_GETGROUPLIST
100#include <grp.h>
101#include <pwd.h>
102#include <unistd.h>
103#endif
104
105#undef dout_prefix
106#define dout_prefix *_dout << "client." << whoami << " "
107
108#define tout(cct) if (!cct->_conf->client_trace.empty()) traceout
109
110// FreeBSD fails to define this
111#ifndef O_DSYNC
112#define O_DSYNC 0x0
113#endif
114// Darwin fails to define this
115#ifndef O_RSYNC
116#define O_RSYNC 0x0
117#endif
118
119#ifndef O_DIRECT
120#define O_DIRECT 0x0
121#endif
122
123#define DEBUG_GETATTR_CAPS (CEPH_CAP_XATTR_SHARED)
124
125void client_flush_set_callback(void *p, ObjectCacher::ObjectSet *oset)
126{
127 Client *client = static_cast<Client*>(p);
128 client->flush_set_callback(oset);
129}
130
131
132// -------------
133
134Client::CommandHook::CommandHook(Client *client) :
135 m_client(client)
136{
137}
138
139bool Client::CommandHook::call(std::string command, cmdmap_t& cmdmap,
140 std::string format, bufferlist& out)
141{
142 Formatter *f = Formatter::create(format);
143 f->open_object_section("result");
144 m_client->client_lock.Lock();
145 if (command == "mds_requests")
146 m_client->dump_mds_requests(f);
147 else if (command == "mds_sessions")
148 m_client->dump_mds_sessions(f);
149 else if (command == "dump_cache")
150 m_client->dump_cache(f);
151 else if (command == "kick_stale_sessions")
152 m_client->_kick_stale_sessions();
153 else if (command == "status")
154 m_client->dump_status(f);
155 else
156 assert(0 == "bad command registered");
157 m_client->client_lock.Unlock();
158 f->close_section();
159 f->flush(out);
160 delete f;
161 return true;
162}
163
164
165// -------------
166
167dir_result_t::dir_result_t(Inode *in, const UserPerm& perms)
168 : inode(in), offset(0), next_offset(2),
169 release_count(0), ordered_count(0), cache_index(0), start_shared_gen(0),
170 perms(perms)
171 { }
172
173void Client::_reset_faked_inos()
174{
175 ino_t start = 1024;
176 free_faked_inos.clear();
177 free_faked_inos.insert(start, (uint32_t)-1 - start + 1);
178 last_used_faked_ino = 0;
179 _use_faked_inos = sizeof(ino_t) < 8 || cct->_conf->client_use_faked_inos;
180}
181
182void Client::_assign_faked_ino(Inode *in)
183{
184 interval_set<ino_t>::const_iterator it = free_faked_inos.lower_bound(last_used_faked_ino + 1);
185 if (it == free_faked_inos.end() && last_used_faked_ino > 0) {
186 last_used_faked_ino = 0;
187 it = free_faked_inos.lower_bound(last_used_faked_ino + 1);
188 }
189 assert(it != free_faked_inos.end());
190 if (last_used_faked_ino < it.get_start()) {
191 assert(it.get_len() > 0);
192 last_used_faked_ino = it.get_start();
193 } else {
194 ++last_used_faked_ino;
195 assert(it.get_start() + it.get_len() > last_used_faked_ino);
196 }
197 in->faked_ino = last_used_faked_ino;
198 free_faked_inos.erase(in->faked_ino);
199 faked_ino_map[in->faked_ino] = in->vino();
200}
201
202void Client::_release_faked_ino(Inode *in)
203{
204 free_faked_inos.insert(in->faked_ino);
205 faked_ino_map.erase(in->faked_ino);
206}
207
208vinodeno_t Client::_map_faked_ino(ino_t ino)
209{
210 vinodeno_t vino;
211 if (ino == 1)
212 vino = root->vino();
213 else if (faked_ino_map.count(ino))
214 vino = faked_ino_map[ino];
215 else
216 vino = vinodeno_t(0, CEPH_NOSNAP);
217 ldout(cct, 10) << "map_faked_ino " << ino << " -> " << vino << dendl;
218 return vino;
219}
220
221vinodeno_t Client::map_faked_ino(ino_t ino)
222{
223 Mutex::Locker lock(client_lock);
224 return _map_faked_ino(ino);
225}
226
227// cons/des
228
229Client::Client(Messenger *m, MonClient *mc, Objecter *objecter_)
230 : Dispatcher(m->cct),
231 m_command_hook(this),
232 timer(m->cct, client_lock),
233 callback_handle(NULL),
234 switch_interrupt_cb(NULL),
235 remount_cb(NULL),
236 ino_invalidate_cb(NULL),
237 dentry_invalidate_cb(NULL),
238 getgroups_cb(NULL),
239 umask_cb(NULL),
240 can_invalidate_dentries(false),
241 require_remount(false),
242 async_ino_invalidator(m->cct),
243 async_dentry_invalidator(m->cct),
244 interrupt_finisher(m->cct),
245 remount_finisher(m->cct),
246 objecter_finisher(m->cct),
247 tick_event(NULL),
248 messenger(m), monclient(mc),
249 objecter(objecter_),
250 whoami(mc->get_global_id()), cap_epoch_barrier(0),
251 last_tid(0), oldest_tid(0), last_flush_tid(1),
252 initialized(false),
31f18b77 253 mounted(false), unmounting(false), blacklisted(false),
7c673cae
FG
254 local_osd(-1), local_osd_epoch(0),
255 unsafe_sync_write(0),
256 client_lock("Client::client_lock")
257{
258 _reset_faked_inos();
259 //
260 root = 0;
261
262 num_flushing_caps = 0;
263
264 _dir_vxattrs_name_size = _vxattrs_calcu_name_size(_dir_vxattrs);
265 _file_vxattrs_name_size = _vxattrs_calcu_name_size(_file_vxattrs);
266
267 user_id = cct->_conf->client_mount_uid;
268 group_id = cct->_conf->client_mount_gid;
269
270 acl_type = NO_ACL;
271 if (cct->_conf->client_acl_type == "posix_acl")
272 acl_type = POSIX_ACL;
273
274 lru.lru_set_max(cct->_conf->client_cache_size);
275 lru.lru_set_midpoint(cct->_conf->client_cache_mid);
276
277 // file handles
278 free_fd_set.insert(10, 1<<30);
279
280 mdsmap.reset(new MDSMap);
281
282 // osd interfaces
283 writeback_handler.reset(new ObjecterWriteback(objecter, &objecter_finisher,
284 &client_lock));
285 objectcacher.reset(new ObjectCacher(cct, "libcephfs", *writeback_handler, client_lock,
286 client_flush_set_callback, // all commit callback
287 (void*)this,
288 cct->_conf->client_oc_size,
289 cct->_conf->client_oc_max_objects,
290 cct->_conf->client_oc_max_dirty,
291 cct->_conf->client_oc_target_dirty,
292 cct->_conf->client_oc_max_dirty_age,
293 true));
294 objecter_finisher.start();
295 filer.reset(new Filer(objecter, &objecter_finisher));
31f18b77 296 objecter->enable_blacklist_events();
7c673cae
FG
297}
298
299
300Client::~Client()
301{
302 assert(!client_lock.is_locked());
303
31f18b77
FG
304 // It is necessary to hold client_lock, because any inode destruction
305 // may call into ObjectCacher, which asserts that it's lock (which is
306 // client_lock) is held.
307 client_lock.Lock();
7c673cae 308 tear_down_cache();
31f18b77 309 client_lock.Unlock();
7c673cae
FG
310}
311
312void Client::tear_down_cache()
313{
314 // fd's
315 for (ceph::unordered_map<int, Fh*>::iterator it = fd_map.begin();
316 it != fd_map.end();
317 ++it) {
318 Fh *fh = it->second;
319 ldout(cct, 1) << "tear_down_cache forcing close of fh " << it->first << " ino " << fh->inode->ino << dendl;
320 _release_fh(fh);
321 }
322 fd_map.clear();
323
324 while (!opened_dirs.empty()) {
325 dir_result_t *dirp = *opened_dirs.begin();
326 ldout(cct, 1) << "tear_down_cache forcing close of dir " << dirp << " ino " << dirp->inode->ino << dendl;
327 _closedir(dirp);
328 }
329
330 // caps!
331 // *** FIXME ***
332
333 // empty lru
334 lru.lru_set_max(0);
335 trim_cache();
336 assert(lru.lru_get_size() == 0);
337
338 // close root ino
339 assert(inode_map.size() <= 1 + root_parents.size());
340 if (root && inode_map.size() == 1 + root_parents.size()) {
341 delete root;
342 root = 0;
343 root_ancestor = 0;
344 while (!root_parents.empty())
345 root_parents.erase(root_parents.begin());
346 inode_map.clear();
347 _reset_faked_inos();
348 }
349
350 assert(inode_map.empty());
351}
352
353inodeno_t Client::get_root_ino()
354{
355 Mutex::Locker l(client_lock);
356 if (use_faked_inos())
357 return root->faked_ino;
358 else
359 return root->ino;
360}
361
362Inode *Client::get_root()
363{
364 Mutex::Locker l(client_lock);
365 root->ll_get();
366 return root;
367}
368
369
370// debug crapola
371
372void Client::dump_inode(Formatter *f, Inode *in, set<Inode*>& did, bool disconnected)
373{
374 filepath path;
375 in->make_long_path(path);
376 ldout(cct, 1) << "dump_inode: "
377 << (disconnected ? "DISCONNECTED ":"")
378 << "inode " << in->ino
379 << " " << path
380 << " ref " << in->get_num_ref()
381 << *in << dendl;
382
383 if (f) {
384 f->open_object_section("inode");
385 f->dump_stream("path") << path;
386 if (disconnected)
387 f->dump_int("disconnected", 1);
388 in->dump(f);
389 f->close_section();
390 }
391
392 did.insert(in);
393 if (in->dir) {
394 ldout(cct, 1) << " dir " << in->dir << " size " << in->dir->dentries.size() << dendl;
395 for (ceph::unordered_map<string, Dentry*>::iterator it = in->dir->dentries.begin();
396 it != in->dir->dentries.end();
397 ++it) {
398 ldout(cct, 1) << " " << in->ino << " dn " << it->first << " " << it->second << " ref " << it->second->ref << dendl;
399 if (f) {
400 f->open_object_section("dentry");
401 it->second->dump(f);
402 f->close_section();
403 }
404 if (it->second->inode)
405 dump_inode(f, it->second->inode.get(), did, false);
406 }
407 }
408}
409
410void Client::dump_cache(Formatter *f)
411{
412 set<Inode*> did;
413
414 ldout(cct, 1) << "dump_cache" << dendl;
415
416 if (f)
417 f->open_array_section("cache");
418
419 if (root)
420 dump_inode(f, root, did, true);
421
422 // make a second pass to catch anything disconnected
423 for (ceph::unordered_map<vinodeno_t, Inode*>::iterator it = inode_map.begin();
424 it != inode_map.end();
425 ++it) {
426 if (did.count(it->second))
427 continue;
428 dump_inode(f, it->second, did, true);
429 }
430
431 if (f)
432 f->close_section();
433}
434
435void Client::dump_status(Formatter *f)
436{
437 assert(client_lock.is_locked_by_me());
438
439 ldout(cct, 1) << __func__ << dendl;
440
441 const epoch_t osd_epoch
442 = objecter->with_osdmap(std::mem_fn(&OSDMap::get_epoch));
443
444 if (f) {
445 f->open_object_section("metadata");
446 for (const auto& kv : metadata)
447 f->dump_string(kv.first.c_str(), kv.second);
448 f->close_section();
449
450 f->dump_int("dentry_count", lru.lru_get_size());
451 f->dump_int("dentry_pinned_count", lru.lru_get_num_pinned());
452 f->dump_int("id", get_nodeid().v);
453 f->dump_int("inode_count", inode_map.size());
454 f->dump_int("mds_epoch", mdsmap->get_epoch());
455 f->dump_int("osd_epoch", osd_epoch);
456 f->dump_int("osd_epoch_barrier", cap_epoch_barrier);
457 }
458}
459
460int Client::init()
461{
462 timer.init();
463 objectcacher->start();
464
465 client_lock.Lock();
466 assert(!initialized);
467
468 messenger->add_dispatcher_tail(this);
469 client_lock.Unlock();
470
471 _finish_init();
472 return 0;
473}
474
475void Client::_finish_init()
476{
477 client_lock.Lock();
478 // logger
479 PerfCountersBuilder plb(cct, "client", l_c_first, l_c_last);
480 plb.add_time_avg(l_c_reply, "reply", "Latency of receiving a reply on metadata request");
481 plb.add_time_avg(l_c_lat, "lat", "Latency of processing a metadata request");
482 plb.add_time_avg(l_c_wrlat, "wrlat", "Latency of a file data write operation");
483 logger.reset(plb.create_perf_counters());
484 cct->get_perfcounters_collection()->add(logger.get());
485
486 client_lock.Unlock();
487
488 cct->_conf->add_observer(this);
489
490 AdminSocket* admin_socket = cct->get_admin_socket();
491 int ret = admin_socket->register_command("mds_requests",
492 "mds_requests",
493 &m_command_hook,
494 "show in-progress mds requests");
495 if (ret < 0) {
496 lderr(cct) << "error registering admin socket command: "
497 << cpp_strerror(-ret) << dendl;
498 }
499 ret = admin_socket->register_command("mds_sessions",
500 "mds_sessions",
501 &m_command_hook,
502 "show mds session state");
503 if (ret < 0) {
504 lderr(cct) << "error registering admin socket command: "
505 << cpp_strerror(-ret) << dendl;
506 }
507 ret = admin_socket->register_command("dump_cache",
508 "dump_cache",
509 &m_command_hook,
510 "show in-memory metadata cache contents");
511 if (ret < 0) {
512 lderr(cct) << "error registering admin socket command: "
513 << cpp_strerror(-ret) << dendl;
514 }
515 ret = admin_socket->register_command("kick_stale_sessions",
516 "kick_stale_sessions",
517 &m_command_hook,
518 "kick sessions that were remote reset");
519 if (ret < 0) {
520 lderr(cct) << "error registering admin socket command: "
521 << cpp_strerror(-ret) << dendl;
522 }
523 ret = admin_socket->register_command("status",
524 "status",
525 &m_command_hook,
526 "show overall client status");
527 if (ret < 0) {
528 lderr(cct) << "error registering admin socket command: "
529 << cpp_strerror(-ret) << dendl;
530 }
531
532 client_lock.Lock();
533 initialized = true;
534 client_lock.Unlock();
535}
536
537void Client::shutdown()
538{
539 ldout(cct, 1) << "shutdown" << dendl;
540
541 // If we were not mounted, but were being used for sending
542 // MDS commands, we may have sessions that need closing.
543 client_lock.Lock();
544 _close_sessions();
545 client_lock.Unlock();
546
547 cct->_conf->remove_observer(this);
548
549 AdminSocket* admin_socket = cct->get_admin_socket();
550 admin_socket->unregister_command("mds_requests");
551 admin_socket->unregister_command("mds_sessions");
552 admin_socket->unregister_command("dump_cache");
553 admin_socket->unregister_command("kick_stale_sessions");
554 admin_socket->unregister_command("status");
555
556 if (ino_invalidate_cb) {
557 ldout(cct, 10) << "shutdown stopping cache invalidator finisher" << dendl;
558 async_ino_invalidator.wait_for_empty();
559 async_ino_invalidator.stop();
560 }
561
562 if (dentry_invalidate_cb) {
563 ldout(cct, 10) << "shutdown stopping dentry invalidator finisher" << dendl;
564 async_dentry_invalidator.wait_for_empty();
565 async_dentry_invalidator.stop();
566 }
567
568 if (switch_interrupt_cb) {
569 ldout(cct, 10) << "shutdown stopping interrupt finisher" << dendl;
570 interrupt_finisher.wait_for_empty();
571 interrupt_finisher.stop();
572 }
573
574 if (remount_cb) {
575 ldout(cct, 10) << "shutdown stopping remount finisher" << dendl;
576 remount_finisher.wait_for_empty();
577 remount_finisher.stop();
578 }
579
580 objectcacher->stop(); // outside of client_lock! this does a join.
581
582 client_lock.Lock();
583 assert(initialized);
584 initialized = false;
585 timer.shutdown();
586 client_lock.Unlock();
587
588 objecter_finisher.wait_for_empty();
589 objecter_finisher.stop();
590
591 if (logger) {
592 cct->get_perfcounters_collection()->remove(logger.get());
593 logger.reset();
594 }
595}
596
597
598// ===================
599// metadata cache stuff
600
601void Client::trim_cache(bool trim_kernel_dcache)
602{
603 ldout(cct, 20) << "trim_cache size " << lru.lru_get_size() << " max " << lru.lru_get_max() << dendl;
604 unsigned last = 0;
605 while (lru.lru_get_size() != last) {
606 last = lru.lru_get_size();
607
608 if (lru.lru_get_size() <= lru.lru_get_max()) break;
609
610 // trim!
31f18b77 611 Dentry *dn = static_cast<Dentry*>(lru.lru_get_next_expire());
7c673cae
FG
612 if (!dn)
613 break; // done
614
615 trim_dentry(dn);
616 }
617
618 if (trim_kernel_dcache && lru.lru_get_size() > lru.lru_get_max())
619 _invalidate_kernel_dcache();
620
621 // hose root?
622 if (lru.lru_get_size() == 0 && root && root->get_num_ref() == 0 && inode_map.size() == 1 + root_parents.size()) {
623 ldout(cct, 15) << "trim_cache trimmed root " << root << dendl;
624 delete root;
625 root = 0;
626 root_ancestor = 0;
627 while (!root_parents.empty())
628 root_parents.erase(root_parents.begin());
629 inode_map.clear();
630 _reset_faked_inos();
631 }
632}
633
634void Client::trim_cache_for_reconnect(MetaSession *s)
635{
636 mds_rank_t mds = s->mds_num;
637 ldout(cct, 20) << "trim_cache_for_reconnect mds." << mds << dendl;
638
639 int trimmed = 0;
640 list<Dentry*> skipped;
641 while (lru.lru_get_size() > 0) {
642 Dentry *dn = static_cast<Dentry*>(lru.lru_expire());
643 if (!dn)
644 break;
645
646 if ((dn->inode && dn->inode->caps.count(mds)) ||
647 dn->dir->parent_inode->caps.count(mds)) {
648 trim_dentry(dn);
649 trimmed++;
650 } else
651 skipped.push_back(dn);
652 }
653
654 for(list<Dentry*>::iterator p = skipped.begin(); p != skipped.end(); ++p)
655 lru.lru_insert_mid(*p);
656
657 ldout(cct, 20) << "trim_cache_for_reconnect mds." << mds
658 << " trimmed " << trimmed << " dentries" << dendl;
659
660 if (s->caps.size() > 0)
661 _invalidate_kernel_dcache();
662}
663
664void Client::trim_dentry(Dentry *dn)
665{
666 ldout(cct, 15) << "trim_dentry unlinking dn " << dn->name
667 << " in dir " << hex << dn->dir->parent_inode->ino
668 << dendl;
669 if (dn->inode) {
670 Inode *diri = dn->dir->parent_inode;
671 diri->dir_release_count++;
672 clear_dir_complete_and_ordered(diri, true);
673 }
674 unlink(dn, false, false); // drop dir, drop dentry
675}
676
677
678void Client::update_inode_file_bits(Inode *in,
679 uint64_t truncate_seq, uint64_t truncate_size,
680 uint64_t size, uint64_t change_attr,
681 uint64_t time_warp_seq, utime_t ctime,
682 utime_t mtime,
683 utime_t atime,
684 version_t inline_version,
685 bufferlist& inline_data,
686 int issued)
687{
688 bool warn = false;
689 ldout(cct, 10) << "update_inode_file_bits " << *in << " " << ccap_string(issued)
690 << " mtime " << mtime << dendl;
691 ldout(cct, 25) << "truncate_seq: mds " << truncate_seq << " local "
692 << in->truncate_seq << " time_warp_seq: mds " << time_warp_seq
693 << " local " << in->time_warp_seq << dendl;
694 uint64_t prior_size = in->size;
695
696 if (inline_version > in->inline_version) {
697 in->inline_data = inline_data;
698 in->inline_version = inline_version;
699 }
700
701 /* always take a newer change attr */
702 if (change_attr > in->change_attr)
703 in->change_attr = change_attr;
704
705 if (truncate_seq > in->truncate_seq ||
706 (truncate_seq == in->truncate_seq && size > in->size)) {
707 ldout(cct, 10) << "size " << in->size << " -> " << size << dendl;
708 in->size = size;
709 in->reported_size = size;
710 if (truncate_seq != in->truncate_seq) {
711 ldout(cct, 10) << "truncate_seq " << in->truncate_seq << " -> "
712 << truncate_seq << dendl;
713 in->truncate_seq = truncate_seq;
714 in->oset.truncate_seq = truncate_seq;
715
716 // truncate cached file data
717 if (prior_size > size) {
718 _invalidate_inode_cache(in, truncate_size, prior_size - truncate_size);
719 }
720 }
721
722 // truncate inline data
723 if (in->inline_version < CEPH_INLINE_NONE) {
724 uint32_t len = in->inline_data.length();
725 if (size < len)
726 in->inline_data.splice(size, len - size);
727 }
728 }
729 if (truncate_seq >= in->truncate_seq &&
730 in->truncate_size != truncate_size) {
731 if (in->is_file()) {
732 ldout(cct, 10) << "truncate_size " << in->truncate_size << " -> "
733 << truncate_size << dendl;
734 in->truncate_size = truncate_size;
735 in->oset.truncate_size = truncate_size;
736 } else {
737 ldout(cct, 0) << "Hmmm, truncate_seq && truncate_size changed on non-file inode!" << dendl;
738 }
739 }
740
741 // be careful with size, mtime, atime
742 if (issued & (CEPH_CAP_FILE_EXCL|
743 CEPH_CAP_FILE_WR|
744 CEPH_CAP_FILE_BUFFER|
745 CEPH_CAP_AUTH_EXCL|
746 CEPH_CAP_XATTR_EXCL)) {
747 ldout(cct, 30) << "Yay have enough caps to look at our times" << dendl;
748 if (ctime > in->ctime)
749 in->ctime = ctime;
750 if (time_warp_seq > in->time_warp_seq) {
751 ldout(cct, 10) << "mds time_warp_seq " << time_warp_seq << " on inode " << *in
752 << " is higher than local time_warp_seq "
753 << in->time_warp_seq << dendl;
754 //the mds updated times, so take those!
755 in->mtime = mtime;
756 in->atime = atime;
757 in->time_warp_seq = time_warp_seq;
758 } else if (time_warp_seq == in->time_warp_seq) {
759 //take max times
760 if (mtime > in->mtime)
761 in->mtime = mtime;
762 if (atime > in->atime)
763 in->atime = atime;
764 } else if (issued & CEPH_CAP_FILE_EXCL) {
765 //ignore mds values as we have a higher seq
766 } else warn = true;
767 } else {
768 ldout(cct, 30) << "Don't have enough caps, just taking mds' time values" << dendl;
769 if (time_warp_seq >= in->time_warp_seq) {
770 in->ctime = ctime;
771 in->mtime = mtime;
772 in->atime = atime;
773 in->time_warp_seq = time_warp_seq;
774 } else warn = true;
775 }
776 if (warn) {
777 ldout(cct, 0) << "WARNING: " << *in << " mds time_warp_seq "
778 << time_warp_seq << " is lower than local time_warp_seq "
779 << in->time_warp_seq
780 << dendl;
781 }
782}
783
784void Client::_fragmap_remove_non_leaves(Inode *in)
785{
786 for (map<frag_t,int>::iterator p = in->fragmap.begin(); p != in->fragmap.end(); )
787 if (!in->dirfragtree.is_leaf(p->first))
788 in->fragmap.erase(p++);
789 else
790 ++p;
791}
792
793void Client::_fragmap_remove_stopped_mds(Inode *in, mds_rank_t mds)
794{
795 for (auto p = in->fragmap.begin(); p != in->fragmap.end(); )
796 if (p->second == mds)
797 in->fragmap.erase(p++);
798 else
799 ++p;
800}
801
802Inode * Client::add_update_inode(InodeStat *st, utime_t from,
803 MetaSession *session,
804 const UserPerm& request_perms)
805{
806 Inode *in;
807 bool was_new = false;
808 if (inode_map.count(st->vino)) {
809 in = inode_map[st->vino];
810 ldout(cct, 12) << "add_update_inode had " << *in << " caps " << ccap_string(st->cap.caps) << dendl;
811 } else {
812 in = new Inode(this, st->vino, &st->layout);
813 inode_map[st->vino] = in;
814
815 if (use_faked_inos())
816 _assign_faked_ino(in);
817
818 if (!root) {
819 root = in;
820 root_ancestor = in;
821 cwd = root;
822 } else if (!mounted) {
823 root_parents[root_ancestor] = in;
824 root_ancestor = in;
825 }
826
827 // immutable bits
828 in->ino = st->vino.ino;
829 in->snapid = st->vino.snapid;
830 in->mode = st->mode & S_IFMT;
831 was_new = true;
832 }
833
834 in->rdev = st->rdev;
835 if (in->is_symlink())
836 in->symlink = st->symlink;
837
838 if (was_new)
839 ldout(cct, 12) << "add_update_inode adding " << *in << " caps " << ccap_string(st->cap.caps) << dendl;
840
841 if (!st->cap.caps)
842 return in; // as with readdir returning indoes in different snaprealms (no caps!)
843
844 // only update inode if mds info is strictly newer, or it is the same and projected (odd).
845 bool updating_inode = false;
846 int issued = 0;
847 if (st->version == 0 ||
848 (in->version & ~1) < st->version) {
849 updating_inode = true;
850
851 int implemented = 0;
852 issued = in->caps_issued(&implemented) | in->caps_dirty();
853 issued |= implemented;
854
855 in->version = st->version;
856
857 if ((issued & CEPH_CAP_AUTH_EXCL) == 0) {
858 in->mode = st->mode;
859 in->uid = st->uid;
860 in->gid = st->gid;
861 in->btime = st->btime;
862 }
863
864 if ((issued & CEPH_CAP_LINK_EXCL) == 0) {
865 in->nlink = st->nlink;
866 }
867
868 in->dirstat = st->dirstat;
869 in->rstat = st->rstat;
870 in->quota = st->quota;
871 in->layout = st->layout;
872
873 if (in->is_dir()) {
874 in->dir_layout = st->dir_layout;
875 ldout(cct, 20) << " dir hash is " << (int)in->dir_layout.dl_dir_hash << dendl;
876 }
877
878 update_inode_file_bits(in, st->truncate_seq, st->truncate_size, st->size,
879 st->change_attr, st->time_warp_seq, st->ctime,
880 st->mtime, st->atime, st->inline_version,
881 st->inline_data, issued);
882 } else if (st->inline_version > in->inline_version) {
883 in->inline_data = st->inline_data;
884 in->inline_version = st->inline_version;
885 }
886
887 if ((in->xattr_version == 0 || !(issued & CEPH_CAP_XATTR_EXCL)) &&
888 st->xattrbl.length() &&
889 st->xattr_version > in->xattr_version) {
890 bufferlist::iterator p = st->xattrbl.begin();
891 ::decode(in->xattrs, p);
892 in->xattr_version = st->xattr_version;
893 }
894
895 // move me if/when version reflects fragtree changes.
896 if (in->dirfragtree != st->dirfragtree) {
897 in->dirfragtree = st->dirfragtree;
898 _fragmap_remove_non_leaves(in);
899 }
900
901 if (in->snapid == CEPH_NOSNAP) {
902 add_update_cap(in, session, st->cap.cap_id, st->cap.caps, st->cap.seq,
903 st->cap.mseq, inodeno_t(st->cap.realm), st->cap.flags,
904 request_perms);
905 if (in->auth_cap && in->auth_cap->session == session)
906 in->max_size = st->max_size;
907 } else
908 in->snap_caps |= st->cap.caps;
909
910 // setting I_COMPLETE needs to happen after adding the cap
911 if (updating_inode &&
912 in->is_dir() &&
913 (st->cap.caps & CEPH_CAP_FILE_SHARED) &&
914 (issued & CEPH_CAP_FILE_EXCL) == 0 &&
915 in->dirstat.nfiles == 0 &&
916 in->dirstat.nsubdirs == 0) {
917 ldout(cct, 10) << " marking (I_COMPLETE|I_DIR_ORDERED) on empty dir " << *in << dendl;
918 in->flags |= I_COMPLETE | I_DIR_ORDERED;
919 if (in->dir) {
920 ldout(cct, 10) << " dir is open on empty dir " << in->ino << " with "
921 << in->dir->dentries.size() << " entries, marking all dentries null" << dendl;
922 in->dir->readdir_cache.clear();
923 for (auto p = in->dir->dentries.begin();
924 p != in->dir->dentries.end();
925 ++p) {
926 unlink(p->second, true, true); // keep dir, keep dentry
927 }
928 if (in->dir->dentries.empty())
929 close_dir(in->dir);
930 }
931 }
932
933 return in;
934}
935
936
937/*
938 * insert_dentry_inode - insert + link a single dentry + inode into the metadata cache.
939 */
940Dentry *Client::insert_dentry_inode(Dir *dir, const string& dname, LeaseStat *dlease,
941 Inode *in, utime_t from, MetaSession *session,
942 Dentry *old_dentry)
943{
944 Dentry *dn = NULL;
945 if (dir->dentries.count(dname))
946 dn = dir->dentries[dname];
947
948 ldout(cct, 12) << "insert_dentry_inode '" << dname << "' vino " << in->vino()
949 << " in dir " << dir->parent_inode->vino() << " dn " << dn
950 << dendl;
951
952 if (dn && dn->inode) {
953 if (dn->inode->vino() == in->vino()) {
954 touch_dn(dn);
955 ldout(cct, 12) << " had dentry " << dname
956 << " with correct vino " << dn->inode->vino()
957 << dendl;
958 } else {
959 ldout(cct, 12) << " had dentry " << dname
960 << " with WRONG vino " << dn->inode->vino()
961 << dendl;
962 unlink(dn, true, true); // keep dir, keep dentry
963 }
964 }
965
966 if (!dn || !dn->inode) {
967 InodeRef tmp_ref(in);
968 if (old_dentry) {
969 if (old_dentry->dir != dir) {
970 Inode *old_diri = old_dentry->dir->parent_inode;
971 old_diri->dir_ordered_count++;
972 clear_dir_complete_and_ordered(old_diri, false);
973 }
974 unlink(old_dentry, dir == old_dentry->dir, false); // drop dentry, keep dir open if its the same dir
975 }
976 Inode *diri = dir->parent_inode;
977 diri->dir_ordered_count++;
978 clear_dir_complete_and_ordered(diri, false);
979 dn = link(dir, dname, in, dn);
980 }
981
982 update_dentry_lease(dn, dlease, from, session);
983 return dn;
984}
985
986void Client::update_dentry_lease(Dentry *dn, LeaseStat *dlease, utime_t from, MetaSession *session)
987{
988 utime_t dttl = from;
989 dttl += (float)dlease->duration_ms / 1000.0;
990
991 assert(dn);
992
993 if (dlease->mask & CEPH_LOCK_DN) {
994 if (dttl > dn->lease_ttl) {
995 ldout(cct, 10) << "got dentry lease on " << dn->name
996 << " dur " << dlease->duration_ms << "ms ttl " << dttl << dendl;
997 dn->lease_ttl = dttl;
998 dn->lease_mds = session->mds_num;
999 dn->lease_seq = dlease->seq;
1000 dn->lease_gen = session->cap_gen;
1001 }
1002 }
1003 dn->cap_shared_gen = dn->dir->parent_inode->shared_gen;
1004}
1005
1006
1007/*
1008 * update MDS location cache for a single inode
1009 */
1010void Client::update_dir_dist(Inode *in, DirStat *dst)
1011{
1012 // auth
1013 ldout(cct, 20) << "got dirfrag map for " << in->ino << " frag " << dst->frag << " to mds " << dst->auth << dendl;
1014 if (dst->auth >= 0) {
1015 in->fragmap[dst->frag] = dst->auth;
1016 } else {
1017 in->fragmap.erase(dst->frag);
1018 }
1019 if (!in->dirfragtree.is_leaf(dst->frag)) {
1020 in->dirfragtree.force_to_leaf(cct, dst->frag);
1021 _fragmap_remove_non_leaves(in);
1022 }
1023
1024 // replicated
1025 in->dir_replicated = !dst->dist.empty(); // FIXME that's just one frag!
1026
1027 // dist
1028 /*
1029 if (!st->dirfrag_dist.empty()) { // FIXME
1030 set<int> dist = st->dirfrag_dist.begin()->second;
1031 if (dist.empty() && !in->dir_contacts.empty())
1032 ldout(cct, 9) << "lost dist spec for " << in->ino
1033 << " " << dist << dendl;
1034 if (!dist.empty() && in->dir_contacts.empty())
1035 ldout(cct, 9) << "got dist spec for " << in->ino
1036 << " " << dist << dendl;
1037 in->dir_contacts = dist;
1038 }
1039 */
1040}
1041
1042void Client::clear_dir_complete_and_ordered(Inode *diri, bool complete)
1043{
1044 if (diri->flags & I_COMPLETE) {
1045 if (complete) {
1046 ldout(cct, 10) << " clearing (I_COMPLETE|I_DIR_ORDERED) on " << *diri << dendl;
1047 diri->flags &= ~(I_COMPLETE | I_DIR_ORDERED);
1048 } else {
1049 if (diri->flags & I_DIR_ORDERED) {
1050 ldout(cct, 10) << " clearing I_DIR_ORDERED on " << *diri << dendl;
1051 diri->flags &= ~I_DIR_ORDERED;
1052 }
1053 }
1054 if (diri->dir)
1055 diri->dir->readdir_cache.clear();
1056 }
1057}
1058
1059/*
1060 * insert results from readdir or lssnap into the metadata cache.
1061 */
1062void Client::insert_readdir_results(MetaRequest *request, MetaSession *session, Inode *diri) {
1063
1064 MClientReply *reply = request->reply;
1065 ConnectionRef con = request->reply->get_connection();
1066 uint64_t features = con->get_features();
1067
1068 dir_result_t *dirp = request->dirp;
1069 assert(dirp);
1070
1071 // the extra buffer list is only set for readdir and lssnap replies
1072 bufferlist::iterator p = reply->get_extra_bl().begin();
1073 if (!p.end()) {
1074 // snapdir?
1075 if (request->head.op == CEPH_MDS_OP_LSSNAP) {
1076 assert(diri);
1077 diri = open_snapdir(diri);
1078 }
1079
1080 // only open dir if we're actually adding stuff to it!
1081 Dir *dir = diri->open_dir();
1082 assert(dir);
1083
1084 // dirstat
1085 DirStat dst(p);
1086 __u32 numdn;
1087 __u16 flags;
1088 ::decode(numdn, p);
1089 ::decode(flags, p);
1090
1091 bool end = ((unsigned)flags & CEPH_READDIR_FRAG_END);
1092 bool hash_order = ((unsigned)flags & CEPH_READDIR_HASH_ORDER);
1093
1094 frag_t fg = (unsigned)request->head.args.readdir.frag;
1095 unsigned readdir_offset = dirp->next_offset;
1096 string readdir_start = dirp->last_name;
1097 assert(!readdir_start.empty() || readdir_offset == 2);
1098
1099 unsigned last_hash = 0;
1100 if (hash_order) {
1101 if (!readdir_start.empty()) {
1102 last_hash = ceph_frag_value(diri->hash_dentry_name(readdir_start));
1103 } else if (flags & CEPH_READDIR_OFFSET_HASH) {
1104 /* mds understands offset_hash */
1105 last_hash = (unsigned)request->head.args.readdir.offset_hash;
1106 }
1107 }
1108
1109 if (fg != dst.frag) {
1110 ldout(cct, 10) << "insert_trace got new frag " << fg << " -> " << dst.frag << dendl;
1111 fg = dst.frag;
1112 if (!hash_order) {
1113 readdir_offset = 2;
1114 readdir_start.clear();
1115 dirp->offset = dir_result_t::make_fpos(fg, readdir_offset, false);
1116 }
1117 }
1118
1119 ldout(cct, 10) << __func__ << " " << numdn << " readdir items, end=" << end
1120 << ", hash_order=" << hash_order
1121 << ", readdir_start " << readdir_start
1122 << ", last_hash " << last_hash
1123 << ", next_offset " << readdir_offset << dendl;
1124
1125 if (diri->snapid != CEPH_SNAPDIR &&
1126 fg.is_leftmost() && readdir_offset == 2 &&
1127 !(hash_order && last_hash)) {
1128 dirp->release_count = diri->dir_release_count;
1129 dirp->ordered_count = diri->dir_ordered_count;
1130 dirp->start_shared_gen = diri->shared_gen;
1131 dirp->cache_index = 0;
1132 }
1133
1134 dirp->buffer_frag = fg;
1135
1136 _readdir_drop_dirp_buffer(dirp);
1137 dirp->buffer.reserve(numdn);
1138
1139 string dname;
1140 LeaseStat dlease;
1141 for (unsigned i=0; i<numdn; i++) {
1142 ::decode(dname, p);
1143 ::decode(dlease, p);
1144 InodeStat ist(p, features);
1145
1146 ldout(cct, 15) << "" << i << ": '" << dname << "'" << dendl;
1147
1148 Inode *in = add_update_inode(&ist, request->sent_stamp, session,
1149 request->perms);
1150 Dentry *dn;
1151 if (diri->dir->dentries.count(dname)) {
1152 Dentry *olddn = diri->dir->dentries[dname];
1153 if (olddn->inode != in) {
1154 // replace incorrect dentry
1155 unlink(olddn, true, true); // keep dir, dentry
1156 dn = link(dir, dname, in, olddn);
1157 assert(dn == olddn);
1158 } else {
1159 // keep existing dn
1160 dn = olddn;
1161 touch_dn(dn);
1162 }
1163 } else {
1164 // new dn
1165 dn = link(dir, dname, in, NULL);
1166 }
1167
1168 update_dentry_lease(dn, &dlease, request->sent_stamp, session);
1169 if (hash_order) {
1170 unsigned hash = ceph_frag_value(diri->hash_dentry_name(dname));
1171 if (hash != last_hash)
1172 readdir_offset = 2;
1173 last_hash = hash;
1174 dn->offset = dir_result_t::make_fpos(hash, readdir_offset++, true);
1175 } else {
1176 dn->offset = dir_result_t::make_fpos(fg, readdir_offset++, false);
1177 }
1178 // add to readdir cache
1179 if (dirp->release_count == diri->dir_release_count &&
1180 dirp->ordered_count == diri->dir_ordered_count &&
1181 dirp->start_shared_gen == diri->shared_gen) {
1182 if (dirp->cache_index == dir->readdir_cache.size()) {
1183 if (i == 0) {
1184 assert(!dirp->inode->is_complete_and_ordered());
1185 dir->readdir_cache.reserve(dirp->cache_index + numdn);
1186 }
1187 dir->readdir_cache.push_back(dn);
1188 } else if (dirp->cache_index < dir->readdir_cache.size()) {
1189 if (dirp->inode->is_complete_and_ordered())
1190 assert(dir->readdir_cache[dirp->cache_index] == dn);
1191 else
1192 dir->readdir_cache[dirp->cache_index] = dn;
1193 } else {
1194 assert(0 == "unexpected readdir buffer idx");
1195 }
1196 dirp->cache_index++;
1197 }
1198 // add to cached result list
1199 dirp->buffer.push_back(dir_result_t::dentry(dn->offset, dname, in));
1200 ldout(cct, 15) << __func__ << " " << hex << dn->offset << dec << ": '" << dname << "' -> " << in->ino << dendl;
1201 }
1202
1203 if (numdn > 0)
1204 dirp->last_name = dname;
1205 if (end)
1206 dirp->next_offset = 2;
1207 else
1208 dirp->next_offset = readdir_offset;
1209
1210 if (dir->is_empty())
1211 close_dir(dir);
1212 }
1213}
1214
1215/** insert_trace
1216 *
1217 * insert a trace from a MDS reply into the cache.
1218 */
1219Inode* Client::insert_trace(MetaRequest *request, MetaSession *session)
1220{
1221 MClientReply *reply = request->reply;
1222 int op = request->get_op();
1223
1224 ldout(cct, 10) << "insert_trace from " << request->sent_stamp << " mds." << session->mds_num
1225 << " is_target=" << (int)reply->head.is_target
1226 << " is_dentry=" << (int)reply->head.is_dentry
1227 << dendl;
1228
1229 bufferlist::iterator p = reply->get_trace_bl().begin();
1230 if (request->got_unsafe) {
1231 ldout(cct, 10) << "insert_trace -- already got unsafe; ignoring" << dendl;
1232 assert(p.end());
1233 return NULL;
1234 }
1235
1236 if (p.end()) {
1237 ldout(cct, 10) << "insert_trace -- no trace" << dendl;
1238
1239 Dentry *d = request->dentry();
1240 if (d) {
1241 Inode *diri = d->dir->parent_inode;
1242 diri->dir_release_count++;
1243 clear_dir_complete_and_ordered(diri, true);
1244 }
1245
1246 if (d && reply->get_result() == 0) {
1247 if (op == CEPH_MDS_OP_RENAME) {
1248 // rename
1249 Dentry *od = request->old_dentry();
1250 ldout(cct, 10) << " unlinking rename src dn " << od << " for traceless reply" << dendl;
1251 assert(od);
1252 unlink(od, true, true); // keep dir, dentry
1253 } else if (op == CEPH_MDS_OP_RMDIR ||
1254 op == CEPH_MDS_OP_UNLINK) {
1255 // unlink, rmdir
1256 ldout(cct, 10) << " unlinking unlink/rmdir dn " << d << " for traceless reply" << dendl;
1257 unlink(d, true, true); // keep dir, dentry
1258 }
1259 }
1260 return NULL;
1261 }
1262
1263 ConnectionRef con = request->reply->get_connection();
1264 uint64_t features = con->get_features();
1265 ldout(cct, 10) << " features 0x" << hex << features << dec << dendl;
1266
1267 // snap trace
1268 SnapRealm *realm = NULL;
1269 if (reply->snapbl.length())
1270 update_snap_trace(reply->snapbl, &realm);
1271
1272 ldout(cct, 10) << " hrm "
1273 << " is_target=" << (int)reply->head.is_target
1274 << " is_dentry=" << (int)reply->head.is_dentry
1275 << dendl;
1276
1277 InodeStat dirst;
1278 DirStat dst;
1279 string dname;
1280 LeaseStat dlease;
1281 InodeStat ist;
1282
1283 if (reply->head.is_dentry) {
1284 dirst.decode(p, features);
1285 dst.decode(p);
1286 ::decode(dname, p);
1287 ::decode(dlease, p);
1288 }
1289
1290 Inode *in = 0;
1291 if (reply->head.is_target) {
1292 ist.decode(p, features);
1293 if (cct->_conf->client_debug_getattr_caps) {
1294 unsigned wanted = 0;
1295 if (op == CEPH_MDS_OP_GETATTR || op == CEPH_MDS_OP_LOOKUP)
1296 wanted = request->head.args.getattr.mask;
1297 else if (op == CEPH_MDS_OP_OPEN || op == CEPH_MDS_OP_CREATE)
1298 wanted = request->head.args.open.mask;
1299
1300 if ((wanted & CEPH_CAP_XATTR_SHARED) &&
1301 !(ist.xattr_version > 0 && ist.xattrbl.length() > 0))
1302 assert(0 == "MDS reply does not contain xattrs");
1303 }
1304
1305 in = add_update_inode(&ist, request->sent_stamp, session,
1306 request->perms);
1307 }
1308
1309 Inode *diri = NULL;
1310 if (reply->head.is_dentry) {
1311 diri = add_update_inode(&dirst, request->sent_stamp, session,
1312 request->perms);
1313 update_dir_dist(diri, &dst); // dir stat info is attached to ..
1314
1315 if (in) {
1316 Dir *dir = diri->open_dir();
1317 insert_dentry_inode(dir, dname, &dlease, in, request->sent_stamp, session,
1318 (op == CEPH_MDS_OP_RENAME) ? request->old_dentry() : NULL);
1319 } else {
1320 Dentry *dn = NULL;
1321 if (diri->dir && diri->dir->dentries.count(dname)) {
1322 dn = diri->dir->dentries[dname];
1323 if (dn->inode) {
1324 diri->dir_ordered_count++;
1325 clear_dir_complete_and_ordered(diri, false);
1326 unlink(dn, true, true); // keep dir, dentry
1327 }
1328 }
1329 if (dlease.duration_ms > 0) {
1330 if (!dn) {
1331 Dir *dir = diri->open_dir();
1332 dn = link(dir, dname, NULL, NULL);
1333 }
1334 update_dentry_lease(dn, &dlease, request->sent_stamp, session);
1335 }
1336 }
1337 } else if (op == CEPH_MDS_OP_LOOKUPSNAP ||
1338 op == CEPH_MDS_OP_MKSNAP) {
1339 ldout(cct, 10) << " faking snap lookup weirdness" << dendl;
1340 // fake it for snap lookup
1341 vinodeno_t vino = ist.vino;
1342 vino.snapid = CEPH_SNAPDIR;
1343 assert(inode_map.count(vino));
1344 diri = inode_map[vino];
1345
1346 string dname = request->path.last_dentry();
1347
1348 LeaseStat dlease;
1349 dlease.duration_ms = 0;
1350
1351 if (in) {
1352 Dir *dir = diri->open_dir();
1353 insert_dentry_inode(dir, dname, &dlease, in, request->sent_stamp, session);
1354 } else {
1355 if (diri->dir && diri->dir->dentries.count(dname)) {
1356 Dentry *dn = diri->dir->dentries[dname];
1357 if (dn->inode)
1358 unlink(dn, true, true); // keep dir, dentry
1359 }
1360 }
1361 }
1362
1363 if (in) {
1364 if (op == CEPH_MDS_OP_READDIR ||
1365 op == CEPH_MDS_OP_LSSNAP) {
1366 insert_readdir_results(request, session, in);
1367 } else if (op == CEPH_MDS_OP_LOOKUPNAME) {
1368 // hack: return parent inode instead
1369 in = diri;
1370 }
1371
1372 if (request->dentry() == NULL && in != request->inode()) {
1373 // pin the target inode if its parent dentry is not pinned
1374 request->set_other_inode(in);
1375 }
1376 }
1377
1378 if (realm)
1379 put_snap_realm(realm);
1380
1381 request->target = in;
1382 return in;
1383}
1384
1385// -------
1386
1387mds_rank_t Client::choose_target_mds(MetaRequest *req, Inode** phash_diri)
1388{
1389 mds_rank_t mds = MDS_RANK_NONE;
1390 __u32 hash = 0;
1391 bool is_hash = false;
1392
1393 Inode *in = NULL;
1394 Dentry *de = NULL;
1395 Cap *cap = NULL;
1396
1397 if (req->resend_mds >= 0) {
1398 mds = req->resend_mds;
1399 req->resend_mds = -1;
1400 ldout(cct, 10) << "choose_target_mds resend_mds specified as mds." << mds << dendl;
1401 goto out;
1402 }
1403
1404 if (cct->_conf->client_use_random_mds)
1405 goto random_mds;
1406
1407 in = req->inode();
1408 de = req->dentry();
1409 if (in) {
1410 ldout(cct, 20) << "choose_target_mds starting with req->inode " << *in << dendl;
1411 if (req->path.depth()) {
1412 hash = in->hash_dentry_name(req->path[0]);
1413 ldout(cct, 20) << "choose_target_mds inode dir hash is " << (int)in->dir_layout.dl_dir_hash
1414 << " on " << req->path[0]
1415 << " => " << hash << dendl;
1416 is_hash = true;
1417 }
1418 } else if (de) {
1419 if (de->inode) {
1420 in = de->inode.get();
1421 ldout(cct, 20) << "choose_target_mds starting with req->dentry inode " << *in << dendl;
1422 } else {
1423 in = de->dir->parent_inode;
1424 hash = in->hash_dentry_name(de->name);
1425 ldout(cct, 20) << "choose_target_mds dentry dir hash is " << (int)in->dir_layout.dl_dir_hash
1426 << " on " << de->name
1427 << " => " << hash << dendl;
1428 is_hash = true;
1429 }
1430 }
1431 if (in) {
1432 if (in->snapid != CEPH_NOSNAP) {
1433 ldout(cct, 10) << "choose_target_mds " << *in << " is snapped, using nonsnap parent" << dendl;
1434 while (in->snapid != CEPH_NOSNAP) {
1435 if (in->snapid == CEPH_SNAPDIR)
1436 in = in->snapdir_parent.get();
1437 else if (!in->dn_set.empty())
1438 /* In most cases there will only be one dentry, so getting it
1439 * will be the correct action. If there are multiple hard links,
1440 * I think the MDS should be able to redirect as needed*/
1441 in = in->get_first_parent()->dir->parent_inode;
1442 else {
1443 ldout(cct, 10) << "got unlinked inode, can't look at parent" << dendl;
1444 break;
1445 }
1446 }
1447 is_hash = false;
1448 }
1449
1450 ldout(cct, 20) << "choose_target_mds " << *in << " is_hash=" << is_hash
1451 << " hash=" << hash << dendl;
1452
1453 if (is_hash && S_ISDIR(in->mode) && !in->fragmap.empty()) {
1454 frag_t fg = in->dirfragtree[hash];
1455 if (in->fragmap.count(fg)) {
1456 mds = in->fragmap[fg];
1457 if (phash_diri)
1458 *phash_diri = in;
1459 ldout(cct, 10) << "choose_target_mds from dirfragtree hash" << dendl;
1460 goto out;
1461 }
1462 }
1463
1464 if (req->auth_is_best())
1465 cap = in->auth_cap;
1466 if (!cap && !in->caps.empty())
1467 cap = in->caps.begin()->second;
1468 if (!cap)
1469 goto random_mds;
1470 mds = cap->session->mds_num;
1471 ldout(cct, 10) << "choose_target_mds from caps on inode " << *in << dendl;
1472
1473 goto out;
1474 }
1475
1476random_mds:
1477 if (mds < 0) {
1478 mds = _get_random_up_mds();
1479 ldout(cct, 10) << "did not get mds through better means, so chose random mds " << mds << dendl;
1480 }
1481
1482out:
1483 ldout(cct, 20) << "mds is " << mds << dendl;
1484 return mds;
1485}
1486
1487
1488void Client::connect_mds_targets(mds_rank_t mds)
1489{
1490 ldout(cct, 10) << "connect_mds_targets for mds." << mds << dendl;
1491 assert(mds_sessions.count(mds));
1492 const MDSMap::mds_info_t& info = mdsmap->get_mds_info(mds);
1493 for (set<mds_rank_t>::const_iterator q = info.export_targets.begin();
1494 q != info.export_targets.end();
1495 ++q) {
1496 if (mds_sessions.count(*q) == 0 &&
1497 mdsmap->is_clientreplay_or_active_or_stopping(*q)) {
1498 ldout(cct, 10) << "check_mds_sessions opening mds." << mds
1499 << " export target mds." << *q << dendl;
1500 _open_mds_session(*q);
1501 }
1502 }
1503}
1504
1505void Client::dump_mds_sessions(Formatter *f)
1506{
1507 f->dump_int("id", get_nodeid().v);
1508 f->open_array_section("sessions");
1509 for (map<mds_rank_t,MetaSession*>::const_iterator p = mds_sessions.begin(); p != mds_sessions.end(); ++p) {
1510 f->open_object_section("session");
1511 p->second->dump(f);
1512 f->close_section();
1513 }
1514 f->close_section();
1515 f->dump_int("mdsmap_epoch", mdsmap->get_epoch());
1516}
1517void Client::dump_mds_requests(Formatter *f)
1518{
1519 for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
1520 p != mds_requests.end();
1521 ++p) {
1522 f->open_object_section("request");
1523 p->second->dump(f);
1524 f->close_section();
1525 }
1526}
1527
1528int Client::verify_reply_trace(int r,
1529 MetaRequest *request, MClientReply *reply,
1530 InodeRef *ptarget, bool *pcreated,
1531 const UserPerm& perms)
1532{
1533 // check whether this request actually did the create, and set created flag
1534 bufferlist extra_bl;
1535 inodeno_t created_ino;
1536 bool got_created_ino = false;
1537 ceph::unordered_map<vinodeno_t, Inode*>::iterator p;
1538
1539 extra_bl.claim(reply->get_extra_bl());
1540 if (extra_bl.length() >= 8) {
1541 // if the extra bufferlist has a buffer, we assume its the created inode
1542 // and that this request to create succeeded in actually creating
1543 // the inode (won the race with other create requests)
1544 ::decode(created_ino, extra_bl);
1545 got_created_ino = true;
1546 ldout(cct, 10) << "make_request created ino " << created_ino << dendl;
1547 }
1548
1549 if (pcreated)
1550 *pcreated = got_created_ino;
1551
1552 if (request->target) {
1553 *ptarget = request->target;
1554 ldout(cct, 20) << "make_request target is " << *ptarget->get() << dendl;
1555 } else {
1556 if (got_created_ino && (p = inode_map.find(vinodeno_t(created_ino, CEPH_NOSNAP))) != inode_map.end()) {
1557 (*ptarget) = p->second;
1558 ldout(cct, 20) << "make_request created, target is " << *ptarget->get() << dendl;
1559 } else {
1560 // we got a traceless reply, and need to look up what we just
1561 // created. for now, do this by name. someday, do this by the
1562 // ino... which we know! FIXME.
1563 InodeRef target;
1564 Dentry *d = request->dentry();
1565 if (d) {
1566 if (d->dir) {
1567 ldout(cct, 10) << "make_request got traceless reply, looking up #"
1568 << d->dir->parent_inode->ino << "/" << d->name
1569 << " got_ino " << got_created_ino
1570 << " ino " << created_ino
1571 << dendl;
1572 r = _do_lookup(d->dir->parent_inode, d->name, request->regetattr_mask,
1573 &target, perms);
1574 } else {
1575 // if the dentry is not linked, just do our best. see #5021.
1576 assert(0 == "how did this happen? i want logs!");
1577 }
1578 } else {
1579 Inode *in = request->inode();
1580 ldout(cct, 10) << "make_request got traceless reply, forcing getattr on #"
1581 << in->ino << dendl;
1582 r = _getattr(in, request->regetattr_mask, perms, true);
1583 target = in;
1584 }
1585 if (r >= 0) {
1586 // verify ino returned in reply and trace_dist are the same
1587 if (got_created_ino &&
1588 created_ino.val != target->ino.val) {
1589 ldout(cct, 5) << "create got ino " << created_ino << " but then failed on lookup; EINTR?" << dendl;
1590 r = -EINTR;
1591 }
1592 if (ptarget)
1593 ptarget->swap(target);
1594 }
1595 }
1596 }
1597
1598 return r;
1599}
1600
1601
1602/**
1603 * make a request
1604 *
1605 * Blocking helper to make an MDS request.
1606 *
1607 * If the ptarget flag is set, behavior changes slightly: the caller
1608 * expects to get a pointer to the inode we are creating or operating
1609 * on. As a result, we will follow up any traceless mutation reply
1610 * with a getattr or lookup to transparently handle a traceless reply
1611 * from the MDS (as when the MDS restarts and the client has to replay
1612 * a request).
1613 *
1614 * @param request the MetaRequest to execute
1615 * @param perms The user uid/gid to execute as (eventually, full group lists?)
1616 * @param ptarget [optional] address to store a pointer to the target inode we want to create or operate on
1617 * @param pcreated [optional; required if ptarget] where to store a bool of whether our create atomically created a file
1618 * @param use_mds [optional] prefer a specific mds (-1 for default)
1619 * @param pdirbl [optional; disallowed if ptarget] where to pass extra reply payload to the caller
1620 */
1621int Client::make_request(MetaRequest *request,
1622 const UserPerm& perms,
1623 InodeRef *ptarget, bool *pcreated,
1624 mds_rank_t use_mds,
1625 bufferlist *pdirbl)
1626{
1627 int r = 0;
1628
1629 // assign a unique tid
1630 ceph_tid_t tid = ++last_tid;
1631 request->set_tid(tid);
1632
1633 // and timestamp
1634 request->op_stamp = ceph_clock_now();
1635
1636 // make note
1637 mds_requests[tid] = request->get();
1638 if (oldest_tid == 0 && request->get_op() != CEPH_MDS_OP_SETFILELOCK)
1639 oldest_tid = tid;
1640
1641 request->set_caller_perms(perms);
1642
1643 if (cct->_conf->client_inject_fixed_oldest_tid) {
1644 ldout(cct, 20) << __func__ << " injecting fixed oldest_client_tid(1)" << dendl;
1645 request->set_oldest_client_tid(1);
1646 } else {
1647 request->set_oldest_client_tid(oldest_tid);
1648 }
1649
1650 // hack target mds?
1651 if (use_mds >= 0)
1652 request->resend_mds = use_mds;
1653
1654 while (1) {
1655 if (request->aborted())
1656 break;
1657
31f18b77
FG
1658 if (blacklisted) {
1659 request->abort(-EBLACKLISTED);
1660 break;
1661 }
1662
7c673cae
FG
1663 // set up wait cond
1664 Cond caller_cond;
1665 request->caller_cond = &caller_cond;
1666
1667 // choose mds
1668 Inode *hash_diri = NULL;
1669 mds_rank_t mds = choose_target_mds(request, &hash_diri);
1670 int mds_state = (mds == MDS_RANK_NONE) ? MDSMap::STATE_NULL : mdsmap->get_state(mds);
1671 if (mds_state != MDSMap::STATE_ACTIVE && mds_state != MDSMap::STATE_STOPPING) {
1672 if (mds_state == MDSMap::STATE_NULL && mds >= mdsmap->get_max_mds()) {
1673 if (hash_diri) {
1674 ldout(cct, 10) << " target mds." << mds << " has stopped, remove it from fragmap" << dendl;
1675 _fragmap_remove_stopped_mds(hash_diri, mds);
1676 } else {
1677 ldout(cct, 10) << " target mds." << mds << " has stopped, trying a random mds" << dendl;
1678 request->resend_mds = _get_random_up_mds();
1679 }
1680 } else {
1681 ldout(cct, 10) << " target mds." << mds << " not active, waiting for new mdsmap" << dendl;
1682 wait_on_list(waiting_for_mdsmap);
1683 }
1684 continue;
1685 }
1686
1687 // open a session?
1688 MetaSession *session = NULL;
1689 if (!have_open_session(mds)) {
1690 session = _get_or_open_mds_session(mds);
1691
1692 // wait
1693 if (session->state == MetaSession::STATE_OPENING) {
1694 ldout(cct, 10) << "waiting for session to mds." << mds << " to open" << dendl;
1695 wait_on_context_list(session->waiting_for_open);
1696 // Abort requests on REJECT from MDS
1697 if (rejected_by_mds.count(mds)) {
1698 request->abort(-EPERM);
1699 break;
1700 }
1701 continue;
1702 }
1703
1704 if (!have_open_session(mds))
1705 continue;
1706 } else {
1707 session = mds_sessions[mds];
1708 }
1709
1710 // send request.
1711 send_request(request, session);
1712
1713 // wait for signal
1714 ldout(cct, 20) << "awaiting reply|forward|kick on " << &caller_cond << dendl;
1715 request->kick = false;
1716 while (!request->reply && // reply
1717 request->resend_mds < 0 && // forward
1718 !request->kick)
1719 caller_cond.Wait(client_lock);
1720 request->caller_cond = NULL;
1721
1722 // did we get a reply?
1723 if (request->reply)
1724 break;
1725 }
1726
1727 if (!request->reply) {
1728 assert(request->aborted());
1729 assert(!request->got_unsafe);
1730 r = request->get_abort_code();
1731 request->item.remove_myself();
1732 unregister_request(request);
1733 put_request(request); // ours
1734 return r;
1735 }
1736
1737 // got it!
1738 MClientReply *reply = request->reply;
1739 request->reply = NULL;
1740 r = reply->get_result();
1741 if (r >= 0)
1742 request->success = true;
1743
1744 // kick dispatcher (we've got it!)
1745 assert(request->dispatch_cond);
1746 request->dispatch_cond->Signal();
1747 ldout(cct, 20) << "sendrecv kickback on tid " << tid << " " << request->dispatch_cond << dendl;
1748 request->dispatch_cond = 0;
1749
1750 if (r >= 0 && ptarget)
1751 r = verify_reply_trace(r, request, reply, ptarget, pcreated, perms);
1752
1753 if (pdirbl)
1754 pdirbl->claim(reply->get_extra_bl());
1755
1756 // -- log times --
1757 utime_t lat = ceph_clock_now();
1758 lat -= request->sent_stamp;
1759 ldout(cct, 20) << "lat " << lat << dendl;
1760 logger->tinc(l_c_lat, lat);
1761 logger->tinc(l_c_reply, lat);
1762
1763 put_request(request);
1764
1765 reply->put();
1766 return r;
1767}
1768
1769void Client::unregister_request(MetaRequest *req)
1770{
1771 mds_requests.erase(req->tid);
1772 if (req->tid == oldest_tid) {
1773 map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.upper_bound(oldest_tid);
1774 while (true) {
1775 if (p == mds_requests.end()) {
1776 oldest_tid = 0;
1777 break;
1778 }
1779 if (p->second->get_op() != CEPH_MDS_OP_SETFILELOCK) {
1780 oldest_tid = p->first;
1781 break;
1782 }
1783 ++p;
1784 }
1785 }
1786 put_request(req);
1787}
1788
1789void Client::put_request(MetaRequest *request)
1790{
1791 if (request->_put()) {
1792 int op = -1;
1793 if (request->success)
1794 op = request->get_op();
1795 InodeRef other_in;
1796 request->take_other_inode(&other_in);
1797 delete request;
1798
1799 if (other_in &&
1800 (op == CEPH_MDS_OP_RMDIR ||
1801 op == CEPH_MDS_OP_RENAME ||
1802 op == CEPH_MDS_OP_RMSNAP)) {
1803 _try_to_trim_inode(other_in.get(), false);
1804 }
1805 }
1806}
1807
1808int Client::encode_inode_release(Inode *in, MetaRequest *req,
1809 mds_rank_t mds, int drop,
1810 int unless, int force)
1811{
1812 ldout(cct, 20) << "encode_inode_release enter(in:" << *in << ", req:" << req
1813 << " mds:" << mds << ", drop:" << drop << ", unless:" << unless
1814 << ", have:" << ", force:" << force << ")" << dendl;
1815 int released = 0;
1816 if (in->caps.count(mds)) {
1817 Cap *caps = in->caps[mds];
1818 drop &= ~(in->dirty_caps | get_caps_used(in));
1819 if ((drop & caps->issued) &&
1820 !(unless & caps->issued)) {
1821 ldout(cct, 25) << "Dropping caps. Initial " << ccap_string(caps->issued) << dendl;
1822 caps->issued &= ~drop;
1823 caps->implemented &= ~drop;
1824 released = 1;
1825 ldout(cct, 25) << "Now have: " << ccap_string(caps->issued) << dendl;
1826 } else {
1827 released = force;
1828 }
1829 if (released) {
1830 ceph_mds_request_release rel;
1831 rel.ino = in->ino;
1832 rel.cap_id = caps->cap_id;
1833 rel.seq = caps->seq;
1834 rel.issue_seq = caps->issue_seq;
1835 rel.mseq = caps->mseq;
1836 rel.caps = caps->implemented;
1837 rel.wanted = caps->wanted;
1838 rel.dname_len = 0;
1839 rel.dname_seq = 0;
1840 req->cap_releases.push_back(MClientRequest::Release(rel,""));
1841 }
1842 }
1843 ldout(cct, 25) << "encode_inode_release exit(in:" << *in << ") released:"
1844 << released << dendl;
1845 return released;
1846}
1847
1848void Client::encode_dentry_release(Dentry *dn, MetaRequest *req,
1849 mds_rank_t mds, int drop, int unless)
1850{
1851 ldout(cct, 20) << "encode_dentry_release enter(dn:"
1852 << dn << ")" << dendl;
1853 int released = 0;
1854 if (dn->dir)
1855 released = encode_inode_release(dn->dir->parent_inode, req,
1856 mds, drop, unless, 1);
1857 if (released && dn->lease_mds == mds) {
1858 ldout(cct, 25) << "preemptively releasing dn to mds" << dendl;
1859 MClientRequest::Release& rel = req->cap_releases.back();
1860 rel.item.dname_len = dn->name.length();
1861 rel.item.dname_seq = dn->lease_seq;
1862 rel.dname = dn->name;
1863 }
1864 ldout(cct, 25) << "encode_dentry_release exit(dn:"
1865 << dn << ")" << dendl;
1866}
1867
1868
1869/*
1870 * This requires the MClientRequest *request member to be set.
1871 * It will error out horribly without one.
1872 * Additionally, if you set any *drop member, you'd better have
1873 * set the corresponding dentry!
1874 */
1875void Client::encode_cap_releases(MetaRequest *req, mds_rank_t mds)
1876{
1877 ldout(cct, 20) << "encode_cap_releases enter (req: "
1878 << req << ", mds: " << mds << ")" << dendl;
1879 if (req->inode_drop && req->inode())
1880 encode_inode_release(req->inode(), req,
1881 mds, req->inode_drop,
1882 req->inode_unless);
1883
1884 if (req->old_inode_drop && req->old_inode())
1885 encode_inode_release(req->old_inode(), req,
1886 mds, req->old_inode_drop,
1887 req->old_inode_unless);
1888 if (req->other_inode_drop && req->other_inode())
1889 encode_inode_release(req->other_inode(), req,
1890 mds, req->other_inode_drop,
1891 req->other_inode_unless);
1892
1893 if (req->dentry_drop && req->dentry())
1894 encode_dentry_release(req->dentry(), req,
1895 mds, req->dentry_drop,
1896 req->dentry_unless);
1897
1898 if (req->old_dentry_drop && req->old_dentry())
1899 encode_dentry_release(req->old_dentry(), req,
1900 mds, req->old_dentry_drop,
1901 req->old_dentry_unless);
1902 ldout(cct, 25) << "encode_cap_releases exit (req: "
1903 << req << ", mds " << mds <<dendl;
1904}
1905
1906bool Client::have_open_session(mds_rank_t mds)
1907{
1908 return
1909 mds_sessions.count(mds) &&
1910 (mds_sessions[mds]->state == MetaSession::STATE_OPEN ||
1911 mds_sessions[mds]->state == MetaSession::STATE_STALE);
1912}
1913
1914MetaSession *Client::_get_mds_session(mds_rank_t mds, Connection *con)
1915{
1916 if (mds_sessions.count(mds) == 0)
1917 return NULL;
1918 MetaSession *s = mds_sessions[mds];
1919 if (s->con != con)
1920 return NULL;
1921 return s;
1922}
1923
1924MetaSession *Client::_get_or_open_mds_session(mds_rank_t mds)
1925{
1926 if (mds_sessions.count(mds))
1927 return mds_sessions[mds];
1928 return _open_mds_session(mds);
1929}
1930
1931/**
1932 * Populate a map of strings with client-identifying metadata,
1933 * such as the hostname. Call this once at initialization.
1934 */
1935void Client::populate_metadata(const std::string &mount_root)
1936{
1937 // Hostname
1938 struct utsname u;
1939 int r = uname(&u);
1940 if (r >= 0) {
1941 metadata["hostname"] = u.nodename;
1942 ldout(cct, 20) << __func__ << " read hostname '" << u.nodename << "'" << dendl;
1943 } else {
1944 ldout(cct, 1) << __func__ << " failed to read hostname (" << cpp_strerror(r) << ")" << dendl;
1945 }
1946
1947 metadata["pid"] = stringify(getpid());
1948
1949 // Ceph entity id (the '0' in "client.0")
1950 metadata["entity_id"] = cct->_conf->name.get_id();
1951
1952 // Our mount position
1953 if (!mount_root.empty()) {
1954 metadata["root"] = mount_root;
1955 }
1956
1957 // Ceph version
1958 metadata["ceph_version"] = pretty_version_to_str();
1959 metadata["ceph_sha1"] = git_version_to_str();
1960
1961 // Apply any metadata from the user's configured overrides
1962 std::vector<std::string> tokens;
1963 get_str_vec(cct->_conf->client_metadata, ",", tokens);
1964 for (const auto &i : tokens) {
1965 auto eqpos = i.find("=");
1966 // Throw out anything that isn't of the form "<str>=<str>"
1967 if (eqpos == 0 || eqpos == std::string::npos || eqpos == i.size()) {
1968 lderr(cct) << "Invalid metadata keyval pair: '" << i << "'" << dendl;
1969 continue;
1970 }
1971 metadata[i.substr(0, eqpos)] = i.substr(eqpos + 1);
1972 }
1973}
1974
1975/**
1976 * Optionally add or override client metadata fields.
1977 */
1978void Client::update_metadata(std::string const &k, std::string const &v)
1979{
1980 Mutex::Locker l(client_lock);
1981 assert(initialized);
1982
1983 if (metadata.count(k)) {
1984 ldout(cct, 1) << __func__ << " warning, overriding metadata field '" << k
1985 << "' from '" << metadata[k] << "' to '" << v << "'" << dendl;
1986 }
1987
1988 metadata[k] = v;
1989}
1990
1991MetaSession *Client::_open_mds_session(mds_rank_t mds)
1992{
1993 ldout(cct, 10) << "_open_mds_session mds." << mds << dendl;
1994 assert(mds_sessions.count(mds) == 0);
1995 MetaSession *session = new MetaSession;
1996 session->mds_num = mds;
1997 session->seq = 0;
1998 session->inst = mdsmap->get_inst(mds);
1999 session->con = messenger->get_connection(session->inst);
2000 session->state = MetaSession::STATE_OPENING;
2001 session->mds_state = MDSMap::STATE_NULL;
2002 mds_sessions[mds] = session;
2003
2004 // Maybe skip sending a request to open if this MDS daemon
2005 // has previously sent us a REJECT.
2006 if (rejected_by_mds.count(mds)) {
2007 if (rejected_by_mds[mds] == session->inst) {
2008 ldout(cct, 4) << "_open_mds_session mds." << mds << " skipping "
2009 "because we were rejected" << dendl;
2010 return session;
2011 } else {
2012 ldout(cct, 4) << "_open_mds_session mds." << mds << " old inst "
2013 "rejected us, trying with new inst" << dendl;
2014 rejected_by_mds.erase(mds);
2015 }
2016 }
2017
2018 MClientSession *m = new MClientSession(CEPH_SESSION_REQUEST_OPEN);
2019 m->client_meta = metadata;
2020 session->con->send_message(m);
2021 return session;
2022}
2023
2024void Client::_close_mds_session(MetaSession *s)
2025{
2026 ldout(cct, 2) << "_close_mds_session mds." << s->mds_num << " seq " << s->seq << dendl;
2027 s->state = MetaSession::STATE_CLOSING;
2028 s->con->send_message(new MClientSession(CEPH_SESSION_REQUEST_CLOSE, s->seq));
2029}
2030
2031void Client::_closed_mds_session(MetaSession *s)
2032{
2033 s->state = MetaSession::STATE_CLOSED;
2034 s->con->mark_down();
2035 signal_context_list(s->waiting_for_open);
2036 mount_cond.Signal();
2037 remove_session_caps(s);
2038 kick_requests_closed(s);
2039 mds_sessions.erase(s->mds_num);
2040 delete s;
2041}
2042
2043void Client::handle_client_session(MClientSession *m)
2044{
2045 mds_rank_t from = mds_rank_t(m->get_source().num());
2046 ldout(cct, 10) << "handle_client_session " << *m << " from mds." << from << dendl;
2047
2048 MetaSession *session = _get_mds_session(from, m->get_connection().get());
2049 if (!session) {
2050 ldout(cct, 10) << " discarding session message from sessionless mds " << m->get_source_inst() << dendl;
2051 m->put();
2052 return;
2053 }
2054
2055 switch (m->get_op()) {
2056 case CEPH_SESSION_OPEN:
2057 renew_caps(session);
2058 session->state = MetaSession::STATE_OPEN;
2059 if (unmounting)
2060 mount_cond.Signal();
2061 else
2062 connect_mds_targets(from);
2063 signal_context_list(session->waiting_for_open);
2064 break;
2065
2066 case CEPH_SESSION_CLOSE:
2067 _closed_mds_session(session);
2068 break;
2069
2070 case CEPH_SESSION_RENEWCAPS:
2071 if (session->cap_renew_seq == m->get_seq()) {
2072 session->cap_ttl =
2073 session->last_cap_renew_request + mdsmap->get_session_timeout();
2074 wake_inode_waiters(session);
2075 }
2076 break;
2077
2078 case CEPH_SESSION_STALE:
2079 renew_caps(session);
2080 break;
2081
2082 case CEPH_SESSION_RECALL_STATE:
2083 trim_caps(session, m->get_max_caps());
2084 break;
2085
2086 case CEPH_SESSION_FLUSHMSG:
2087 session->con->send_message(new MClientSession(CEPH_SESSION_FLUSHMSG_ACK, m->get_seq()));
2088 break;
2089
2090 case CEPH_SESSION_FORCE_RO:
2091 force_session_readonly(session);
2092 break;
2093
2094 case CEPH_SESSION_REJECT:
2095 rejected_by_mds[session->mds_num] = session->inst;
2096 _closed_mds_session(session);
2097
2098 break;
2099
2100 default:
2101 ceph_abort();
2102 }
2103
2104 m->put();
2105}
2106
2107bool Client::_any_stale_sessions() const
2108{
2109 assert(client_lock.is_locked_by_me());
2110
2111 for (const auto &i : mds_sessions) {
2112 if (i.second->state == MetaSession::STATE_STALE) {
2113 return true;
2114 }
2115 }
2116
2117 return false;
2118}
2119
2120void Client::_kick_stale_sessions()
2121{
2122 ldout(cct, 1) << "kick_stale_sessions" << dendl;
2123
2124 for (map<mds_rank_t,MetaSession*>::iterator p = mds_sessions.begin();
2125 p != mds_sessions.end(); ) {
2126 MetaSession *s = p->second;
2127 ++p;
2128 if (s->state == MetaSession::STATE_STALE)
2129 _closed_mds_session(s);
2130 }
2131}
2132
2133void Client::send_request(MetaRequest *request, MetaSession *session,
2134 bool drop_cap_releases)
2135{
2136 // make the request
2137 mds_rank_t mds = session->mds_num;
2138 ldout(cct, 10) << "send_request rebuilding request " << request->get_tid()
2139 << " for mds." << mds << dendl;
2140 MClientRequest *r = build_client_request(request);
2141 if (request->dentry()) {
2142 r->set_dentry_wanted();
2143 }
2144 if (request->got_unsafe) {
2145 r->set_replayed_op();
2146 if (request->target)
2147 r->head.ino = request->target->ino;
2148 } else {
2149 encode_cap_releases(request, mds);
2150 if (drop_cap_releases) // we haven't send cap reconnect yet, drop cap releases
2151 request->cap_releases.clear();
2152 else
2153 r->releases.swap(request->cap_releases);
2154 }
2155 r->set_mdsmap_epoch(mdsmap->get_epoch());
2156 if (r->head.op == CEPH_MDS_OP_SETXATTR) {
2157 objecter->with_osdmap([r](const OSDMap& o) {
2158 r->set_osdmap_epoch(o.get_epoch());
2159 });
2160 }
2161
2162 if (request->mds == -1) {
2163 request->sent_stamp = ceph_clock_now();
2164 ldout(cct, 20) << "send_request set sent_stamp to " << request->sent_stamp << dendl;
2165 }
2166 request->mds = mds;
2167
2168 Inode *in = request->inode();
2169 if (in && in->caps.count(mds))
2170 request->sent_on_mseq = in->caps[mds]->mseq;
2171
2172 session->requests.push_back(&request->item);
2173
2174 ldout(cct, 10) << "send_request " << *r << " to mds." << mds << dendl;
2175 session->con->send_message(r);
2176}
2177
2178MClientRequest* Client::build_client_request(MetaRequest *request)
2179{
2180 MClientRequest *req = new MClientRequest(request->get_op());
2181 req->set_tid(request->tid);
2182 req->set_stamp(request->op_stamp);
2183 memcpy(&req->head, &request->head, sizeof(ceph_mds_request_head));
2184
2185 // if the filepath's haven't been set, set them!
2186 if (request->path.empty()) {
2187 Inode *in = request->inode();
2188 Dentry *de = request->dentry();
2189 if (in)
2190 in->make_nosnap_relative_path(request->path);
2191 else if (de) {
2192 if (de->inode)
2193 de->inode->make_nosnap_relative_path(request->path);
2194 else if (de->dir) {
2195 de->dir->parent_inode->make_nosnap_relative_path(request->path);
2196 request->path.push_dentry(de->name);
2197 }
2198 else ldout(cct, 1) << "Warning -- unable to construct a filepath!"
2199 << " No path, inode, or appropriately-endowed dentry given!"
2200 << dendl;
2201 } else ldout(cct, 1) << "Warning -- unable to construct a filepath!"
2202 << " No path, inode, or dentry given!"
2203 << dendl;
2204 }
2205 req->set_filepath(request->get_filepath());
2206 req->set_filepath2(request->get_filepath2());
2207 req->set_data(request->data);
2208 req->set_retry_attempt(request->retry_attempt++);
2209 req->head.num_fwd = request->num_fwd;
2210 const gid_t *_gids;
2211 int gid_count = request->perms.get_gids(&_gids);
2212 req->set_gid_list(gid_count, _gids);
2213 return req;
2214}
2215
2216
2217
2218void Client::handle_client_request_forward(MClientRequestForward *fwd)
2219{
2220 mds_rank_t mds = mds_rank_t(fwd->get_source().num());
2221 MetaSession *session = _get_mds_session(mds, fwd->get_connection().get());
2222 if (!session) {
2223 fwd->put();
2224 return;
2225 }
2226 ceph_tid_t tid = fwd->get_tid();
2227
2228 if (mds_requests.count(tid) == 0) {
2229 ldout(cct, 10) << "handle_client_request_forward no pending request on tid " << tid << dendl;
2230 fwd->put();
2231 return;
2232 }
2233
2234 MetaRequest *request = mds_requests[tid];
2235 assert(request);
2236
2237 // reset retry counter
2238 request->retry_attempt = 0;
2239
2240 // request not forwarded, or dest mds has no session.
2241 // resend.
2242 ldout(cct, 10) << "handle_client_request tid " << tid
2243 << " fwd " << fwd->get_num_fwd()
2244 << " to mds." << fwd->get_dest_mds()
2245 << ", resending to " << fwd->get_dest_mds()
2246 << dendl;
2247
2248 request->mds = -1;
2249 request->item.remove_myself();
2250 request->num_fwd = fwd->get_num_fwd();
2251 request->resend_mds = fwd->get_dest_mds();
2252 request->caller_cond->Signal();
2253
2254 fwd->put();
2255}
2256
2257bool Client::is_dir_operation(MetaRequest *req)
2258{
2259 int op = req->get_op();
2260 if (op == CEPH_MDS_OP_MKNOD || op == CEPH_MDS_OP_LINK ||
2261 op == CEPH_MDS_OP_UNLINK || op == CEPH_MDS_OP_RENAME ||
2262 op == CEPH_MDS_OP_MKDIR || op == CEPH_MDS_OP_RMDIR ||
2263 op == CEPH_MDS_OP_SYMLINK || op == CEPH_MDS_OP_CREATE)
2264 return true;
2265 return false;
2266}
2267
2268void Client::handle_client_reply(MClientReply *reply)
2269{
2270 mds_rank_t mds_num = mds_rank_t(reply->get_source().num());
2271 MetaSession *session = _get_mds_session(mds_num, reply->get_connection().get());
2272 if (!session) {
2273 reply->put();
2274 return;
2275 }
2276
2277 ceph_tid_t tid = reply->get_tid();
2278 bool is_safe = reply->is_safe();
2279
2280 if (mds_requests.count(tid) == 0) {
2281 lderr(cct) << "handle_client_reply no pending request on tid " << tid
2282 << " safe is:" << is_safe << dendl;
2283 reply->put();
2284 return;
2285 }
2286 MetaRequest *request = mds_requests.at(tid);
2287
2288 ldout(cct, 20) << "handle_client_reply got a reply. Safe:" << is_safe
2289 << " tid " << tid << dendl;
2290
2291 if (request->got_unsafe && !is_safe) {
2292 //duplicate response
2293 ldout(cct, 0) << "got a duplicate reply on tid " << tid << " from mds "
2294 << mds_num << " safe:" << is_safe << dendl;
2295 reply->put();
2296 return;
2297 }
2298
2299 if (-ESTALE == reply->get_result()) { // see if we can get to proper MDS
2300 ldout(cct, 20) << "got ESTALE on tid " << request->tid
2301 << " from mds." << request->mds << dendl;
2302 request->send_to_auth = true;
2303 request->resend_mds = choose_target_mds(request);
2304 Inode *in = request->inode();
2305 if (request->resend_mds >= 0 &&
2306 request->resend_mds == request->mds &&
2307 (in == NULL ||
2308 in->caps.count(request->resend_mds) == 0 ||
2309 request->sent_on_mseq == in->caps[request->resend_mds]->mseq)) {
2310 // have to return ESTALE
2311 } else {
2312 request->caller_cond->Signal();
2313 reply->put();
2314 return;
2315 }
2316 ldout(cct, 20) << "have to return ESTALE" << dendl;
2317 }
2318
2319 assert(request->reply == NULL);
2320 request->reply = reply;
2321 insert_trace(request, session);
2322
2323 // Handle unsafe reply
2324 if (!is_safe) {
2325 request->got_unsafe = true;
2326 session->unsafe_requests.push_back(&request->unsafe_item);
2327 if (is_dir_operation(request)) {
2328 Inode *dir = request->inode();
2329 assert(dir);
2330 dir->unsafe_ops.push_back(&request->unsafe_dir_item);
2331 }
2332 if (request->target) {
2333 InodeRef &in = request->target;
2334 in->unsafe_ops.push_back(&request->unsafe_target_item);
2335 }
2336 }
2337
2338 // Only signal the caller once (on the first reply):
2339 // Either its an unsafe reply, or its a safe reply and no unsafe reply was sent.
2340 if (!is_safe || !request->got_unsafe) {
2341 Cond cond;
2342 request->dispatch_cond = &cond;
2343
2344 // wake up waiter
2345 ldout(cct, 20) << "handle_client_reply signalling caller " << (void*)request->caller_cond << dendl;
2346 request->caller_cond->Signal();
2347
2348 // wake for kick back
2349 while (request->dispatch_cond) {
2350 ldout(cct, 20) << "handle_client_reply awaiting kickback on tid " << tid << " " << &cond << dendl;
2351 cond.Wait(client_lock);
2352 }
2353 }
2354
2355 if (is_safe) {
2356 // the filesystem change is committed to disk
2357 // we're done, clean up
2358 if (request->got_unsafe) {
2359 request->unsafe_item.remove_myself();
2360 request->unsafe_dir_item.remove_myself();
2361 request->unsafe_target_item.remove_myself();
2362 signal_cond_list(request->waitfor_safe);
2363 }
2364 request->item.remove_myself();
2365 unregister_request(request);
2366 }
2367 if (unmounting)
2368 mount_cond.Signal();
2369}
2370
2371void Client::_handle_full_flag(int64_t pool)
2372{
2373 ldout(cct, 1) << __func__ << ": FULL: cancelling outstanding operations "
2374 << "on " << pool << dendl;
2375 // Cancel all outstanding ops in this pool with -ENOSPC: it is necessary
2376 // to do this rather than blocking, because otherwise when we fill up we
2377 // potentially lock caps forever on files with dirty pages, and we need
2378 // to be able to release those caps to the MDS so that it can delete files
2379 // and free up space.
2380 epoch_t cancelled_epoch = objecter->op_cancel_writes(-ENOSPC, pool);
2381
2382 // For all inodes with layouts in this pool and a pending flush write op
2383 // (i.e. one of the ones we will cancel), we've got to purge_set their data
2384 // from ObjectCacher so that it doesn't re-issue the write in response to
2385 // the ENOSPC error.
2386 // Fortunately since we're cancelling everything in a given pool, we don't
2387 // need to know which ops belong to which ObjectSet, we can just blow all
2388 // the un-flushed cached data away and mark any dirty inodes' async_err
2389 // field with -ENOSPC as long as we're sure all the ops we cancelled were
2390 // affecting this pool, and all the objectsets we're purging were also
2391 // in this pool.
2392 for (unordered_map<vinodeno_t,Inode*>::iterator i = inode_map.begin();
2393 i != inode_map.end(); ++i)
2394 {
2395 Inode *inode = i->second;
2396 if (inode->oset.dirty_or_tx
2397 && (pool == -1 || inode->layout.pool_id == pool)) {
2398 ldout(cct, 4) << __func__ << ": FULL: inode 0x" << std::hex << i->first << std::dec
2399 << " has dirty objects, purging and setting ENOSPC" << dendl;
2400 objectcacher->purge_set(&inode->oset);
2401 inode->set_async_err(-ENOSPC);
2402 }
2403 }
2404
2405 if (cancelled_epoch != (epoch_t)-1) {
2406 set_cap_epoch_barrier(cancelled_epoch);
2407 }
2408}
2409
2410void Client::handle_osd_map(MOSDMap *m)
2411{
31f18b77
FG
2412 std::set<entity_addr_t> new_blacklists;
2413 objecter->consume_blacklist_events(&new_blacklists);
2414
2415 const auto myaddr = messenger->get_myaddr();
2416 if (!blacklisted && new_blacklists.count(myaddr)) {
2417 auto epoch = objecter->with_osdmap([](const OSDMap &o){
2418 return o.get_epoch();
2419 });
2420 lderr(cct) << "I was blacklisted at osd epoch " << epoch << dendl;
2421 blacklisted = true;
2422 for (std::map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
2423 p != mds_requests.end(); ) {
2424 auto req = p->second;
2425 ++p;
2426 req->abort(-EBLACKLISTED);
2427 if (req->caller_cond) {
2428 req->kick = true;
2429 req->caller_cond->Signal();
2430 }
2431 }
2432
2433 // Progress aborts on any requests that were on this waitlist. Any
2434 // requests that were on a waiting_for_open session waitlist
2435 // will get kicked during close session below.
2436 signal_cond_list(waiting_for_mdsmap);
2437
2438 // Force-close all sessions: assume this is not abandoning any state
2439 // on the MDS side because the MDS will have seen the blacklist too.
2440 while(!mds_sessions.empty()) {
2441 auto i = mds_sessions.begin();
2442 auto session = i->second;
2443 _closed_mds_session(session);
2444 }
2445
2446 // Since we know all our OSD ops will fail, cancel them all preemtively,
2447 // so that on an unhealthy cluster we can umount promptly even if e.g.
2448 // some PGs were inaccessible.
2449 objecter->op_cancel_writes(-EBLACKLISTED);
2450
2451 } else if (blacklisted) {
2452 // Handle case where we were blacklisted but no longer are
2453 blacklisted = objecter->with_osdmap([myaddr](const OSDMap &o){
2454 return o.is_blacklisted(myaddr);});
2455 }
2456
7c673cae
FG
2457 if (objecter->osdmap_full_flag()) {
2458 _handle_full_flag(-1);
2459 } else {
2460 // Accumulate local list of full pools so that I can drop
2461 // the objecter lock before re-entering objecter in
2462 // cancel_writes
2463 std::vector<int64_t> full_pools;
2464
2465 objecter->with_osdmap([&full_pools](const OSDMap &o) {
2466 for (const auto& kv : o.get_pools()) {
2467 if (kv.second.has_flag(pg_pool_t::FLAG_FULL)) {
2468 full_pools.push_back(kv.first);
2469 }
2470 }
2471 });
2472
2473 for (auto p : full_pools)
2474 _handle_full_flag(p);
2475
2476 // Subscribe to subsequent maps to watch for the full flag going
2477 // away. For the global full flag objecter does this for us, but
2478 // it pays no attention to the per-pool full flag so in this branch
2479 // we do it ourselves.
2480 if (!full_pools.empty()) {
2481 objecter->maybe_request_map();
2482 }
2483 }
2484
2485 m->put();
2486}
2487
2488
2489// ------------------------
2490// incoming messages
2491
2492
2493bool Client::ms_dispatch(Message *m)
2494{
2495 Mutex::Locker l(client_lock);
2496 if (!initialized) {
2497 ldout(cct, 10) << "inactive, discarding " << *m << dendl;
2498 m->put();
2499 return true;
2500 }
2501
2502 switch (m->get_type()) {
2503 // mounting and mds sessions
2504 case CEPH_MSG_MDS_MAP:
2505 handle_mds_map(static_cast<MMDSMap*>(m));
2506 break;
2507 case CEPH_MSG_FS_MAP:
2508 handle_fs_map(static_cast<MFSMap*>(m));
2509 break;
2510 case CEPH_MSG_FS_MAP_USER:
2511 handle_fs_map_user(static_cast<MFSMapUser*>(m));
2512 break;
2513 case CEPH_MSG_CLIENT_SESSION:
2514 handle_client_session(static_cast<MClientSession*>(m));
2515 break;
2516
2517 case CEPH_MSG_OSD_MAP:
2518 handle_osd_map(static_cast<MOSDMap*>(m));
2519 break;
2520
2521 // requests
2522 case CEPH_MSG_CLIENT_REQUEST_FORWARD:
2523 handle_client_request_forward(static_cast<MClientRequestForward*>(m));
2524 break;
2525 case CEPH_MSG_CLIENT_REPLY:
2526 handle_client_reply(static_cast<MClientReply*>(m));
2527 break;
2528
2529 case CEPH_MSG_CLIENT_SNAP:
2530 handle_snap(static_cast<MClientSnap*>(m));
2531 break;
2532 case CEPH_MSG_CLIENT_CAPS:
2533 handle_caps(static_cast<MClientCaps*>(m));
2534 break;
2535 case CEPH_MSG_CLIENT_LEASE:
2536 handle_lease(static_cast<MClientLease*>(m));
2537 break;
2538 case MSG_COMMAND_REPLY:
2539 if (m->get_source().type() == CEPH_ENTITY_TYPE_MDS) {
2540 handle_command_reply(static_cast<MCommandReply*>(m));
2541 } else {
2542 return false;
2543 }
2544 break;
2545 case CEPH_MSG_CLIENT_QUOTA:
2546 handle_quota(static_cast<MClientQuota*>(m));
2547 break;
2548
2549 default:
2550 return false;
2551 }
2552
2553 // unmounting?
2554 if (unmounting) {
2555 ldout(cct, 10) << "unmounting: trim pass, size was " << lru.lru_get_size()
2556 << "+" << inode_map.size() << dendl;
2557 long unsigned size = lru.lru_get_size() + inode_map.size();
2558 trim_cache();
2559 if (size < lru.lru_get_size() + inode_map.size()) {
2560 ldout(cct, 10) << "unmounting: trim pass, cache shrank, poking unmount()" << dendl;
2561 mount_cond.Signal();
2562 } else {
2563 ldout(cct, 10) << "unmounting: trim pass, size still " << lru.lru_get_size()
2564 << "+" << inode_map.size() << dendl;
2565 }
2566 }
2567
2568 return true;
2569}
2570
2571void Client::handle_fs_map(MFSMap *m)
2572{
2573 fsmap.reset(new FSMap(m->get_fsmap()));
2574 m->put();
2575
2576 signal_cond_list(waiting_for_fsmap);
2577
2578 monclient->sub_got("fsmap", fsmap->get_epoch());
2579}
2580
2581void Client::handle_fs_map_user(MFSMapUser *m)
2582{
2583 fsmap_user.reset(new FSMapUser);
2584 *fsmap_user = m->get_fsmap();
2585 m->put();
2586
2587 monclient->sub_got("fsmap.user", fsmap_user->get_epoch());
2588 signal_cond_list(waiting_for_fsmap);
2589}
2590
2591void Client::handle_mds_map(MMDSMap* m)
2592{
2593 if (m->get_epoch() <= mdsmap->get_epoch()) {
2594 ldout(cct, 1) << "handle_mds_map epoch " << m->get_epoch()
2595 << " is identical to or older than our "
2596 << mdsmap->get_epoch() << dendl;
2597 m->put();
2598 return;
2599 }
2600
2601 ldout(cct, 1) << "handle_mds_map epoch " << m->get_epoch() << dendl;
2602
2603 std::unique_ptr<MDSMap> oldmap(new MDSMap);
2604 oldmap.swap(mdsmap);
2605
2606 mdsmap->decode(m->get_encoded());
2607
2608 // Cancel any commands for missing or laggy GIDs
2609 std::list<ceph_tid_t> cancel_ops;
2610 auto &commands = command_table.get_commands();
2611 for (const auto &i : commands) {
2612 auto &op = i.second;
2613 const mds_gid_t op_mds_gid = op.mds_gid;
2614 if (mdsmap->is_dne_gid(op_mds_gid) || mdsmap->is_laggy_gid(op_mds_gid)) {
2615 ldout(cct, 1) << __func__ << ": cancelling command op " << i.first << dendl;
2616 cancel_ops.push_back(i.first);
2617 if (op.outs) {
2618 std::ostringstream ss;
2619 ss << "MDS " << op_mds_gid << " went away";
2620 *(op.outs) = ss.str();
2621 }
2622 op.con->mark_down();
2623 if (op.on_finish) {
2624 op.on_finish->complete(-ETIMEDOUT);
2625 }
2626 }
2627 }
2628
2629 for (std::list<ceph_tid_t>::iterator i = cancel_ops.begin();
2630 i != cancel_ops.end(); ++i) {
2631 command_table.erase(*i);
2632 }
2633
2634 // reset session
2635 for (map<mds_rank_t,MetaSession*>::iterator p = mds_sessions.begin();
2636 p != mds_sessions.end(); ) {
2637 mds_rank_t mds = p->first;
2638 MetaSession *session = p->second;
2639 ++p;
2640
2641 int oldstate = oldmap->get_state(mds);
2642 int newstate = mdsmap->get_state(mds);
2643 if (!mdsmap->is_up(mds)) {
2644 session->con->mark_down();
2645 } else if (mdsmap->get_inst(mds) != session->inst) {
2646 session->con->mark_down();
2647 session->inst = mdsmap->get_inst(mds);
2648 // When new MDS starts to take over, notify kernel to trim unused entries
2649 // in its dcache/icache. Hopefully, the kernel will release some unused
2650 // inodes before the new MDS enters reconnect state.
2651 trim_cache_for_reconnect(session);
2652 } else if (oldstate == newstate)
2653 continue; // no change
2654
2655 session->mds_state = newstate;
2656 if (newstate == MDSMap::STATE_RECONNECT) {
2657 session->con = messenger->get_connection(session->inst);
2658 send_reconnect(session);
2659 } else if (newstate >= MDSMap::STATE_ACTIVE) {
2660 if (oldstate < MDSMap::STATE_ACTIVE) {
2661 // kick new requests
2662 kick_requests(session);
2663 kick_flushing_caps(session);
2664 signal_context_list(session->waiting_for_open);
2665 kick_maxsize_requests(session);
2666 wake_inode_waiters(session);
2667 }
2668 connect_mds_targets(mds);
2669 } else if (newstate == MDSMap::STATE_NULL &&
2670 mds >= mdsmap->get_max_mds()) {
2671 _closed_mds_session(session);
2672 }
2673 }
2674
2675 // kick any waiting threads
2676 signal_cond_list(waiting_for_mdsmap);
2677
2678 m->put();
2679
2680 monclient->sub_got("mdsmap", mdsmap->get_epoch());
2681}
2682
2683void Client::send_reconnect(MetaSession *session)
2684{
2685 mds_rank_t mds = session->mds_num;
2686 ldout(cct, 10) << "send_reconnect to mds." << mds << dendl;
2687
2688 // trim unused caps to reduce MDS's cache rejoin time
2689 trim_cache_for_reconnect(session);
2690
2691 session->readonly = false;
2692
2693 if (session->release) {
2694 session->release->put();
2695 session->release = NULL;
2696 }
2697
2698 // reset my cap seq number
2699 session->seq = 0;
2700 //connect to the mds' offload targets
2701 connect_mds_targets(mds);
2702 //make sure unsafe requests get saved
2703 resend_unsafe_requests(session);
2704
2705 MClientReconnect *m = new MClientReconnect;
2706
2707 // i have an open session.
2708 ceph::unordered_set<inodeno_t> did_snaprealm;
2709 for (ceph::unordered_map<vinodeno_t, Inode*>::iterator p = inode_map.begin();
2710 p != inode_map.end();
2711 ++p) {
2712 Inode *in = p->second;
2713 if (in->caps.count(mds)) {
2714 ldout(cct, 10) << " caps on " << p->first
2715 << " " << ccap_string(in->caps[mds]->issued)
2716 << " wants " << ccap_string(in->caps_wanted())
2717 << dendl;
2718 filepath path;
2719 in->make_long_path(path);
2720 ldout(cct, 10) << " path " << path << dendl;
2721
2722 bufferlist flockbl;
2723 _encode_filelocks(in, flockbl);
2724
2725 Cap *cap = in->caps[mds];
2726 cap->seq = 0; // reset seq.
2727 cap->issue_seq = 0; // reset seq.
2728 cap->mseq = 0; // reset seq.
2729 cap->issued = cap->implemented;
2730
2731 snapid_t snap_follows = 0;
2732 if (!in->cap_snaps.empty())
2733 snap_follows = in->cap_snaps.begin()->first;
2734
2735 m->add_cap(p->first.ino,
2736 cap->cap_id,
2737 path.get_ino(), path.get_path(), // ino
2738 in->caps_wanted(), // wanted
2739 cap->issued, // issued
2740 in->snaprealm->ino,
2741 snap_follows,
2742 flockbl);
2743
2744 if (did_snaprealm.count(in->snaprealm->ino) == 0) {
2745 ldout(cct, 10) << " snaprealm " << *in->snaprealm << dendl;
2746 m->add_snaprealm(in->snaprealm->ino, in->snaprealm->seq, in->snaprealm->parent);
2747 did_snaprealm.insert(in->snaprealm->ino);
2748 }
2749 }
2750 }
2751
2752 early_kick_flushing_caps(session);
2753
2754 session->con->send_message(m);
2755
2756 mount_cond.Signal();
2757}
2758
2759
2760void Client::kick_requests(MetaSession *session)
2761{
2762 ldout(cct, 10) << "kick_requests for mds." << session->mds_num << dendl;
2763 for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
2764 p != mds_requests.end();
2765 ++p) {
31f18b77
FG
2766 MetaRequest *req = p->second;
2767 if (req->got_unsafe)
2768 continue;
2769 if (req->aborted()) {
2770 if (req->caller_cond) {
2771 req->kick = true;
2772 req->caller_cond->Signal();
2773 }
7c673cae 2774 continue;
31f18b77
FG
2775 }
2776 if (req->retry_attempt > 0)
7c673cae 2777 continue; // new requests only
31f18b77 2778 if (req->mds == session->mds_num) {
7c673cae
FG
2779 send_request(p->second, session);
2780 }
2781 }
2782}
2783
2784void Client::resend_unsafe_requests(MetaSession *session)
2785{
2786 for (xlist<MetaRequest*>::iterator iter = session->unsafe_requests.begin();
2787 !iter.end();
2788 ++iter)
2789 send_request(*iter, session);
2790
2791 // also re-send old requests when MDS enters reconnect stage. So that MDS can
2792 // process completed requests in clientreplay stage.
2793 for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
2794 p != mds_requests.end();
2795 ++p) {
2796 MetaRequest *req = p->second;
2797 if (req->got_unsafe)
2798 continue;
31f18b77
FG
2799 if (req->aborted())
2800 continue;
7c673cae
FG
2801 if (req->retry_attempt == 0)
2802 continue; // old requests only
2803 if (req->mds == session->mds_num)
2804 send_request(req, session, true);
2805 }
2806}
2807
2808void Client::wait_unsafe_requests()
2809{
2810 list<MetaRequest*> last_unsafe_reqs;
2811 for (map<mds_rank_t,MetaSession*>::iterator p = mds_sessions.begin();
2812 p != mds_sessions.end();
2813 ++p) {
2814 MetaSession *s = p->second;
2815 if (!s->unsafe_requests.empty()) {
2816 MetaRequest *req = s->unsafe_requests.back();
2817 req->get();
2818 last_unsafe_reqs.push_back(req);
2819 }
2820 }
2821
2822 for (list<MetaRequest*>::iterator p = last_unsafe_reqs.begin();
2823 p != last_unsafe_reqs.end();
2824 ++p) {
2825 MetaRequest *req = *p;
2826 if (req->unsafe_item.is_on_list())
2827 wait_on_list(req->waitfor_safe);
2828 put_request(req);
2829 }
2830}
2831
2832void Client::kick_requests_closed(MetaSession *session)
2833{
2834 ldout(cct, 10) << "kick_requests_closed for mds." << session->mds_num << dendl;
2835 for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
2836 p != mds_requests.end(); ) {
2837 MetaRequest *req = p->second;
2838 ++p;
2839 if (req->mds == session->mds_num) {
2840 if (req->caller_cond) {
2841 req->kick = true;
2842 req->caller_cond->Signal();
2843 }
2844 req->item.remove_myself();
2845 if (req->got_unsafe) {
2846 lderr(cct) << "kick_requests_closed removing unsafe request " << req->get_tid() << dendl;
2847 req->unsafe_item.remove_myself();
2848 req->unsafe_dir_item.remove_myself();
2849 req->unsafe_target_item.remove_myself();
2850 signal_cond_list(req->waitfor_safe);
2851 unregister_request(req);
2852 }
2853 }
2854 }
2855 assert(session->requests.empty());
2856 assert(session->unsafe_requests.empty());
2857}
2858
2859
2860
2861
2862/************
2863 * leases
2864 */
2865
2866void Client::got_mds_push(MetaSession *s)
2867{
2868 s->seq++;
2869 ldout(cct, 10) << " mds." << s->mds_num << " seq now " << s->seq << dendl;
2870 if (s->state == MetaSession::STATE_CLOSING) {
2871 s->con->send_message(new MClientSession(CEPH_SESSION_REQUEST_CLOSE, s->seq));
2872 }
2873}
2874
2875void Client::handle_lease(MClientLease *m)
2876{
2877 ldout(cct, 10) << "handle_lease " << *m << dendl;
2878
2879 assert(m->get_action() == CEPH_MDS_LEASE_REVOKE);
2880
2881 mds_rank_t mds = mds_rank_t(m->get_source().num());
2882 MetaSession *session = _get_mds_session(mds, m->get_connection().get());
2883 if (!session) {
2884 m->put();
2885 return;
2886 }
2887
2888 got_mds_push(session);
2889
2890 ceph_seq_t seq = m->get_seq();
2891
2892 Inode *in;
2893 vinodeno_t vino(m->get_ino(), CEPH_NOSNAP);
2894 if (inode_map.count(vino) == 0) {
2895 ldout(cct, 10) << " don't have vino " << vino << dendl;
2896 goto revoke;
2897 }
2898 in = inode_map[vino];
2899
2900 if (m->get_mask() & CEPH_LOCK_DN) {
2901 if (!in->dir || in->dir->dentries.count(m->dname) == 0) {
2902 ldout(cct, 10) << " don't have dir|dentry " << m->get_ino() << "/" << m->dname <<dendl;
2903 goto revoke;
2904 }
2905 Dentry *dn = in->dir->dentries[m->dname];
2906 ldout(cct, 10) << " revoked DN lease on " << dn << dendl;
2907 dn->lease_mds = -1;
2908 }
2909
2910 revoke:
2911 m->get_connection()->send_message(
2912 new MClientLease(
2913 CEPH_MDS_LEASE_RELEASE, seq,
2914 m->get_mask(), m->get_ino(), m->get_first(), m->get_last(), m->dname));
2915 m->put();
2916}
2917
2918void Client::put_inode(Inode *in, int n)
2919{
2920 ldout(cct, 10) << "put_inode on " << *in << dendl;
2921 int left = in->_put(n);
2922 if (left == 0) {
2923 // release any caps
2924 remove_all_caps(in);
2925
2926 ldout(cct, 10) << "put_inode deleting " << *in << dendl;
2927 bool unclean = objectcacher->release_set(&in->oset);
2928 assert(!unclean);
2929 inode_map.erase(in->vino());
2930 if (use_faked_inos())
2931 _release_faked_ino(in);
2932
2933 if (in == root) {
2934 root = 0;
2935 root_ancestor = 0;
2936 while (!root_parents.empty())
2937 root_parents.erase(root_parents.begin());
2938 }
2939
2940 delete in;
2941 }
2942}
2943
2944void Client::close_dir(Dir *dir)
2945{
2946 Inode *in = dir->parent_inode;
2947 ldout(cct, 15) << "close_dir dir " << dir << " on " << in << dendl;
2948 assert(dir->is_empty());
2949 assert(in->dir == dir);
2950 assert(in->dn_set.size() < 2); // dirs can't be hard-linked
2951 if (!in->dn_set.empty())
2952 in->get_first_parent()->put(); // unpin dentry
2953
2954 delete in->dir;
2955 in->dir = 0;
2956 put_inode(in); // unpin inode
2957}
2958
2959 /**
2960 * Don't call this with in==NULL, use get_or_create for that
2961 * leave dn set to default NULL unless you're trying to add
2962 * a new inode to a pre-created Dentry
2963 */
2964Dentry* Client::link(Dir *dir, const string& name, Inode *in, Dentry *dn)
2965{
2966 if (!dn) {
2967 // create a new Dentry
2968 dn = new Dentry;
2969 dn->name = name;
2970
2971 // link to dir
2972 dn->dir = dir;
2973 dir->dentries[dn->name] = dn;
2974 lru.lru_insert_mid(dn); // mid or top?
2975
2976 ldout(cct, 15) << "link dir " << dir->parent_inode << " '" << name << "' to inode " << in
2977 << " dn " << dn << " (new dn)" << dendl;
2978 } else {
2979 ldout(cct, 15) << "link dir " << dir->parent_inode << " '" << name << "' to inode " << in
2980 << " dn " << dn << " (old dn)" << dendl;
2981 }
2982
2983 if (in) { // link to inode
2984 dn->inode = in;
2985 if (in->is_dir()) {
2986 if (in->dir)
2987 dn->get(); // dir -> dn pin
2988 if (in->ll_ref)
2989 dn->get(); // ll_ref -> dn pin
2990 }
2991
2992 assert(in->dn_set.count(dn) == 0);
2993
2994 // only one parent for directories!
2995 if (in->is_dir() && !in->dn_set.empty()) {
2996 Dentry *olddn = in->get_first_parent();
2997 assert(olddn->dir != dir || olddn->name != name);
2998 Inode *old_diri = olddn->dir->parent_inode;
2999 old_diri->dir_release_count++;
3000 clear_dir_complete_and_ordered(old_diri, true);
3001 unlink(olddn, true, true); // keep dir, dentry
3002 }
3003
3004 in->dn_set.insert(dn);
3005
3006 ldout(cct, 20) << "link inode " << in << " parents now " << in->dn_set << dendl;
3007 }
3008
3009 return dn;
3010}
3011
3012void Client::unlink(Dentry *dn, bool keepdir, bool keepdentry)
3013{
3014 InodeRef in;
3015 in.swap(dn->inode);
3016 ldout(cct, 15) << "unlink dir " << dn->dir->parent_inode << " '" << dn->name << "' dn " << dn
3017 << " inode " << dn->inode << dendl;
3018
3019 // unlink from inode
3020 if (in) {
3021 if (in->is_dir()) {
3022 if (in->dir)
3023 dn->put(); // dir -> dn pin
3024 if (in->ll_ref)
3025 dn->put(); // ll_ref -> dn pin
3026 }
3027 dn->inode = 0;
3028 assert(in->dn_set.count(dn));
3029 in->dn_set.erase(dn);
3030 ldout(cct, 20) << "unlink inode " << in << " parents now " << in->dn_set << dendl;
3031 }
3032
3033 if (keepdentry) {
3034 dn->lease_mds = -1;
3035 } else {
3036 ldout(cct, 15) << "unlink removing '" << dn->name << "' dn " << dn << dendl;
3037
3038 // unlink from dir
3039 dn->dir->dentries.erase(dn->name);
3040 if (dn->dir->is_empty() && !keepdir)
3041 close_dir(dn->dir);
3042 dn->dir = 0;
3043
3044 // delete den
3045 lru.lru_remove(dn);
3046 dn->put();
3047 }
3048}
3049
3050/**
3051 * For asynchronous flushes, check for errors from the IO and
3052 * update the inode if necessary
3053 */
3054class C_Client_FlushComplete : public Context {
3055private:
3056 Client *client;
3057 InodeRef inode;
3058public:
3059 C_Client_FlushComplete(Client *c, Inode *in) : client(c), inode(in) { }
3060 void finish(int r) override {
3061 assert(client->client_lock.is_locked_by_me());
3062 if (r != 0) {
3063 client_t const whoami = client->whoami; // For the benefit of ldout prefix
3064 ldout(client->cct, 1) << "I/O error from flush on inode " << inode
3065 << " 0x" << std::hex << inode->ino << std::dec
3066 << ": " << r << "(" << cpp_strerror(r) << ")" << dendl;
3067 inode->set_async_err(r);
3068 }
3069 }
3070};
3071
3072
3073/****
3074 * caps
3075 */
3076
3077void Client::get_cap_ref(Inode *in, int cap)
3078{
3079 if ((cap & CEPH_CAP_FILE_BUFFER) &&
3080 in->cap_refs[CEPH_CAP_FILE_BUFFER] == 0) {
3081 ldout(cct, 5) << "get_cap_ref got first FILE_BUFFER ref on " << *in << dendl;
3082 in->get();
3083 }
3084 if ((cap & CEPH_CAP_FILE_CACHE) &&
3085 in->cap_refs[CEPH_CAP_FILE_CACHE] == 0) {
3086 ldout(cct, 5) << "get_cap_ref got first FILE_CACHE ref on " << *in << dendl;
3087 in->get();
3088 }
3089 in->get_cap_ref(cap);
3090}
3091
3092void Client::put_cap_ref(Inode *in, int cap)
3093{
3094 int last = in->put_cap_ref(cap);
3095 if (last) {
3096 int put_nref = 0;
3097 int drop = last & ~in->caps_issued();
3098 if (in->snapid == CEPH_NOSNAP) {
3099 if ((last & CEPH_CAP_FILE_WR) &&
3100 !in->cap_snaps.empty() &&
3101 in->cap_snaps.rbegin()->second.writing) {
3102 ldout(cct, 10) << "put_cap_ref finishing pending cap_snap on " << *in << dendl;
3103 in->cap_snaps.rbegin()->second.writing = 0;
3104 finish_cap_snap(in, in->cap_snaps.rbegin()->second, get_caps_used(in));
3105 signal_cond_list(in->waitfor_caps); // wake up blocked sync writers
3106 }
3107 if (last & CEPH_CAP_FILE_BUFFER) {
3108 for (auto &p : in->cap_snaps)
3109 p.second.dirty_data = 0;
3110 signal_cond_list(in->waitfor_commit);
3111 ldout(cct, 5) << "put_cap_ref dropped last FILE_BUFFER ref on " << *in << dendl;
3112 ++put_nref;
3113 }
3114 }
3115 if (last & CEPH_CAP_FILE_CACHE) {
3116 ldout(cct, 5) << "put_cap_ref dropped last FILE_CACHE ref on " << *in << dendl;
3117 ++put_nref;
3118 }
3119 if (drop)
3120 check_caps(in, 0);
3121 if (put_nref)
3122 put_inode(in, put_nref);
3123 }
3124}
3125
3126int Client::get_caps(Inode *in, int need, int want, int *phave, loff_t endoff)
3127{
3128 int r = check_pool_perm(in, need);
3129 if (r < 0)
3130 return r;
3131
3132 while (1) {
3133 int file_wanted = in->caps_file_wanted();
3134 if ((file_wanted & need) != need) {
3135 ldout(cct, 10) << "get_caps " << *in << " need " << ccap_string(need)
3136 << " file_wanted " << ccap_string(file_wanted) << ", EBADF "
3137 << dendl;
3138 return -EBADF;
3139 }
3140
3141 int implemented;
3142 int have = in->caps_issued(&implemented);
3143
3144 bool waitfor_caps = false;
3145 bool waitfor_commit = false;
3146
3147 if (have & need & CEPH_CAP_FILE_WR) {
3148 if (endoff > 0 &&
3149 (endoff >= (loff_t)in->max_size ||
3150 endoff > (loff_t)(in->size << 1)) &&
3151 endoff > (loff_t)in->wanted_max_size) {
3152 ldout(cct, 10) << "wanted_max_size " << in->wanted_max_size << " -> " << endoff << dendl;
3153 in->wanted_max_size = endoff;
3154 check_caps(in, 0);
3155 }
3156
3157 if (endoff >= 0 && endoff > (loff_t)in->max_size) {
3158 ldout(cct, 10) << "waiting on max_size, endoff " << endoff << " max_size " << in->max_size << " on " << *in << dendl;
3159 waitfor_caps = true;
3160 }
3161 if (!in->cap_snaps.empty()) {
3162 if (in->cap_snaps.rbegin()->second.writing) {
3163 ldout(cct, 10) << "waiting on cap_snap write to complete" << dendl;
3164 waitfor_caps = true;
3165 }
3166 for (auto &p : in->cap_snaps) {
3167 if (p.second.dirty_data) {
3168 waitfor_commit = true;
3169 break;
3170 }
3171 }
3172 if (waitfor_commit) {
3173 _flush(in, new C_Client_FlushComplete(this, in));
3174 ldout(cct, 10) << "waiting for WRBUFFER to get dropped" << dendl;
3175 }
3176 }
3177 }
3178
3179 if (!waitfor_caps && !waitfor_commit) {
3180 if ((have & need) == need) {
3181 int butnot = want & ~(have & need);
3182 int revoking = implemented & ~have;
3183 ldout(cct, 10) << "get_caps " << *in << " have " << ccap_string(have)
3184 << " need " << ccap_string(need) << " want " << ccap_string(want)
3185 << " but not " << ccap_string(butnot) << " revoking " << ccap_string(revoking)
3186 << dendl;
3187 if ((revoking & butnot) == 0) {
3188 *phave = need | (have & want);
3189 in->get_cap_ref(need);
3190 return 0;
3191 }
3192 }
3193 ldout(cct, 10) << "waiting for caps " << *in << " need " << ccap_string(need) << " want " << ccap_string(want) << dendl;
3194 waitfor_caps = true;
3195 }
3196
3197 if ((need & CEPH_CAP_FILE_WR) && in->auth_cap &&
3198 in->auth_cap->session->readonly)
3199 return -EROFS;
3200
3201 if (in->flags & I_CAP_DROPPED) {
3202 int mds_wanted = in->caps_mds_wanted();
3203 if ((mds_wanted & need) != need) {
3204 int ret = _renew_caps(in);
3205 if (ret < 0)
3206 return ret;
3207 continue;
3208 }
3209 if ((mds_wanted & file_wanted) ==
3210 (file_wanted & (CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR))) {
3211 in->flags &= ~I_CAP_DROPPED;
3212 }
3213 }
3214
3215 if (waitfor_caps)
3216 wait_on_list(in->waitfor_caps);
3217 else if (waitfor_commit)
3218 wait_on_list(in->waitfor_commit);
3219 }
3220}
3221
3222int Client::get_caps_used(Inode *in)
3223{
3224 unsigned used = in->caps_used();
3225 if (!(used & CEPH_CAP_FILE_CACHE) &&
3226 !objectcacher->set_is_empty(&in->oset))
3227 used |= CEPH_CAP_FILE_CACHE;
3228 return used;
3229}
3230
3231void Client::cap_delay_requeue(Inode *in)
3232{
3233 ldout(cct, 10) << "cap_delay_requeue on " << *in << dendl;
3234 in->hold_caps_until = ceph_clock_now();
3235 in->hold_caps_until += cct->_conf->client_caps_release_delay;
3236 delayed_caps.push_back(&in->cap_item);
3237}
3238
3239void Client::send_cap(Inode *in, MetaSession *session, Cap *cap,
3240 bool sync, int used, int want, int retain,
3241 int flush, ceph_tid_t flush_tid)
3242{
3243 int held = cap->issued | cap->implemented;
3244 int revoking = cap->implemented & ~cap->issued;
3245 retain &= ~revoking;
3246 int dropping = cap->issued & ~retain;
3247 int op = CEPH_CAP_OP_UPDATE;
3248
3249 ldout(cct, 10) << "send_cap " << *in
3250 << " mds." << session->mds_num << " seq " << cap->seq
3251 << (sync ? " sync " : " async ")
3252 << " used " << ccap_string(used)
3253 << " want " << ccap_string(want)
3254 << " flush " << ccap_string(flush)
3255 << " retain " << ccap_string(retain)
3256 << " held "<< ccap_string(held)
3257 << " revoking " << ccap_string(revoking)
3258 << " dropping " << ccap_string(dropping)
3259 << dendl;
3260
3261 if (cct->_conf->client_inject_release_failure && revoking) {
3262 const int would_have_issued = cap->issued & retain;
3263 const int would_have_implemented = cap->implemented & (cap->issued | used);
3264 // Simulated bug:
3265 // - tell the server we think issued is whatever they issued plus whatever we implemented
3266 // - leave what we have implemented in place
3267 ldout(cct, 20) << __func__ << " injecting failure to release caps" << dendl;
3268 cap->issued = cap->issued | cap->implemented;
3269
3270 // Make an exception for revoking xattr caps: we are injecting
3271 // failure to release other caps, but allow xattr because client
3272 // will block on xattr ops if it can't release these to MDS (#9800)
3273 const int xattr_mask = CEPH_CAP_XATTR_SHARED | CEPH_CAP_XATTR_EXCL;
3274 cap->issued ^= xattr_mask & revoking;
3275 cap->implemented ^= xattr_mask & revoking;
3276
3277 ldout(cct, 20) << __func__ << " issued " << ccap_string(cap->issued) << " vs " << ccap_string(would_have_issued) << dendl;
3278 ldout(cct, 20) << __func__ << " implemented " << ccap_string(cap->implemented) << " vs " << ccap_string(would_have_implemented) << dendl;
3279 } else {
3280 // Normal behaviour
3281 cap->issued &= retain;
3282 cap->implemented &= cap->issued | used;
3283 }
3284
3285 snapid_t follows = 0;
3286
3287 if (flush)
3288 follows = in->snaprealm->get_snap_context().seq;
3289
3290 MClientCaps *m = new MClientCaps(op,
3291 in->ino,
3292 0,
3293 cap->cap_id, cap->seq,
3294 cap->implemented,
3295 want,
3296 flush,
3297 cap->mseq,
3298 cap_epoch_barrier);
3299 m->caller_uid = in->cap_dirtier_uid;
3300 m->caller_gid = in->cap_dirtier_gid;
3301
3302 m->head.issue_seq = cap->issue_seq;
3303 m->set_tid(flush_tid);
3304
3305 m->head.uid = in->uid;
3306 m->head.gid = in->gid;
3307 m->head.mode = in->mode;
3308
3309 m->head.nlink = in->nlink;
3310
3311 if (flush & CEPH_CAP_XATTR_EXCL) {
3312 ::encode(in->xattrs, m->xattrbl);
3313 m->head.xattr_version = in->xattr_version;
3314 }
3315
3316 m->size = in->size;
3317 m->max_size = in->max_size;
3318 m->truncate_seq = in->truncate_seq;
3319 m->truncate_size = in->truncate_size;
3320 m->mtime = in->mtime;
3321 m->atime = in->atime;
3322 m->ctime = in->ctime;
3323 m->btime = in->btime;
3324 m->time_warp_seq = in->time_warp_seq;
3325 m->change_attr = in->change_attr;
3326 if (sync)
3327 m->flags |= CLIENT_CAPS_SYNC;
3328
3329 if (flush & CEPH_CAP_FILE_WR) {
3330 m->inline_version = in->inline_version;
3331 m->inline_data = in->inline_data;
3332 }
3333
3334 in->reported_size = in->size;
3335 m->set_snap_follows(follows);
3336 cap->wanted = want;
3337 if (cap == in->auth_cap) {
3338 m->set_max_size(in->wanted_max_size);
3339 in->requested_max_size = in->wanted_max_size;
3340 ldout(cct, 15) << "auth cap, setting max_size = " << in->requested_max_size << dendl;
3341 }
3342
3343 if (!session->flushing_caps_tids.empty())
3344 m->set_oldest_flush_tid(*session->flushing_caps_tids.begin());
3345
3346 session->con->send_message(m);
3347}
3348
31f18b77
FG
3349static bool is_max_size_approaching(Inode *in)
3350{
3351 /* mds will adjust max size according to the reported size */
3352 if (in->flushing_caps & CEPH_CAP_FILE_WR)
3353 return false;
3354 if (in->size >= in->max_size)
3355 return true;
3356 /* half of previous max_size increment has been used */
3357 if (in->max_size > in->reported_size &&
3358 (in->size << 1) >= in->max_size + in->reported_size)
3359 return true;
3360 return false;
3361}
7c673cae
FG
3362
3363/**
3364 * check_caps
3365 *
3366 * Examine currently used and wanted versus held caps. Release, flush or ack
3367 * revoked caps to the MDS as appropriate.
3368 *
3369 * @param in the inode to check
3370 * @param flags flags to apply to cap check
3371 */
3372void Client::check_caps(Inode *in, unsigned flags)
3373{
3374 unsigned wanted = in->caps_wanted();
3375 unsigned used = get_caps_used(in);
3376 unsigned cap_used;
3377
3378 if (in->is_dir() && (in->flags & I_COMPLETE)) {
3379 // we do this here because we don't want to drop to Fs (and then
3380 // drop the Fs if we do a create!) if that alone makes us send lookups
3381 // to the MDS. Doing it in in->caps_wanted() has knock-on effects elsewhere
3382 wanted |= CEPH_CAP_FILE_EXCL;
3383 }
3384
3385 int implemented;
3386 int issued = in->caps_issued(&implemented);
3387 int revoking = implemented & ~issued;
3388
3389 int retain = wanted | used | CEPH_CAP_PIN;
3390 if (!unmounting) {
3391 if (wanted)
3392 retain |= CEPH_CAP_ANY;
3393 else
3394 retain |= CEPH_CAP_ANY_SHARED;
3395 }
3396
3397 ldout(cct, 10) << "check_caps on " << *in
3398 << " wanted " << ccap_string(wanted)
3399 << " used " << ccap_string(used)
3400 << " issued " << ccap_string(issued)
3401 << " revoking " << ccap_string(revoking)
3402 << " flags=" << flags
3403 << dendl;
3404
3405 if (in->snapid != CEPH_NOSNAP)
3406 return; //snap caps last forever, can't write
3407
3408 if (in->caps.empty())
3409 return; // guard if at end of func
3410
3411 if ((revoking & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO)) &&
3412 (used & CEPH_CAP_FILE_CACHE) && !(used & CEPH_CAP_FILE_BUFFER))
3413 _release(in);
3414
3415 if (!in->cap_snaps.empty())
3416 flush_snaps(in);
3417
3418 if (flags & CHECK_CAPS_NODELAY)
3419 in->hold_caps_until = utime_t();
3420 else
3421 cap_delay_requeue(in);
3422
3423 utime_t now = ceph_clock_now();
3424
3425 map<mds_rank_t, Cap*>::iterator it = in->caps.begin();
3426 while (it != in->caps.end()) {
3427 mds_rank_t mds = it->first;
3428 Cap *cap = it->second;
3429 ++it;
3430
3431 MetaSession *session = mds_sessions[mds];
3432 assert(session);
3433
3434 cap_used = used;
3435 if (in->auth_cap && cap != in->auth_cap)
3436 cap_used &= ~in->auth_cap->issued;
3437
3438 revoking = cap->implemented & ~cap->issued;
3439
3440 ldout(cct, 10) << " cap mds." << mds
3441 << " issued " << ccap_string(cap->issued)
3442 << " implemented " << ccap_string(cap->implemented)
3443 << " revoking " << ccap_string(revoking) << dendl;
3444
3445 if (in->wanted_max_size > in->max_size &&
3446 in->wanted_max_size > in->requested_max_size &&
3447 cap == in->auth_cap)
3448 goto ack;
3449
3450 /* approaching file_max? */
3451 if ((cap->issued & CEPH_CAP_FILE_WR) &&
31f18b77
FG
3452 cap == in->auth_cap &&
3453 is_max_size_approaching(in)) {
7c673cae 3454 ldout(cct, 10) << "size " << in->size << " approaching max_size " << in->max_size
31f18b77 3455 << ", reported " << in->reported_size << dendl;
7c673cae
FG
3456 goto ack;
3457 }
3458
3459 /* completed revocation? */
3460 if (revoking && (revoking & cap_used) == 0) {
3461 ldout(cct, 10) << "completed revocation of " << ccap_string(cap->implemented & ~cap->issued) << dendl;
3462 goto ack;
3463 }
3464
3465 /* want more caps from mds? */
3466 if (wanted & ~(cap->wanted | cap->issued))
3467 goto ack;
3468
3469 if (!revoking && unmounting && (cap_used == 0))
3470 goto ack;
3471
3472 if (wanted == cap->wanted && // mds knows what we want.
3473 ((cap->issued & ~retain) == 0) &&// and we don't have anything we wouldn't like
3474 !in->dirty_caps) // and we have no dirty caps
3475 continue;
3476
3477 if (now < in->hold_caps_until) {
3478 ldout(cct, 10) << "delaying cap release" << dendl;
3479 continue;
3480 }
3481
3482 ack:
3483 // re-send old cap/snapcap flushes first.
3484 if (session->mds_state >= MDSMap::STATE_RECONNECT &&
3485 session->mds_state < MDSMap::STATE_ACTIVE &&
3486 session->early_flushing_caps.count(in) == 0) {
3487 ldout(cct, 20) << " reflushing caps (check_caps) on " << *in
3488 << " to mds." << session->mds_num << dendl;
3489 session->early_flushing_caps.insert(in);
3490 if (in->cap_snaps.size())
3491 flush_snaps(in, true);
3492 if (in->flushing_caps)
3493 flush_caps(in, session, flags & CHECK_CAPS_SYNCHRONOUS);
3494 }
3495
3496 int flushing;
3497 ceph_tid_t flush_tid;
3498 if (in->auth_cap == cap && in->dirty_caps) {
3499 flushing = mark_caps_flushing(in, &flush_tid);
3500 } else {
3501 flushing = 0;
3502 flush_tid = 0;
3503 }
3504
3505 send_cap(in, session, cap, flags & CHECK_CAPS_SYNCHRONOUS, cap_used, wanted,
3506 retain, flushing, flush_tid);
3507 }
3508}
3509
3510
3511void Client::queue_cap_snap(Inode *in, SnapContext& old_snapc)
3512{
3513 int used = get_caps_used(in);
3514 int dirty = in->caps_dirty();
3515 ldout(cct, 10) << "queue_cap_snap " << *in << " snapc " << old_snapc << " used " << ccap_string(used) << dendl;
3516
3517 if (in->cap_snaps.size() &&
3518 in->cap_snaps.rbegin()->second.writing) {
3519 ldout(cct, 10) << "queue_cap_snap already have pending cap_snap on " << *in << dendl;
3520 return;
3521 } else if (in->caps_dirty() ||
3522 (used & CEPH_CAP_FILE_WR) ||
3523 (dirty & CEPH_CAP_ANY_WR)) {
3524 const auto &capsnapem = in->cap_snaps.emplace(std::piecewise_construct, std::make_tuple(old_snapc.seq), std::make_tuple(in));
3525 assert(capsnapem.second == true); /* element inserted */
3526 CapSnap &capsnap = capsnapem.first->second;
3527 capsnap.context = old_snapc;
3528 capsnap.issued = in->caps_issued();
3529 capsnap.dirty = in->caps_dirty();
3530
3531 capsnap.dirty_data = (used & CEPH_CAP_FILE_BUFFER);
3532
3533 capsnap.uid = in->uid;
3534 capsnap.gid = in->gid;
3535 capsnap.mode = in->mode;
3536 capsnap.btime = in->btime;
3537 capsnap.xattrs = in->xattrs;
3538 capsnap.xattr_version = in->xattr_version;
3539
3540 if (used & CEPH_CAP_FILE_WR) {
3541 ldout(cct, 10) << "queue_cap_snap WR used on " << *in << dendl;
3542 capsnap.writing = 1;
3543 } else {
3544 finish_cap_snap(in, capsnap, used);
3545 }
3546 } else {
3547 ldout(cct, 10) << "queue_cap_snap not dirty|writing on " << *in << dendl;
3548 }
3549}
3550
3551void Client::finish_cap_snap(Inode *in, CapSnap &capsnap, int used)
3552{
3553 ldout(cct, 10) << "finish_cap_snap " << *in << " capsnap " << (void *)&capsnap << " used " << ccap_string(used) << dendl;
3554 capsnap.size = in->size;
3555 capsnap.mtime = in->mtime;
3556 capsnap.atime = in->atime;
3557 capsnap.ctime = in->ctime;
3558 capsnap.time_warp_seq = in->time_warp_seq;
3559 capsnap.change_attr = in->change_attr;
3560
3561 capsnap.dirty |= in->caps_dirty();
3562
3563 if (capsnap.dirty & CEPH_CAP_FILE_WR) {
3564 capsnap.inline_data = in->inline_data;
3565 capsnap.inline_version = in->inline_version;
3566 }
3567
3568 if (used & CEPH_CAP_FILE_BUFFER) {
3569 ldout(cct, 10) << "finish_cap_snap " << *in << " cap_snap " << &capsnap << " used " << used
3570 << " WRBUFFER, delaying" << dendl;
3571 } else {
3572 capsnap.dirty_data = 0;
3573 flush_snaps(in);
3574 }
3575}
3576
3577void Client::_flushed_cap_snap(Inode *in, snapid_t seq)
3578{
3579 ldout(cct, 10) << "_flushed_cap_snap seq " << seq << " on " << *in << dendl;
3580 in->cap_snaps.at(seq).dirty_data = 0;
3581 flush_snaps(in);
3582}
3583
3584void Client::flush_snaps(Inode *in, bool all_again)
3585{
3586 ldout(cct, 10) << "flush_snaps on " << *in << " all_again " << all_again << dendl;
3587 assert(in->cap_snaps.size());
3588
3589 // pick auth mds
3590 assert(in->auth_cap);
3591 MetaSession *session = in->auth_cap->session;
3592 int mseq = in->auth_cap->mseq;
3593
3594 for (auto &p : in->cap_snaps) {
3595 CapSnap &capsnap = p.second;
3596 if (!all_again) {
3597 // only flush once per session
3598 if (capsnap.flush_tid > 0)
3599 continue;
3600 }
3601
3602 ldout(cct, 10) << "flush_snaps mds." << session->mds_num
3603 << " follows " << p.first
3604 << " size " << capsnap.size
3605 << " mtime " << capsnap.mtime
3606 << " dirty_data=" << capsnap.dirty_data
3607 << " writing=" << capsnap.writing
3608 << " on " << *in << dendl;
3609 if (capsnap.dirty_data || capsnap.writing)
3610 continue;
3611
3612 if (capsnap.flush_tid == 0) {
3613 capsnap.flush_tid = ++last_flush_tid;
3614 if (!in->flushing_cap_item.is_on_list())
3615 session->flushing_caps.push_back(&in->flushing_cap_item);
3616 session->flushing_caps_tids.insert(capsnap.flush_tid);
3617 }
3618
3619 MClientCaps *m = new MClientCaps(CEPH_CAP_OP_FLUSHSNAP, in->ino, in->snaprealm->ino, 0, mseq,
3620 cap_epoch_barrier);
3621 if (user_id >= 0)
3622 m->caller_uid = user_id;
3623 if (group_id >= 0)
3624 m->caller_gid = group_id;
3625
3626 m->set_client_tid(capsnap.flush_tid);
3627 m->head.snap_follows = p.first;
3628
3629 m->head.caps = capsnap.issued;
3630 m->head.dirty = capsnap.dirty;
3631
3632 m->head.uid = capsnap.uid;
3633 m->head.gid = capsnap.gid;
3634 m->head.mode = capsnap.mode;
3635 m->btime = capsnap.btime;
3636
3637 m->size = capsnap.size;
3638
3639 m->head.xattr_version = capsnap.xattr_version;
3640 ::encode(capsnap.xattrs, m->xattrbl);
3641
3642 m->ctime = capsnap.ctime;
3643 m->btime = capsnap.btime;
3644 m->mtime = capsnap.mtime;
3645 m->atime = capsnap.atime;
3646 m->time_warp_seq = capsnap.time_warp_seq;
3647 m->change_attr = capsnap.change_attr;
3648
3649 if (capsnap.dirty & CEPH_CAP_FILE_WR) {
3650 m->inline_version = in->inline_version;
3651 m->inline_data = in->inline_data;
3652 }
3653
3654 assert(!session->flushing_caps_tids.empty());
3655 m->set_oldest_flush_tid(*session->flushing_caps_tids.begin());
3656
3657 session->con->send_message(m);
3658 }
3659}
3660
3661
3662
3663void Client::wait_on_list(list<Cond*>& ls)
3664{
3665 Cond cond;
3666 ls.push_back(&cond);
3667 cond.Wait(client_lock);
3668 ls.remove(&cond);
3669}
3670
3671void Client::signal_cond_list(list<Cond*>& ls)
3672{
3673 for (list<Cond*>::iterator it = ls.begin(); it != ls.end(); ++it)
3674 (*it)->Signal();
3675}
3676
3677void Client::wait_on_context_list(list<Context*>& ls)
3678{
3679 Cond cond;
3680 bool done = false;
3681 int r;
3682 ls.push_back(new C_Cond(&cond, &done, &r));
3683 while (!done)
3684 cond.Wait(client_lock);
3685}
3686
3687void Client::signal_context_list(list<Context*>& ls)
3688{
3689 while (!ls.empty()) {
3690 ls.front()->complete(0);
3691 ls.pop_front();
3692 }
3693}
3694
3695void Client::wake_inode_waiters(MetaSession *s)
3696{
3697 xlist<Cap*>::iterator iter = s->caps.begin();
3698 while (!iter.end()){
3699 signal_cond_list((*iter)->inode->waitfor_caps);
3700 ++iter;
3701 }
3702}
3703
3704
3705// flush dirty data (from objectcache)
3706
3707class C_Client_CacheInvalidate : public Context {
3708private:
3709 Client *client;
3710 vinodeno_t ino;
3711 int64_t offset, length;
3712public:
3713 C_Client_CacheInvalidate(Client *c, Inode *in, int64_t off, int64_t len) :
3714 client(c), offset(off), length(len) {
3715 if (client->use_faked_inos())
3716 ino = vinodeno_t(in->faked_ino, CEPH_NOSNAP);
3717 else
3718 ino = in->vino();
3719 }
3720 void finish(int r) override {
3721 // _async_invalidate takes the lock when it needs to, call this back from outside of lock.
3722 assert(!client->client_lock.is_locked_by_me());
3723 client->_async_invalidate(ino, offset, length);
3724 }
3725};
3726
3727void Client::_async_invalidate(vinodeno_t ino, int64_t off, int64_t len)
3728{
3729 if (unmounting)
3730 return;
3731 ldout(cct, 10) << "_async_invalidate " << ino << " " << off << "~" << len << dendl;
3732 ino_invalidate_cb(callback_handle, ino, off, len);
3733}
3734
3735void Client::_schedule_invalidate_callback(Inode *in, int64_t off, int64_t len) {
3736
3737 if (ino_invalidate_cb)
3738 // we queue the invalidate, which calls the callback and decrements the ref
3739 async_ino_invalidator.queue(new C_Client_CacheInvalidate(this, in, off, len));
3740}
3741
3742void Client::_invalidate_inode_cache(Inode *in)
3743{
3744 ldout(cct, 10) << "_invalidate_inode_cache " << *in << dendl;
3745
3746 // invalidate our userspace inode cache
3747 if (cct->_conf->client_oc)
3748 objectcacher->release_set(&in->oset);
3749
3750 _schedule_invalidate_callback(in, 0, 0);
3751}
3752
3753void Client::_invalidate_inode_cache(Inode *in, int64_t off, int64_t len)
3754{
3755 ldout(cct, 10) << "_invalidate_inode_cache " << *in << " " << off << "~" << len << dendl;
3756
3757 // invalidate our userspace inode cache
3758 if (cct->_conf->client_oc) {
3759 vector<ObjectExtent> ls;
3760 Striper::file_to_extents(cct, in->ino, &in->layout, off, len, in->truncate_size, ls);
3761 objectcacher->discard_set(&in->oset, ls);
3762 }
3763
3764 _schedule_invalidate_callback(in, off, len);
3765}
3766
3767bool Client::_release(Inode *in)
3768{
3769 ldout(cct, 20) << "_release " << *in << dendl;
3770 if (in->cap_refs[CEPH_CAP_FILE_CACHE] == 0) {
3771 _invalidate_inode_cache(in);
3772 return true;
3773 }
3774 return false;
3775}
3776
3777bool Client::_flush(Inode *in, Context *onfinish)
3778{
3779 ldout(cct, 10) << "_flush " << *in << dendl;
3780
3781 if (!in->oset.dirty_or_tx) {
3782 ldout(cct, 10) << " nothing to flush" << dendl;
3783 onfinish->complete(0);
3784 return true;
3785 }
3786
3787 if (objecter->osdmap_pool_full(in->layout.pool_id)) {
3788 ldout(cct, 1) << __func__ << ": FULL, purging for ENOSPC" << dendl;
3789 objectcacher->purge_set(&in->oset);
3790 if (onfinish) {
3791 onfinish->complete(-ENOSPC);
3792 }
3793 return true;
3794 }
3795
3796 return objectcacher->flush_set(&in->oset, onfinish);
3797}
3798
3799void Client::_flush_range(Inode *in, int64_t offset, uint64_t size)
3800{
3801 assert(client_lock.is_locked());
3802 if (!in->oset.dirty_or_tx) {
3803 ldout(cct, 10) << " nothing to flush" << dendl;
3804 return;
3805 }
3806
3807 Mutex flock("Client::_flush_range flock");
3808 Cond cond;
3809 bool safe = false;
3810 Context *onflush = new C_SafeCond(&flock, &cond, &safe);
3811 bool ret = objectcacher->file_flush(&in->oset, &in->layout, in->snaprealm->get_snap_context(),
3812 offset, size, onflush);
3813 if (!ret) {
3814 // wait for flush
3815 client_lock.Unlock();
3816 flock.Lock();
3817 while (!safe)
3818 cond.Wait(flock);
3819 flock.Unlock();
3820 client_lock.Lock();
3821 }
3822}
3823
3824void Client::flush_set_callback(ObjectCacher::ObjectSet *oset)
3825{
3826 // Mutex::Locker l(client_lock);
3827 assert(client_lock.is_locked()); // will be called via dispatch() -> objecter -> ...
3828 Inode *in = static_cast<Inode *>(oset->parent);
3829 assert(in);
3830 _flushed(in);
3831}
3832
3833void Client::_flushed(Inode *in)
3834{
3835 ldout(cct, 10) << "_flushed " << *in << dendl;
3836
3837 put_cap_ref(in, CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_BUFFER);
3838}
3839
3840
3841
3842// checks common to add_update_cap, handle_cap_grant
3843void Client::check_cap_issue(Inode *in, Cap *cap, unsigned issued)
3844{
3845 unsigned had = in->caps_issued();
3846
3847 if ((issued & CEPH_CAP_FILE_CACHE) &&
3848 !(had & CEPH_CAP_FILE_CACHE))
3849 in->cache_gen++;
3850
3851 if ((issued & CEPH_CAP_FILE_SHARED) &&
3852 !(had & CEPH_CAP_FILE_SHARED)) {
3853 in->shared_gen++;
3854
3855 if (in->is_dir())
3856 clear_dir_complete_and_ordered(in, true);
3857 }
3858}
3859
3860void Client::add_update_cap(Inode *in, MetaSession *mds_session, uint64_t cap_id,
3861 unsigned issued, unsigned seq, unsigned mseq, inodeno_t realm,
3862 int flags, const UserPerm& cap_perms)
3863{
3864 Cap *cap = 0;
3865 mds_rank_t mds = mds_session->mds_num;
3866 if (in->caps.count(mds)) {
3867 cap = in->caps[mds];
3868
3869 /*
3870 * auth mds of the inode changed. we received the cap export
3871 * message, but still haven't received the cap import message.
3872 * handle_cap_export() updated the new auth MDS' cap.
3873 *
3874 * "ceph_seq_cmp(seq, cap->seq) <= 0" means we are processing
3875 * a message that was send before the cap import message. So
3876 * don't remove caps.
3877 */
3878 if (ceph_seq_cmp(seq, cap->seq) <= 0) {
3879 assert(cap == in->auth_cap);
3880 assert(cap->cap_id == cap_id);
3881 seq = cap->seq;
3882 mseq = cap->mseq;
3883 issued |= cap->issued;
3884 flags |= CEPH_CAP_FLAG_AUTH;
3885 }
3886 } else {
3887 mds_session->num_caps++;
3888 if (!in->is_any_caps()) {
3889 assert(in->snaprealm == 0);
3890 in->snaprealm = get_snap_realm(realm);
3891 in->snaprealm->inodes_with_caps.push_back(&in->snaprealm_item);
3892 ldout(cct, 15) << "add_update_cap first one, opened snaprealm " << in->snaprealm << dendl;
3893 }
3894 in->caps[mds] = cap = new Cap;
3895
3896 mds_session->caps.push_back(&cap->cap_item);
3897 cap->session = mds_session;
3898 cap->inode = in;
3899 cap->gen = mds_session->cap_gen;
3900 cap_list.push_back(&in->cap_item);
3901 }
3902
3903 check_cap_issue(in, cap, issued);
3904
3905 if (flags & CEPH_CAP_FLAG_AUTH) {
3906 if (in->auth_cap != cap &&
3907 (!in->auth_cap || ceph_seq_cmp(in->auth_cap->mseq, mseq) < 0)) {
3908 if (in->auth_cap && in->flushing_cap_item.is_on_list()) {
3909 ldout(cct, 10) << "add_update_cap changing auth cap: "
3910 << "add myself to new auth MDS' flushing caps list" << dendl;
3911 adjust_session_flushing_caps(in, in->auth_cap->session, mds_session);
3912 }
3913 in->auth_cap = cap;
3914 }
3915 }
3916
3917 unsigned old_caps = cap->issued;
3918 cap->cap_id = cap_id;
3919 cap->issued |= issued;
3920 cap->implemented |= issued;
3921 cap->seq = seq;
3922 cap->issue_seq = seq;
3923 cap->mseq = mseq;
3924 cap->latest_perms = cap_perms;
3925 ldout(cct, 10) << "add_update_cap issued " << ccap_string(old_caps) << " -> " << ccap_string(cap->issued)
3926 << " from mds." << mds
3927 << " on " << *in
3928 << dendl;
3929
3930 if ((issued & ~old_caps) && in->auth_cap == cap) {
3931 // non-auth MDS is revoking the newly grant caps ?
3932 for (map<mds_rank_t,Cap*>::iterator it = in->caps.begin(); it != in->caps.end(); ++it) {
3933 if (it->second == cap)
3934 continue;
3935 if (it->second->implemented & ~it->second->issued & issued) {
3936 check_caps(in, CHECK_CAPS_NODELAY);
3937 break;
3938 }
3939 }
3940 }
3941
3942 if (issued & ~old_caps)
3943 signal_cond_list(in->waitfor_caps);
3944}
3945
3946void Client::remove_cap(Cap *cap, bool queue_release)
3947{
3948 Inode *in = cap->inode;
3949 MetaSession *session = cap->session;
3950 mds_rank_t mds = cap->session->mds_num;
3951
3952 ldout(cct, 10) << "remove_cap mds." << mds << " on " << *in << dendl;
3953
3954 if (queue_release) {
3955 session->enqueue_cap_release(
3956 in->ino,
3957 cap->cap_id,
3958 cap->issue_seq,
3959 cap->mseq,
3960 cap_epoch_barrier);
3961 }
3962
3963 if (in->auth_cap == cap) {
3964 if (in->flushing_cap_item.is_on_list()) {
3965 ldout(cct, 10) << " removing myself from flushing_cap list" << dendl;
3966 in->flushing_cap_item.remove_myself();
3967 }
3968 in->auth_cap = NULL;
3969 }
3970 assert(in->caps.count(mds));
3971 in->caps.erase(mds);
3972
3973 cap->cap_item.remove_myself();
3974 delete cap;
3975 cap = nullptr;
3976
3977 if (!in->is_any_caps()) {
3978 ldout(cct, 15) << "remove_cap last one, closing snaprealm " << in->snaprealm << dendl;
3979 in->snaprealm_item.remove_myself();
3980 put_snap_realm(in->snaprealm);
3981 in->snaprealm = 0;
3982 }
3983}
3984
3985void Client::remove_all_caps(Inode *in)
3986{
3987 while (!in->caps.empty())
3988 remove_cap(in->caps.begin()->second, true);
3989}
3990
3991void Client::remove_session_caps(MetaSession *s)
3992{
3993 ldout(cct, 10) << "remove_session_caps mds." << s->mds_num << dendl;
3994
3995 while (s->caps.size()) {
3996 Cap *cap = *s->caps.begin();
3997 Inode *in = cap->inode;
3998 bool dirty_caps = false, cap_snaps = false;
3999 if (in->auth_cap == cap) {
4000 cap_snaps = !in->cap_snaps.empty();
4001 dirty_caps = in->dirty_caps | in->flushing_caps;
4002 in->wanted_max_size = 0;
4003 in->requested_max_size = 0;
4004 in->flags |= I_CAP_DROPPED;
4005 }
4006 remove_cap(cap, false);
4007 signal_cond_list(in->waitfor_caps);
4008 if (cap_snaps) {
4009 InodeRef tmp_ref(in);
4010 in->cap_snaps.clear();
4011 }
4012 if (dirty_caps) {
4013 lderr(cct) << "remove_session_caps still has dirty|flushing caps on " << *in << dendl;
4014 if (in->flushing_caps) {
4015 num_flushing_caps--;
4016 in->flushing_cap_tids.clear();
4017 }
4018 in->flushing_caps = 0;
4019 in->dirty_caps = 0;
4020 put_inode(in);
4021 }
4022 }
4023 s->flushing_caps_tids.clear();
4024 sync_cond.Signal();
4025}
4026
4027class C_Client_Remount : public Context {
4028private:
4029 Client *client;
4030public:
4031 explicit C_Client_Remount(Client *c) : client(c) {}
4032 void finish(int r) override {
4033 assert (r == 0);
4034 r = client->remount_cb(client->callback_handle);
4035 if (r != 0) {
4036 client_t whoami = client->get_nodeid();
4037 lderr(client->cct) << "tried to remount (to trim kernel dentries) and got error "
4038 << r << dendl;
4039 if (client->require_remount && !client->unmounting) {
4040 assert(0 == "failed to remount for kernel dentry trimming");
4041 }
4042 }
4043 }
4044};
4045
4046void Client::_invalidate_kernel_dcache()
4047{
4048 if (unmounting)
4049 return;
4050 if (can_invalidate_dentries && dentry_invalidate_cb && root->dir) {
4051 for (ceph::unordered_map<string, Dentry*>::iterator p = root->dir->dentries.begin();
4052 p != root->dir->dentries.end();
4053 ++p) {
4054 if (p->second->inode)
4055 _schedule_invalidate_dentry_callback(p->second, false);
4056 }
4057 } else if (remount_cb) {
4058 // Hacky:
4059 // when remounting a file system, linux kernel trims all unused dentries in the fs
4060 remount_finisher.queue(new C_Client_Remount(this));
4061 }
4062}
4063
4064void Client::trim_caps(MetaSession *s, int max)
4065{
4066 mds_rank_t mds = s->mds_num;
4067 int caps_size = s->caps.size();
4068 ldout(cct, 10) << "trim_caps mds." << mds << " max " << max
4069 << " caps " << caps_size << dendl;
4070
4071 int trimmed = 0;
4072 xlist<Cap*>::iterator p = s->caps.begin();
4073 while ((caps_size - trimmed) > max && !p.end()) {
4074 Cap *cap = *p;
4075 Inode *in = cap->inode;
4076
4077 // Increment p early because it will be invalidated if cap
4078 // is deleted inside remove_cap
4079 ++p;
4080
4081 if (in->caps.size() > 1 && cap != in->auth_cap) {
4082 int mine = cap->issued | cap->implemented;
4083 int oissued = in->auth_cap ? in->auth_cap->issued : 0;
4084 // disposable non-auth cap
4085 if (!(get_caps_used(in) & ~oissued & mine)) {
4086 ldout(cct, 20) << " removing unused, unneeded non-auth cap on " << *in << dendl;
4087 remove_cap(cap, true);
4088 trimmed++;
4089 }
4090 } else {
4091 ldout(cct, 20) << " trying to trim dentries for " << *in << dendl;
4092 bool all = true;
4093 set<Dentry*>::iterator q = in->dn_set.begin();
4094 InodeRef tmp_ref(in);
4095 while (q != in->dn_set.end()) {
4096 Dentry *dn = *q++;
4097 if (dn->lru_is_expireable()) {
4098 if (can_invalidate_dentries &&
4099 dn->dir->parent_inode->ino == MDS_INO_ROOT) {
4100 // Only issue one of these per DN for inodes in root: handle
4101 // others more efficiently by calling for root-child DNs at
4102 // the end of this function.
4103 _schedule_invalidate_dentry_callback(dn, true);
4104 }
4105 trim_dentry(dn);
4106 } else {
4107 ldout(cct, 20) << " not expirable: " << dn->name << dendl;
4108 all = false;
4109 }
4110 }
4111 if (all && in->ino != MDS_INO_ROOT) {
4112 ldout(cct, 20) << __func__ << " counting as trimmed: " << *in << dendl;
4113 trimmed++;
4114 }
4115 }
4116 }
4117
4118 if (s->caps.size() > max)
4119 _invalidate_kernel_dcache();
4120}
4121
4122void Client::force_session_readonly(MetaSession *s)
4123{
4124 s->readonly = true;
4125 for (xlist<Cap*>::iterator p = s->caps.begin(); !p.end(); ++p) {
4126 Inode *in = (*p)->inode;
4127 if (in->caps_wanted() & CEPH_CAP_FILE_WR)
4128 signal_cond_list(in->waitfor_caps);
4129 }
4130}
4131
4132void Client::mark_caps_dirty(Inode *in, int caps)
4133{
4134 ldout(cct, 10) << "mark_caps_dirty " << *in << " " << ccap_string(in->dirty_caps) << " -> "
4135 << ccap_string(in->dirty_caps | caps) << dendl;
4136 if (caps && !in->caps_dirty())
4137 in->get();
4138 in->dirty_caps |= caps;
4139}
4140
4141int Client::mark_caps_flushing(Inode *in, ceph_tid_t* ptid)
4142{
4143 MetaSession *session = in->auth_cap->session;
4144
4145 int flushing = in->dirty_caps;
4146 assert(flushing);
4147
4148 ceph_tid_t flush_tid = ++last_flush_tid;
4149 in->flushing_cap_tids[flush_tid] = flushing;
4150
4151 if (!in->flushing_caps) {
4152 ldout(cct, 10) << "mark_caps_flushing " << ccap_string(flushing) << " " << *in << dendl;
4153 num_flushing_caps++;
4154 } else {
4155 ldout(cct, 10) << "mark_caps_flushing (more) " << ccap_string(flushing) << " " << *in << dendl;
4156 }
4157
4158 in->flushing_caps |= flushing;
4159 in->dirty_caps = 0;
4160
4161 if (!in->flushing_cap_item.is_on_list())
4162 session->flushing_caps.push_back(&in->flushing_cap_item);
4163 session->flushing_caps_tids.insert(flush_tid);
4164
4165 *ptid = flush_tid;
4166 return flushing;
4167}
4168
4169void Client::adjust_session_flushing_caps(Inode *in, MetaSession *old_s, MetaSession *new_s)
4170{
4171 for (auto &p : in->cap_snaps) {
4172 CapSnap &capsnap = p.second;
4173 if (capsnap.flush_tid > 0) {
4174 old_s->flushing_caps_tids.erase(capsnap.flush_tid);
4175 new_s->flushing_caps_tids.insert(capsnap.flush_tid);
4176 }
4177 }
4178 for (map<ceph_tid_t, int>::iterator it = in->flushing_cap_tids.begin();
4179 it != in->flushing_cap_tids.end();
4180 ++it) {
4181 old_s->flushing_caps_tids.erase(it->first);
4182 new_s->flushing_caps_tids.insert(it->first);
4183 }
4184 new_s->flushing_caps.push_back(&in->flushing_cap_item);
4185}
4186
4187/*
4188 * Flush all caps back to the MDS. Because the callers generally wait on the
4189 * result of this function (syncfs and umount cases), we set
4190 * CHECK_CAPS_SYNCHRONOUS on the last check_caps call.
4191 */
4192void Client::flush_caps_sync()
4193{
4194 ldout(cct, 10) << __func__ << dendl;
4195 xlist<Inode*>::iterator p = delayed_caps.begin();
4196 while (!p.end()) {
4197 unsigned flags = CHECK_CAPS_NODELAY;
4198 Inode *in = *p;
4199
4200 ++p;
4201 delayed_caps.pop_front();
4202 if (p.end() && cap_list.empty())
4203 flags |= CHECK_CAPS_SYNCHRONOUS;
4204 check_caps(in, flags);
4205 }
4206
4207 // other caps, too
4208 p = cap_list.begin();
4209 while (!p.end()) {
4210 unsigned flags = CHECK_CAPS_NODELAY;
4211 Inode *in = *p;
4212
4213 ++p;
4214 if (p.end())
4215 flags |= CHECK_CAPS_SYNCHRONOUS;
4216 check_caps(in, flags);
4217 }
4218}
4219
4220void Client::flush_caps(Inode *in, MetaSession *session, bool sync)
4221{
4222 ldout(cct, 10) << "flush_caps " << in << " mds." << session->mds_num << dendl;
4223 Cap *cap = in->auth_cap;
4224 assert(cap->session == session);
4225
4226 for (map<ceph_tid_t,int>::iterator p = in->flushing_cap_tids.begin();
4227 p != in->flushing_cap_tids.end();
4228 ++p) {
4229 bool req_sync = false;
4230
4231 /* If this is a synchronous request, then flush the journal on last one */
4232 if (sync && (p->first == in->flushing_cap_tids.rbegin()->first))
4233 req_sync = true;
4234
4235 send_cap(in, session, cap, req_sync,
4236 (get_caps_used(in) | in->caps_dirty()),
4237 in->caps_wanted(), (cap->issued | cap->implemented),
4238 p->second, p->first);
4239 }
4240}
4241
4242void Client::wait_sync_caps(Inode *in, ceph_tid_t want)
4243{
4244 while (in->flushing_caps) {
4245 map<ceph_tid_t, int>::iterator it = in->flushing_cap_tids.begin();
4246 assert(it != in->flushing_cap_tids.end());
4247 if (it->first > want)
4248 break;
4249 ldout(cct, 10) << "wait_sync_caps on " << *in << " flushing "
4250 << ccap_string(it->second) << " want " << want
4251 << " last " << it->first << dendl;
4252 wait_on_list(in->waitfor_caps);
4253 }
4254}
4255
4256void Client::wait_sync_caps(ceph_tid_t want)
4257{
4258 retry:
4259 ldout(cct, 10) << "wait_sync_caps want " << want << " (last is " << last_flush_tid << ", "
4260 << num_flushing_caps << " total flushing)" << dendl;
4261 for (map<mds_rank_t,MetaSession*>::iterator p = mds_sessions.begin();
4262 p != mds_sessions.end();
4263 ++p) {
4264 MetaSession *s = p->second;
4265 if (s->flushing_caps_tids.empty())
4266 continue;
4267 ceph_tid_t oldest_tid = *s->flushing_caps_tids.begin();
4268 if (oldest_tid <= want) {
4269 ldout(cct, 10) << " waiting on mds." << p->first << " tid " << oldest_tid
4270 << " (want " << want << ")" << dendl;
4271 sync_cond.Wait(client_lock);
4272 goto retry;
4273 }
4274 }
4275}
4276
4277void Client::kick_flushing_caps(MetaSession *session)
4278{
4279 mds_rank_t mds = session->mds_num;
4280 ldout(cct, 10) << "kick_flushing_caps mds." << mds << dendl;
4281
4282 for (xlist<Inode*>::iterator p = session->flushing_caps.begin(); !p.end(); ++p) {
4283 Inode *in = *p;
4284 if (session->early_flushing_caps.count(in))
4285 continue;
4286 ldout(cct, 20) << " reflushing caps on " << *in << " to mds." << mds << dendl;
4287 if (in->cap_snaps.size())
4288 flush_snaps(in, true);
4289 if (in->flushing_caps)
4290 flush_caps(in, session);
4291 }
4292
4293 session->early_flushing_caps.clear();
4294}
4295
4296void Client::early_kick_flushing_caps(MetaSession *session)
4297{
4298 session->early_flushing_caps.clear();
4299
4300 for (xlist<Inode*>::iterator p = session->flushing_caps.begin(); !p.end(); ++p) {
4301 Inode *in = *p;
4302 assert(in->auth_cap);
4303
4304 // if flushing caps were revoked, we re-send the cap flush in client reconnect
4305 // stage. This guarantees that MDS processes the cap flush message before issuing
4306 // the flushing caps to other client.
4307 if ((in->flushing_caps & in->auth_cap->issued) == in->flushing_caps)
4308 continue;
4309
4310 ldout(cct, 20) << " reflushing caps (early_kick) on " << *in
4311 << " to mds." << session->mds_num << dendl;
4312
4313 session->early_flushing_caps.insert(in);
4314
4315 if (in->cap_snaps.size())
4316 flush_snaps(in, true);
4317 if (in->flushing_caps)
4318 flush_caps(in, session);
4319
4320 }
4321}
4322
4323void Client::kick_maxsize_requests(MetaSession *session)
4324{
4325 xlist<Cap*>::iterator iter = session->caps.begin();
4326 while (!iter.end()){
4327 (*iter)->inode->requested_max_size = 0;
4328 (*iter)->inode->wanted_max_size = 0;
4329 signal_cond_list((*iter)->inode->waitfor_caps);
4330 ++iter;
4331 }
4332}
4333
4334void SnapRealm::build_snap_context()
4335{
4336 set<snapid_t> snaps;
4337 snapid_t max_seq = seq;
4338
4339 // start with prior_parents?
4340 for (unsigned i=0; i<prior_parent_snaps.size(); i++)
4341 snaps.insert(prior_parent_snaps[i]);
4342
4343 // current parent's snaps
4344 if (pparent) {
4345 const SnapContext& psnapc = pparent->get_snap_context();
4346 for (unsigned i=0; i<psnapc.snaps.size(); i++)
4347 if (psnapc.snaps[i] >= parent_since)
4348 snaps.insert(psnapc.snaps[i]);
4349 if (psnapc.seq > max_seq)
4350 max_seq = psnapc.seq;
4351 }
4352
4353 // my snaps
4354 for (unsigned i=0; i<my_snaps.size(); i++)
4355 snaps.insert(my_snaps[i]);
4356
4357 // ok!
4358 cached_snap_context.seq = max_seq;
4359 cached_snap_context.snaps.resize(0);
4360 cached_snap_context.snaps.reserve(snaps.size());
4361 for (set<snapid_t>::reverse_iterator p = snaps.rbegin(); p != snaps.rend(); ++p)
4362 cached_snap_context.snaps.push_back(*p);
4363}
4364
4365void Client::invalidate_snaprealm_and_children(SnapRealm *realm)
4366{
4367 list<SnapRealm*> q;
4368 q.push_back(realm);
4369
4370 while (!q.empty()) {
4371 realm = q.front();
4372 q.pop_front();
4373
4374 ldout(cct, 10) << "invalidate_snaprealm_and_children " << *realm << dendl;
4375 realm->invalidate_cache();
4376
4377 for (set<SnapRealm*>::iterator p = realm->pchildren.begin();
4378 p != realm->pchildren.end();
4379 ++p)
4380 q.push_back(*p);
4381 }
4382}
4383
4384SnapRealm *Client::get_snap_realm(inodeno_t r)
4385{
4386 SnapRealm *realm = snap_realms[r];
4387 if (!realm)
4388 snap_realms[r] = realm = new SnapRealm(r);
4389 ldout(cct, 20) << "get_snap_realm " << r << " " << realm << " " << realm->nref << " -> " << (realm->nref + 1) << dendl;
4390 realm->nref++;
4391 return realm;
4392}
4393
4394SnapRealm *Client::get_snap_realm_maybe(inodeno_t r)
4395{
4396 if (snap_realms.count(r) == 0) {
4397 ldout(cct, 20) << "get_snap_realm_maybe " << r << " fail" << dendl;
4398 return NULL;
4399 }
4400 SnapRealm *realm = snap_realms[r];
4401 ldout(cct, 20) << "get_snap_realm_maybe " << r << " " << realm << " " << realm->nref << " -> " << (realm->nref + 1) << dendl;
4402 realm->nref++;
4403 return realm;
4404}
4405
4406void Client::put_snap_realm(SnapRealm *realm)
4407{
4408 ldout(cct, 20) << "put_snap_realm " << realm->ino << " " << realm
4409 << " " << realm->nref << " -> " << (realm->nref - 1) << dendl;
4410 if (--realm->nref == 0) {
4411 snap_realms.erase(realm->ino);
4412 if (realm->pparent) {
4413 realm->pparent->pchildren.erase(realm);
4414 put_snap_realm(realm->pparent);
4415 }
4416 delete realm;
4417 }
4418}
4419
4420bool Client::adjust_realm_parent(SnapRealm *realm, inodeno_t parent)
4421{
4422 if (realm->parent != parent) {
4423 ldout(cct, 10) << "adjust_realm_parent " << *realm
4424 << " " << realm->parent << " -> " << parent << dendl;
4425 realm->parent = parent;
4426 if (realm->pparent) {
4427 realm->pparent->pchildren.erase(realm);
4428 put_snap_realm(realm->pparent);
4429 }
4430 realm->pparent = get_snap_realm(parent);
4431 realm->pparent->pchildren.insert(realm);
4432 return true;
4433 }
4434 return false;
4435}
4436
4437static bool has_new_snaps(const SnapContext& old_snapc,
4438 const SnapContext& new_snapc)
4439{
4440 return !new_snapc.snaps.empty() && new_snapc.snaps[0] > old_snapc.seq;
4441}
4442
4443
4444void Client::update_snap_trace(bufferlist& bl, SnapRealm **realm_ret, bool flush)
4445{
4446 SnapRealm *first_realm = NULL;
4447 ldout(cct, 10) << "update_snap_trace len " << bl.length() << dendl;
4448
4449 map<SnapRealm*, SnapContext> dirty_realms;
4450
4451 bufferlist::iterator p = bl.begin();
4452 while (!p.end()) {
4453 SnapRealmInfo info;
4454 ::decode(info, p);
4455 SnapRealm *realm = get_snap_realm(info.ino());
4456
4457 bool invalidate = false;
4458
4459 if (info.seq() > realm->seq) {
4460 ldout(cct, 10) << "update_snap_trace " << *realm << " seq " << info.seq() << " > " << realm->seq
4461 << dendl;
4462
4463 if (flush) {
4464 // writeback any dirty caps _before_ updating snap list (i.e. with old snap info)
4465 // flush me + children
4466 list<SnapRealm*> q;
4467 q.push_back(realm);
4468 while (!q.empty()) {
4469 SnapRealm *realm = q.front();
4470 q.pop_front();
4471
4472 for (set<SnapRealm*>::iterator p = realm->pchildren.begin();
4473 p != realm->pchildren.end();
4474 ++p)
4475 q.push_back(*p);
4476
4477 if (dirty_realms.count(realm) == 0) {
4478 realm->nref++;
4479 dirty_realms[realm] = realm->get_snap_context();
4480 }
4481 }
4482 }
4483
4484 // update
4485 realm->seq = info.seq();
4486 realm->created = info.created();
4487 realm->parent_since = info.parent_since();
4488 realm->prior_parent_snaps = info.prior_parent_snaps;
4489 realm->my_snaps = info.my_snaps;
4490 invalidate = true;
4491 }
4492
4493 // _always_ verify parent
4494 if (adjust_realm_parent(realm, info.parent()))
4495 invalidate = true;
4496
4497 if (invalidate) {
4498 invalidate_snaprealm_and_children(realm);
4499 ldout(cct, 15) << "update_snap_trace " << *realm << " self|parent updated" << dendl;
4500 ldout(cct, 15) << " snapc " << realm->get_snap_context() << dendl;
4501 } else {
4502 ldout(cct, 10) << "update_snap_trace " << *realm << " seq " << info.seq()
4503 << " <= " << realm->seq << " and same parent, SKIPPING" << dendl;
4504 }
4505
4506 if (!first_realm)
4507 first_realm = realm;
4508 else
4509 put_snap_realm(realm);
4510 }
4511
4512 for (map<SnapRealm*, SnapContext>::iterator q = dirty_realms.begin();
4513 q != dirty_realms.end();
4514 ++q) {
4515 SnapRealm *realm = q->first;
4516 // if there are new snaps ?
4517 if (has_new_snaps(q->second, realm->get_snap_context())) {
4518 ldout(cct, 10) << " flushing caps on " << *realm << dendl;
4519 xlist<Inode*>::iterator r = realm->inodes_with_caps.begin();
4520 while (!r.end()) {
4521 Inode *in = *r;
4522 ++r;
4523 queue_cap_snap(in, q->second);
4524 }
4525 } else {
4526 ldout(cct, 10) << " no new snap on " << *realm << dendl;
4527 }
4528 put_snap_realm(realm);
4529 }
4530
4531 if (realm_ret)
4532 *realm_ret = first_realm;
4533 else
4534 put_snap_realm(first_realm);
4535}
4536
4537void Client::handle_snap(MClientSnap *m)
4538{
4539 ldout(cct, 10) << "handle_snap " << *m << dendl;
4540 mds_rank_t mds = mds_rank_t(m->get_source().num());
4541 MetaSession *session = _get_mds_session(mds, m->get_connection().get());
4542 if (!session) {
4543 m->put();
4544 return;
4545 }
4546
4547 got_mds_push(session);
4548
4549 map<Inode*, SnapContext> to_move;
4550 SnapRealm *realm = 0;
4551
4552 if (m->head.op == CEPH_SNAP_OP_SPLIT) {
4553 assert(m->head.split);
4554 SnapRealmInfo info;
4555 bufferlist::iterator p = m->bl.begin();
4556 ::decode(info, p);
4557 assert(info.ino() == m->head.split);
4558
4559 // flush, then move, ino's.
4560 realm = get_snap_realm(info.ino());
4561 ldout(cct, 10) << " splitting off " << *realm << dendl;
4562 for (vector<inodeno_t>::iterator p = m->split_inos.begin();
4563 p != m->split_inos.end();
4564 ++p) {
4565 vinodeno_t vino(*p, CEPH_NOSNAP);
4566 if (inode_map.count(vino)) {
4567 Inode *in = inode_map[vino];
4568 if (!in->snaprealm || in->snaprealm == realm)
4569 continue;
4570 if (in->snaprealm->created > info.created()) {
4571 ldout(cct, 10) << " NOT moving " << *in << " from _newer_ realm "
4572 << *in->snaprealm << dendl;
4573 continue;
4574 }
4575 ldout(cct, 10) << " moving " << *in << " from " << *in->snaprealm << dendl;
4576
4577
4578 in->snaprealm_item.remove_myself();
4579 to_move[in] = in->snaprealm->get_snap_context();
4580 put_snap_realm(in->snaprealm);
4581 }
4582 }
4583
4584 // move child snaprealms, too
4585 for (vector<inodeno_t>::iterator p = m->split_realms.begin();
4586 p != m->split_realms.end();
4587 ++p) {
4588 ldout(cct, 10) << "adjusting snaprealm " << *p << " parent" << dendl;
4589 SnapRealm *child = get_snap_realm_maybe(*p);
4590 if (!child)
4591 continue;
4592 adjust_realm_parent(child, realm->ino);
4593 put_snap_realm(child);
4594 }
4595 }
4596
4597 update_snap_trace(m->bl, NULL, m->head.op != CEPH_SNAP_OP_DESTROY);
4598
4599 if (realm) {
4600 for (auto p = to_move.begin(); p != to_move.end(); ++p) {
4601 Inode *in = p->first;
4602 in->snaprealm = realm;
4603 realm->inodes_with_caps.push_back(&in->snaprealm_item);
4604 realm->nref++;
4605 // queue for snap writeback
4606 if (has_new_snaps(p->second, realm->get_snap_context()))
4607 queue_cap_snap(in, p->second);
4608 }
4609 put_snap_realm(realm);
4610 }
4611
4612 m->put();
4613}
4614
4615void Client::handle_quota(MClientQuota *m)
4616{
4617 mds_rank_t mds = mds_rank_t(m->get_source().num());
4618 MetaSession *session = _get_mds_session(mds, m->get_connection().get());
4619 if (!session) {
4620 m->put();
4621 return;
4622 }
4623
4624 got_mds_push(session);
4625
4626 ldout(cct, 10) << "handle_quota " << *m << " from mds." << mds << dendl;
4627
4628 vinodeno_t vino(m->ino, CEPH_NOSNAP);
4629 if (inode_map.count(vino)) {
4630 Inode *in = NULL;
4631 in = inode_map[vino];
4632
4633 if (in) {
4634 in->quota = m->quota;
4635 in->rstat = m->rstat;
4636 }
4637 }
4638
4639 m->put();
4640}
4641
4642void Client::handle_caps(MClientCaps *m)
4643{
4644 mds_rank_t mds = mds_rank_t(m->get_source().num());
4645 MetaSession *session = _get_mds_session(mds, m->get_connection().get());
4646 if (!session) {
4647 m->put();
4648 return;
4649 }
4650
4651 if (m->osd_epoch_barrier && !objecter->have_map(m->osd_epoch_barrier)) {
4652 // Pause RADOS operations until we see the required epoch
4653 objecter->set_epoch_barrier(m->osd_epoch_barrier);
4654 }
4655
4656 if (m->osd_epoch_barrier > cap_epoch_barrier) {
4657 // Record the barrier so that we will transmit it to MDS when releasing
4658 set_cap_epoch_barrier(m->osd_epoch_barrier);
4659 }
4660
4661 got_mds_push(session);
4662
4663 m->clear_payload(); // for if/when we send back to MDS
4664
4665 Inode *in = 0;
4666 vinodeno_t vino(m->get_ino(), CEPH_NOSNAP);
4667 if (inode_map.count(vino))
4668 in = inode_map[vino];
4669 if (!in) {
4670 if (m->get_op() == CEPH_CAP_OP_IMPORT) {
4671 ldout(cct, 5) << "handle_caps don't have vino " << vino << " on IMPORT, immediately releasing" << dendl;
4672 session->enqueue_cap_release(
4673 m->get_ino(),
4674 m->get_cap_id(),
4675 m->get_seq(),
4676 m->get_mseq(),
4677 cap_epoch_barrier);
4678 } else {
4679 ldout(cct, 5) << "handle_caps don't have vino " << vino << ", dropping" << dendl;
4680 }
4681 m->put();
4682
4683 // in case the mds is waiting on e.g. a revocation
4684 flush_cap_releases();
4685 return;
4686 }
4687
4688 switch (m->get_op()) {
4689 case CEPH_CAP_OP_EXPORT:
4690 return handle_cap_export(session, in, m);
4691 case CEPH_CAP_OP_FLUSHSNAP_ACK:
4692 return handle_cap_flushsnap_ack(session, in, m);
4693 case CEPH_CAP_OP_IMPORT:
4694 handle_cap_import(session, in, m);
4695 }
4696
4697 if (in->caps.count(mds) == 0) {
4698 ldout(cct, 5) << "handle_caps don't have " << *in << " cap on mds." << mds << dendl;
4699 m->put();
4700 return;
4701 }
4702
4703 Cap *cap = in->caps[mds];
4704
4705 switch (m->get_op()) {
4706 case CEPH_CAP_OP_TRUNC: return handle_cap_trunc(session, in, m);
4707 case CEPH_CAP_OP_IMPORT:
4708 case CEPH_CAP_OP_REVOKE:
4709 case CEPH_CAP_OP_GRANT: return handle_cap_grant(session, in, cap, m);
4710 case CEPH_CAP_OP_FLUSH_ACK: return handle_cap_flush_ack(session, in, cap, m);
4711 default:
4712 m->put();
4713 }
4714}
4715
4716void Client::handle_cap_import(MetaSession *session, Inode *in, MClientCaps *m)
4717{
4718 mds_rank_t mds = session->mds_num;
4719
4720 ldout(cct, 5) << "handle_cap_import ino " << m->get_ino() << " mseq " << m->get_mseq()
4721 << " IMPORT from mds." << mds << dendl;
4722
4723 const mds_rank_t peer_mds = mds_rank_t(m->peer.mds);
4724 Cap *cap = NULL;
4725 UserPerm cap_perms;
4726 if (m->peer.cap_id && in->caps.count(peer_mds)) {
4727 cap = in->caps[peer_mds];
4728 if (cap) {
4729 cap_perms = cap->latest_perms;
4730 }
4731 }
4732
4733 // add/update it
4734 SnapRealm *realm = NULL;
4735 update_snap_trace(m->snapbl, &realm);
4736
4737 add_update_cap(in, session, m->get_cap_id(),
4738 m->get_caps(), m->get_seq(), m->get_mseq(), m->get_realm(),
4739 CEPH_CAP_FLAG_AUTH, cap_perms);
4740
4741 if (cap && cap->cap_id == m->peer.cap_id) {
4742 remove_cap(cap, (m->peer.flags & CEPH_CAP_FLAG_RELEASE));
4743 }
4744
4745 if (realm)
4746 put_snap_realm(realm);
4747
4748 if (in->auth_cap && in->auth_cap->session->mds_num == mds) {
4749 // reflush any/all caps (if we are now the auth_cap)
4750 if (in->cap_snaps.size())
4751 flush_snaps(in, true);
4752 if (in->flushing_caps)
4753 flush_caps(in, session);
4754 }
4755}
4756
4757void Client::handle_cap_export(MetaSession *session, Inode *in, MClientCaps *m)
4758{
4759 mds_rank_t mds = session->mds_num;
4760
4761 ldout(cct, 5) << "handle_cap_export ino " << m->get_ino() << " mseq " << m->get_mseq()
4762 << " EXPORT from mds." << mds << dendl;
4763
4764 Cap *cap = NULL;
4765 if (in->caps.count(mds))
4766 cap = in->caps[mds];
4767
4768 const mds_rank_t peer_mds = mds_rank_t(m->peer.mds);
4769
4770 if (cap && cap->cap_id == m->get_cap_id()) {
4771 if (m->peer.cap_id) {
4772 MetaSession *tsession = _get_or_open_mds_session(peer_mds);
4773 if (in->caps.count(peer_mds)) {
4774 Cap *tcap = in->caps[peer_mds];
4775 if (tcap->cap_id != m->peer.cap_id ||
4776 ceph_seq_cmp(tcap->seq, m->peer.seq) < 0) {
4777 tcap->cap_id = m->peer.cap_id;
4778 tcap->seq = m->peer.seq - 1;
4779 tcap->issue_seq = tcap->seq;
4780 tcap->mseq = m->peer.mseq;
4781 tcap->issued |= cap->issued;
4782 tcap->implemented |= cap->issued;
4783 if (cap == in->auth_cap)
4784 in->auth_cap = tcap;
4785 if (in->auth_cap == tcap && in->flushing_cap_item.is_on_list())
4786 adjust_session_flushing_caps(in, session, tsession);
4787 }
4788 } else {
4789 add_update_cap(in, tsession, m->peer.cap_id, cap->issued,
4790 m->peer.seq - 1, m->peer.mseq, (uint64_t)-1,
4791 cap == in->auth_cap ? CEPH_CAP_FLAG_AUTH : 0,
4792 cap->latest_perms);
4793 }
4794 } else {
4795 if (cap == in->auth_cap)
4796 in->flags |= I_CAP_DROPPED;
4797 }
4798
4799 remove_cap(cap, false);
4800 }
4801
4802 m->put();
4803}
4804
4805void Client::handle_cap_trunc(MetaSession *session, Inode *in, MClientCaps *m)
4806{
4807 mds_rank_t mds = session->mds_num;
4808 assert(in->caps[mds]);
4809
4810 ldout(cct, 10) << "handle_cap_trunc on ino " << *in
4811 << " size " << in->size << " -> " << m->get_size()
4812 << dendl;
4813
4814 int implemented = 0;
4815 int issued = in->caps_issued(&implemented) | in->caps_dirty();
4816 issued |= implemented;
4817 update_inode_file_bits(in, m->get_truncate_seq(), m->get_truncate_size(),
4818 m->get_size(), m->get_change_attr(), m->get_time_warp_seq(),
4819 m->get_ctime(), m->get_mtime(), m->get_atime(),
4820 m->inline_version, m->inline_data, issued);
4821 m->put();
4822}
4823
4824void Client::handle_cap_flush_ack(MetaSession *session, Inode *in, Cap *cap, MClientCaps *m)
4825{
4826 ceph_tid_t flush_ack_tid = m->get_client_tid();
4827 int dirty = m->get_dirty();
4828 int cleaned = 0;
4829 int flushed = 0;
4830
4831 for (map<ceph_tid_t, int>::iterator it = in->flushing_cap_tids.begin();
4832 it != in->flushing_cap_tids.end(); ) {
4833 if (it->first == flush_ack_tid)
4834 cleaned = it->second;
4835 if (it->first <= flush_ack_tid) {
4836 session->flushing_caps_tids.erase(it->first);
4837 in->flushing_cap_tids.erase(it++);
4838 ++flushed;
4839 continue;
4840 }
4841 cleaned &= ~it->second;
4842 if (!cleaned)
4843 break;
4844 ++it;
4845 }
4846
4847 ldout(cct, 5) << "handle_cap_flush_ack mds." << session->mds_num
4848 << " cleaned " << ccap_string(cleaned) << " on " << *in
4849 << " with " << ccap_string(dirty) << dendl;
4850
4851 if (flushed) {
4852 signal_cond_list(in->waitfor_caps);
4853 if (session->flushing_caps_tids.empty() ||
4854 *session->flushing_caps_tids.begin() > flush_ack_tid)
4855 sync_cond.Signal();
4856 }
4857
4858 if (!dirty) {
4859 in->cap_dirtier_uid = -1;
4860 in->cap_dirtier_gid = -1;
4861 }
4862
4863 if (!cleaned) {
4864 ldout(cct, 10) << " tid " << m->get_client_tid() << " != any cap bit tids" << dendl;
4865 } else {
4866 if (in->flushing_caps) {
4867 ldout(cct, 5) << " flushing_caps " << ccap_string(in->flushing_caps)
4868 << " -> " << ccap_string(in->flushing_caps & ~cleaned) << dendl;
4869 in->flushing_caps &= ~cleaned;
4870 if (in->flushing_caps == 0) {
4871 ldout(cct, 10) << " " << *in << " !flushing" << dendl;
4872 num_flushing_caps--;
4873 if (in->cap_snaps.empty())
4874 in->flushing_cap_item.remove_myself();
4875 }
4876 if (!in->caps_dirty())
4877 put_inode(in);
4878 }
4879 }
4880
4881 m->put();
4882}
4883
4884
4885void Client::handle_cap_flushsnap_ack(MetaSession *session, Inode *in, MClientCaps *m)
4886{
4887 mds_rank_t mds = session->mds_num;
4888 assert(in->caps[mds]);
4889 snapid_t follows = m->get_snap_follows();
4890
4891 if (in->cap_snaps.count(follows)) {
4892 CapSnap &capsnap = in->cap_snaps.at(follows);
4893 if (m->get_client_tid() != capsnap.flush_tid) {
4894 ldout(cct, 10) << " tid " << m->get_client_tid() << " != " << capsnap.flush_tid << dendl;
4895 } else {
4896 ldout(cct, 5) << "handle_cap_flushedsnap mds." << mds << " flushed snap follows " << follows
4897 << " on " << *in << dendl;
4898 InodeRef tmp_ref;
4899 if (in->get_num_ref() == 1)
4900 tmp_ref = in; // make sure inode not get freed while erasing item from in->cap_snaps
4901 if (in->flushing_caps == 0 && in->cap_snaps.empty())
4902 in->flushing_cap_item.remove_myself();
4903 session->flushing_caps_tids.erase(capsnap.flush_tid);
4904 in->cap_snaps.erase(follows);
4905 }
4906 } else {
4907 ldout(cct, 5) << "handle_cap_flushedsnap DUP(?) mds." << mds << " flushed snap follows " << follows
4908 << " on " << *in << dendl;
4909 // we may not have it if we send multiple FLUSHSNAP requests and (got multiple FLUSHEDSNAPs back)
4910 }
4911
4912 m->put();
4913}
4914
4915class C_Client_DentryInvalidate : public Context {
4916private:
4917 Client *client;
4918 vinodeno_t dirino;
4919 vinodeno_t ino;
4920 string name;
4921public:
4922 C_Client_DentryInvalidate(Client *c, Dentry *dn, bool del) :
4923 client(c), name(dn->name) {
4924 if (client->use_faked_inos()) {
4925 dirino.ino = dn->dir->parent_inode->faked_ino;
4926 if (del)
4927 ino.ino = dn->inode->faked_ino;
4928 } else {
4929 dirino = dn->dir->parent_inode->vino();
4930 if (del)
4931 ino = dn->inode->vino();
4932 }
4933 if (!del)
4934 ino.ino = inodeno_t();
4935 }
4936 void finish(int r) override {
4937 // _async_dentry_invalidate is responsible for its own locking
4938 assert(!client->client_lock.is_locked_by_me());
4939 client->_async_dentry_invalidate(dirino, ino, name);
4940 }
4941};
4942
4943void Client::_async_dentry_invalidate(vinodeno_t dirino, vinodeno_t ino, string& name)
4944{
4945 if (unmounting)
4946 return;
4947 ldout(cct, 10) << "_async_dentry_invalidate '" << name << "' ino " << ino
4948 << " in dir " << dirino << dendl;
4949 dentry_invalidate_cb(callback_handle, dirino, ino, name);
4950}
4951
4952void Client::_schedule_invalidate_dentry_callback(Dentry *dn, bool del)
4953{
4954 if (dentry_invalidate_cb && dn->inode->ll_ref > 0)
4955 async_dentry_invalidator.queue(new C_Client_DentryInvalidate(this, dn, del));
4956}
4957
4958void Client::_try_to_trim_inode(Inode *in, bool sched_inval)
4959{
4960 int ref = in->get_num_ref();
4961
4962 if (in->dir && !in->dir->dentries.empty()) {
4963 for (auto p = in->dir->dentries.begin();
4964 p != in->dir->dentries.end(); ) {
4965 Dentry *dn = p->second;
4966 ++p;
4967 /* rmsnap removes whole subtree, need trim inodes recursively.
4968 * we don't need to invalidate dentries recursively. because
4969 * invalidating a directory dentry effectively invalidate
4970 * whole subtree */
4971 if (in->snapid != CEPH_NOSNAP && dn->inode && dn->inode->is_dir())
4972 _try_to_trim_inode(dn->inode.get(), false);
4973
4974 if (dn->lru_is_expireable())
4975 unlink(dn, true, false); // keep dir, drop dentry
4976 }
4977 if (in->dir->dentries.empty()) {
4978 close_dir(in->dir);
4979 --ref;
4980 }
4981 }
4982
4983 if (ref > 0 && (in->flags & I_SNAPDIR_OPEN)) {
4984 InodeRef snapdir = open_snapdir(in);
4985 _try_to_trim_inode(snapdir.get(), false);
4986 --ref;
4987 }
4988
4989 if (ref > 0 && in->ll_ref > 0 && sched_inval) {
4990 set<Dentry*>::iterator q = in->dn_set.begin();
4991 while (q != in->dn_set.end()) {
4992 Dentry *dn = *q++;
4993 // FIXME: we play lots of unlink/link tricks when handling MDS replies,
4994 // so in->dn_set doesn't always reflect the state of kernel's dcache.
4995 _schedule_invalidate_dentry_callback(dn, true);
4996 unlink(dn, true, true);
4997 }
4998 }
4999}
5000
5001void Client::handle_cap_grant(MetaSession *session, Inode *in, Cap *cap, MClientCaps *m)
5002{
5003 mds_rank_t mds = session->mds_num;
5004 int used = get_caps_used(in);
5005 int wanted = in->caps_wanted();
5006
5007 const int old_caps = cap->issued;
5008 const int new_caps = m->get_caps();
5009 ldout(cct, 5) << "handle_cap_grant on in " << m->get_ino()
5010 << " mds." << mds << " seq " << m->get_seq()
5011 << " caps now " << ccap_string(new_caps)
5012 << " was " << ccap_string(old_caps) << dendl;
5013 cap->seq = m->get_seq();
5014
5015 in->layout = m->get_layout();
5016
5017 // update inode
5018 int implemented = 0;
5019 int issued = in->caps_issued(&implemented) | in->caps_dirty();
5020 issued |= implemented;
5021
5022 if ((issued & CEPH_CAP_AUTH_EXCL) == 0) {
5023 in->mode = m->head.mode;
5024 in->uid = m->head.uid;
5025 in->gid = m->head.gid;
5026 in->btime = m->btime;
5027 }
5028 bool deleted_inode = false;
5029 if ((issued & CEPH_CAP_LINK_EXCL) == 0) {
5030 in->nlink = m->head.nlink;
5031 if (in->nlink == 0 &&
5032 (new_caps & (CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL)))
5033 deleted_inode = true;
5034 }
5035 if ((issued & CEPH_CAP_XATTR_EXCL) == 0 &&
5036 m->xattrbl.length() &&
5037 m->head.xattr_version > in->xattr_version) {
5038 bufferlist::iterator p = m->xattrbl.begin();
5039 ::decode(in->xattrs, p);
5040 in->xattr_version = m->head.xattr_version;
5041 }
5042 update_inode_file_bits(in, m->get_truncate_seq(), m->get_truncate_size(), m->get_size(),
5043 m->get_change_attr(), m->get_time_warp_seq(), m->get_ctime(),
5044 m->get_mtime(), m->get_atime(),
5045 m->inline_version, m->inline_data, issued);
5046
5047 // max_size
5048 if (cap == in->auth_cap &&
5049 m->get_max_size() != in->max_size) {
5050 ldout(cct, 10) << "max_size " << in->max_size << " -> " << m->get_max_size() << dendl;
5051 in->max_size = m->get_max_size();
5052 if (in->max_size > in->wanted_max_size) {
5053 in->wanted_max_size = 0;
5054 in->requested_max_size = 0;
5055 }
5056 }
5057
5058 bool check = false;
5059 if (m->get_op() == CEPH_CAP_OP_IMPORT && m->get_wanted() != wanted)
5060 check = true;
5061
5062 check_cap_issue(in, cap, new_caps);
5063
5064 // update caps
5065 if (old_caps & ~new_caps) {
5066 ldout(cct, 10) << " revocation of " << ccap_string(~new_caps & old_caps) << dendl;
5067 cap->issued = new_caps;
5068 cap->implemented |= new_caps;
5069
5070 if (((used & ~new_caps) & CEPH_CAP_FILE_BUFFER)
5071 && !_flush(in, new C_Client_FlushComplete(this, in))) {
5072 // waitin' for flush
5073 } else if ((old_caps & ~new_caps) & CEPH_CAP_FILE_CACHE) {
5074 if (_release(in))
5075 check = true;
5076 } else {
5077 cap->wanted = 0; // don't let check_caps skip sending a response to MDS
5078 check = true;
5079 }
5080
5081 } else if (old_caps == new_caps) {
5082 ldout(cct, 10) << " caps unchanged at " << ccap_string(old_caps) << dendl;
5083 } else {
5084 ldout(cct, 10) << " grant, new caps are " << ccap_string(new_caps & ~old_caps) << dendl;
5085 cap->issued = new_caps;
5086 cap->implemented |= new_caps;
5087
5088 if (cap == in->auth_cap) {
5089 // non-auth MDS is revoking the newly grant caps ?
5090 for (map<mds_rank_t, Cap*>::iterator it = in->caps.begin(); it != in->caps.end(); ++it) {
5091 if (it->second == cap)
5092 continue;
5093 if (it->second->implemented & ~it->second->issued & new_caps) {
5094 check = true;
5095 break;
5096 }
5097 }
5098 }
5099 }
5100
5101 if (check)
5102 check_caps(in, 0);
5103
5104 // wake up waiters
5105 if (new_caps)
5106 signal_cond_list(in->waitfor_caps);
5107
5108 // may drop inode's last ref
5109 if (deleted_inode)
5110 _try_to_trim_inode(in, true);
5111
5112 m->put();
5113}
5114
5115int Client::_getgrouplist(gid_t** sgids, uid_t uid, gid_t gid)
5116{
5117 // cppcheck-suppress variableScope
5118 int sgid_count;
5119 gid_t *sgid_buf;
5120
5121 if (getgroups_cb) {
5122 sgid_count = getgroups_cb(callback_handle, &sgid_buf);
5123 if (sgid_count > 0) {
5124 *sgids = sgid_buf;
5125 return sgid_count;
5126 }
5127 }
5128
5129#if HAVE_GETGROUPLIST
5130 struct passwd *pw;
5131 pw = getpwuid(uid);
5132 if (pw == NULL) {
5133 ldout(cct, 3) << "getting user entry failed" << dendl;
5134 return -errno;
5135 }
5136 //use PAM to get the group list
5137 // initial number of group entries, defaults to posix standard of 16
5138 // PAM implementations may provide more than 16 groups....
5139 sgid_count = 16;
5140 sgid_buf = (gid_t*)malloc(sgid_count * sizeof(gid_t));
5141 if (sgid_buf == NULL) {
5142 ldout(cct, 3) << "allocating group memory failed" << dendl;
5143 return -ENOMEM;
5144 }
5145
5146 while (1) {
5147#if defined(__APPLE__)
5148 if (getgrouplist(pw->pw_name, gid, (int*)sgid_buf, &sgid_count) == -1) {
5149#else
5150 if (getgrouplist(pw->pw_name, gid, sgid_buf, &sgid_count) == -1) {
5151#endif
5152 // we need to resize the group list and try again
5153 void *_realloc = NULL;
5154 if ((_realloc = realloc(sgid_buf, sgid_count * sizeof(gid_t))) == NULL) {
5155 ldout(cct, 3) << "allocating group memory failed" << dendl;
5156 free(sgid_buf);
5157 return -ENOMEM;
5158 }
5159 sgid_buf = (gid_t*)_realloc;
5160 continue;
5161 }
5162 // list was successfully retrieved
5163 break;
5164 }
5165 *sgids = sgid_buf;
5166 return sgid_count;
5167#else
5168 return 0;
5169#endif
5170}
5171
5172int Client::inode_permission(Inode *in, const UserPerm& perms, unsigned want)
5173{
5174 if (perms.uid() == 0)
5175 return 0;
5176
5177 if (perms.uid() != in->uid && (in->mode & S_IRWXG)) {
5178 int ret = _posix_acl_permission(in, perms, want);
5179 if (ret != -EAGAIN)
5180 return ret;
5181 }
5182
5183 // check permissions before doing anything else
5184 if (!in->check_mode(perms, want))
5185 return -EACCES;
5186 return 0;
5187}
5188
5189int Client::xattr_permission(Inode *in, const char *name, unsigned want,
5190 const UserPerm& perms)
5191{
5192 int r = _getattr_for_perm(in, perms);
5193 if (r < 0)
5194 goto out;
5195
5196 r = 0;
5197 if (strncmp(name, "system.", 7) == 0) {
5198 if ((want & MAY_WRITE) && (perms.uid() != 0 && perms.uid() != in->uid))
5199 r = -EPERM;
5200 } else {
5201 r = inode_permission(in, perms, want);
5202 }
5203out:
5204 ldout(cct, 3) << __func__ << " " << in << " = " << r << dendl;
5205 return r;
5206}
5207
5208ostream& operator<<(ostream &out, const UserPerm& perm) {
5209 out << "UserPerm(uid: " << perm.uid() << ", gid: " << perm.gid() << ")";
5210 return out;
5211}
5212
5213int Client::may_setattr(Inode *in, struct ceph_statx *stx, int mask,
5214 const UserPerm& perms)
5215{
5216 ldout(cct, 20) << __func__ << *in << "; " << perms << dendl;
5217 int r = _getattr_for_perm(in, perms);
5218 if (r < 0)
5219 goto out;
5220
5221 if (mask & CEPH_SETATTR_SIZE) {
5222 r = inode_permission(in, perms, MAY_WRITE);
5223 if (r < 0)
5224 goto out;
5225 }
5226
5227 r = -EPERM;
5228 if (mask & CEPH_SETATTR_UID) {
5229 if (perms.uid() != 0 && (perms.uid() != in->uid || stx->stx_uid != in->uid))
5230 goto out;
5231 }
5232 if (mask & CEPH_SETATTR_GID) {
5233 if (perms.uid() != 0 && (perms.uid() != in->uid ||
5234 (!perms.gid_in_groups(stx->stx_gid) && stx->stx_gid != in->gid)))
5235 goto out;
5236 }
5237
5238 if (mask & CEPH_SETATTR_MODE) {
5239 if (perms.uid() != 0 && perms.uid() != in->uid)
5240 goto out;
5241
5242 gid_t i_gid = (mask & CEPH_SETATTR_GID) ? stx->stx_gid : in->gid;
5243 if (perms.uid() != 0 && !perms.gid_in_groups(i_gid))
5244 stx->stx_mode &= ~S_ISGID;
5245 }
5246
5247 if (mask & (CEPH_SETATTR_CTIME | CEPH_SETATTR_BTIME |
5248 CEPH_SETATTR_MTIME | CEPH_SETATTR_ATIME)) {
5249 if (perms.uid() != 0 && perms.uid() != in->uid) {
5250 int check_mask = CEPH_SETATTR_CTIME | CEPH_SETATTR_BTIME;
5251 if (!(mask & CEPH_SETATTR_MTIME_NOW))
5252 check_mask |= CEPH_SETATTR_MTIME;
5253 if (!(mask & CEPH_SETATTR_ATIME_NOW))
5254 check_mask |= CEPH_SETATTR_ATIME;
5255 if (check_mask & mask) {
5256 goto out;
5257 } else {
5258 r = inode_permission(in, perms, MAY_WRITE);
5259 if (r < 0)
5260 goto out;
5261 }
5262 }
5263 }
5264 r = 0;
5265out:
5266 ldout(cct, 3) << __func__ << " " << in << " = " << r << dendl;
5267 return r;
5268}
5269
5270int Client::may_open(Inode *in, int flags, const UserPerm& perms)
5271{
5272 ldout(cct, 20) << __func__ << *in << "; " << perms << dendl;
5273 unsigned want = 0;
5274
5275 if ((flags & O_ACCMODE) == O_WRONLY)
5276 want = MAY_WRITE;
5277 else if ((flags & O_ACCMODE) == O_RDWR)
5278 want = MAY_READ | MAY_WRITE;
5279 else if ((flags & O_ACCMODE) == O_RDONLY)
5280 want = MAY_READ;
5281 if (flags & O_TRUNC)
5282 want |= MAY_WRITE;
5283
5284 int r = 0;
5285 switch (in->mode & S_IFMT) {
5286 case S_IFLNK:
5287 r = -ELOOP;
5288 goto out;
5289 case S_IFDIR:
5290 if (want & MAY_WRITE) {
5291 r = -EISDIR;
5292 goto out;
5293 }
5294 break;
5295 }
5296
5297 r = _getattr_for_perm(in, perms);
5298 if (r < 0)
5299 goto out;
5300
5301 r = inode_permission(in, perms, want);
5302out:
5303 ldout(cct, 3) << __func__ << " " << in << " = " << r << dendl;
5304 return r;
5305}
5306
5307int Client::may_lookup(Inode *dir, const UserPerm& perms)
5308{
5309 ldout(cct, 20) << __func__ << *dir << "; " << perms << dendl;
5310 int r = _getattr_for_perm(dir, perms);
5311 if (r < 0)
5312 goto out;
5313
5314 r = inode_permission(dir, perms, MAY_EXEC);
5315out:
5316 ldout(cct, 3) << __func__ << " " << dir << " = " << r << dendl;
5317 return r;
5318}
5319
5320int Client::may_create(Inode *dir, const UserPerm& perms)
5321{
5322 ldout(cct, 20) << __func__ << *dir << "; " << perms << dendl;
5323 int r = _getattr_for_perm(dir, perms);
5324 if (r < 0)
5325 goto out;
5326
5327 r = inode_permission(dir, perms, MAY_EXEC | MAY_WRITE);
5328out:
5329 ldout(cct, 3) << __func__ << " " << dir << " = " << r << dendl;
5330 return r;
5331}
5332
5333int Client::may_delete(Inode *dir, const char *name, const UserPerm& perms)
5334{
5335 ldout(cct, 20) << __func__ << *dir << "; " << "; name " << name << "; " << perms << dendl;
5336 int r = _getattr_for_perm(dir, perms);
5337 if (r < 0)
5338 goto out;
5339
5340 r = inode_permission(dir, perms, MAY_EXEC | MAY_WRITE);
5341 if (r < 0)
5342 goto out;
5343
5344 /* 'name == NULL' means rmsnap */
5345 if (perms.uid() != 0 && name && (dir->mode & S_ISVTX)) {
5346 InodeRef otherin;
5347 r = _lookup(dir, name, CEPH_CAP_AUTH_SHARED, &otherin, perms);
5348 if (r < 0)
5349 goto out;
5350 if (dir->uid != perms.uid() && otherin->uid != perms.uid())
5351 r = -EPERM;
5352 }
5353out:
5354 ldout(cct, 3) << __func__ << " " << dir << " = " << r << dendl;
5355 return r;
5356}
5357
5358int Client::may_hardlink(Inode *in, const UserPerm& perms)
5359{
5360 ldout(cct, 20) << __func__ << *in << "; " << perms << dendl;
5361 int r = _getattr_for_perm(in, perms);
5362 if (r < 0)
5363 goto out;
5364
5365 if (perms.uid() == 0 || perms.uid() == in->uid) {
5366 r = 0;
5367 goto out;
5368 }
5369
5370 r = -EPERM;
5371 if (!S_ISREG(in->mode))
5372 goto out;
5373
5374 if (in->mode & S_ISUID)
5375 goto out;
5376
5377 if ((in->mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP))
5378 goto out;
5379
5380 r = inode_permission(in, perms, MAY_READ | MAY_WRITE);
5381out:
5382 ldout(cct, 3) << __func__ << " " << in << " = " << r << dendl;
5383 return r;
5384}
5385
5386int Client::_getattr_for_perm(Inode *in, const UserPerm& perms)
5387{
5388 int mask = CEPH_STAT_CAP_MODE;
5389 bool force = false;
5390 if (acl_type != NO_ACL) {
5391 mask |= CEPH_STAT_CAP_XATTR;
5392 force = in->xattr_version == 0;
5393 }
5394 return _getattr(in, mask, perms, force);
5395}
5396
5397vinodeno_t Client::_get_vino(Inode *in)
5398{
5399 /* The caller must hold the client lock */
5400 return vinodeno_t(in->ino, in->snapid);
5401}
5402
5403inodeno_t Client::_get_inodeno(Inode *in)
5404{
5405 /* The caller must hold the client lock */
5406 return in->ino;
5407}
5408
5409
5410/**
5411 * Resolve an MDS spec to a list of MDS daemon GIDs.
5412 *
5413 * The spec is a string representing a GID, rank, filesystem:rank, or name/id.
5414 * It may be '*' in which case it matches all GIDs.
5415 *
5416 * If no error is returned, the `targets` vector will be populated with at least
5417 * one MDS.
5418 */
5419int Client::resolve_mds(
5420 const std::string &mds_spec,
5421 std::vector<mds_gid_t> *targets)
5422{
5423 assert(fsmap);
5424 assert(targets != nullptr);
5425
5426 mds_role_t role;
5427 std::stringstream ss;
5428 int role_r = fsmap->parse_role(mds_spec, &role, ss);
5429 if (role_r == 0) {
5430 // We got a role, resolve it to a GID
5431 ldout(cct, 10) << __func__ << ": resolved '" << mds_spec << "' to role '"
5432 << role << "'" << dendl;
5433 targets->push_back(
5434 fsmap->get_filesystem(role.fscid)->mds_map.get_info(role.rank).global_id);
5435 return 0;
5436 }
5437
5438 std::string strtol_err;
5439 long long rank_or_gid = strict_strtoll(mds_spec.c_str(), 10, &strtol_err);
5440 if (strtol_err.empty()) {
5441 // It is a possible GID
5442 const mds_gid_t mds_gid = mds_gid_t(rank_or_gid);
5443 if (fsmap->gid_exists(mds_gid)) {
5444 ldout(cct, 10) << __func__ << ": validated GID " << mds_gid << dendl;
5445 targets->push_back(mds_gid);
5446 } else {
5447 lderr(cct) << __func__ << ": GID " << mds_gid << " not in MDS map"
5448 << dendl;
5449 return -ENOENT;
5450 }
5451 } else if (mds_spec == "*") {
5452 // It is a wildcard: use all MDSs
5453 const auto mds_info = fsmap->get_mds_info();
5454
5455 if (mds_info.empty()) {
5456 lderr(cct) << __func__ << ": * passed but no MDS daemons found" << dendl;
5457 return -ENOENT;
5458 }
5459
5460 for (const auto i : mds_info) {
5461 targets->push_back(i.first);
5462 }
5463 } else {
5464 // It did not parse as an integer, it is not a wildcard, it must be a name
5465 const mds_gid_t mds_gid = fsmap->find_mds_gid_by_name(mds_spec);
5466 if (mds_gid == 0) {
5467 lderr(cct) << "MDS ID '" << mds_spec << "' not found" << dendl;
5468
5469 lderr(cct) << "FSMap: " << *fsmap << dendl;
5470
5471 return -ENOENT;
5472 } else {
5473 ldout(cct, 10) << __func__ << ": resolved ID '" << mds_spec
5474 << "' to GID " << mds_gid << dendl;
5475 targets->push_back(mds_gid);
5476 }
5477 }
5478
5479 return 0;
5480}
5481
5482
5483/**
5484 * Authenticate with mon and establish global ID
5485 */
5486int Client::authenticate()
5487{
5488 assert(client_lock.is_locked_by_me());
5489
5490 if (monclient->is_authenticated()) {
5491 return 0;
5492 }
5493
5494 client_lock.Unlock();
5495 int r = monclient->authenticate(cct->_conf->client_mount_timeout);
5496 client_lock.Lock();
5497 if (r < 0) {
5498 return r;
5499 }
5500
5501 whoami = monclient->get_global_id();
5502 messenger->set_myname(entity_name_t::CLIENT(whoami.v));
5503
5504 return 0;
5505}
5506
5507int Client::fetch_fsmap(bool user)
5508{
5509 int r;
5510 // Retrieve FSMap to enable looking up daemon addresses. We need FSMap
5511 // rather than MDSMap because no one MDSMap contains all the daemons, and
5512 // a `tell` can address any daemon.
5513 version_t fsmap_latest;
5514 do {
5515 C_SaferCond cond;
5516 monclient->get_version("fsmap", &fsmap_latest, NULL, &cond);
5517 client_lock.Unlock();
5518 r = cond.wait();
5519 client_lock.Lock();
5520 } while (r == -EAGAIN);
5521
5522 if (r < 0) {
5523 lderr(cct) << "Failed to learn FSMap version: " << cpp_strerror(r) << dendl;
5524 return r;
5525 }
5526
5527 ldout(cct, 10) << __func__ << " learned FSMap version " << fsmap_latest << dendl;
5528
5529 if (user) {
5530 if (!fsmap_user || fsmap_user->get_epoch() < fsmap_latest) {
5531 monclient->sub_want("fsmap.user", fsmap_latest, CEPH_SUBSCRIBE_ONETIME);
5532 monclient->renew_subs();
5533 wait_on_list(waiting_for_fsmap);
5534 }
5535 assert(fsmap_user);
5536 assert(fsmap_user->get_epoch() >= fsmap_latest);
5537 } else {
5538 if (!fsmap || fsmap->get_epoch() < fsmap_latest) {
5539 monclient->sub_want("fsmap", fsmap_latest, CEPH_SUBSCRIBE_ONETIME);
5540 monclient->renew_subs();
5541 wait_on_list(waiting_for_fsmap);
5542 }
5543 assert(fsmap);
5544 assert(fsmap->get_epoch() >= fsmap_latest);
5545 }
5546 ldout(cct, 10) << __func__ << " finished waiting for FSMap version "
5547 << fsmap_latest << dendl;
5548 return 0;
5549}
5550
5551/**
5552 *
5553 * @mds_spec one of ID, rank, GID, "*"
5554 *
5555 */
5556int Client::mds_command(
5557 const std::string &mds_spec,
5558 const vector<string>& cmd,
5559 const bufferlist& inbl,
5560 bufferlist *outbl,
5561 string *outs,
5562 Context *onfinish)
5563{
5564 Mutex::Locker lock(client_lock);
5565
5566 assert(initialized);
5567
5568 int r;
5569 r = authenticate();
5570 if (r < 0) {
5571 return r;
5572 }
5573
5574 r = fetch_fsmap(false);
5575 if (r < 0) {
5576 return r;
5577 }
5578
5579 // Look up MDS target(s) of the command
5580 std::vector<mds_gid_t> targets;
5581 r = resolve_mds(mds_spec, &targets);
5582 if (r < 0) {
5583 return r;
5584 }
5585
5586 // If daemons are laggy, we won't send them commands. If all
5587 // are laggy then we fail.
5588 std::vector<mds_gid_t> non_laggy;
5589 for (const auto gid : targets) {
5590 const auto info = fsmap->get_info_gid(gid);
5591 if (!info.laggy()) {
5592 non_laggy.push_back(gid);
5593 }
5594 }
5595 if (non_laggy.size() == 0) {
5596 *outs = "All targeted MDS daemons are laggy";
5597 return -ENOENT;
5598 }
5599
5600 if (metadata.empty()) {
5601 // We are called on an unmounted client, so metadata
5602 // won't be initialized yet.
5603 populate_metadata("");
5604 }
5605
5606 // Send commands to targets
5607 C_GatherBuilder gather(cct, onfinish);
5608 for (const auto target_gid : non_laggy) {
5609 const auto info = fsmap->get_info_gid(target_gid);
5610
5611 // Open a connection to the target MDS
5612 entity_inst_t inst = info.get_inst();
5613 ConnectionRef conn = messenger->get_connection(inst);
5614
5615 // Generate MDSCommandOp state
5616 auto &op = command_table.start_command();
5617
5618 op.on_finish = gather.new_sub();
5619 op.cmd = cmd;
5620 op.outbl = outbl;
5621 op.outs = outs;
5622 op.inbl = inbl;
5623 op.mds_gid = target_gid;
5624 op.con = conn;
5625
5626 ldout(cct, 4) << __func__ << ": new command op to " << target_gid
5627 << " tid=" << op.tid << cmd << dendl;
5628
5629 // Construct and send MCommand
5630 MCommand *m = op.get_message(monclient->get_fsid());
5631 conn->send_message(m);
5632 }
5633 gather.activate();
5634
5635 return 0;
5636}
5637
5638void Client::handle_command_reply(MCommandReply *m)
5639{
5640 ceph_tid_t const tid = m->get_tid();
5641
5642 ldout(cct, 10) << __func__ << ": tid=" << m->get_tid() << dendl;
5643
5644 if (!command_table.exists(tid)) {
5645 ldout(cct, 1) << __func__ << ": unknown tid " << tid << ", dropping" << dendl;
5646 m->put();
5647 return;
5648 }
5649
5650 auto &op = command_table.get_command(tid);
5651 if (op.outbl) {
5652 op.outbl->claim(m->get_data());
5653 }
5654 if (op.outs) {
5655 *op.outs = m->rs;
5656 }
5657
5658 if (op.on_finish) {
5659 op.on_finish->complete(m->r);
5660 }
5661
5662 command_table.erase(tid);
5663
5664 m->put();
5665}
5666
5667// -------------------
5668// MOUNT
5669
5670int Client::mount(const std::string &mount_root, const UserPerm& perms,
5671 bool require_mds)
5672{
5673 Mutex::Locker lock(client_lock);
5674
5675 if (mounted) {
5676 ldout(cct, 5) << "already mounted" << dendl;
5677 return 0;
5678 }
5679
5680 int r = authenticate();
5681 if (r < 0) {
5682 lderr(cct) << "authentication failed: " << cpp_strerror(r) << dendl;
5683 return r;
5684 }
5685
5686 std::string want = "mdsmap";
5687 const auto &mds_ns = cct->_conf->client_mds_namespace;
5688 if (!mds_ns.empty()) {
5689 r = fetch_fsmap(true);
5690 if (r < 0)
5691 return r;
5692 fs_cluster_id_t cid = fsmap_user->get_fs_cid(mds_ns);
5693 if (cid == FS_CLUSTER_ID_NONE)
5694 return -ENOENT;
5695
5696 std::ostringstream oss;
5697 oss << want << "." << cid;
5698 want = oss.str();
5699 }
5700 ldout(cct, 10) << "Subscribing to map '" << want << "'" << dendl;
5701
5702 monclient->sub_want(want, 0, 0);
5703 monclient->renew_subs();
5704
5705 tick(); // start tick
5706
5707 if (require_mds) {
5708 while (1) {
5709 auto availability = mdsmap->is_cluster_available();
5710 if (availability == MDSMap::STUCK_UNAVAILABLE) {
5711 // Error out
5712 ldout(cct, 10) << "mds cluster unavailable: epoch=" << mdsmap->get_epoch() << dendl;
5713 return CEPH_FUSE_NO_MDS_UP;
5714 } else if (availability == MDSMap::AVAILABLE) {
5715 // Continue to mount
5716 break;
5717 } else if (availability == MDSMap::TRANSIENT_UNAVAILABLE) {
5718 // Else, wait. MDSMonitor will update the map to bring
5719 // us to a conclusion eventually.
5720 wait_on_list(waiting_for_mdsmap);
5721 } else {
5722 // Unexpected value!
5723 ceph_abort();
5724 }
5725 }
5726 }
5727
5728 populate_metadata(mount_root.empty() ? "/" : mount_root);
5729
5730 filepath fp(CEPH_INO_ROOT);
5731 if (!mount_root.empty()) {
5732 fp = filepath(mount_root.c_str());
5733 }
5734 while (true) {
5735 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_GETATTR);
5736 req->set_filepath(fp);
5737 req->head.args.getattr.mask = CEPH_STAT_CAP_INODE_ALL;
5738 int res = make_request(req, perms);
5739 if (res < 0) {
5740 if (res == -EACCES && root) {
5741 ldout(cct, 1) << __func__ << " EACCES on parent of mount point; quotas may not work" << dendl;
5742 break;
5743 }
5744 return res;
5745 }
5746
5747 if (fp.depth())
5748 fp.pop_dentry();
5749 else
5750 break;
5751 }
5752
5753 assert(root);
5754 _ll_get(root);
5755
5756 mounted = true;
5757
5758 // trace?
5759 if (!cct->_conf->client_trace.empty()) {
5760 traceout.open(cct->_conf->client_trace.c_str());
5761 if (traceout.is_open()) {
5762 ldout(cct, 1) << "opened trace file '" << cct->_conf->client_trace << "'" << dendl;
5763 } else {
5764 ldout(cct, 1) << "FAILED to open trace file '" << cct->_conf->client_trace << "'" << dendl;
5765 }
5766 }
5767
5768 /*
5769 ldout(cct, 3) << "op: // client trace data structs" << dendl;
5770 ldout(cct, 3) << "op: struct stat st;" << dendl;
5771 ldout(cct, 3) << "op: struct utimbuf utim;" << dendl;
5772 ldout(cct, 3) << "op: int readlinkbuf_len = 1000;" << dendl;
5773 ldout(cct, 3) << "op: char readlinkbuf[readlinkbuf_len];" << dendl;
5774 ldout(cct, 3) << "op: map<string, inode_t*> dir_contents;" << dendl;
5775 ldout(cct, 3) << "op: map<int, int> open_files;" << dendl;
5776 ldout(cct, 3) << "op: int fd;" << dendl;
5777 */
5778 return 0;
5779}
5780
5781// UNMOUNT
5782
5783void Client::_close_sessions()
5784{
5785 while (!mds_sessions.empty()) {
5786 // send session closes!
5787 for (map<mds_rank_t,MetaSession*>::iterator p = mds_sessions.begin();
5788 p != mds_sessions.end();
5789 ++p) {
5790 if (p->second->state != MetaSession::STATE_CLOSING) {
5791 _close_mds_session(p->second);
5792 }
5793 }
5794
5795 // wait for sessions to close
5796 ldout(cct, 2) << "waiting for " << mds_sessions.size() << " mds sessions to close" << dendl;
5797 mount_cond.Wait(client_lock);
5798 }
5799}
5800
31f18b77
FG
5801void Client::flush_mdlog_sync()
5802{
5803 if (mds_requests.empty())
5804 return;
5805 for (map<mds_rank_t,MetaSession*>::iterator p = mds_sessions.begin();
5806 p != mds_sessions.end();
5807 ++p) {
5808 MetaSession *s = p->second;
5809 flush_mdlog(s);
5810 }
5811}
5812
5813void Client::flush_mdlog(MetaSession *session)
5814{
5815 // Only send this to Luminous or newer MDS daemons, older daemons
5816 // will crash if they see an unknown CEPH_SESSION_* value in this msg.
5817 const uint64_t features = session->con->get_features();
5818 if (HAVE_FEATURE(features, SERVER_LUMINOUS)) {
5819 MClientSession *m = new MClientSession(CEPH_SESSION_REQUEST_FLUSH_MDLOG);
5820 session->con->send_message(m);
5821 }
5822}
5823
5824
7c673cae
FG
5825void Client::unmount()
5826{
5827 Mutex::Locker lock(client_lock);
5828
5829 assert(mounted); // caller is confused?
5830
5831 ldout(cct, 2) << "unmounting" << dendl;
5832 unmounting = true;
5833
31f18b77 5834 flush_mdlog_sync(); // flush the mdlog for pending requests, if any
7c673cae
FG
5835 while (!mds_requests.empty()) {
5836 ldout(cct, 10) << "waiting on " << mds_requests.size() << " requests" << dendl;
5837 mount_cond.Wait(client_lock);
5838 }
5839
5840 if (tick_event)
5841 timer.cancel_event(tick_event);
5842 tick_event = 0;
5843
5844 cwd.reset();
5845
5846 // clean up any unclosed files
5847 while (!fd_map.empty()) {
5848 Fh *fh = fd_map.begin()->second;
5849 fd_map.erase(fd_map.begin());
5850 ldout(cct, 0) << " destroyed lost open file " << fh << " on " << *fh->inode << dendl;
5851 _release_fh(fh);
5852 }
5853
5854 while (!ll_unclosed_fh_set.empty()) {
5855 set<Fh*>::iterator it = ll_unclosed_fh_set.begin();
5856 Fh *fh = *it;
5857 ll_unclosed_fh_set.erase(fh);
5858 ldout(cct, 0) << " destroyed lost open file " << fh << " on " << *(fh->inode) << dendl;
5859 _release_fh(fh);
5860 }
5861
5862 while (!opened_dirs.empty()) {
5863 dir_result_t *dirp = *opened_dirs.begin();
5864 ldout(cct, 0) << " destroyed lost open dir " << dirp << " on " << *dirp->inode << dendl;
5865 _closedir(dirp);
5866 }
5867
5868 _ll_drop_pins();
5869
31f18b77
FG
5870 if (blacklisted) {
5871 ldout(cct, 0) << " skipping clean shutdown, we are blacklisted" << dendl;
5872
5873 if (cct->_conf->client_oc) {
5874 // Purge all cached data so that ObjectCacher doesn't get hung up
5875 // trying to flush it. ObjectCacher's behaviour on EBLACKLISTED
5876 // is to just leave things marked dirty
5877 // (http://tracker.ceph.com/issues/9105)
5878 for (const auto &i : inode_map) {
5879 objectcacher->purge_set(&(i.second->oset));
5880 }
5881 }
5882
5883 mounted = false;
5884 return;
5885 }
5886
7c673cae
FG
5887 while (unsafe_sync_write > 0) {
5888 ldout(cct, 0) << unsafe_sync_write << " unsafe_sync_writes, waiting" << dendl;
5889 mount_cond.Wait(client_lock);
5890 }
5891
5892 if (cct->_conf->client_oc) {
5893 // flush/release all buffered data
5894 ceph::unordered_map<vinodeno_t, Inode*>::iterator next;
5895 for (ceph::unordered_map<vinodeno_t, Inode*>::iterator p = inode_map.begin();
5896 p != inode_map.end();
5897 p = next) {
5898 next = p;
5899 ++next;
5900 Inode *in = p->second;
5901 if (!in) {
5902 ldout(cct, 0) << "null inode_map entry ino " << p->first << dendl;
5903 assert(in);
5904 }
5905 if (!in->caps.empty()) {
5906 InodeRef tmp_ref(in);
5907 _release(in);
5908 _flush(in, new C_Client_FlushComplete(this, in));
5909 }
5910 }
5911 }
5912
5913 flush_caps_sync();
5914 wait_sync_caps(last_flush_tid);
5915
5916 // empty lru cache
5917 lru.lru_set_max(0);
5918 trim_cache();
5919
5920 while (lru.lru_get_size() > 0 ||
5921 !inode_map.empty()) {
5922 ldout(cct, 2) << "cache still has " << lru.lru_get_size()
5923 << "+" << inode_map.size() << " items"
5924 << ", waiting (for caps to release?)"
5925 << dendl;
5926 utime_t until = ceph_clock_now() + utime_t(5, 0);
5927 int r = mount_cond.WaitUntil(client_lock, until);
5928 if (r == ETIMEDOUT) {
5929 dump_cache(NULL);
5930 }
5931 }
5932 assert(lru.lru_get_size() == 0);
5933 assert(inode_map.empty());
5934
5935 // stop tracing
5936 if (!cct->_conf->client_trace.empty()) {
5937 ldout(cct, 1) << "closing trace file '" << cct->_conf->client_trace << "'" << dendl;
5938 traceout.close();
5939 }
5940
5941 _close_sessions();
5942
5943 mounted = false;
5944
5945 ldout(cct, 2) << "unmounted." << dendl;
5946}
5947
5948
5949
5950class C_C_Tick : public Context {
5951 Client *client;
5952public:
5953 explicit C_C_Tick(Client *c) : client(c) {}
5954 void finish(int r) override {
5955 // Called back via Timer, which takes client_lock for us
5956 assert(client->client_lock.is_locked_by_me());
5957 client->tick();
5958 }
5959};
5960
5961void Client::flush_cap_releases()
5962{
5963 // send any cap releases
5964 for (map<mds_rank_t,MetaSession*>::iterator p = mds_sessions.begin();
5965 p != mds_sessions.end();
5966 ++p) {
5967 if (p->second->release && mdsmap->is_clientreplay_or_active_or_stopping(
5968 p->first)) {
5969 if (cct->_conf->client_inject_release_failure) {
5970 ldout(cct, 20) << __func__ << " injecting failure to send cap release message" << dendl;
5971 p->second->release->put();
5972 } else {
5973 p->second->con->send_message(p->second->release);
5974 }
5975 p->second->release = 0;
5976 }
5977 }
5978}
5979
5980void Client::tick()
5981{
5982 if (cct->_conf->client_debug_inject_tick_delay > 0) {
5983 sleep(cct->_conf->client_debug_inject_tick_delay);
5984 assert(0 == cct->_conf->set_val("client_debug_inject_tick_delay", "0"));
5985 cct->_conf->apply_changes(NULL);
5986 }
5987
5988 ldout(cct, 21) << "tick" << dendl;
5989 tick_event = new C_C_Tick(this);
5990 timer.add_event_after(cct->_conf->client_tick_interval, tick_event);
5991
5992 utime_t now = ceph_clock_now();
5993
5994 if (!mounted && !mds_requests.empty()) {
5995 MetaRequest *req = mds_requests.begin()->second;
5996 if (req->op_stamp + cct->_conf->client_mount_timeout < now) {
5997 req->abort(-ETIMEDOUT);
5998 if (req->caller_cond) {
5999 req->kick = true;
6000 req->caller_cond->Signal();
6001 }
6002 signal_cond_list(waiting_for_mdsmap);
6003 for (map<mds_rank_t,MetaSession*>::iterator p = mds_sessions.begin();
6004 p != mds_sessions.end();
6005 ++p)
6006 signal_context_list(p->second->waiting_for_open);
6007 }
6008 }
6009
6010 if (mdsmap->get_epoch()) {
6011 // renew caps?
6012 utime_t el = now - last_cap_renew;
6013 if (el > mdsmap->get_session_timeout() / 3.0)
6014 renew_caps();
6015
6016 flush_cap_releases();
6017 }
6018
6019 // delayed caps
6020 xlist<Inode*>::iterator p = delayed_caps.begin();
6021 while (!p.end()) {
6022 Inode *in = *p;
6023 ++p;
6024 if (in->hold_caps_until > now)
6025 break;
6026 delayed_caps.pop_front();
6027 cap_list.push_back(&in->cap_item);
6028 check_caps(in, CHECK_CAPS_NODELAY);
6029 }
6030
6031 trim_cache(true);
6032}
6033
6034void Client::renew_caps()
6035{
6036 ldout(cct, 10) << "renew_caps()" << dendl;
6037 last_cap_renew = ceph_clock_now();
6038
6039 for (map<mds_rank_t,MetaSession*>::iterator p = mds_sessions.begin();
6040 p != mds_sessions.end();
6041 ++p) {
6042 ldout(cct, 15) << "renew_caps requesting from mds." << p->first << dendl;
6043 if (mdsmap->get_state(p->first) >= MDSMap::STATE_REJOIN)
6044 renew_caps(p->second);
6045 }
6046}
6047
6048void Client::renew_caps(MetaSession *session)
6049{
6050 ldout(cct, 10) << "renew_caps mds." << session->mds_num << dendl;
6051 session->last_cap_renew_request = ceph_clock_now();
6052 uint64_t seq = ++session->cap_renew_seq;
6053 session->con->send_message(new MClientSession(CEPH_SESSION_REQUEST_RENEWCAPS, seq));
6054}
6055
6056
6057// ===============================================================
6058// high level (POSIXy) interface
6059
6060int Client::_do_lookup(Inode *dir, const string& name, int mask,
6061 InodeRef *target, const UserPerm& perms)
6062{
6063 int op = dir->snapid == CEPH_SNAPDIR ? CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP;
6064 MetaRequest *req = new MetaRequest(op);
6065 filepath path;
6066 dir->make_nosnap_relative_path(path);
6067 path.push_dentry(name);
6068 req->set_filepath(path);
6069 req->set_inode(dir);
6070 if (cct->_conf->client_debug_getattr_caps && op == CEPH_MDS_OP_LOOKUP)
6071 mask |= DEBUG_GETATTR_CAPS;
6072 req->head.args.getattr.mask = mask;
6073
6074 ldout(cct, 10) << "_do_lookup on " << path << dendl;
6075
6076 int r = make_request(req, perms, target);
6077 ldout(cct, 10) << "_do_lookup res is " << r << dendl;
6078 return r;
6079}
6080
6081int Client::_lookup(Inode *dir, const string& dname, int mask, InodeRef *target,
6082 const UserPerm& perms)
6083{
6084 int r = 0;
6085 Dentry *dn = NULL;
6086
6087 if (!dir->is_dir()) {
6088 r = -ENOTDIR;
6089 goto done;
6090 }
6091
6092 if (dname == "..") {
6093 if (dir->dn_set.empty())
6094 *target = dir;
6095 else
6096 *target = dir->get_first_parent()->dir->parent_inode; //dirs can't be hard-linked
6097 goto done;
6098 }
6099
6100 if (dname == ".") {
6101 *target = dir;
6102 goto done;
6103 }
6104
6105 if (dname.length() > NAME_MAX) {
6106 r = -ENAMETOOLONG;
6107 goto done;
6108 }
6109
6110 if (dname == cct->_conf->client_snapdir &&
6111 dir->snapid == CEPH_NOSNAP) {
6112 *target = open_snapdir(dir);
6113 goto done;
6114 }
6115
6116 if (dir->dir &&
6117 dir->dir->dentries.count(dname)) {
6118 dn = dir->dir->dentries[dname];
6119
6120 ldout(cct, 20) << "_lookup have dn " << dname << " mds." << dn->lease_mds << " ttl " << dn->lease_ttl
6121 << " seq " << dn->lease_seq
6122 << dendl;
6123
6124 if (!dn->inode || dn->inode->caps_issued_mask(mask)) {
6125 // is dn lease valid?
6126 utime_t now = ceph_clock_now();
6127 if (dn->lease_mds >= 0 &&
6128 dn->lease_ttl > now &&
6129 mds_sessions.count(dn->lease_mds)) {
6130 MetaSession *s = mds_sessions[dn->lease_mds];
6131 if (s->cap_ttl > now &&
6132 s->cap_gen == dn->lease_gen) {
6133 // touch this mds's dir cap too, even though we don't _explicitly_ use it here, to
6134 // make trim_caps() behave.
6135 dir->try_touch_cap(dn->lease_mds);
6136 goto hit_dn;
6137 }
6138 ldout(cct, 20) << " bad lease, cap_ttl " << s->cap_ttl << ", cap_gen " << s->cap_gen
6139 << " vs lease_gen " << dn->lease_gen << dendl;
6140 }
6141 // dir lease?
6142 if (dir->caps_issued_mask(CEPH_CAP_FILE_SHARED)) {
6143 if (dn->cap_shared_gen == dir->shared_gen &&
6144 (!dn->inode || dn->inode->caps_issued_mask(mask)))
6145 goto hit_dn;
6146 if (!dn->inode && (dir->flags & I_COMPLETE)) {
6147 ldout(cct, 10) << "_lookup concluded ENOENT locally for "
6148 << *dir << " dn '" << dname << "'" << dendl;
6149 return -ENOENT;
6150 }
6151 }
6152 } else {
6153 ldout(cct, 20) << " no cap on " << dn->inode->vino() << dendl;
6154 }
6155 } else {
6156 // can we conclude ENOENT locally?
6157 if (dir->caps_issued_mask(CEPH_CAP_FILE_SHARED) &&
6158 (dir->flags & I_COMPLETE)) {
6159 ldout(cct, 10) << "_lookup concluded ENOENT locally for " << *dir << " dn '" << dname << "'" << dendl;
6160 return -ENOENT;
6161 }
6162 }
6163
6164 r = _do_lookup(dir, dname, mask, target, perms);
6165 goto done;
6166
6167 hit_dn:
6168 if (dn->inode) {
6169 *target = dn->inode;
6170 } else {
6171 r = -ENOENT;
6172 }
6173 touch_dn(dn);
6174
6175 done:
6176 if (r < 0)
6177 ldout(cct, 10) << "_lookup " << *dir << " " << dname << " = " << r << dendl;
6178 else
6179 ldout(cct, 10) << "_lookup " << *dir << " " << dname << " = " << **target << dendl;
6180 return r;
6181}
6182
6183int Client::get_or_create(Inode *dir, const char* name,
6184 Dentry **pdn, bool expect_null)
6185{
6186 // lookup
6187 ldout(cct, 20) << "get_or_create " << *dir << " name " << name << dendl;
6188 dir->open_dir();
6189 if (dir->dir->dentries.count(name)) {
6190 Dentry *dn = dir->dir->dentries[name];
6191
6192 // is dn lease valid?
6193 utime_t now = ceph_clock_now();
6194 if (dn->inode &&
6195 dn->lease_mds >= 0 &&
6196 dn->lease_ttl > now &&
6197 mds_sessions.count(dn->lease_mds)) {
6198 MetaSession *s = mds_sessions[dn->lease_mds];
6199 if (s->cap_ttl > now &&
6200 s->cap_gen == dn->lease_gen) {
6201 if (expect_null)
6202 return -EEXIST;
6203 }
6204 }
6205 *pdn = dn;
6206 } else {
6207 // otherwise link up a new one
6208 *pdn = link(dir->dir, name, NULL, NULL);
6209 }
6210
6211 // success
6212 return 0;
6213}
6214
6215int Client::path_walk(const filepath& origpath, InodeRef *end,
6216 const UserPerm& perms, bool followsym, int mask)
6217{
6218 filepath path = origpath;
6219 InodeRef cur;
6220 if (origpath.absolute())
6221 cur = root;
6222 else
6223 cur = cwd;
6224 assert(cur);
6225
6226 ldout(cct, 10) << "path_walk " << path << dendl;
6227
6228 int symlinks = 0;
6229
6230 unsigned i=0;
6231 while (i < path.depth() && cur) {
6232 int caps = 0;
6233 const string &dname = path[i];
6234 ldout(cct, 10) << " " << i << " " << *cur << " " << dname << dendl;
6235 ldout(cct, 20) << " (path is " << path << ")" << dendl;
6236 InodeRef next;
6237 if (cct->_conf->client_permissions) {
6238 int r = may_lookup(cur.get(), perms);
6239 if (r < 0)
6240 return r;
6241 caps = CEPH_CAP_AUTH_SHARED;
6242 }
6243
6244 /* Get extra requested caps on the last component */
6245 if (i == (path.depth() - 1))
6246 caps |= mask;
6247 int r = _lookup(cur.get(), dname, caps, &next, perms);
6248 if (r < 0)
6249 return r;
6250 // only follow trailing symlink if followsym. always follow
6251 // 'directory' symlinks.
6252 if (next && next->is_symlink()) {
6253 symlinks++;
6254 ldout(cct, 20) << " symlink count " << symlinks << ", value is '" << next->symlink << "'" << dendl;
6255 if (symlinks > MAXSYMLINKS) {
6256 return -ELOOP;
6257 }
6258
6259 if (i < path.depth() - 1) {
6260 // dir symlink
6261 // replace consumed components of path with symlink dir target
6262 filepath resolved(next->symlink.c_str());
6263 resolved.append(path.postfixpath(i + 1));
6264 path = resolved;
6265 i = 0;
6266 if (next->symlink[0] == '/') {
6267 cur = root;
6268 }
6269 continue;
6270 } else if (followsym) {
6271 if (next->symlink[0] == '/') {
6272 path = next->symlink.c_str();
6273 i = 0;
6274 // reset position
6275 cur = root;
6276 } else {
6277 filepath more(next->symlink.c_str());
6278 // we need to remove the symlink component from off of the path
6279 // before adding the target that the symlink points to. remain
6280 // at the same position in the path.
6281 path.pop_dentry();
6282 path.append(more);
6283 }
6284 continue;
6285 }
6286 }
6287 cur.swap(next);
6288 i++;
6289 }
6290 if (!cur)
6291 return -ENOENT;
6292 if (end)
6293 end->swap(cur);
6294 return 0;
6295}
6296
6297
6298// namespace ops
6299
6300int Client::link(const char *relexisting, const char *relpath, const UserPerm& perm)
6301{
6302 Mutex::Locker lock(client_lock);
6303 tout(cct) << "link" << std::endl;
6304 tout(cct) << relexisting << std::endl;
6305 tout(cct) << relpath << std::endl;
6306
6307 filepath existing(relexisting);
6308
6309 InodeRef in, dir;
6310 int r = path_walk(existing, &in, perm, true);
6311 if (r < 0)
6312 return r;
6313 if (std::string(relpath) == "/") {
6314 r = -EEXIST;
6315 return r;
6316 }
6317 filepath path(relpath);
6318 string name = path.last_dentry();
6319 path.pop_dentry();
6320
6321 r = path_walk(path, &dir, perm, true);
6322 if (r < 0)
6323 return r;
6324 if (cct->_conf->client_permissions) {
6325 if (S_ISDIR(in->mode)) {
6326 r = -EPERM;
6327 return r;
6328 }
6329 r = may_hardlink(in.get(), perm);
6330 if (r < 0)
6331 return r;
6332 r = may_create(dir.get(), perm);
6333 if (r < 0)
6334 return r;
6335 }
6336 r = _link(in.get(), dir.get(), name.c_str(), perm);
6337 return r;
6338}
6339
6340int Client::unlink(const char *relpath, const UserPerm& perm)
6341{
6342 Mutex::Locker lock(client_lock);
6343 tout(cct) << "unlink" << std::endl;
6344 tout(cct) << relpath << std::endl;
6345
6346 if (std::string(relpath) == "/")
6347 return -EISDIR;
6348
6349 filepath path(relpath);
6350 string name = path.last_dentry();
6351 path.pop_dentry();
6352 InodeRef dir;
6353 int r = path_walk(path, &dir, perm);
6354 if (r < 0)
6355 return r;
6356 if (cct->_conf->client_permissions) {
6357 r = may_delete(dir.get(), name.c_str(), perm);
6358 if (r < 0)
6359 return r;
6360 }
6361 return _unlink(dir.get(), name.c_str(), perm);
6362}
6363
6364int Client::rename(const char *relfrom, const char *relto, const UserPerm& perm)
6365{
6366 Mutex::Locker lock(client_lock);
6367 tout(cct) << "rename" << std::endl;
6368 tout(cct) << relfrom << std::endl;
6369 tout(cct) << relto << std::endl;
6370
6371 if (std::string(relfrom) == "/" || std::string(relto) == "/")
6372 return -EBUSY;
6373
6374 filepath from(relfrom);
6375 filepath to(relto);
6376 string fromname = from.last_dentry();
6377 from.pop_dentry();
6378 string toname = to.last_dentry();
6379 to.pop_dentry();
6380
6381 InodeRef fromdir, todir;
6382 int r = path_walk(from, &fromdir, perm);
6383 if (r < 0)
6384 goto out;
6385 r = path_walk(to, &todir, perm);
6386 if (r < 0)
6387 goto out;
6388
6389 if (cct->_conf->client_permissions) {
6390 int r = may_delete(fromdir.get(), fromname.c_str(), perm);
6391 if (r < 0)
6392 return r;
6393 r = may_delete(todir.get(), toname.c_str(), perm);
6394 if (r < 0 && r != -ENOENT)
6395 return r;
6396 }
6397 r = _rename(fromdir.get(), fromname.c_str(), todir.get(), toname.c_str(), perm);
6398out:
6399 return r;
6400}
6401
6402// dirs
6403
6404int Client::mkdir(const char *relpath, mode_t mode, const UserPerm& perm)
6405{
6406 Mutex::Locker lock(client_lock);
6407 tout(cct) << "mkdir" << std::endl;
6408 tout(cct) << relpath << std::endl;
6409 tout(cct) << mode << std::endl;
6410 ldout(cct, 10) << "mkdir: " << relpath << dendl;
6411
6412 if (std::string(relpath) == "/")
6413 return -EEXIST;
6414
6415 filepath path(relpath);
6416 string name = path.last_dentry();
6417 path.pop_dentry();
6418 InodeRef dir;
6419 int r = path_walk(path, &dir, perm);
6420 if (r < 0)
6421 return r;
6422 if (cct->_conf->client_permissions) {
6423 r = may_create(dir.get(), perm);
6424 if (r < 0)
6425 return r;
6426 }
6427 return _mkdir(dir.get(), name.c_str(), mode, perm);
6428}
6429
6430int Client::mkdirs(const char *relpath, mode_t mode, const UserPerm& perms)
6431{
6432 Mutex::Locker lock(client_lock);
6433 ldout(cct, 10) << "Client::mkdirs " << relpath << dendl;
6434 tout(cct) << "mkdirs" << std::endl;
6435 tout(cct) << relpath << std::endl;
6436 tout(cct) << mode << std::endl;
6437
6438 //get through existing parts of path
6439 filepath path(relpath);
6440 unsigned int i;
6441 int r = 0, caps = 0;
6442 InodeRef cur, next;
6443 cur = cwd;
6444 for (i=0; i<path.depth(); ++i) {
6445 if (cct->_conf->client_permissions) {
6446 r = may_lookup(cur.get(), perms);
6447 if (r < 0)
6448 break;
6449 caps = CEPH_CAP_AUTH_SHARED;
6450 }
6451 r = _lookup(cur.get(), path[i].c_str(), caps, &next, perms);
6452 if (r < 0)
6453 break;
6454 cur.swap(next);
6455 }
6456 //check that we have work left to do
6457 if (i==path.depth()) return -EEXIST;
6458 if (r!=-ENOENT) return r;
6459 ldout(cct, 20) << "mkdirs got through " << i << " directories on path " << relpath << dendl;
6460 //make new directory at each level
6461 for (; i<path.depth(); ++i) {
6462 if (cct->_conf->client_permissions) {
6463 r = may_create(cur.get(), perms);
6464 if (r < 0)
6465 return r;
6466 }
6467 //make new dir
6468 r = _mkdir(cur.get(), path[i].c_str(), mode, perms, &next);
6469 //check proper creation/existence
6470 if (r < 0) return r;
6471 //move to new dir and continue
6472 cur.swap(next);
6473 ldout(cct, 20) << "mkdirs: successfully created directory "
6474 << filepath(cur->ino).get_path() << dendl;
6475 }
6476 return 0;
6477}
6478
6479int Client::rmdir(const char *relpath, const UserPerm& perms)
6480{
6481 Mutex::Locker lock(client_lock);
6482 tout(cct) << "rmdir" << std::endl;
6483 tout(cct) << relpath << std::endl;
6484
6485 if (std::string(relpath) == "/")
6486 return -EBUSY;
6487
6488 filepath path(relpath);
6489 string name = path.last_dentry();
6490 path.pop_dentry();
6491 InodeRef dir;
6492 int r = path_walk(path, &dir, perms);
6493 if (r < 0)
6494 return r;
6495 if (cct->_conf->client_permissions) {
6496 int r = may_delete(dir.get(), name.c_str(), perms);
6497 if (r < 0)
6498 return r;
6499 }
6500 return _rmdir(dir.get(), name.c_str(), perms);
6501}
6502
6503int Client::mknod(const char *relpath, mode_t mode, const UserPerm& perms, dev_t rdev)
6504{
6505 Mutex::Locker lock(client_lock);
6506 tout(cct) << "mknod" << std::endl;
6507 tout(cct) << relpath << std::endl;
6508 tout(cct) << mode << std::endl;
6509 tout(cct) << rdev << std::endl;
6510
6511 if (std::string(relpath) == "/")
6512 return -EEXIST;
6513
6514 filepath path(relpath);
6515 string name = path.last_dentry();
6516 path.pop_dentry();
6517 InodeRef dir;
6518 int r = path_walk(path, &dir, perms);
6519 if (r < 0)
6520 return r;
6521 if (cct->_conf->client_permissions) {
6522 int r = may_create(dir.get(), perms);
6523 if (r < 0)
6524 return r;
6525 }
6526 return _mknod(dir.get(), name.c_str(), mode, rdev, perms);
6527}
6528
6529// symlinks
6530
6531int Client::symlink(const char *target, const char *relpath, const UserPerm& perms)
6532{
6533 Mutex::Locker lock(client_lock);
6534 tout(cct) << "symlink" << std::endl;
6535 tout(cct) << target << std::endl;
6536 tout(cct) << relpath << std::endl;
6537
6538 if (std::string(relpath) == "/")
6539 return -EEXIST;
6540
6541 filepath path(relpath);
6542 string name = path.last_dentry();
6543 path.pop_dentry();
6544 InodeRef dir;
6545 int r = path_walk(path, &dir, perms);
6546 if (r < 0)
6547 return r;
6548 if (cct->_conf->client_permissions) {
6549 int r = may_create(dir.get(), perms);
6550 if (r < 0)
6551 return r;
6552 }
6553 return _symlink(dir.get(), name.c_str(), target, perms);
6554}
6555
6556int Client::readlink(const char *relpath, char *buf, loff_t size, const UserPerm& perms)
6557{
6558 Mutex::Locker lock(client_lock);
6559 tout(cct) << "readlink" << std::endl;
6560 tout(cct) << relpath << std::endl;
6561
6562 filepath path(relpath);
6563 InodeRef in;
6564 int r = path_walk(path, &in, perms, false);
6565 if (r < 0)
6566 return r;
6567
6568 return _readlink(in.get(), buf, size);
6569}
6570
6571int Client::_readlink(Inode *in, char *buf, size_t size)
6572{
6573 if (!in->is_symlink())
6574 return -EINVAL;
6575
6576 // copy into buf (at most size bytes)
6577 int r = in->symlink.length();
6578 if (r > (int)size)
6579 r = size;
6580 memcpy(buf, in->symlink.c_str(), r);
6581 return r;
6582}
6583
6584
6585// inode stuff
6586
6587int Client::_getattr(Inode *in, int mask, const UserPerm& perms, bool force)
6588{
6589 bool yes = in->caps_issued_mask(mask);
6590
6591 ldout(cct, 10) << "_getattr mask " << ccap_string(mask) << " issued=" << yes << dendl;
6592 if (yes && !force)
6593 return 0;
6594
6595 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_GETATTR);
6596 filepath path;
6597 in->make_nosnap_relative_path(path);
6598 req->set_filepath(path);
6599 req->set_inode(in);
6600 req->head.args.getattr.mask = mask;
6601
6602 int res = make_request(req, perms);
6603 ldout(cct, 10) << "_getattr result=" << res << dendl;
6604 return res;
6605}
6606
6607int Client::_do_setattr(Inode *in, struct ceph_statx *stx, int mask,
6608 const UserPerm& perms, InodeRef *inp)
6609{
6610 int issued = in->caps_issued();
6611
6612 ldout(cct, 10) << "_setattr mask " << mask << " issued " <<
6613 ccap_string(issued) << dendl;
6614
6615 if (in->snapid != CEPH_NOSNAP) {
6616 return -EROFS;
6617 }
6618 if ((mask & CEPH_SETATTR_SIZE) &&
6619 (unsigned long)stx->stx_size > in->size &&
6620 is_quota_bytes_exceeded(in, (unsigned long)stx->stx_size - in->size,
6621 perms)) {
6622 return -EDQUOT;
6623 }
6624
6625 // make the change locally?
6626 if ((in->cap_dirtier_uid >= 0 && perms.uid() != in->cap_dirtier_uid) ||
6627 (in->cap_dirtier_gid >= 0 && perms.gid() != in->cap_dirtier_gid)) {
6628 ldout(cct, 10) << __func__ << " caller " << perms.uid() << ":" << perms.gid()
6629 << " != cap dirtier " << in->cap_dirtier_uid << ":"
6630 << in->cap_dirtier_gid << ", forcing sync setattr"
6631 << dendl;
6632 /*
6633 * This works because we implicitly flush the caps as part of the
6634 * request, so the cap update check will happen with the writeback
6635 * cap context, and then the setattr check will happen with the
6636 * caller's context.
6637 *
6638 * In reality this pattern is likely pretty rare (different users
6639 * setattr'ing the same file). If that turns out not to be the
6640 * case later, we can build a more complex pipelined cap writeback
6641 * infrastructure...
6642 */
6643 if (!mask)
6644 mask |= CEPH_SETATTR_CTIME;
6645 goto force_request;
6646 }
6647
6648 if (!mask) {
6649 // caller just needs us to bump the ctime
6650 in->ctime = ceph_clock_now();
6651 in->cap_dirtier_uid = perms.uid();
6652 in->cap_dirtier_gid = perms.gid();
6653 if (issued & CEPH_CAP_AUTH_EXCL)
6654 mark_caps_dirty(in, CEPH_CAP_AUTH_EXCL);
6655 else if (issued & CEPH_CAP_FILE_EXCL)
6656 mark_caps_dirty(in, CEPH_CAP_FILE_EXCL);
6657 else if (issued & CEPH_CAP_XATTR_EXCL)
6658 mark_caps_dirty(in, CEPH_CAP_XATTR_EXCL);
6659 else
6660 mask |= CEPH_SETATTR_CTIME;
6661 }
6662
6663 if (in->caps_issued_mask(CEPH_CAP_AUTH_EXCL)) {
6664 bool kill_sguid = mask & (CEPH_SETATTR_SIZE|CEPH_SETATTR_KILL_SGUID);
6665
6666 mask &= ~CEPH_SETATTR_KILL_SGUID;
6667
6668 if (mask & CEPH_SETATTR_UID) {
6669 in->ctime = ceph_clock_now();
6670 in->cap_dirtier_uid = perms.uid();
6671 in->cap_dirtier_gid = perms.gid();
6672 in->uid = stx->stx_uid;
6673 mark_caps_dirty(in, CEPH_CAP_AUTH_EXCL);
6674 mask &= ~CEPH_SETATTR_UID;
6675 kill_sguid = true;
6676 ldout(cct,10) << "changing uid to " << stx->stx_uid << dendl;
6677 }
6678 if (mask & CEPH_SETATTR_GID) {
6679 in->ctime = ceph_clock_now();
6680 in->cap_dirtier_uid = perms.uid();
6681 in->cap_dirtier_gid = perms.gid();
6682 in->gid = stx->stx_gid;
6683 mark_caps_dirty(in, CEPH_CAP_AUTH_EXCL);
6684 mask &= ~CEPH_SETATTR_GID;
6685 kill_sguid = true;
6686 ldout(cct,10) << "changing gid to " << stx->stx_gid << dendl;
6687 }
6688
6689 if (mask & CEPH_SETATTR_MODE) {
6690 in->ctime = ceph_clock_now();
6691 in->cap_dirtier_uid = perms.uid();
6692 in->cap_dirtier_gid = perms.gid();
6693 in->mode = (in->mode & ~07777) | (stx->stx_mode & 07777);
6694 mark_caps_dirty(in, CEPH_CAP_AUTH_EXCL);
6695 mask &= ~CEPH_SETATTR_MODE;
6696 ldout(cct,10) << "changing mode to " << stx->stx_mode << dendl;
6697 } else if (kill_sguid && S_ISREG(in->mode)) {
6698 /* Must squash the any setuid/setgid bits with an ownership change */
6699 in->mode &= ~S_ISUID;
6700 if ((in->mode & (S_ISGID|S_IXGRP)) == (S_ISGID|S_IXGRP))
6701 in->mode &= ~S_ISGID;
6702 mark_caps_dirty(in, CEPH_CAP_AUTH_EXCL);
6703 }
6704
6705 if (mask & CEPH_SETATTR_BTIME) {
6706 in->ctime = ceph_clock_now();
6707 in->cap_dirtier_uid = perms.uid();
6708 in->cap_dirtier_gid = perms.gid();
6709 in->btime = utime_t(stx->stx_btime);
6710 mark_caps_dirty(in, CEPH_CAP_AUTH_EXCL);
6711 mask &= ~CEPH_SETATTR_BTIME;
6712 ldout(cct,10) << "changing btime to " << in->btime << dendl;
6713 }
6714 } else if (mask & CEPH_SETATTR_SIZE) {
6715 /* If we don't have Ax, then we must ask the server to clear them on truncate */
6716 mask |= CEPH_SETATTR_KILL_SGUID;
6717 }
6718
6719 if (in->caps_issued_mask(CEPH_CAP_FILE_EXCL)) {
6720 if (mask & (CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME)) {
6721 if (mask & CEPH_SETATTR_MTIME)
6722 in->mtime = utime_t(stx->stx_mtime);
6723 if (mask & CEPH_SETATTR_ATIME)
6724 in->atime = utime_t(stx->stx_atime);
6725 in->ctime = ceph_clock_now();
6726 in->cap_dirtier_uid = perms.uid();
6727 in->cap_dirtier_gid = perms.gid();
6728 in->time_warp_seq++;
6729 mark_caps_dirty(in, CEPH_CAP_FILE_EXCL);
6730 mask &= ~(CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME);
6731 }
6732 }
6733 if (!mask) {
6734 in->change_attr++;
6735 return 0;
6736 }
6737
6738force_request:
6739 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_SETATTR);
6740
6741 filepath path;
6742
6743 in->make_nosnap_relative_path(path);
6744 req->set_filepath(path);
6745 req->set_inode(in);
6746
6747 if (mask & CEPH_SETATTR_KILL_SGUID) {
6748 req->inode_drop |= CEPH_CAP_AUTH_SHARED;
6749 }
6750 if (mask & CEPH_SETATTR_MODE) {
6751 req->head.args.setattr.mode = stx->stx_mode;
6752 req->inode_drop |= CEPH_CAP_AUTH_SHARED;
6753 ldout(cct,10) << "changing mode to " << stx->stx_mode << dendl;
6754 }
6755 if (mask & CEPH_SETATTR_UID) {
6756 req->head.args.setattr.uid = stx->stx_uid;
6757 req->inode_drop |= CEPH_CAP_AUTH_SHARED;
6758 ldout(cct,10) << "changing uid to " << stx->stx_uid << dendl;
6759 }
6760 if (mask & CEPH_SETATTR_GID) {
6761 req->head.args.setattr.gid = stx->stx_gid;
6762 req->inode_drop |= CEPH_CAP_AUTH_SHARED;
6763 ldout(cct,10) << "changing gid to " << stx->stx_gid << dendl;
6764 }
6765 if (mask & CEPH_SETATTR_BTIME) {
6766 req->head.args.setattr.btime = utime_t(stx->stx_btime);
6767 req->inode_drop |= CEPH_CAP_AUTH_SHARED;
6768 }
6769 if (mask & CEPH_SETATTR_MTIME) {
6770 req->head.args.setattr.mtime = utime_t(stx->stx_mtime);
6771 req->inode_drop |= CEPH_CAP_AUTH_SHARED | CEPH_CAP_FILE_RD |
6772 CEPH_CAP_FILE_WR;
6773 }
6774 if (mask & CEPH_SETATTR_ATIME) {
6775 req->head.args.setattr.atime = utime_t(stx->stx_atime);
6776 req->inode_drop |= CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_RD |
6777 CEPH_CAP_FILE_WR;
6778 }
6779 if (mask & CEPH_SETATTR_SIZE) {
6780 if ((unsigned long)stx->stx_size < mdsmap->get_max_filesize()) {
6781 req->head.args.setattr.size = stx->stx_size;
6782 ldout(cct,10) << "changing size to " << stx->stx_size << dendl;
6783 } else { //too big!
6784 put_request(req);
6785 ldout(cct,10) << "unable to set size to " << stx->stx_size << ". Too large!" << dendl;
6786 return -EFBIG;
6787 }
6788 req->inode_drop |= CEPH_CAP_AUTH_SHARED | CEPH_CAP_FILE_RD |
6789 CEPH_CAP_FILE_WR;
6790 }
6791 req->head.args.setattr.mask = mask;
6792
6793 req->regetattr_mask = mask;
6794
6795 int res = make_request(req, perms, inp);
6796 ldout(cct, 10) << "_setattr result=" << res << dendl;
6797 return res;
6798}
6799
6800/* Note that we only care about attrs that setattr cares about */
6801void Client::stat_to_statx(struct stat *st, struct ceph_statx *stx)
6802{
6803 stx->stx_size = st->st_size;
6804 stx->stx_mode = st->st_mode;
6805 stx->stx_uid = st->st_uid;
6806 stx->stx_gid = st->st_gid;
6807 stx->stx_mtime = st->st_mtim;
6808 stx->stx_atime = st->st_atim;
6809}
6810
6811int Client::__setattrx(Inode *in, struct ceph_statx *stx, int mask,
6812 const UserPerm& perms, InodeRef *inp)
6813{
6814 int ret = _do_setattr(in, stx, mask, perms, inp);
6815 if (ret < 0)
6816 return ret;
6817 if (mask & CEPH_SETATTR_MODE)
6818 ret = _posix_acl_chmod(in, stx->stx_mode, perms);
6819 return ret;
6820}
6821
6822int Client::_setattrx(InodeRef &in, struct ceph_statx *stx, int mask,
6823 const UserPerm& perms)
6824{
6825 mask &= (CEPH_SETATTR_MODE | CEPH_SETATTR_UID |
6826 CEPH_SETATTR_GID | CEPH_SETATTR_MTIME |
6827 CEPH_SETATTR_ATIME | CEPH_SETATTR_SIZE |
6828 CEPH_SETATTR_CTIME | CEPH_SETATTR_BTIME);
6829 if (cct->_conf->client_permissions) {
6830 int r = may_setattr(in.get(), stx, mask, perms);
6831 if (r < 0)
6832 return r;
6833 }
6834 return __setattrx(in.get(), stx, mask, perms);
6835}
6836
6837int Client::_setattr(InodeRef &in, struct stat *attr, int mask,
6838 const UserPerm& perms)
6839{
6840 struct ceph_statx stx;
6841
6842 stat_to_statx(attr, &stx);
6843 mask &= ~CEPH_SETATTR_BTIME;
6844 return _setattrx(in, &stx, mask, perms);
6845}
6846
6847int Client::setattr(const char *relpath, struct stat *attr, int mask,
6848 const UserPerm& perms)
6849{
6850 Mutex::Locker lock(client_lock);
6851 tout(cct) << "setattr" << std::endl;
6852 tout(cct) << relpath << std::endl;
6853 tout(cct) << mask << std::endl;
6854
6855 filepath path(relpath);
6856 InodeRef in;
6857 int r = path_walk(path, &in, perms);
6858 if (r < 0)
6859 return r;
6860 return _setattr(in, attr, mask, perms);
6861}
6862
6863int Client::setattrx(const char *relpath, struct ceph_statx *stx, int mask,
6864 const UserPerm& perms, int flags)
6865{
6866 Mutex::Locker lock(client_lock);
6867 tout(cct) << "setattrx" << std::endl;
6868 tout(cct) << relpath << std::endl;
6869 tout(cct) << mask << std::endl;
6870
6871 filepath path(relpath);
6872 InodeRef in;
6873 int r = path_walk(path, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW));
6874 if (r < 0)
6875 return r;
6876 return _setattrx(in, stx, mask, perms);
6877}
6878
6879int Client::fsetattr(int fd, struct stat *attr, int mask, const UserPerm& perms)
6880{
6881 Mutex::Locker lock(client_lock);
6882 tout(cct) << "fsetattr" << std::endl;
6883 tout(cct) << fd << std::endl;
6884 tout(cct) << mask << std::endl;
6885
6886 Fh *f = get_filehandle(fd);
6887 if (!f)
6888 return -EBADF;
6889#if defined(__linux__) && defined(O_PATH)
6890 if (f->flags & O_PATH)
6891 return -EBADF;
6892#endif
6893 return _setattr(f->inode, attr, mask, perms);
6894}
6895
6896int Client::fsetattrx(int fd, struct ceph_statx *stx, int mask, const UserPerm& perms)
6897{
6898 Mutex::Locker lock(client_lock);
6899 tout(cct) << "fsetattr" << std::endl;
6900 tout(cct) << fd << std::endl;
6901 tout(cct) << mask << std::endl;
6902
6903 Fh *f = get_filehandle(fd);
6904 if (!f)
6905 return -EBADF;
6906#if defined(__linux__) && defined(O_PATH)
6907 if (f->flags & O_PATH)
6908 return -EBADF;
6909#endif
6910 return _setattrx(f->inode, stx, mask, perms);
6911}
6912
6913int Client::stat(const char *relpath, struct stat *stbuf, const UserPerm& perms,
6914 frag_info_t *dirstat, int mask)
6915{
6916 ldout(cct, 3) << "stat enter (relpath " << relpath << " mask " << mask << ")" << dendl;
6917 Mutex::Locker lock(client_lock);
6918 tout(cct) << "stat" << std::endl;
6919 tout(cct) << relpath << std::endl;
6920 filepath path(relpath);
6921 InodeRef in;
6922 int r = path_walk(path, &in, perms, true, mask);
6923 if (r < 0)
6924 return r;
6925 r = _getattr(in, mask, perms);
6926 if (r < 0) {
6927 ldout(cct, 3) << "stat exit on error!" << dendl;
6928 return r;
6929 }
6930 fill_stat(in, stbuf, dirstat);
6931 ldout(cct, 3) << "stat exit (relpath " << relpath << " mask " << mask << ")" << dendl;
6932 return r;
6933}
6934
6935unsigned Client::statx_to_mask(unsigned int flags, unsigned int want)
6936{
6937 unsigned mask = 0;
6938
6939 /* if NO_ATTR_SYNC is set, then we don't need any -- just use what's in cache */
6940 if (flags & AT_NO_ATTR_SYNC)
6941 goto out;
6942
6943 /* Always set PIN to distinguish from AT_NO_ATTR_SYNC case */
6944 mask |= CEPH_CAP_PIN;
6945 if (want & (CEPH_STATX_MODE|CEPH_STATX_UID|CEPH_STATX_GID|CEPH_STATX_BTIME|CEPH_STATX_CTIME|CEPH_STATX_VERSION))
6946 mask |= CEPH_CAP_AUTH_SHARED;
6947 if (want & (CEPH_STATX_NLINK|CEPH_STATX_CTIME|CEPH_STATX_VERSION))
6948 mask |= CEPH_CAP_LINK_SHARED;
6949 if (want & (CEPH_STATX_ATIME|CEPH_STATX_MTIME|CEPH_STATX_CTIME|CEPH_STATX_SIZE|CEPH_STATX_BLOCKS|CEPH_STATX_VERSION))
6950 mask |= CEPH_CAP_FILE_SHARED;
6951 if (want & (CEPH_STATX_VERSION|CEPH_STATX_CTIME))
6952 mask |= CEPH_CAP_XATTR_SHARED;
6953out:
6954 return mask;
6955}
6956
6957int Client::statx(const char *relpath, struct ceph_statx *stx,
6958 const UserPerm& perms,
6959 unsigned int want, unsigned int flags)
6960{
6961 ldout(cct, 3) << "statx enter (relpath " << relpath << " want " << want << ")" << dendl;
6962 Mutex::Locker lock(client_lock);
6963 tout(cct) << "statx" << std::endl;
6964 tout(cct) << relpath << std::endl;
6965 filepath path(relpath);
6966 InodeRef in;
6967
6968 unsigned mask = statx_to_mask(flags, want);
6969
6970 int r = path_walk(path, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW), mask);
6971 if (r < 0)
6972 return r;
6973
6974 r = _getattr(in, mask, perms);
6975 if (r < 0) {
6976 ldout(cct, 3) << "statx exit on error!" << dendl;
6977 return r;
6978 }
6979
6980 fill_statx(in, mask, stx);
6981 ldout(cct, 3) << "statx exit (relpath " << relpath << " mask " << stx->stx_mask << ")" << dendl;
6982 return r;
6983}
6984
6985int Client::lstat(const char *relpath, struct stat *stbuf,
6986 const UserPerm& perms, frag_info_t *dirstat, int mask)
6987{
6988 ldout(cct, 3) << "lstat enter (relpath " << relpath << " mask " << mask << ")" << dendl;
6989 Mutex::Locker lock(client_lock);
6990 tout(cct) << "lstat" << std::endl;
6991 tout(cct) << relpath << std::endl;
6992 filepath path(relpath);
6993 InodeRef in;
6994 // don't follow symlinks
6995 int r = path_walk(path, &in, perms, false, mask);
6996 if (r < 0)
6997 return r;
6998 r = _getattr(in, mask, perms);
6999 if (r < 0) {
7000 ldout(cct, 3) << "lstat exit on error!" << dendl;
7001 return r;
7002 }
7003 fill_stat(in, stbuf, dirstat);
7004 ldout(cct, 3) << "lstat exit (relpath " << relpath << " mask " << mask << ")" << dendl;
7005 return r;
7006}
7007
7008int Client::fill_stat(Inode *in, struct stat *st, frag_info_t *dirstat, nest_info_t *rstat)
7009{
7010 ldout(cct, 10) << "fill_stat on " << in->ino << " snap/dev" << in->snapid
7011 << " mode 0" << oct << in->mode << dec
7012 << " mtime " << in->mtime << " ctime " << in->ctime << dendl;
7013 memset(st, 0, sizeof(struct stat));
7014 if (use_faked_inos())
7015 st->st_ino = in->faked_ino;
7016 else
7017 st->st_ino = in->ino;
7018 st->st_dev = in->snapid;
7019 st->st_mode = in->mode;
7020 st->st_rdev = in->rdev;
7021 st->st_nlink = in->nlink;
7022 st->st_uid = in->uid;
7023 st->st_gid = in->gid;
7024 if (in->ctime > in->mtime) {
7025 stat_set_ctime_sec(st, in->ctime.sec());
7026 stat_set_ctime_nsec(st, in->ctime.nsec());
7027 } else {
7028 stat_set_ctime_sec(st, in->mtime.sec());
7029 stat_set_ctime_nsec(st, in->mtime.nsec());
7030 }
7031 stat_set_atime_sec(st, in->atime.sec());
7032 stat_set_atime_nsec(st, in->atime.nsec());
7033 stat_set_mtime_sec(st, in->mtime.sec());
7034 stat_set_mtime_nsec(st, in->mtime.nsec());
7035 if (in->is_dir()) {
7036 if (cct->_conf->client_dirsize_rbytes)
7037 st->st_size = in->rstat.rbytes;
7038 else
7039 st->st_size = in->dirstat.size();
7040 st->st_blocks = 1;
7041 } else {
7042 st->st_size = in->size;
7043 st->st_blocks = (in->size + 511) >> 9;
7044 }
7045 st->st_blksize = MAX(in->layout.stripe_unit, 4096);
7046
7047 if (dirstat)
7048 *dirstat = in->dirstat;
7049 if (rstat)
7050 *rstat = in->rstat;
7051
7052 return in->caps_issued();
7053}
7054
7055void Client::fill_statx(Inode *in, unsigned int mask, struct ceph_statx *stx)
7056{
7057 ldout(cct, 10) << "fill_statx on " << in->ino << " snap/dev" << in->snapid
7058 << " mode 0" << oct << in->mode << dec
7059 << " mtime " << in->mtime << " ctime " << in->ctime << dendl;
7060 memset(stx, 0, sizeof(struct ceph_statx));
7061
7062 /*
7063 * If mask is 0, then the caller set AT_NO_ATTR_SYNC. Reset the mask
7064 * so that all bits are set.
7065 */
7066 if (!mask)
7067 mask = ~0;
7068
7069 /* These are always considered to be available */
7070 stx->stx_dev = in->snapid;
7071 stx->stx_blksize = MAX(in->layout.stripe_unit, 4096);
7072
7073 /* Type bits are always set, even when CEPH_STATX_MODE is not */
7074 stx->stx_mode = S_IFMT & in->mode;
7075 stx->stx_ino = use_faked_inos() ? in->faked_ino : (ino_t)in->ino;
7076 stx->stx_rdev = in->rdev;
7077 stx->stx_mask |= (CEPH_STATX_INO|CEPH_STATX_RDEV);
7078
7079 if (mask & CEPH_CAP_AUTH_SHARED) {
7080 stx->stx_uid = in->uid;
7081 stx->stx_gid = in->gid;
7082 stx->stx_mode = in->mode;
7083 in->btime.to_timespec(&stx->stx_btime);
7084 stx->stx_mask |= (CEPH_STATX_MODE|CEPH_STATX_UID|CEPH_STATX_GID|CEPH_STATX_BTIME);
7085 }
7086
7087 if (mask & CEPH_CAP_LINK_SHARED) {
7088 stx->stx_nlink = in->nlink;
7089 stx->stx_mask |= CEPH_STATX_NLINK;
7090 }
7091
7092 if (mask & CEPH_CAP_FILE_SHARED) {
7093
7094 in->atime.to_timespec(&stx->stx_atime);
7095 in->mtime.to_timespec(&stx->stx_mtime);
7096
7097 if (in->is_dir()) {
7098 if (cct->_conf->client_dirsize_rbytes)
7099 stx->stx_size = in->rstat.rbytes;
7100 else
7101 stx->stx_size = in->dirstat.size();
7102 stx->stx_blocks = 1;
7103 } else {
7104 stx->stx_size = in->size;
7105 stx->stx_blocks = (in->size + 511) >> 9;
7106 }
7107 stx->stx_mask |= (CEPH_STATX_ATIME|CEPH_STATX_MTIME|
7108 CEPH_STATX_SIZE|CEPH_STATX_BLOCKS);
7109 }
7110
7111 /* Change time and change_attr both require all shared caps to view */
7112 if ((mask & CEPH_STAT_CAP_INODE_ALL) == CEPH_STAT_CAP_INODE_ALL) {
7113 stx->stx_version = in->change_attr;
7114 if (in->ctime > in->mtime)
7115 in->ctime.to_timespec(&stx->stx_ctime);
7116 else
7117 in->mtime.to_timespec(&stx->stx_ctime);
7118 stx->stx_mask |= (CEPH_STATX_CTIME|CEPH_STATX_VERSION);
7119 }
7120
7121}
7122
7123void Client::touch_dn(Dentry *dn)
7124{
7125 lru.lru_touch(dn);
7126}
7127
7128int Client::chmod(const char *relpath, mode_t mode, const UserPerm& perms)
7129{
7130 Mutex::Locker lock(client_lock);
7131 tout(cct) << "chmod" << std::endl;
7132 tout(cct) << relpath << std::endl;
7133 tout(cct) << mode << std::endl;
7134 filepath path(relpath);
7135 InodeRef in;
7136 int r = path_walk(path, &in, perms);
7137 if (r < 0)
7138 return r;
7139 struct stat attr;
7140 attr.st_mode = mode;
7141 return _setattr(in, &attr, CEPH_SETATTR_MODE, perms);
7142}
7143
7144int Client::fchmod(int fd, mode_t mode, const UserPerm& perms)
7145{
7146 Mutex::Locker lock(client_lock);
7147 tout(cct) << "fchmod" << std::endl;
7148 tout(cct) << fd << std::endl;
7149 tout(cct) << mode << std::endl;
7150 Fh *f = get_filehandle(fd);
7151 if (!f)
7152 return -EBADF;
7153#if defined(__linux__) && defined(O_PATH)
7154 if (f->flags & O_PATH)
7155 return -EBADF;
7156#endif
7157 struct stat attr;
7158 attr.st_mode = mode;
7159 return _setattr(f->inode, &attr, CEPH_SETATTR_MODE, perms);
7160}
7161
7162int Client::lchmod(const char *relpath, mode_t mode, const UserPerm& perms)
7163{
7164 Mutex::Locker lock(client_lock);
7165 tout(cct) << "lchmod" << std::endl;
7166 tout(cct) << relpath << std::endl;
7167 tout(cct) << mode << std::endl;
7168 filepath path(relpath);
7169 InodeRef in;
7170 // don't follow symlinks
7171 int r = path_walk(path, &in, perms, false);
7172 if (r < 0)
7173 return r;
7174 struct stat attr;
7175 attr.st_mode = mode;
7176 return _setattr(in, &attr, CEPH_SETATTR_MODE, perms);
7177}
7178
7179int Client::chown(const char *relpath, uid_t new_uid, gid_t new_gid,
7180 const UserPerm& perms)
7181{
7182 Mutex::Locker lock(client_lock);
7183 tout(cct) << "chown" << std::endl;
7184 tout(cct) << relpath << std::endl;
7185 tout(cct) << new_uid << std::endl;
7186 tout(cct) << new_gid << std::endl;
7187 filepath path(relpath);
7188 InodeRef in;
7189 int r = path_walk(path, &in, perms);
7190 if (r < 0)
7191 return r;
7192 struct stat attr;
7193 attr.st_uid = new_uid;
7194 attr.st_gid = new_gid;
7195 int mask = 0;
7196 if (new_uid != static_cast<uid_t>(-1)) mask |= CEPH_SETATTR_UID;
7197 if (new_gid != static_cast<gid_t>(-1)) mask |= CEPH_SETATTR_GID;
7198 return _setattr(in, &attr, mask, perms);
7199}
7200
7201int Client::fchown(int fd, uid_t new_uid, gid_t new_gid, const UserPerm& perms)
7202{
7203 Mutex::Locker lock(client_lock);
7204 tout(cct) << "fchown" << std::endl;
7205 tout(cct) << fd << std::endl;
7206 tout(cct) << new_uid << std::endl;
7207 tout(cct) << new_gid << std::endl;
7208 Fh *f = get_filehandle(fd);
7209 if (!f)
7210 return -EBADF;
7211#if defined(__linux__) && defined(O_PATH)
7212 if (f->flags & O_PATH)
7213 return -EBADF;
7214#endif
7215 struct stat attr;
7216 attr.st_uid = new_uid;
7217 attr.st_gid = new_gid;
7218 int mask = 0;
7219 if (new_uid != static_cast<uid_t>(-1)) mask |= CEPH_SETATTR_UID;
7220 if (new_gid != static_cast<gid_t>(-1)) mask |= CEPH_SETATTR_GID;
7221 return _setattr(f->inode, &attr, mask, perms);
7222}
7223
7224int Client::lchown(const char *relpath, uid_t new_uid, gid_t new_gid,
7225 const UserPerm& perms)
7226{
7227 Mutex::Locker lock(client_lock);
7228 tout(cct) << "lchown" << std::endl;
7229 tout(cct) << relpath << std::endl;
7230 tout(cct) << new_uid << std::endl;
7231 tout(cct) << new_gid << std::endl;
7232 filepath path(relpath);
7233 InodeRef in;
7234 // don't follow symlinks
7235 int r = path_walk(path, &in, perms, false);
7236 if (r < 0)
7237 return r;
7238 struct stat attr;
7239 attr.st_uid = new_uid;
7240 attr.st_gid = new_gid;
7241 int mask = 0;
7242 if (new_uid != static_cast<uid_t>(-1)) mask |= CEPH_SETATTR_UID;
7243 if (new_gid != static_cast<gid_t>(-1)) mask |= CEPH_SETATTR_GID;
7244 return _setattr(in, &attr, mask, perms);
7245}
7246
7247int Client::utime(const char *relpath, struct utimbuf *buf,
7248 const UserPerm& perms)
7249{
7250 Mutex::Locker lock(client_lock);
7251 tout(cct) << "utime" << std::endl;
7252 tout(cct) << relpath << std::endl;
7253 tout(cct) << buf->modtime << std::endl;
7254 tout(cct) << buf->actime << std::endl;
7255 filepath path(relpath);
7256 InodeRef in;
7257 int r = path_walk(path, &in, perms);
7258 if (r < 0)
7259 return r;
7260 struct stat attr;
7261 stat_set_mtime_sec(&attr, buf->modtime);
7262 stat_set_mtime_nsec(&attr, 0);
7263 stat_set_atime_sec(&attr, buf->actime);
7264 stat_set_atime_nsec(&attr, 0);
7265 return _setattr(in, &attr, CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME, perms);
7266}
7267
7268int Client::lutime(const char *relpath, struct utimbuf *buf,
7269 const UserPerm& perms)
7270{
7271 Mutex::Locker lock(client_lock);
7272 tout(cct) << "lutime" << std::endl;
7273 tout(cct) << relpath << std::endl;
7274 tout(cct) << buf->modtime << std::endl;
7275 tout(cct) << buf->actime << std::endl;
7276 filepath path(relpath);
7277 InodeRef in;
7278 // don't follow symlinks
7279 int r = path_walk(path, &in, perms, false);
7280 if (r < 0)
7281 return r;
7282 struct stat attr;
7283 stat_set_mtime_sec(&attr, buf->modtime);
7284 stat_set_mtime_nsec(&attr, 0);
7285 stat_set_atime_sec(&attr, buf->actime);
7286 stat_set_atime_nsec(&attr, 0);
7287 return _setattr(in, &attr, CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME, perms);
7288}
7289
7290int Client::flock(int fd, int operation, uint64_t owner)
7291{
7292 Mutex::Locker lock(client_lock);
7293 tout(cct) << "flock" << std::endl;
7294 tout(cct) << fd << std::endl;
7295 tout(cct) << operation << std::endl;
7296 tout(cct) << owner << std::endl;
7297 Fh *f = get_filehandle(fd);
7298 if (!f)
7299 return -EBADF;
7300
7301 return _flock(f, operation, owner);
7302}
7303
7304int Client::opendir(const char *relpath, dir_result_t **dirpp, const UserPerm& perms)
7305{
7306 Mutex::Locker lock(client_lock);
7307 tout(cct) << "opendir" << std::endl;
7308 tout(cct) << relpath << std::endl;
7309 filepath path(relpath);
7310 InodeRef in;
7311 int r = path_walk(path, &in, perms, true);
7312 if (r < 0)
7313 return r;
7314 if (cct->_conf->client_permissions) {
7315 int r = may_open(in.get(), O_RDONLY, perms);
7316 if (r < 0)
7317 return r;
7318 }
7319 r = _opendir(in.get(), dirpp, perms);
7320 /* if ENOTDIR, dirpp will be an uninitialized point and it's very dangerous to access its value */
7321 if (r != -ENOTDIR)
7322 tout(cct) << (unsigned long)*dirpp << std::endl;
7323 return r;
7324}
7325
7326int Client::_opendir(Inode *in, dir_result_t **dirpp, const UserPerm& perms)
7327{
7328 if (!in->is_dir())
7329 return -ENOTDIR;
7330 *dirpp = new dir_result_t(in, perms);
7331 opened_dirs.insert(*dirpp);
7332 ldout(cct, 3) << "_opendir(" << in->ino << ") = " << 0 << " (" << *dirpp << ")" << dendl;
7333 return 0;
7334}
7335
7336
7337int Client::closedir(dir_result_t *dir)
7338{
7339 Mutex::Locker lock(client_lock);
7340 tout(cct) << "closedir" << std::endl;
7341 tout(cct) << (unsigned long)dir << std::endl;
7342
7343 ldout(cct, 3) << "closedir(" << dir << ") = 0" << dendl;
7344 _closedir(dir);
7345 return 0;
7346}
7347
7348void Client::_closedir(dir_result_t *dirp)
7349{
7350 ldout(cct, 10) << "_closedir(" << dirp << ")" << dendl;
7351 if (dirp->inode) {
7352 ldout(cct, 10) << "_closedir detaching inode " << dirp->inode << dendl;
7353 dirp->inode.reset();
7354 }
7355 _readdir_drop_dirp_buffer(dirp);
7356 opened_dirs.erase(dirp);
7357 delete dirp;
7358}
7359
7360void Client::rewinddir(dir_result_t *dirp)
7361{
7362 Mutex::Locker lock(client_lock);
7363
7364 ldout(cct, 3) << "rewinddir(" << dirp << ")" << dendl;
7365 dir_result_t *d = static_cast<dir_result_t*>(dirp);
7366 _readdir_drop_dirp_buffer(d);
7367 d->reset();
7368}
7369
7370loff_t Client::telldir(dir_result_t *dirp)
7371{
7372 dir_result_t *d = static_cast<dir_result_t*>(dirp);
7373 ldout(cct, 3) << "telldir(" << dirp << ") = " << d->offset << dendl;
7374 return d->offset;
7375}
7376
7377void Client::seekdir(dir_result_t *dirp, loff_t offset)
7378{
7379 Mutex::Locker lock(client_lock);
7380
7381 ldout(cct, 3) << "seekdir(" << dirp << ", " << offset << ")" << dendl;
7382
7383 if (offset == dirp->offset)
7384 return;
7385
7386 if (offset > dirp->offset)
7387 dirp->release_count = 0; // bump if we do a forward seek
7388 else
7389 dirp->ordered_count = 0; // disable filling readdir cache
7390
7391 if (dirp->hash_order()) {
7392 if (dirp->offset > offset) {
7393 _readdir_drop_dirp_buffer(dirp);
7394 dirp->reset();
7395 }
7396 } else {
7397 if (offset == 0 ||
7398 dirp->buffer_frag != frag_t(dir_result_t::fpos_high(offset)) ||
7399 dirp->offset_low() > dir_result_t::fpos_low(offset)) {
7400 _readdir_drop_dirp_buffer(dirp);
7401 dirp->reset();
7402 }
7403 }
7404
7405 dirp->offset = offset;
7406}
7407
7408
7409//struct dirent {
7410// ino_t d_ino; /* inode number */
7411// off_t d_off; /* offset to the next dirent */
7412// unsigned short d_reclen; /* length of this record */
7413// unsigned char d_type; /* type of file */
7414// char d_name[256]; /* filename */
7415//};
7416void Client::fill_dirent(struct dirent *de, const char *name, int type, uint64_t ino, loff_t next_off)
7417{
7418 strncpy(de->d_name, name, 255);
7419 de->d_name[255] = '\0';
7420#ifndef __CYGWIN__
7421 de->d_ino = ino;
7422#if !defined(DARWIN) && !defined(__FreeBSD__)
7423 de->d_off = next_off;
7424#endif
7425 de->d_reclen = 1;
7426 de->d_type = IFTODT(type);
7427 ldout(cct, 10) << "fill_dirent '" << de->d_name << "' -> " << inodeno_t(de->d_ino)
7428 << " type " << (int)de->d_type << " w/ next_off " << hex << next_off << dec << dendl;
7429#endif
7430}
7431
7432void Client::_readdir_next_frag(dir_result_t *dirp)
7433{
7434 frag_t fg = dirp->buffer_frag;
7435
7436 if (fg.is_rightmost()) {
7437 ldout(cct, 10) << "_readdir_next_frag advance from " << fg << " to END" << dendl;
7438 dirp->set_end();
7439 return;
7440 }
7441
7442 // advance
7443 fg = fg.next();
7444 ldout(cct, 10) << "_readdir_next_frag advance from " << dirp->buffer_frag << " to " << fg << dendl;
7445
7446 if (dirp->hash_order()) {
7447 // keep last_name
7448 int64_t new_offset = dir_result_t::make_fpos(fg.value(), 2, true);
7449 if (dirp->offset < new_offset) // don't decrease offset
7450 dirp->offset = new_offset;
7451 } else {
7452 dirp->last_name.clear();
7453 dirp->offset = dir_result_t::make_fpos(fg, 2, false);
7454 _readdir_rechoose_frag(dirp);
7455 }
7456}
7457
7458void Client::_readdir_rechoose_frag(dir_result_t *dirp)
7459{
7460 assert(dirp->inode);
7461
7462 if (dirp->hash_order())
7463 return;
7464
7465 frag_t cur = frag_t(dirp->offset_high());
7466 frag_t fg = dirp->inode->dirfragtree[cur.value()];
7467 if (fg != cur) {
7468 ldout(cct, 10) << "_readdir_rechoose_frag frag " << cur << " maps to " << fg << dendl;
7469 dirp->offset = dir_result_t::make_fpos(fg, 2, false);
7470 dirp->last_name.clear();
7471 dirp->next_offset = 2;
7472 }
7473}
7474
7475void Client::_readdir_drop_dirp_buffer(dir_result_t *dirp)
7476{
7477 ldout(cct, 10) << "_readdir_drop_dirp_buffer " << dirp << dendl;
7478 dirp->buffer.clear();
7479}
7480
7481int Client::_readdir_get_frag(dir_result_t *dirp)
7482{
7483 assert(dirp);
7484 assert(dirp->inode);
7485
7486 // get the current frag.
7487 frag_t fg;
7488 if (dirp->hash_order())
7489 fg = dirp->inode->dirfragtree[dirp->offset_high()];
7490 else
7491 fg = frag_t(dirp->offset_high());
7492
7493 ldout(cct, 10) << "_readdir_get_frag " << dirp << " on " << dirp->inode->ino << " fg " << fg
7494 << " offset " << hex << dirp->offset << dec << dendl;
7495
7496 int op = CEPH_MDS_OP_READDIR;
7497 if (dirp->inode && dirp->inode->snapid == CEPH_SNAPDIR)
7498 op = CEPH_MDS_OP_LSSNAP;
7499
7500 InodeRef& diri = dirp->inode;
7501
7502 MetaRequest *req = new MetaRequest(op);
7503 filepath path;
7504 diri->make_nosnap_relative_path(path);
7505 req->set_filepath(path);
7506 req->set_inode(diri.get());
7507 req->head.args.readdir.frag = fg;
7508 req->head.args.readdir.flags = CEPH_READDIR_REPLY_BITFLAGS;
7509 if (dirp->last_name.length()) {
7510 req->path2.set_path(dirp->last_name.c_str());
7511 } else if (dirp->hash_order()) {
7512 req->head.args.readdir.offset_hash = dirp->offset_high();
7513 }
7514 req->dirp = dirp;
7515
7516 bufferlist dirbl;
7517 int res = make_request(req, dirp->perms, NULL, NULL, -1, &dirbl);
7518
7519 if (res == -EAGAIN) {
7520 ldout(cct, 10) << "_readdir_get_frag got EAGAIN, retrying" << dendl;
7521 _readdir_rechoose_frag(dirp);
7522 return _readdir_get_frag(dirp);
7523 }
7524
7525 if (res == 0) {
7526 ldout(cct, 10) << "_readdir_get_frag " << dirp << " got frag " << dirp->buffer_frag
7527 << " size " << dirp->buffer.size() << dendl;
7528 } else {
7529 ldout(cct, 10) << "_readdir_get_frag got error " << res << ", setting end flag" << dendl;
7530 dirp->set_end();
7531 }
7532
7533 return res;
7534}
7535
7536struct dentry_off_lt {
7537 bool operator()(const Dentry* dn, int64_t off) const {
7538 return dir_result_t::fpos_cmp(dn->offset, off) < 0;
7539 }
7540};
7541
7542int Client::_readdir_cache_cb(dir_result_t *dirp, add_dirent_cb_t cb, void *p,
7543 int caps, bool getref)
7544{
7545 assert(client_lock.is_locked());
7546 ldout(cct, 10) << "_readdir_cache_cb " << dirp << " on " << dirp->inode->ino
7547 << " last_name " << dirp->last_name << " offset " << hex << dirp->offset << dec
7548 << dendl;
7549 Dir *dir = dirp->inode->dir;
7550
7551 if (!dir) {
7552 ldout(cct, 10) << " dir is empty" << dendl;
7553 dirp->set_end();
7554 return 0;
7555 }
7556
7557 vector<Dentry*>::iterator pd = std::lower_bound(dir->readdir_cache.begin(),
7558 dir->readdir_cache.end(),
7559 dirp->offset, dentry_off_lt());
7560
7561 string dn_name;
7562 while (true) {
7563 if (!dirp->inode->is_complete_and_ordered())
7564 return -EAGAIN;
7565 if (pd == dir->readdir_cache.end())
7566 break;
7567 Dentry *dn = *pd;
7568 if (dn->inode == NULL) {
7569 ldout(cct, 15) << " skipping null '" << dn->name << "'" << dendl;
7570 ++pd;
7571 continue;
7572 }
7573 if (dn->cap_shared_gen != dir->parent_inode->shared_gen) {
7574 ldout(cct, 15) << " skipping mismatch shared gen '" << dn->name << "'" << dendl;
7575 ++pd;
7576 continue;
7577 }
7578
7579 int r = _getattr(dn->inode, caps, dirp->perms);
7580 if (r < 0)
7581 return r;
7582
7583 struct ceph_statx stx;
7584 struct dirent de;
7585 fill_statx(dn->inode, caps, &stx);
7586
7587 uint64_t next_off = dn->offset + 1;
7588 ++pd;
7589 if (pd == dir->readdir_cache.end())
7590 next_off = dir_result_t::END;
7591
7592 Inode *in = NULL;
7593 fill_dirent(&de, dn->name.c_str(), stx.stx_mode, stx.stx_ino, next_off);
7594 if (getref) {
7595 in = dn->inode.get();
7596 _ll_get(in);
7597 }
7598
7599 dn_name = dn->name; // fill in name while we have lock
7600
7601 client_lock.Unlock();
7602 r = cb(p, &de, &stx, next_off, in); // _next_ offset
7603 client_lock.Lock();
7604 ldout(cct, 15) << " de " << de.d_name << " off " << hex << dn->offset << dec
7605 << " = " << r << dendl;
7606 if (r < 0) {
7607 return r;
7608 }
7609
7610 dirp->offset = next_off;
7611 if (dirp->at_end())
7612 dirp->next_offset = 2;
7613 else
7614 dirp->next_offset = dirp->offset_low();
7615 dirp->last_name = dn_name; // we successfully returned this one; update!
7616 if (r > 0)
7617 return r;
7618 }
7619
7620 ldout(cct, 10) << "_readdir_cache_cb " << dirp << " on " << dirp->inode->ino << " at end" << dendl;
7621 dirp->set_end();
7622 return 0;
7623}
7624
7625int Client::readdir_r_cb(dir_result_t *d, add_dirent_cb_t cb, void *p,
7626 unsigned want, unsigned flags, bool getref)
7627{
7628 int caps = statx_to_mask(flags, want);
7629
7630 Mutex::Locker lock(client_lock);
7631
7632 dir_result_t *dirp = static_cast<dir_result_t*>(d);
7633
7634 ldout(cct, 10) << "readdir_r_cb " << *dirp->inode << " offset " << hex << dirp->offset
7635 << dec << " at_end=" << dirp->at_end()
7636 << " hash_order=" << dirp->hash_order() << dendl;
7637
7638 struct dirent de;
7639 struct ceph_statx stx;
7640 memset(&de, 0, sizeof(de));
7641 memset(&stx, 0, sizeof(stx));
7642
7643 InodeRef& diri = dirp->inode;
7644
7645 if (dirp->at_end())
7646 return 0;
7647
7648 if (dirp->offset == 0) {
7649 ldout(cct, 15) << " including ." << dendl;
7650 assert(diri->dn_set.size() < 2); // can't have multiple hard-links to a dir
7651 uint64_t next_off = 1;
7652
7653 int r;
7654 r = _getattr(diri, caps, dirp->perms);
7655 if (r < 0)
7656 return r;
7657
7658 fill_statx(diri, caps, &stx);
7659 fill_dirent(&de, ".", S_IFDIR, stx.stx_ino, next_off);
7660
7661 Inode *inode = NULL;
7662 if (getref) {
7663 inode = diri.get();
7664 _ll_get(inode);
7665 }
7666
7667 client_lock.Unlock();
7668 r = cb(p, &de, &stx, next_off, inode);
7669 client_lock.Lock();
7670 if (r < 0)
7671 return r;
7672
7673 dirp->offset = next_off;
7674 if (r > 0)
7675 return r;
7676 }
7677 if (dirp->offset == 1) {
7678 ldout(cct, 15) << " including .." << dendl;
7679 uint64_t next_off = 2;
7680 InodeRef in;
7681 if (diri->dn_set.empty())
7682 in = diri;
7683 else
7684 in = diri->get_first_parent()->inode;
7685
7686 int r;
7687 r = _getattr(diri, caps, dirp->perms);
7688 if (r < 0)
7689 return r;
7690
7691 fill_statx(in, caps, &stx);
7692 fill_dirent(&de, "..", S_IFDIR, stx.stx_ino, next_off);
7693
7694 Inode *inode = NULL;
7695 if (getref) {
7696 inode = in.get();
7697 _ll_get(inode);
7698 }
7699
7700 client_lock.Unlock();
7701 r = cb(p, &de, &stx, next_off, inode);
7702 client_lock.Lock();
7703 if (r < 0)
7704 return r;
7705
7706 dirp->offset = next_off;
7707 if (r > 0)
7708 return r;
7709 }
7710
7711 // can we read from our cache?
7712 ldout(cct, 10) << "offset " << hex << dirp->offset << dec
7713 << " snapid " << dirp->inode->snapid << " (complete && ordered) "
7714 << dirp->inode->is_complete_and_ordered()
7715 << " issued " << ccap_string(dirp->inode->caps_issued())
7716 << dendl;
7717 if (dirp->inode->snapid != CEPH_SNAPDIR &&
7718 dirp->inode->is_complete_and_ordered() &&
7719 dirp->inode->caps_issued_mask(CEPH_CAP_FILE_SHARED)) {
7720 int err = _readdir_cache_cb(dirp, cb, p, caps, getref);
7721 if (err != -EAGAIN)
7722 return err;
7723 }
7724
7725 while (1) {
7726 if (dirp->at_end())
7727 return 0;
7728
7729 bool check_caps = true;
7730 if (!dirp->is_cached()) {
7731 int r = _readdir_get_frag(dirp);
7732 if (r)
7733 return r;
7734 // _readdir_get_frag () may updates dirp->offset if the replied dirfrag is
7735 // different than the requested one. (our dirfragtree was outdated)
7736 check_caps = false;
7737 }
7738 frag_t fg = dirp->buffer_frag;
7739
7740 ldout(cct, 10) << "frag " << fg << " buffer size " << dirp->buffer.size()
7741 << " offset " << hex << dirp->offset << dendl;
7742
7743 for (auto it = std::lower_bound(dirp->buffer.begin(), dirp->buffer.end(),
7744 dirp->offset, dir_result_t::dentry_off_lt());
7745 it != dirp->buffer.end();
7746 ++it) {
7747 dir_result_t::dentry &entry = *it;
7748
7749 uint64_t next_off = entry.offset + 1;
7750
7751 int r;
7752 if (check_caps) {
7753 r = _getattr(entry.inode, caps, dirp->perms);
7754 if (r < 0)
7755 return r;
7756 }
7757
7758 fill_statx(entry.inode, caps, &stx);
7759 fill_dirent(&de, entry.name.c_str(), stx.stx_mode, stx.stx_ino, next_off);
7760
7761 Inode *inode = NULL;
7762 if (getref) {
7763 inode = entry.inode.get();
7764 _ll_get(inode);
7765 }
7766
7767 client_lock.Unlock();
7768 r = cb(p, &de, &stx, next_off, inode); // _next_ offset
7769 client_lock.Lock();
7770
7771 ldout(cct, 15) << " de " << de.d_name << " off " << hex << next_off - 1 << dec
7772 << " = " << r << dendl;
7773 if (r < 0)
7774 return r;
7775
7776 dirp->offset = next_off;
7777 if (r > 0)
7778 return r;
7779 }
7780
7781 if (dirp->next_offset > 2) {
7782 ldout(cct, 10) << " fetching next chunk of this frag" << dendl;
7783 _readdir_drop_dirp_buffer(dirp);
7784 continue; // more!
7785 }
7786
7787 if (!fg.is_rightmost()) {
7788 // next frag!
7789 _readdir_next_frag(dirp);
7790 continue;
7791 }
7792
7793 if (diri->shared_gen == dirp->start_shared_gen &&
7794 diri->dir_release_count == dirp->release_count) {
7795 if (diri->dir_ordered_count == dirp->ordered_count) {
7796 ldout(cct, 10) << " marking (I_COMPLETE|I_DIR_ORDERED) on " << *diri << dendl;
7797 if (diri->dir) {
7798 assert(diri->dir->readdir_cache.size() >= dirp->cache_index);
7799 diri->dir->readdir_cache.resize(dirp->cache_index);
7800 }
7801 diri->flags |= I_COMPLETE | I_DIR_ORDERED;
7802 } else {
7803 ldout(cct, 10) << " marking I_COMPLETE on " << *diri << dendl;
7804 diri->flags |= I_COMPLETE;
7805 }
7806 }
7807
7808 dirp->set_end();
7809 return 0;
7810 }
7811 ceph_abort();
7812 return 0;
7813}
7814
7815
7816int Client::readdir_r(dir_result_t *d, struct dirent *de)
7817{
7818 return readdirplus_r(d, de, 0, 0, 0, NULL);
7819}
7820
7821/*
7822 * readdirplus_r
7823 *
7824 * returns
7825 * 1 if we got a dirent
7826 * 0 for end of directory
7827 * <0 on error
7828 */
7829
7830struct single_readdir {
7831 struct dirent *de;
7832 struct ceph_statx *stx;
7833 Inode *inode;
7834 bool full;
7835};
7836
7837static int _readdir_single_dirent_cb(void *p, struct dirent *de,
7838 struct ceph_statx *stx, off_t off,
7839 Inode *in)
7840{
7841 single_readdir *c = static_cast<single_readdir *>(p);
7842
7843 if (c->full)
7844 return -1; // already filled this dirent
7845
7846 *c->de = *de;
7847 if (c->stx)
7848 *c->stx = *stx;
7849 c->inode = in;
7850 c->full = true;
7851 return 1;
7852}
7853
7854struct dirent *Client::readdir(dir_result_t *d)
7855{
7856 int ret;
7857 static struct dirent de;
7858 single_readdir sr;
7859 sr.de = &de;
7860 sr.stx = NULL;
7861 sr.inode = NULL;
7862 sr.full = false;
7863
7864 // our callback fills the dirent and sets sr.full=true on first
7865 // call, and returns -1 the second time around.
7866 ret = readdir_r_cb(d, _readdir_single_dirent_cb, (void *)&sr);
7867 if (ret < -1) {
7868 errno = -ret; // this sucks.
7869 return (dirent *) NULL;
7870 }
7871 if (sr.full) {
7872 return &de;
7873 }
7874 return (dirent *) NULL;
7875}
7876
7877int Client::readdirplus_r(dir_result_t *d, struct dirent *de,
7878 struct ceph_statx *stx, unsigned want,
7879 unsigned flags, Inode **out)
7880{
7881 single_readdir sr;
7882 sr.de = de;
7883 sr.stx = stx;
7884 sr.inode = NULL;
7885 sr.full = false;
7886
7887 // our callback fills the dirent and sets sr.full=true on first
7888 // call, and returns -1 the second time around.
7889 int r = readdir_r_cb(d, _readdir_single_dirent_cb, (void *)&sr, want, flags, out);
7890 if (r < -1)
7891 return r;
7892 if (out)
7893 *out = sr.inode;
7894 if (sr.full)
7895 return 1;
7896 return 0;
7897}
7898
7899
7900/* getdents */
7901struct getdents_result {
7902 char *buf;
7903 int buflen;
7904 int pos;
7905 bool fullent;
7906};
7907
7908static int _readdir_getdent_cb(void *p, struct dirent *de,
7909 struct ceph_statx *stx, off_t off, Inode *in)
7910{
7911 struct getdents_result *c = static_cast<getdents_result *>(p);
7912
7913 int dlen;
7914 if (c->fullent)
7915 dlen = sizeof(*de);
7916 else
7917 dlen = strlen(de->d_name) + 1;
7918
7919 if (c->pos + dlen > c->buflen)
7920 return -1; // doesn't fit
7921
7922 if (c->fullent) {
7923 memcpy(c->buf + c->pos, de, sizeof(*de));
7924 } else {
7925 memcpy(c->buf + c->pos, de->d_name, dlen);
7926 }
7927 c->pos += dlen;
7928 return 0;
7929}
7930
7931int Client::_getdents(dir_result_t *dir, char *buf, int buflen, bool fullent)
7932{
7933 getdents_result gr;
7934 gr.buf = buf;
7935 gr.buflen = buflen;
7936 gr.fullent = fullent;
7937 gr.pos = 0;
7938
7939 int r = readdir_r_cb(dir, _readdir_getdent_cb, (void *)&gr);
7940
7941 if (r < 0) { // some error
7942 if (r == -1) { // buffer ran out of space
7943 if (gr.pos) { // but we got some entries already!
7944 return gr.pos;
7945 } // or we need a larger buffer
7946 return -ERANGE;
7947 } else { // actual error, return it
7948 return r;
7949 }
7950 }
7951 return gr.pos;
7952}
7953
7954
7955/* getdir */
7956struct getdir_result {
7957 list<string> *contents;
7958 int num;
7959};
7960
7961static int _getdir_cb(void *p, struct dirent *de, struct ceph_statx *stx, off_t off, Inode *in)
7962{
7963 getdir_result *r = static_cast<getdir_result *>(p);
7964
7965 r->contents->push_back(de->d_name);
7966 r->num++;
7967 return 0;
7968}
7969
7970int Client::getdir(const char *relpath, list<string>& contents,
7971 const UserPerm& perms)
7972{
7973 ldout(cct, 3) << "getdir(" << relpath << ")" << dendl;
7974 {
7975 Mutex::Locker lock(client_lock);
7976 tout(cct) << "getdir" << std::endl;
7977 tout(cct) << relpath << std::endl;
7978 }
7979
7980 dir_result_t *d;
7981 int r = opendir(relpath, &d, perms);
7982 if (r < 0)
7983 return r;
7984
7985 getdir_result gr;
7986 gr.contents = &contents;
7987 gr.num = 0;
7988 r = readdir_r_cb(d, _getdir_cb, (void *)&gr);
7989
7990 closedir(d);
7991
7992 if (r < 0)
7993 return r;
7994 return gr.num;
7995}
7996
7997
7998/****** file i/o **********/
7999int Client::open(const char *relpath, int flags, const UserPerm& perms,
8000 mode_t mode, int stripe_unit, int stripe_count,
8001 int object_size, const char *data_pool)
8002{
8003 ldout(cct, 3) << "open enter(" << relpath << ", " << ceph_flags_sys2wire(flags) << "," << mode << ")" << dendl;
8004 Mutex::Locker lock(client_lock);
8005 tout(cct) << "open" << std::endl;
8006 tout(cct) << relpath << std::endl;
8007 tout(cct) << ceph_flags_sys2wire(flags) << std::endl;
8008
8009 Fh *fh = NULL;
8010
8011#if defined(__linux__) && defined(O_PATH)
8012 /* When the O_PATH is being specified, others flags than O_DIRECTORY
8013 * and O_NOFOLLOW are ignored. Please refer do_entry_open() function
8014 * in kernel (fs/open.c). */
8015 if (flags & O_PATH)
8016 flags &= O_DIRECTORY | O_NOFOLLOW | O_PATH;
8017#endif
8018
8019 filepath path(relpath);
8020 InodeRef in;
8021 bool created = false;
8022 /* O_CREATE with O_EXCL enforces O_NOFOLLOW. */
8023 bool followsym = !((flags & O_NOFOLLOW) || ((flags & O_CREAT) && (flags & O_EXCL)));
8024 int r = path_walk(path, &in, perms, followsym, ceph_caps_for_mode(mode));
8025
8026 if (r == 0 && (flags & O_CREAT) && (flags & O_EXCL))
8027 return -EEXIST;
8028
8029#if defined(__linux__) && defined(O_PATH)
8030 if (r == 0 && in->is_symlink() && (flags & O_NOFOLLOW) && !(flags & O_PATH))
8031#else
8032 if (r == 0 && in->is_symlink() && (flags & O_NOFOLLOW))
8033#endif
8034 return -ELOOP;
8035
8036 if (r == -ENOENT && (flags & O_CREAT)) {
8037 filepath dirpath = path;
8038 string dname = dirpath.last_dentry();
8039 dirpath.pop_dentry();
8040 InodeRef dir;
8041 r = path_walk(dirpath, &dir, perms, true,
8042 cct->_conf->client_permissions ? CEPH_CAP_AUTH_SHARED : 0);
8043 if (r < 0)
8044 goto out;
8045 if (cct->_conf->client_permissions) {
8046 r = may_create(dir.get(), perms);
8047 if (r < 0)
8048 goto out;
8049 }
8050 r = _create(dir.get(), dname.c_str(), flags, mode, &in, &fh, stripe_unit,
8051 stripe_count, object_size, data_pool, &created, perms);
8052 }
8053 if (r < 0)
8054 goto out;
8055
8056 if (!created) {
8057 // posix says we can only check permissions of existing files
8058 if (cct->_conf->client_permissions) {
8059 r = may_open(in.get(), flags, perms);
8060 if (r < 0)
8061 goto out;
8062 }
8063 }
8064
8065 if (!fh)
8066 r = _open(in.get(), flags, mode, &fh, perms);
8067 if (r >= 0) {
8068 // allocate a integer file descriptor
8069 assert(fh);
8070 r = get_fd();
8071 assert(fd_map.count(r) == 0);
8072 fd_map[r] = fh;
8073 }
8074
8075 out:
8076 tout(cct) << r << std::endl;
8077 ldout(cct, 3) << "open exit(" << path << ", " << ceph_flags_sys2wire(flags) << ") = " << r << dendl;
8078 return r;
8079}
8080
8081int Client::open(const char *relpath, int flags, const UserPerm& perms, mode_t mode)
8082{
8083 /* Use default file striping parameters */
8084 return open(relpath, flags, perms, mode, 0, 0, 0, NULL);
8085}
8086
8087int Client::lookup_hash(inodeno_t ino, inodeno_t dirino, const char *name,
8088 const UserPerm& perms)
8089{
8090 Mutex::Locker lock(client_lock);
8091 ldout(cct, 3) << "lookup_hash enter(" << ino << ", #" << dirino << "/" << name << ")" << dendl;
8092
8093 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPHASH);
8094 filepath path(ino);
8095 req->set_filepath(path);
8096
8097 uint32_t h = ceph_str_hash(CEPH_STR_HASH_RJENKINS, name, strlen(name));
8098 char f[30];
8099 sprintf(f, "%u", h);
8100 filepath path2(dirino);
8101 path2.push_dentry(string(f));
8102 req->set_filepath2(path2);
8103
8104 int r = make_request(req, perms, NULL, NULL,
8105 rand() % mdsmap->get_num_in_mds());
8106 ldout(cct, 3) << "lookup_hash exit(" << ino << ", #" << dirino << "/" << name << ") = " << r << dendl;
8107 return r;
8108}
8109
8110
8111/**
8112 * Load inode into local cache.
8113 *
8114 * If inode pointer is non-NULL, and take a reference on
8115 * the resulting Inode object in one operation, so that caller
8116 * can safely assume inode will still be there after return.
8117 */
8118int Client::lookup_ino(inodeno_t ino, const UserPerm& perms, Inode **inode)
8119{
8120 Mutex::Locker lock(client_lock);
8121 ldout(cct, 3) << "lookup_ino enter(" << ino << ")" << dendl;
8122
8123 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPINO);
8124 filepath path(ino);
8125 req->set_filepath(path);
8126
8127 int r = make_request(req, perms, NULL, NULL, rand() % mdsmap->get_num_in_mds());
8128 if (r == 0 && inode != NULL) {
8129 vinodeno_t vino(ino, CEPH_NOSNAP);
8130 unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino);
8131 assert(p != inode_map.end());
8132 *inode = p->second;
8133 _ll_get(*inode);
8134 }
8135 ldout(cct, 3) << "lookup_ino exit(" << ino << ") = " << r << dendl;
8136 return r;
8137}
8138
8139
8140
8141/**
8142 * Find the parent inode of `ino` and insert it into
8143 * our cache. Conditionally also set `parent` to a referenced
8144 * Inode* if caller provides non-NULL value.
8145 */
8146int Client::lookup_parent(Inode *ino, const UserPerm& perms, Inode **parent)
8147{
8148 Mutex::Locker lock(client_lock);
8149 ldout(cct, 3) << "lookup_parent enter(" << ino->ino << ")" << dendl;
8150
8151 if (!ino->dn_set.empty()) {
8152 // if we exposed the parent here, we'd need to check permissions,
8153 // but right now we just rely on the MDS doing so in make_request
8154 ldout(cct, 3) << "lookup_parent dentry already present" << dendl;
8155 return 0;
8156 }
8157
8158 if (ino->is_root()) {
8159 *parent = NULL;
8160 ldout(cct, 3) << "ino is root, no parent" << dendl;
8161 return -EINVAL;
8162 }
8163
8164 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPPARENT);
8165 filepath path(ino->ino);
8166 req->set_filepath(path);
8167
8168 InodeRef target;
8169 int r = make_request(req, perms, &target, NULL, rand() % mdsmap->get_num_in_mds());
8170 // Give caller a reference to the parent ino if they provided a pointer.
8171 if (parent != NULL) {
8172 if (r == 0) {
8173 *parent = target.get();
8174 _ll_get(*parent);
8175 ldout(cct, 3) << "lookup_parent found parent " << (*parent)->ino << dendl;
8176 } else {
8177 *parent = NULL;
8178 }
8179 }
8180 ldout(cct, 3) << "lookup_parent exit(" << ino->ino << ") = " << r << dendl;
8181 return r;
8182}
8183
8184
8185/**
8186 * Populate the parent dentry for `ino`, provided it is
8187 * a child of `parent`.
8188 */
8189int Client::lookup_name(Inode *ino, Inode *parent, const UserPerm& perms)
8190{
8191 assert(parent->is_dir());
8192
8193 Mutex::Locker lock(client_lock);
8194 ldout(cct, 3) << "lookup_name enter(" << ino->ino << ")" << dendl;
8195
8196 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPNAME);
8197 req->set_filepath2(filepath(parent->ino));
8198 req->set_filepath(filepath(ino->ino));
8199 req->set_inode(ino);
8200
8201 int r = make_request(req, perms, NULL, NULL, rand() % mdsmap->get_num_in_mds());
8202 ldout(cct, 3) << "lookup_name exit(" << ino->ino << ") = " << r << dendl;
8203 return r;
8204}
8205
8206
8207 Fh *Client::_create_fh(Inode *in, int flags, int cmode, const UserPerm& perms)
8208{
8209 assert(in);
8210 Fh *f = new Fh(in);
8211 f->mode = cmode;
8212 f->flags = flags;
8213
8214 // inode
8215 f->actor_perms = perms;
8216
8217 ldout(cct, 10) << "_create_fh " << in->ino << " mode " << cmode << dendl;
8218
8219 if (in->snapid != CEPH_NOSNAP) {
8220 in->snap_cap_refs++;
8221 ldout(cct, 5) << "open success, fh is " << f << " combined IMMUTABLE SNAP caps "
8222 << ccap_string(in->caps_issued()) << dendl;
8223 }
8224
8225 const md_config_t *conf = cct->_conf;
8226 f->readahead.set_trigger_requests(1);
8227 f->readahead.set_min_readahead_size(conf->client_readahead_min);
8228 uint64_t max_readahead = Readahead::NO_LIMIT;
8229 if (conf->client_readahead_max_bytes) {
8230 max_readahead = MIN(max_readahead, (uint64_t)conf->client_readahead_max_bytes);
8231 }
8232 if (conf->client_readahead_max_periods) {
8233 max_readahead = MIN(max_readahead, in->layout.get_period()*(uint64_t)conf->client_readahead_max_periods);
8234 }
8235 f->readahead.set_max_readahead_size(max_readahead);
8236 vector<uint64_t> alignments;
8237 alignments.push_back(in->layout.get_period());
8238 alignments.push_back(in->layout.stripe_unit);
8239 f->readahead.set_alignments(alignments);
8240
8241 return f;
8242}
8243
8244int Client::_release_fh(Fh *f)
8245{
8246 //ldout(cct, 3) << "op: client->close(open_files[ " << fh << " ]);" << dendl;
8247 //ldout(cct, 3) << "op: open_files.erase( " << fh << " );" << dendl;
8248 Inode *in = f->inode.get();
8249 ldout(cct, 5) << "_release_fh " << f << " mode " << f->mode << " on " << *in << dendl;
8250
8251 if (in->snapid == CEPH_NOSNAP) {
8252 if (in->put_open_ref(f->mode)) {
8253 _flush(in, new C_Client_FlushComplete(this, in));
8254 check_caps(in, 0);
8255 }
8256 } else {
8257 assert(in->snap_cap_refs > 0);
8258 in->snap_cap_refs--;
8259 }
8260
8261 _release_filelocks(f);
8262
8263 // Finally, read any async err (i.e. from flushes)
8264 int err = f->take_async_err();
8265 if (err != 0) {
8266 ldout(cct, 1) << "_release_fh " << f << " on inode " << *in << " caught async_err = "
8267 << cpp_strerror(err) << dendl;
8268 } else {
8269 ldout(cct, 10) << "_release_fh " << f << " on inode " << *in << " no async_err state" << dendl;
8270 }
8271
8272 _put_fh(f);
8273
8274 return err;
8275}
8276
8277void Client::_put_fh(Fh *f)
8278{
8279 int left = f->put();
8280 if (!left) {
8281 delete f;
8282 }
8283}
8284
8285int Client::_open(Inode *in, int flags, mode_t mode, Fh **fhp,
8286 const UserPerm& perms)
8287{
8288 if (in->snapid != CEPH_NOSNAP &&
8289 (flags & (O_WRONLY | O_RDWR | O_CREAT | O_TRUNC | O_APPEND))) {
8290 return -EROFS;
8291 }
8292
8293 // use normalized flags to generate cmode
8294 int cmode = ceph_flags_to_mode(ceph_flags_sys2wire(flags));
8295 if (cmode < 0)
8296 return -EINVAL;
8297 int want = ceph_caps_for_mode(cmode);
8298 int result = 0;
8299
8300 in->get_open_ref(cmode); // make note of pending open, since it effects _wanted_ caps.
8301
8302 if ((flags & O_TRUNC) == 0 &&
8303 in->caps_issued_mask(want)) {
8304 // update wanted?
8305 check_caps(in, CHECK_CAPS_NODELAY);
8306 } else {
8307 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_OPEN);
8308 filepath path;
8309 in->make_nosnap_relative_path(path);
8310 req->set_filepath(path);
8311 req->head.args.open.flags = ceph_flags_sys2wire(flags & ~O_CREAT);
8312 req->head.args.open.mode = mode;
8313 req->head.args.open.pool = -1;
8314 if (cct->_conf->client_debug_getattr_caps)
8315 req->head.args.open.mask = DEBUG_GETATTR_CAPS;
8316 else
8317 req->head.args.open.mask = 0;
8318 req->head.args.open.old_size = in->size; // for O_TRUNC
8319 req->set_inode(in);
8320 result = make_request(req, perms);
8321 }
8322
8323 // success?
8324 if (result >= 0) {
8325 if (fhp)
8326 *fhp = _create_fh(in, flags, cmode, perms);
8327 } else {
8328 in->put_open_ref(cmode);
8329 }
8330
8331 trim_cache();
8332
8333 return result;
8334}
8335
8336int Client::_renew_caps(Inode *in)
8337{
8338 int wanted = in->caps_file_wanted();
8339 if (in->is_any_caps() &&
8340 ((wanted & CEPH_CAP_ANY_WR) == 0 || in->auth_cap)) {
8341 check_caps(in, CHECK_CAPS_NODELAY);
8342 return 0;
8343 }
8344
8345 int flags = 0;
8346 if ((wanted & CEPH_CAP_FILE_RD) && (wanted & CEPH_CAP_FILE_WR))
8347 flags = O_RDWR;
8348 else if (wanted & CEPH_CAP_FILE_RD)
8349 flags = O_RDONLY;
8350 else if (wanted & CEPH_CAP_FILE_WR)
8351 flags = O_WRONLY;
8352
8353 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_OPEN);
8354 filepath path;
8355 in->make_nosnap_relative_path(path);
8356 req->set_filepath(path);
8357 req->head.args.open.flags = flags;
8358 req->head.args.open.pool = -1;
8359 if (cct->_conf->client_debug_getattr_caps)
8360 req->head.args.open.mask = DEBUG_GETATTR_CAPS;
8361 else
8362 req->head.args.open.mask = 0;
8363 req->set_inode(in);
8364
8365 // duplicate in case Cap goes away; not sure if that race is a concern?
8366 const UserPerm *pperm = in->get_best_perms();
8367 UserPerm perms;
8368 if (pperm != NULL)
8369 perms = *pperm;
8370 int ret = make_request(req, perms);
8371 return ret;
8372}
8373
8374int Client::close(int fd)
8375{
8376 ldout(cct, 3) << "close enter(" << fd << ")" << dendl;
8377 Mutex::Locker lock(client_lock);
8378 tout(cct) << "close" << std::endl;
8379 tout(cct) << fd << std::endl;
8380
8381 Fh *fh = get_filehandle(fd);
8382 if (!fh)
8383 return -EBADF;
8384 int err = _release_fh(fh);
8385 fd_map.erase(fd);
8386 put_fd(fd);
8387 ldout(cct, 3) << "close exit(" << fd << ")" << dendl;
8388 return err;
8389}
8390
8391
8392// ------------
8393// read, write
8394
8395loff_t Client::lseek(int fd, loff_t offset, int whence)
8396{
8397 Mutex::Locker lock(client_lock);
8398 tout(cct) << "lseek" << std::endl;
8399 tout(cct) << fd << std::endl;
8400 tout(cct) << offset << std::endl;
8401 tout(cct) << whence << std::endl;
8402
8403 Fh *f = get_filehandle(fd);
8404 if (!f)
8405 return -EBADF;
8406#if defined(__linux__) && defined(O_PATH)
8407 if (f->flags & O_PATH)
8408 return -EBADF;
8409#endif
8410 return _lseek(f, offset, whence);
8411}
8412
8413loff_t Client::_lseek(Fh *f, loff_t offset, int whence)
8414{
8415 Inode *in = f->inode.get();
8416 int r;
8417
8418 switch (whence) {
8419 case SEEK_SET:
8420 f->pos = offset;
8421 break;
8422
8423 case SEEK_CUR:
8424 f->pos += offset;
8425 break;
8426
8427 case SEEK_END:
8428 r = _getattr(in, CEPH_STAT_CAP_SIZE, f->actor_perms);
8429 if (r < 0)
8430 return r;
8431 f->pos = in->size + offset;
8432 break;
8433
8434 default:
8435 ceph_abort();
8436 }
8437
8438 ldout(cct, 3) << "_lseek(" << f << ", " << offset << ", " << whence << ") = " << f->pos << dendl;
8439 return f->pos;
8440}
8441
8442
8443void Client::lock_fh_pos(Fh *f)
8444{
8445 ldout(cct, 10) << "lock_fh_pos " << f << dendl;
8446
8447 if (f->pos_locked || !f->pos_waiters.empty()) {
8448 Cond cond;
8449 f->pos_waiters.push_back(&cond);
8450 ldout(cct, 10) << "lock_fh_pos BLOCKING on " << f << dendl;
8451 while (f->pos_locked || f->pos_waiters.front() != &cond)
8452 cond.Wait(client_lock);
8453 ldout(cct, 10) << "lock_fh_pos UNBLOCKING on " << f << dendl;
8454 assert(f->pos_waiters.front() == &cond);
8455 f->pos_waiters.pop_front();
8456 }
8457
8458 f->pos_locked = true;
8459}
8460
8461void Client::unlock_fh_pos(Fh *f)
8462{
8463 ldout(cct, 10) << "unlock_fh_pos " << f << dendl;
8464 f->pos_locked = false;
8465}
8466
8467int Client::uninline_data(Inode *in, Context *onfinish)
8468{
8469 if (!in->inline_data.length()) {
8470 onfinish->complete(0);
8471 return 0;
8472 }
8473
8474 char oid_buf[32];
8475 snprintf(oid_buf, sizeof(oid_buf), "%llx.00000000", (long long unsigned)in->ino);
8476 object_t oid = oid_buf;
8477
8478 ObjectOperation create_ops;
8479 create_ops.create(false);
8480
8481 objecter->mutate(oid,
8482 OSDMap::file_to_object_locator(in->layout),
8483 create_ops,
8484 in->snaprealm->get_snap_context(),
8485 ceph::real_clock::now(),
8486 0,
8487 NULL);
8488
8489 bufferlist inline_version_bl;
8490 ::encode(in->inline_version, inline_version_bl);
8491
8492 ObjectOperation uninline_ops;
8493 uninline_ops.cmpxattr("inline_version",
8494 CEPH_OSD_CMPXATTR_OP_GT,
8495 CEPH_OSD_CMPXATTR_MODE_U64,
8496 inline_version_bl);
8497 bufferlist inline_data = in->inline_data;
8498 uninline_ops.write(0, inline_data, in->truncate_size, in->truncate_seq);
8499 uninline_ops.setxattr("inline_version", stringify(in->inline_version));
8500
8501 objecter->mutate(oid,
8502 OSDMap::file_to_object_locator(in->layout),
8503 uninline_ops,
8504 in->snaprealm->get_snap_context(),
8505 ceph::real_clock::now(),
8506 0,
8507 onfinish);
8508
8509 return 0;
8510}
8511
8512//
8513
8514// blocking osd interface
8515
8516int Client::read(int fd, char *buf, loff_t size, loff_t offset)
8517{
8518 Mutex::Locker lock(client_lock);
8519 tout(cct) << "read" << std::endl;
8520 tout(cct) << fd << std::endl;
8521 tout(cct) << size << std::endl;
8522 tout(cct) << offset << std::endl;
8523
8524 Fh *f = get_filehandle(fd);
8525 if (!f)
8526 return -EBADF;
8527#if defined(__linux__) && defined(O_PATH)
8528 if (f->flags & O_PATH)
8529 return -EBADF;
8530#endif
8531 bufferlist bl;
8532 int r = _read(f, offset, size, &bl);
8533 ldout(cct, 3) << "read(" << fd << ", " << (void*)buf << ", " << size << ", " << offset << ") = " << r << dendl;
8534 if (r >= 0) {
8535 bl.copy(0, bl.length(), buf);
8536 r = bl.length();
8537 }
8538 return r;
8539}
8540
8541int Client::preadv(int fd, const struct iovec *iov, int iovcnt, loff_t offset)
8542{
8543 if (iovcnt < 0)
8544 return -EINVAL;
8545 return _preadv_pwritev(fd, iov, iovcnt, offset, false);
8546}
8547
8548int Client::_read(Fh *f, int64_t offset, uint64_t size, bufferlist *bl)
8549{
8550 const md_config_t *conf = cct->_conf;
8551 Inode *in = f->inode.get();
8552
8553 if ((f->mode & CEPH_FILE_MODE_RD) == 0)
8554 return -EBADF;
8555 //bool lazy = f->mode == CEPH_FILE_MODE_LAZY;
8556
8557 bool movepos = false;
8558 if (offset < 0) {
8559 lock_fh_pos(f);
8560 offset = f->pos;
8561 movepos = true;
8562 }
8563 loff_t start_pos = offset;
8564
8565 if (in->inline_version == 0) {
8566 int r = _getattr(in, CEPH_STAT_CAP_INLINE_DATA, f->actor_perms, true);
8567 if (r < 0)
8568 return r;
8569 assert(in->inline_version > 0);
8570 }
8571
8572retry:
8573 int have;
8574 int r = get_caps(in, CEPH_CAP_FILE_RD, CEPH_CAP_FILE_CACHE, &have, -1);
8575 if (r < 0)
8576 return r;
8577
8578 if (f->flags & O_DIRECT)
8579 have &= ~CEPH_CAP_FILE_CACHE;
8580
8581 Mutex uninline_flock("Client::_read_uninline_data flock");
8582 Cond uninline_cond;
8583 bool uninline_done = false;
8584 int uninline_ret = 0;
8585 Context *onuninline = NULL;
8586
8587 if (in->inline_version < CEPH_INLINE_NONE) {
8588 if (!(have & CEPH_CAP_FILE_CACHE)) {
8589 onuninline = new C_SafeCond(&uninline_flock,
8590 &uninline_cond,
8591 &uninline_done,
8592 &uninline_ret);
8593 uninline_data(in, onuninline);
8594 } else {
8595 uint32_t len = in->inline_data.length();
8596
8597 uint64_t endoff = offset + size;
8598 if (endoff > in->size)
8599 endoff = in->size;
8600
8601 if (offset < len) {
8602 if (endoff <= len) {
8603 bl->substr_of(in->inline_data, offset, endoff - offset);
8604 } else {
8605 bl->substr_of(in->inline_data, offset, len - offset);
8606 bl->append_zero(endoff - len);
8607 }
8608 } else if ((uint64_t)offset < endoff) {
8609 bl->append_zero(endoff - offset);
8610 }
8611
8612 goto success;
8613 }
8614 }
8615
8616 if (!conf->client_debug_force_sync_read &&
8617 (conf->client_oc && (have & CEPH_CAP_FILE_CACHE))) {
8618
8619 if (f->flags & O_RSYNC) {
8620 _flush_range(in, offset, size);
8621 }
8622 r = _read_async(f, offset, size, bl);
8623 if (r < 0)
8624 goto done;
8625 } else {
8626 if (f->flags & O_DIRECT)
8627 _flush_range(in, offset, size);
8628
8629 bool checkeof = false;
8630 r = _read_sync(f, offset, size, bl, &checkeof);
8631 if (r < 0)
8632 goto done;
8633 if (checkeof) {
8634 offset += r;
8635 size -= r;
8636
8637 put_cap_ref(in, CEPH_CAP_FILE_RD);
8638 have = 0;
8639 // reverify size
8640 r = _getattr(in, CEPH_STAT_CAP_SIZE, f->actor_perms);
8641 if (r < 0)
8642 goto done;
8643
8644 // eof? short read.
8645 if ((uint64_t)offset < in->size)
8646 goto retry;
8647 }
8648 }
8649
8650success:
8651 if (movepos) {
8652 // adjust fd pos
8653 f->pos = start_pos + bl->length();
8654 unlock_fh_pos(f);
8655 }
8656
8657done:
8658 // done!
8659
8660 if (onuninline) {
8661 client_lock.Unlock();
8662 uninline_flock.Lock();
8663 while (!uninline_done)
8664 uninline_cond.Wait(uninline_flock);
8665 uninline_flock.Unlock();
8666 client_lock.Lock();
8667
8668 if (uninline_ret >= 0 || uninline_ret == -ECANCELED) {
8669 in->inline_data.clear();
8670 in->inline_version = CEPH_INLINE_NONE;
8671 mark_caps_dirty(in, CEPH_CAP_FILE_WR);
8672 check_caps(in, 0);
8673 } else
8674 r = uninline_ret;
8675 }
8676
8677 if (have)
8678 put_cap_ref(in, CEPH_CAP_FILE_RD);
8679 return r < 0 ? r : bl->length();
8680}
8681
8682Client::C_Readahead::C_Readahead(Client *c, Fh *f) :
8683 client(c), f(f) {
8684 f->get();
8685 f->readahead.inc_pending();
8686}
8687
8688Client::C_Readahead::~C_Readahead() {
8689 f->readahead.dec_pending();
8690 client->_put_fh(f);
8691}
8692
8693void Client::C_Readahead::finish(int r) {
8694 lgeneric_subdout(client->cct, client, 20) << "client." << client->get_nodeid() << " " << "C_Readahead on " << f->inode << dendl;
8695 client->put_cap_ref(f->inode.get(), CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE);
8696}
8697
8698int Client::_read_async(Fh *f, uint64_t off, uint64_t len, bufferlist *bl)
8699{
8700 const md_config_t *conf = cct->_conf;
8701 Inode *in = f->inode.get();
8702
8703 ldout(cct, 10) << "_read_async " << *in << " " << off << "~" << len << dendl;
8704
8705 // trim read based on file size?
8706 if (off >= in->size)
8707 return 0;
8708 if (len == 0)
8709 return 0;
8710 if (off + len > in->size) {
8711 len = in->size - off;
8712 }
8713
8714 ldout(cct, 10) << " min_bytes=" << f->readahead.get_min_readahead_size()
8715 << " max_bytes=" << f->readahead.get_max_readahead_size()
8716 << " max_periods=" << conf->client_readahead_max_periods << dendl;
8717
8718 // read (and possibly block)
8719 int r, rvalue = 0;
8720 Mutex flock("Client::_read_async flock");
8721 Cond cond;
8722 bool done = false;
8723 Context *onfinish = new C_SafeCond(&flock, &cond, &done, &rvalue);
8724 r = objectcacher->file_read(&in->oset, &in->layout, in->snapid,
8725 off, len, bl, 0, onfinish);
8726 if (r == 0) {
8727 get_cap_ref(in, CEPH_CAP_FILE_CACHE);
8728 client_lock.Unlock();
8729 flock.Lock();
8730 while (!done)
8731 cond.Wait(flock);
8732 flock.Unlock();
8733 client_lock.Lock();
8734 put_cap_ref(in, CEPH_CAP_FILE_CACHE);
8735 r = rvalue;
8736 } else {
8737 // it was cached.
8738 delete onfinish;
8739 }
8740
8741 if(f->readahead.get_min_readahead_size() > 0) {
8742 pair<uint64_t, uint64_t> readahead_extent = f->readahead.update(off, len, in->size);
8743 if (readahead_extent.second > 0) {
8744 ldout(cct, 20) << "readahead " << readahead_extent.first << "~" << readahead_extent.second
8745 << " (caller wants " << off << "~" << len << ")" << dendl;
8746 Context *onfinish2 = new C_Readahead(this, f);
8747 int r2 = objectcacher->file_read(&in->oset, &in->layout, in->snapid,
8748 readahead_extent.first, readahead_extent.second,
8749 NULL, 0, onfinish2);
8750 if (r2 == 0) {
8751 ldout(cct, 20) << "readahead initiated, c " << onfinish2 << dendl;
8752 get_cap_ref(in, CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE);
8753 } else {
8754 ldout(cct, 20) << "readahead was no-op, already cached" << dendl;
8755 delete onfinish2;
8756 }
8757 }
8758 }
8759
8760 return r;
8761}
8762
8763int Client::_read_sync(Fh *f, uint64_t off, uint64_t len, bufferlist *bl,
8764 bool *checkeof)
8765{
8766 Inode *in = f->inode.get();
8767 uint64_t pos = off;
8768 int left = len;
8769 int read = 0;
8770
8771 ldout(cct, 10) << "_read_sync " << *in << " " << off << "~" << len << dendl;
8772
8773 Mutex flock("Client::_read_sync flock");
8774 Cond cond;
8775 while (left > 0) {
8776 int r = 0;
8777 bool done = false;
8778 Context *onfinish = new C_SafeCond(&flock, &cond, &done, &r);
8779 bufferlist tbl;
8780
8781 int wanted = left;
8782 filer->read_trunc(in->ino, &in->layout, in->snapid,
8783 pos, left, &tbl, 0,
8784 in->truncate_size, in->truncate_seq,
8785 onfinish);
8786 client_lock.Unlock();
8787 flock.Lock();
8788 while (!done)
8789 cond.Wait(flock);
8790 flock.Unlock();
8791 client_lock.Lock();
8792
8793 // if we get ENOENT from OSD, assume 0 bytes returned
8794 if (r == -ENOENT)
8795 r = 0;
8796 if (r < 0)
8797 return r;
8798 if (tbl.length()) {
8799 r = tbl.length();
8800
8801 read += r;
8802 pos += r;
8803 left -= r;
8804 bl->claim_append(tbl);
8805 }
8806 // short read?
8807 if (r >= 0 && r < wanted) {
8808 if (pos < in->size) {
8809 // zero up to known EOF
8810 int64_t some = in->size - pos;
8811 if (some > left)
8812 some = left;
8813 bufferptr z(some);
8814 z.zero();
8815 bl->push_back(z);
8816 read += some;
8817 pos += some;
8818 left -= some;
8819 if (left == 0)
8820 return read;
8821 }
8822
8823 *checkeof = true;
8824 return read;
8825 }
8826 }
8827 return read;
8828}
8829
8830
8831/*
8832 * we keep count of uncommitted sync writes on the inode, so that
8833 * fsync can DDRT.
8834 */
8835void Client::_sync_write_commit(Inode *in)
8836{
8837 assert(unsafe_sync_write > 0);
8838 unsafe_sync_write--;
8839
8840 put_cap_ref(in, CEPH_CAP_FILE_BUFFER);
8841
8842 ldout(cct, 15) << "sync_write_commit unsafe_sync_write = " << unsafe_sync_write << dendl;
8843 if (unsafe_sync_write == 0 && unmounting) {
8844 ldout(cct, 10) << "sync_write_commit -- no more unsafe writes, unmount can proceed" << dendl;
8845 mount_cond.Signal();
8846 }
8847}
8848
8849int Client::write(int fd, const char *buf, loff_t size, loff_t offset)
8850{
8851 Mutex::Locker lock(client_lock);
8852 tout(cct) << "write" << std::endl;
8853 tout(cct) << fd << std::endl;
8854 tout(cct) << size << std::endl;
8855 tout(cct) << offset << std::endl;
8856
8857 Fh *fh = get_filehandle(fd);
8858 if (!fh)
8859 return -EBADF;
8860#if defined(__linux__) && defined(O_PATH)
8861 if (fh->flags & O_PATH)
8862 return -EBADF;
8863#endif
8864 int r = _write(fh, offset, size, buf, NULL, 0);
8865 ldout(cct, 3) << "write(" << fd << ", \"...\", " << size << ", " << offset << ") = " << r << dendl;
8866 return r;
8867}
8868
8869int Client::pwritev(int fd, const struct iovec *iov, int iovcnt, int64_t offset)
8870{
8871 if (iovcnt < 0)
8872 return -EINVAL;
8873 return _preadv_pwritev(fd, iov, iovcnt, offset, true);
8874}
8875
8876int Client::_preadv_pwritev(int fd, const struct iovec *iov, unsigned iovcnt, int64_t offset, bool write)
8877{
8878 Mutex::Locker lock(client_lock);
8879 tout(cct) << fd << std::endl;
8880 tout(cct) << offset << std::endl;
8881
8882 Fh *fh = get_filehandle(fd);
8883 if (!fh)
8884 return -EBADF;
8885#if defined(__linux__) && defined(O_PATH)
8886 if (fh->flags & O_PATH)
8887 return -EBADF;
8888#endif
8889 loff_t totallen = 0;
8890 for (unsigned i = 0; i < iovcnt; i++) {
8891 totallen += iov[i].iov_len;
8892 }
8893 if (write) {
8894 int w = _write(fh, offset, totallen, NULL, iov, iovcnt);
8895 ldout(cct, 3) << "pwritev(" << fd << ", \"...\", " << totallen << ", " << offset << ") = " << w << dendl;
8896 return w;
8897 } else {
8898 bufferlist bl;
8899 int r = _read(fh, offset, totallen, &bl);
8900 ldout(cct, 3) << "preadv(" << fd << ", " << offset << ") = " << r << dendl;
8901 if (r <= 0)
8902 return r;
8903
8904 int bufoff = 0;
8905 for (unsigned j = 0, resid = r; j < iovcnt && resid > 0; j++) {
8906 /*
8907 * This piece of code aims to handle the case that bufferlist does not have enough data
8908 * to fill in the iov
8909 */
8910 if (resid < iov[j].iov_len) {
8911 bl.copy(bufoff, resid, (char *)iov[j].iov_base);
8912 break;
8913 } else {
8914 bl.copy(bufoff, iov[j].iov_len, (char *)iov[j].iov_base);
8915 }
8916 resid -= iov[j].iov_len;
8917 bufoff += iov[j].iov_len;
8918 }
8919 return r;
8920 }
8921}
8922
8923int Client::_write(Fh *f, int64_t offset, uint64_t size, const char *buf,
8924 const struct iovec *iov, int iovcnt)
8925{
8926 if ((uint64_t)(offset+size) > mdsmap->get_max_filesize()) //too large!
8927 return -EFBIG;
8928
8929 //ldout(cct, 7) << "write fh " << fh << " size " << size << " offset " << offset << dendl;
8930 Inode *in = f->inode.get();
8931
8932 if (objecter->osdmap_pool_full(in->layout.pool_id)) {
8933 return -ENOSPC;
8934 }
8935
8936 assert(in->snapid == CEPH_NOSNAP);
8937
8938 // was Fh opened as writeable?
8939 if ((f->mode & CEPH_FILE_MODE_WR) == 0)
8940 return -EBADF;
8941
8942 // check quota
8943 uint64_t endoff = offset + size;
8944 if (endoff > in->size && is_quota_bytes_exceeded(in, endoff - in->size,
8945 f->actor_perms)) {
8946 return -EDQUOT;
8947 }
8948
8949 // use/adjust fd pos?
8950 if (offset < 0) {
8951 lock_fh_pos(f);
8952 /*
8953 * FIXME: this is racy in that we may block _after_ this point waiting for caps, and size may
8954 * change out from under us.
8955 */
8956 if (f->flags & O_APPEND) {
8957 int r = _lseek(f, 0, SEEK_END);
8958 if (r < 0) {
8959 unlock_fh_pos(f);
8960 return r;
8961 }
8962 }
8963 offset = f->pos;
8964 f->pos = offset+size;
8965 unlock_fh_pos(f);
8966 }
8967
8968 //bool lazy = f->mode == CEPH_FILE_MODE_LAZY;
8969
8970 ldout(cct, 10) << "cur file size is " << in->size << dendl;
8971
8972 // time it.
8973 utime_t start = ceph_clock_now();
8974
8975 if (in->inline_version == 0) {
8976 int r = _getattr(in, CEPH_STAT_CAP_INLINE_DATA, f->actor_perms, true);
8977 if (r < 0)
8978 return r;
8979 assert(in->inline_version > 0);
8980 }
8981
8982 // copy into fresh buffer (since our write may be resub, async)
8983 bufferlist bl;
8984 if (buf) {
8985 if (size > 0)
8986 bl.append(buf, size);
8987 } else if (iov){
8988 for (int i = 0; i < iovcnt; i++) {
8989 if (iov[i].iov_len > 0) {
8990 bl.append((const char *)iov[i].iov_base, iov[i].iov_len);
8991 }
8992 }
8993 }
8994
8995 utime_t lat;
8996 uint64_t totalwritten;
8997 int have;
8998 int r = get_caps(in, CEPH_CAP_FILE_WR|CEPH_CAP_AUTH_SHARED,
8999 CEPH_CAP_FILE_BUFFER, &have, endoff);
9000 if (r < 0)
9001 return r;
9002
9003 /* clear the setuid/setgid bits, if any */
9004 if (unlikely((in->mode & S_ISUID) ||
9005 (in->mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP))) {
9006 struct ceph_statx stx = { 0 };
9007
9008 put_cap_ref(in, CEPH_CAP_AUTH_SHARED);
9009 r = __setattrx(in, &stx, CEPH_SETATTR_KILL_SGUID, f->actor_perms);
9010 if (r < 0)
9011 return r;
9012 } else {
9013 put_cap_ref(in, CEPH_CAP_AUTH_SHARED);
9014 }
9015
9016 if (f->flags & O_DIRECT)
9017 have &= ~CEPH_CAP_FILE_BUFFER;
9018
9019 ldout(cct, 10) << " snaprealm " << *in->snaprealm << dendl;
9020
9021 Mutex uninline_flock("Client::_write_uninline_data flock");
9022 Cond uninline_cond;
9023 bool uninline_done = false;
9024 int uninline_ret = 0;
9025 Context *onuninline = NULL;
9026
9027 if (in->inline_version < CEPH_INLINE_NONE) {
9028 if (endoff > cct->_conf->client_max_inline_size ||
9029 endoff > CEPH_INLINE_MAX_SIZE ||
9030 !(have & CEPH_CAP_FILE_BUFFER)) {
9031 onuninline = new C_SafeCond(&uninline_flock,
9032 &uninline_cond,
9033 &uninline_done,
9034 &uninline_ret);
9035 uninline_data(in, onuninline);
9036 } else {
9037 get_cap_ref(in, CEPH_CAP_FILE_BUFFER);
9038
9039 uint32_t len = in->inline_data.length();
9040
9041 if (endoff < len)
9042 in->inline_data.copy(endoff, len - endoff, bl);
9043
9044 if (offset < len)
9045 in->inline_data.splice(offset, len - offset);
9046 else if (offset > len)
9047 in->inline_data.append_zero(offset - len);
9048
9049 in->inline_data.append(bl);
9050 in->inline_version++;
9051
9052 put_cap_ref(in, CEPH_CAP_FILE_BUFFER);
9053
9054 goto success;
9055 }
9056 }
9057
9058 if (cct->_conf->client_oc && (have & CEPH_CAP_FILE_BUFFER)) {
9059 // do buffered write
9060 if (!in->oset.dirty_or_tx)
9061 get_cap_ref(in, CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_BUFFER);
9062
9063 get_cap_ref(in, CEPH_CAP_FILE_BUFFER);
9064
9065 // async, caching, non-blocking.
9066 r = objectcacher->file_write(&in->oset, &in->layout,
9067 in->snaprealm->get_snap_context(),
9068 offset, size, bl, ceph::real_clock::now(),
9069 0);
9070 put_cap_ref(in, CEPH_CAP_FILE_BUFFER);
9071
9072 if (r < 0)
9073 goto done;
9074
9075 // flush cached write if O_SYNC is set on file fh
9076 // O_DSYNC == O_SYNC on linux < 2.6.33
9077 // O_SYNC = __O_SYNC | O_DSYNC on linux >= 2.6.33
9078 if ((f->flags & O_SYNC) || (f->flags & O_DSYNC)) {
9079 _flush_range(in, offset, size);
9080 }
9081 } else {
9082 if (f->flags & O_DIRECT)
9083 _flush_range(in, offset, size);
9084
9085 // simple, non-atomic sync write
9086 Mutex flock("Client::_write flock");
9087 Cond cond;
9088 bool done = false;
9089 Context *onfinish = new C_SafeCond(&flock, &cond, &done);
9090
9091 unsafe_sync_write++;
9092 get_cap_ref(in, CEPH_CAP_FILE_BUFFER); // released by onsafe callback
9093
9094 filer->write_trunc(in->ino, &in->layout, in->snaprealm->get_snap_context(),
9095 offset, size, bl, ceph::real_clock::now(), 0,
9096 in->truncate_size, in->truncate_seq,
9097 onfinish);
9098 client_lock.Unlock();
9099 flock.Lock();
9100
9101 while (!done)
9102 cond.Wait(flock);
9103 flock.Unlock();
9104 client_lock.Lock();
9105 _sync_write_commit(in);
9106 }
9107
9108 // if we get here, write was successful, update client metadata
9109success:
9110 // time
9111 lat = ceph_clock_now();
9112 lat -= start;
9113 logger->tinc(l_c_wrlat, lat);
9114
9115 totalwritten = size;
9116 r = (int)totalwritten;
9117
9118 // extend file?
9119 if (totalwritten + offset > in->size) {
9120 in->size = totalwritten + offset;
9121 mark_caps_dirty(in, CEPH_CAP_FILE_WR);
9122
9123 if (is_quota_bytes_approaching(in, f->actor_perms)) {
9124 check_caps(in, CHECK_CAPS_NODELAY);
31f18b77
FG
9125 } else if (is_max_size_approaching(in)) {
9126 check_caps(in, 0);
7c673cae
FG
9127 }
9128
9129 ldout(cct, 7) << "wrote to " << totalwritten+offset << ", extending file size" << dendl;
9130 } else {
9131 ldout(cct, 7) << "wrote to " << totalwritten+offset << ", leaving file size at " << in->size << dendl;
9132 }
9133
9134 // mtime
9135 in->mtime = ceph_clock_now();
9136 in->change_attr++;
9137 mark_caps_dirty(in, CEPH_CAP_FILE_WR);
9138
9139done:
9140
9141 if (onuninline) {
9142 client_lock.Unlock();
9143 uninline_flock.Lock();
9144 while (!uninline_done)
9145 uninline_cond.Wait(uninline_flock);
9146 uninline_flock.Unlock();
9147 client_lock.Lock();
9148
9149 if (uninline_ret >= 0 || uninline_ret == -ECANCELED) {
9150 in->inline_data.clear();
9151 in->inline_version = CEPH_INLINE_NONE;
9152 mark_caps_dirty(in, CEPH_CAP_FILE_WR);
9153 check_caps(in, 0);
9154 } else
9155 r = uninline_ret;
9156 }
9157
9158 put_cap_ref(in, CEPH_CAP_FILE_WR);
9159 return r;
9160}
9161
9162int Client::_flush(Fh *f)
9163{
9164 Inode *in = f->inode.get();
9165 int err = f->take_async_err();
9166 if (err != 0) {
9167 ldout(cct, 1) << __func__ << ": " << f << " on inode " << *in << " caught async_err = "
9168 << cpp_strerror(err) << dendl;
9169 } else {
9170 ldout(cct, 10) << __func__ << ": " << f << " on inode " << *in << " no async_err state" << dendl;
9171 }
9172
9173 return err;
9174}
9175
9176int Client::truncate(const char *relpath, loff_t length, const UserPerm& perms)
9177{
9178 struct ceph_statx stx;
9179 stx.stx_size = length;
9180 return setattrx(relpath, &stx, CEPH_SETATTR_SIZE, perms);
9181}
9182
9183int Client::ftruncate(int fd, loff_t length, const UserPerm& perms)
9184{
9185 Mutex::Locker lock(client_lock);
9186 tout(cct) << "ftruncate" << std::endl;
9187 tout(cct) << fd << std::endl;
9188 tout(cct) << length << std::endl;
9189
9190 Fh *f = get_filehandle(fd);
9191 if (!f)
9192 return -EBADF;
9193#if defined(__linux__) && defined(O_PATH)
9194 if (f->flags & O_PATH)
9195 return -EBADF;
9196#endif
9197 struct stat attr;
9198 attr.st_size = length;
9199 return _setattr(f->inode, &attr, CEPH_SETATTR_SIZE, perms);
9200}
9201
9202int Client::fsync(int fd, bool syncdataonly)
9203{
9204 Mutex::Locker lock(client_lock);
9205 tout(cct) << "fsync" << std::endl;
9206 tout(cct) << fd << std::endl;
9207 tout(cct) << syncdataonly << std::endl;
9208
9209 Fh *f = get_filehandle(fd);
9210 if (!f)
9211 return -EBADF;
9212#if defined(__linux__) && defined(O_PATH)
9213 if (f->flags & O_PATH)
9214 return -EBADF;
9215#endif
9216 int r = _fsync(f, syncdataonly);
9217 if (r == 0) {
9218 // The IOs in this fsync were okay, but maybe something happened
9219 // in the background that we shoudl be reporting?
9220 r = f->take_async_err();
9221 ldout(cct, 3) << "fsync(" << fd << ", " << syncdataonly
9222 << ") = 0, async_err = " << r << dendl;
9223 } else {
9224 // Assume that an error we encountered during fsync, even reported
9225 // synchronously, would also have applied the error to the Fh, and we
9226 // should clear it here to avoid returning the same error again on next
9227 // call.
9228 ldout(cct, 3) << "fsync(" << fd << ", " << syncdataonly << ") = "
9229 << r << dendl;
9230 f->take_async_err();
9231 }
9232 return r;
9233}
9234
9235int Client::_fsync(Inode *in, bool syncdataonly)
9236{
9237 int r = 0;
9238 Mutex lock("Client::_fsync::lock");
9239 Cond cond;
9240 bool done = false;
9241 C_SafeCond *object_cacher_completion = NULL;
9242 ceph_tid_t flush_tid = 0;
9243 InodeRef tmp_ref;
9244
9245 ldout(cct, 3) << "_fsync on " << *in << " " << (syncdataonly ? "(dataonly)":"(data+metadata)") << dendl;
9246
9247 if (cct->_conf->client_oc) {
9248 object_cacher_completion = new C_SafeCond(&lock, &cond, &done, &r);
9249 tmp_ref = in; // take a reference; C_SafeCond doesn't and _flush won't either
9250 _flush(in, object_cacher_completion);
9251 ldout(cct, 15) << "using return-valued form of _fsync" << dendl;
9252 }
9253
9254 if (!syncdataonly && in->dirty_caps) {
9255 check_caps(in, CHECK_CAPS_NODELAY|CHECK_CAPS_SYNCHRONOUS);
9256 if (in->flushing_caps)
9257 flush_tid = last_flush_tid;
9258 } else ldout(cct, 10) << "no metadata needs to commit" << dendl;
9259
9260 if (!syncdataonly && !in->unsafe_ops.empty()) {
9261 MetaRequest *req = in->unsafe_ops.back();
9262 ldout(cct, 15) << "waiting on unsafe requests, last tid " << req->get_tid() << dendl;
9263
9264 req->get();
9265 wait_on_list(req->waitfor_safe);
9266 put_request(req);
9267 }
9268
9269 if (object_cacher_completion) { // wait on a real reply instead of guessing
9270 client_lock.Unlock();
9271 lock.Lock();
9272 ldout(cct, 15) << "waiting on data to flush" << dendl;
9273 while (!done)
9274 cond.Wait(lock);
9275 lock.Unlock();
9276 client_lock.Lock();
9277 ldout(cct, 15) << "got " << r << " from flush writeback" << dendl;
9278 } else {
9279 // FIXME: this can starve
9280 while (in->cap_refs[CEPH_CAP_FILE_BUFFER] > 0) {
9281 ldout(cct, 10) << "ino " << in->ino << " has " << in->cap_refs[CEPH_CAP_FILE_BUFFER]
9282 << " uncommitted, waiting" << dendl;
9283 wait_on_list(in->waitfor_commit);
9284 }
9285 }
9286
9287 if (!r) {
9288 if (flush_tid > 0)
9289 wait_sync_caps(in, flush_tid);
9290
9291 ldout(cct, 10) << "ino " << in->ino << " has no uncommitted writes" << dendl;
9292 } else {
9293 ldout(cct, 1) << "ino " << in->ino << " failed to commit to disk! "
9294 << cpp_strerror(-r) << dendl;
9295 }
9296
9297 return r;
9298}
9299
9300int Client::_fsync(Fh *f, bool syncdataonly)
9301{
9302 ldout(cct, 3) << "_fsync(" << f << ", " << (syncdataonly ? "dataonly)":"data+metadata)") << dendl;
9303 return _fsync(f->inode.get(), syncdataonly);
9304}
9305
9306int Client::fstat(int fd, struct stat *stbuf, const UserPerm& perms, int mask)
9307{
9308 Mutex::Locker lock(client_lock);
9309 tout(cct) << "fstat mask " << hex << mask << dec << std::endl;
9310 tout(cct) << fd << std::endl;
9311
9312 Fh *f = get_filehandle(fd);
9313 if (!f)
9314 return -EBADF;
9315 int r = _getattr(f->inode, mask, perms);
9316 if (r < 0)
9317 return r;
9318 fill_stat(f->inode, stbuf, NULL);
9319 ldout(cct, 3) << "fstat(" << fd << ", " << stbuf << ") = " << r << dendl;
9320 return r;
9321}
9322
9323int Client::fstatx(int fd, struct ceph_statx *stx, const UserPerm& perms,
9324 unsigned int want, unsigned int flags)
9325{
9326 Mutex::Locker lock(client_lock);
9327 tout(cct) << "fstatx flags " << hex << flags << " want " << want << dec << std::endl;
9328 tout(cct) << fd << std::endl;
9329
9330 Fh *f = get_filehandle(fd);
9331 if (!f)
9332 return -EBADF;
9333
9334 unsigned mask = statx_to_mask(flags, want);
9335
9336 int r = 0;
9337 if (mask && !f->inode->caps_issued_mask(mask)) {
9338 r = _getattr(f->inode, mask, perms);
9339 if (r < 0) {
9340 ldout(cct, 3) << "fstatx exit on error!" << dendl;
9341 return r;
9342 }
9343 }
9344
9345 fill_statx(f->inode, mask, stx);
9346 ldout(cct, 3) << "fstatx(" << fd << ", " << stx << ") = " << r << dendl;
9347 return r;
9348}
9349
9350// not written yet, but i want to link!
9351
9352int Client::chdir(const char *relpath, std::string &new_cwd,
9353 const UserPerm& perms)
9354{
9355 Mutex::Locker lock(client_lock);
9356 tout(cct) << "chdir" << std::endl;
9357 tout(cct) << relpath << std::endl;
9358 filepath path(relpath);
9359 InodeRef in;
9360 int r = path_walk(path, &in, perms);
9361 if (r < 0)
9362 return r;
9363 if (cwd != in)
9364 cwd.swap(in);
9365 ldout(cct, 3) << "chdir(" << relpath << ") cwd now " << cwd->ino << dendl;
9366
9367 getcwd(new_cwd, perms);
9368 return 0;
9369}
9370
9371void Client::getcwd(string& dir, const UserPerm& perms)
9372{
9373 filepath path;
9374 ldout(cct, 10) << "getcwd " << *cwd << dendl;
9375
9376 Inode *in = cwd.get();
9377 while (in != root) {
9378 assert(in->dn_set.size() < 2); // dirs can't be hard-linked
9379
9380 // A cwd or ancester is unlinked
9381 if (in->dn_set.empty()) {
9382 return;
9383 }
9384
9385 Dentry *dn = in->get_first_parent();
9386
9387
9388 if (!dn) {
9389 // look it up
9390 ldout(cct, 10) << "getcwd looking up parent for " << *in << dendl;
9391 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPNAME);
9392 filepath path(in->ino);
9393 req->set_filepath(path);
9394 req->set_inode(in);
9395 int res = make_request(req, perms);
9396 if (res < 0)
9397 break;
9398
9399 // start over
9400 path = filepath();
9401 in = cwd.get();
9402 continue;
9403 }
9404 path.push_front_dentry(dn->name);
9405 in = dn->dir->parent_inode;
9406 }
9407 dir = "/";
9408 dir += path.get_path();
9409}
9410
9411int Client::statfs(const char *path, struct statvfs *stbuf,
9412 const UserPerm& perms)
9413{
9414 Mutex::Locker l(client_lock);
9415 tout(cct) << "statfs" << std::endl;
9416
9417 ceph_statfs stats;
9418 C_SaferCond cond;
9419 objecter->get_fs_stats(stats, &cond);
9420
9421 client_lock.Unlock();
9422 int rval = cond.wait();
9423 client_lock.Lock();
9424
9425 if (rval < 0) {
9426 ldout(cct, 1) << "underlying call to statfs returned error: "
9427 << cpp_strerror(rval)
9428 << dendl;
9429 return rval;
9430 }
9431
9432 memset(stbuf, 0, sizeof(*stbuf));
9433
9434 /*
9435 * we're going to set a block size of 4MB so we can represent larger
9436 * FSes without overflowing. Additionally convert the space
9437 * measurements from KB to bytes while making them in terms of
9438 * blocks. We use 4MB only because it is big enough, and because it
9439 * actually *is* the (ceph) default block size.
9440 */
9441 const int CEPH_BLOCK_SHIFT = 22;
9442 stbuf->f_frsize = 1 << CEPH_BLOCK_SHIFT;
9443 stbuf->f_bsize = 1 << CEPH_BLOCK_SHIFT;
9444 stbuf->f_files = stats.num_objects;
9445 stbuf->f_ffree = -1;
9446 stbuf->f_favail = -1;
9447 stbuf->f_fsid = -1; // ??
9448 stbuf->f_flag = 0; // ??
9449 stbuf->f_namemax = NAME_MAX;
9450
9451 // Usually quota_root will == root_ancestor, but if the mount root has no
9452 // quota but we can see a parent of it that does have a quota, we'll
9453 // respect that one instead.
9454 assert(root != nullptr);
9455 Inode *quota_root = root->quota.is_enable() ? root : get_quota_root(root, perms);
9456
9457 // get_quota_root should always give us something
9458 // because client quotas are always enabled
9459 assert(quota_root != nullptr);
9460
9461 if (quota_root && cct->_conf->client_quota_df && quota_root->quota.max_bytes) {
9462
9463 // Skip the getattr if any sessions are stale, as we don't want to
9464 // block `df` if this client has e.g. been evicted, or if the MDS cluster
9465 // is unhealthy.
9466 if (!_any_stale_sessions()) {
9467 int r = _getattr(quota_root, 0, perms, true);
9468 if (r != 0) {
9469 // Ignore return value: error getting latest inode metadata is not a good
9470 // reason to break "df".
9471 lderr(cct) << "Error in getattr on quota root 0x"
9472 << std::hex << quota_root->ino << std::dec
9473 << " statfs result may be outdated" << dendl;
9474 }
9475 }
9476
9477 // Special case: if there is a size quota set on the Inode acting
9478 // as the root for this client mount, then report the quota status
9479 // as the filesystem statistics.
9480 const fsblkcnt_t total = quota_root->quota.max_bytes >> CEPH_BLOCK_SHIFT;
9481 const fsblkcnt_t used = quota_root->rstat.rbytes >> CEPH_BLOCK_SHIFT;
31f18b77
FG
9482 // It is possible for a quota to be exceeded: arithmetic here must
9483 // handle case where used > total.
9484 const fsblkcnt_t free = total > used ? total - used : 0;
7c673cae
FG
9485
9486 stbuf->f_blocks = total;
9487 stbuf->f_bfree = free;
9488 stbuf->f_bavail = free;
9489 } else {
9490 // General case: report the overall RADOS cluster's statistics. Because
9491 // multiple pools may be used without one filesystem namespace via
9492 // layouts, this is the most correct thing we can do.
9493 stbuf->f_blocks = stats.kb >> (CEPH_BLOCK_SHIFT - 10);
9494 stbuf->f_bfree = stats.kb_avail >> (CEPH_BLOCK_SHIFT - 10);
9495 stbuf->f_bavail = stats.kb_avail >> (CEPH_BLOCK_SHIFT - 10);
9496 }
9497
9498 return rval;
9499}
9500
9501int Client::_do_filelock(Inode *in, Fh *fh, int lock_type, int op, int sleep,
9502 struct flock *fl, uint64_t owner, bool removing)
9503{
9504 ldout(cct, 10) << "_do_filelock ino " << in->ino
9505 << (lock_type == CEPH_LOCK_FCNTL ? " fcntl" : " flock")
9506 << " type " << fl->l_type << " owner " << owner
9507 << " " << fl->l_start << "~" << fl->l_len << dendl;
9508
9509 int lock_cmd;
9510 if (F_RDLCK == fl->l_type)
9511 lock_cmd = CEPH_LOCK_SHARED;
9512 else if (F_WRLCK == fl->l_type)
9513 lock_cmd = CEPH_LOCK_EXCL;
9514 else if (F_UNLCK == fl->l_type)
9515 lock_cmd = CEPH_LOCK_UNLOCK;
9516 else
9517 return -EIO;
9518
9519 if (op != CEPH_MDS_OP_SETFILELOCK || lock_cmd == CEPH_LOCK_UNLOCK)
9520 sleep = 0;
9521
9522 /*
9523 * Set the most significant bit, so that MDS knows the 'owner'
9524 * is sufficient to identify the owner of lock. (old code uses
9525 * both 'owner' and 'pid')
9526 */
9527 owner |= (1ULL << 63);
9528
9529 MetaRequest *req = new MetaRequest(op);
9530 filepath path;
9531 in->make_nosnap_relative_path(path);
9532 req->set_filepath(path);
9533 req->set_inode(in);
9534
9535 req->head.args.filelock_change.rule = lock_type;
9536 req->head.args.filelock_change.type = lock_cmd;
9537 req->head.args.filelock_change.owner = owner;
9538 req->head.args.filelock_change.pid = fl->l_pid;
9539 req->head.args.filelock_change.start = fl->l_start;
9540 req->head.args.filelock_change.length = fl->l_len;
9541 req->head.args.filelock_change.wait = sleep;
9542
9543 int ret;
9544 bufferlist bl;
9545
9546 if (sleep && switch_interrupt_cb) {
9547 // enable interrupt
9548 switch_interrupt_cb(callback_handle, req->get());
9549 ret = make_request(req, fh->actor_perms, NULL, NULL, -1, &bl);
7c673cae
FG
9550 // disable interrupt
9551 switch_interrupt_cb(callback_handle, NULL);
31f18b77
FG
9552 if (ret == 0 && req->aborted()) {
9553 // effect of this lock request has been revoked by the 'lock intr' request
9554 ret = req->get_abort_code();
9555 }
7c673cae
FG
9556 put_request(req);
9557 } else {
9558 ret = make_request(req, fh->actor_perms, NULL, NULL, -1, &bl);
9559 }
9560
9561 if (ret == 0) {
9562 if (op == CEPH_MDS_OP_GETFILELOCK) {
9563 ceph_filelock filelock;
9564 bufferlist::iterator p = bl.begin();
9565 ::decode(filelock, p);
9566
9567 if (CEPH_LOCK_SHARED == filelock.type)
9568 fl->l_type = F_RDLCK;
9569 else if (CEPH_LOCK_EXCL == filelock.type)
9570 fl->l_type = F_WRLCK;
9571 else
9572 fl->l_type = F_UNLCK;
9573
9574 fl->l_whence = SEEK_SET;
9575 fl->l_start = filelock.start;
9576 fl->l_len = filelock.length;
9577 fl->l_pid = filelock.pid;
9578 } else if (op == CEPH_MDS_OP_SETFILELOCK) {
9579 ceph_lock_state_t *lock_state;
9580 if (lock_type == CEPH_LOCK_FCNTL) {
9581 if (!in->fcntl_locks)
9582 in->fcntl_locks = new ceph_lock_state_t(cct, CEPH_LOCK_FCNTL);
9583 lock_state = in->fcntl_locks;
9584 } else if (lock_type == CEPH_LOCK_FLOCK) {
9585 if (!in->flock_locks)
9586 in->flock_locks = new ceph_lock_state_t(cct, CEPH_LOCK_FLOCK);
9587 lock_state = in->flock_locks;
9588 } else {
9589 ceph_abort();
9590 return -EINVAL;
9591 }
9592 _update_lock_state(fl, owner, lock_state);
9593
9594 if (!removing) {
9595 if (lock_type == CEPH_LOCK_FCNTL) {
9596 if (!fh->fcntl_locks)
9597 fh->fcntl_locks = new ceph_lock_state_t(cct, CEPH_LOCK_FCNTL);
9598 lock_state = fh->fcntl_locks;
9599 } else {
9600 if (!fh->flock_locks)
9601 fh->flock_locks = new ceph_lock_state_t(cct, CEPH_LOCK_FLOCK);
9602 lock_state = fh->flock_locks;
9603 }
9604 _update_lock_state(fl, owner, lock_state);
9605 }
9606 } else
9607 ceph_abort();
9608 }
9609 return ret;
9610}
9611
9612int Client::_interrupt_filelock(MetaRequest *req)
9613{
31f18b77
FG
9614 // Set abort code, but do not kick. The abort code prevents the request
9615 // from being re-sent.
9616 req->abort(-EINTR);
9617 if (req->mds < 0)
9618 return 0; // haven't sent the request
9619
7c673cae
FG
9620 Inode *in = req->inode();
9621
9622 int lock_type;
9623 if (req->head.args.filelock_change.rule == CEPH_LOCK_FLOCK)
9624 lock_type = CEPH_LOCK_FLOCK_INTR;
9625 else if (req->head.args.filelock_change.rule == CEPH_LOCK_FCNTL)
9626 lock_type = CEPH_LOCK_FCNTL_INTR;
9627 else {
9628 ceph_abort();
9629 return -EINVAL;
9630 }
9631
9632 MetaRequest *intr_req = new MetaRequest(CEPH_MDS_OP_SETFILELOCK);
9633 filepath path;
9634 in->make_nosnap_relative_path(path);
9635 intr_req->set_filepath(path);
9636 intr_req->set_inode(in);
9637 intr_req->head.args.filelock_change = req->head.args.filelock_change;
9638 intr_req->head.args.filelock_change.rule = lock_type;
9639 intr_req->head.args.filelock_change.type = CEPH_LOCK_UNLOCK;
9640
9641 UserPerm perms(req->get_uid(), req->get_gid());
9642 return make_request(intr_req, perms, NULL, NULL, -1);
9643}
9644
9645void Client::_encode_filelocks(Inode *in, bufferlist& bl)
9646{
9647 if (!in->fcntl_locks && !in->flock_locks)
9648 return;
9649
9650 unsigned nr_fcntl_locks = in->fcntl_locks ? in->fcntl_locks->held_locks.size() : 0;
9651 ::encode(nr_fcntl_locks, bl);
9652 if (nr_fcntl_locks) {
9653 ceph_lock_state_t* lock_state = in->fcntl_locks;
9654 for(multimap<uint64_t, ceph_filelock>::iterator p = lock_state->held_locks.begin();
9655 p != lock_state->held_locks.end();
9656 ++p)
9657 ::encode(p->second, bl);
9658 }
9659
9660 unsigned nr_flock_locks = in->flock_locks ? in->flock_locks->held_locks.size() : 0;
9661 ::encode(nr_flock_locks, bl);
9662 if (nr_flock_locks) {
9663 ceph_lock_state_t* lock_state = in->flock_locks;
9664 for(multimap<uint64_t, ceph_filelock>::iterator p = lock_state->held_locks.begin();
9665 p != lock_state->held_locks.end();
9666 ++p)
9667 ::encode(p->second, bl);
9668 }
9669
9670 ldout(cct, 10) << "_encode_filelocks ino " << in->ino << ", " << nr_fcntl_locks
9671 << " fcntl locks, " << nr_flock_locks << " flock locks" << dendl;
9672}
9673
9674void Client::_release_filelocks(Fh *fh)
9675{
9676 if (!fh->fcntl_locks && !fh->flock_locks)
9677 return;
9678
9679 Inode *in = fh->inode.get();
9680 ldout(cct, 10) << "_release_filelocks " << fh << " ino " << in->ino << dendl;
9681
9682 list<pair<int, ceph_filelock> > to_release;
9683
9684 if (fh->fcntl_locks) {
9685 ceph_lock_state_t* lock_state = fh->fcntl_locks;
9686 for(multimap<uint64_t, ceph_filelock>::iterator p = lock_state->held_locks.begin();
9687 p != lock_state->held_locks.end();
9688 ++p)
9689 to_release.push_back(pair<int, ceph_filelock>(CEPH_LOCK_FCNTL, p->second));
9690 delete fh->fcntl_locks;
9691 }
9692 if (fh->flock_locks) {
9693 ceph_lock_state_t* lock_state = fh->flock_locks;
9694 for(multimap<uint64_t, ceph_filelock>::iterator p = lock_state->held_locks.begin();
9695 p != lock_state->held_locks.end();
9696 ++p)
9697 to_release.push_back(pair<int, ceph_filelock>(CEPH_LOCK_FLOCK, p->second));
9698 delete fh->flock_locks;
9699 }
9700
9701 if (to_release.empty())
9702 return;
9703
9704 struct flock fl;
9705 memset(&fl, 0, sizeof(fl));
9706 fl.l_whence = SEEK_SET;
9707 fl.l_type = F_UNLCK;
9708
9709 for (list<pair<int, ceph_filelock> >::iterator p = to_release.begin();
9710 p != to_release.end();
9711 ++p) {
9712 fl.l_start = p->second.start;
9713 fl.l_len = p->second.length;
9714 fl.l_pid = p->second.pid;
9715 _do_filelock(in, fh, p->first, CEPH_MDS_OP_SETFILELOCK, 0, &fl,
9716 p->second.owner, true);
9717 }
9718}
9719
9720void Client::_update_lock_state(struct flock *fl, uint64_t owner,
9721 ceph_lock_state_t *lock_state)
9722{
9723 int lock_cmd;
9724 if (F_RDLCK == fl->l_type)
9725 lock_cmd = CEPH_LOCK_SHARED;
9726 else if (F_WRLCK == fl->l_type)
9727 lock_cmd = CEPH_LOCK_EXCL;
9728 else
9729 lock_cmd = CEPH_LOCK_UNLOCK;;
9730
9731 ceph_filelock filelock;
9732 filelock.start = fl->l_start;
9733 filelock.length = fl->l_len;
9734 filelock.client = 0;
9735 // see comment in _do_filelock()
9736 filelock.owner = owner | (1ULL << 63);
9737 filelock.pid = fl->l_pid;
9738 filelock.type = lock_cmd;
9739
9740 if (filelock.type == CEPH_LOCK_UNLOCK) {
9741 list<ceph_filelock> activated_locks;
9742 lock_state->remove_lock(filelock, activated_locks);
9743 } else {
9744 bool r = lock_state->add_lock(filelock, false, false, NULL);
9745 assert(r);
9746 }
9747}
9748
9749int Client::_getlk(Fh *fh, struct flock *fl, uint64_t owner)
9750{
9751 Inode *in = fh->inode.get();
9752 ldout(cct, 10) << "_getlk " << fh << " ino " << in->ino << dendl;
9753 int ret = _do_filelock(in, fh, CEPH_LOCK_FCNTL, CEPH_MDS_OP_GETFILELOCK, 0, fl, owner);
9754 return ret;
9755}
9756
9757int Client::_setlk(Fh *fh, struct flock *fl, uint64_t owner, int sleep)
9758{
9759 Inode *in = fh->inode.get();
9760 ldout(cct, 10) << "_setlk " << fh << " ino " << in->ino << dendl;
9761 int ret = _do_filelock(in, fh, CEPH_LOCK_FCNTL, CEPH_MDS_OP_SETFILELOCK, sleep, fl, owner);
9762 ldout(cct, 10) << "_setlk " << fh << " ino " << in->ino << " result=" << ret << dendl;
9763 return ret;
9764}
9765
9766int Client::_flock(Fh *fh, int cmd, uint64_t owner)
9767{
9768 Inode *in = fh->inode.get();
9769 ldout(cct, 10) << "_flock " << fh << " ino " << in->ino << dendl;
9770
9771 int sleep = !(cmd & LOCK_NB);
9772 cmd &= ~LOCK_NB;
9773
9774 int type;
9775 switch (cmd) {
9776 case LOCK_SH:
9777 type = F_RDLCK;
9778 break;
9779 case LOCK_EX:
9780 type = F_WRLCK;
9781 break;
9782 case LOCK_UN:
9783 type = F_UNLCK;
9784 break;
9785 default:
9786 return -EINVAL;
9787 }
9788
9789 struct flock fl;
9790 memset(&fl, 0, sizeof(fl));
9791 fl.l_type = type;
9792 fl.l_whence = SEEK_SET;
9793
9794 int ret = _do_filelock(in, fh, CEPH_LOCK_FLOCK, CEPH_MDS_OP_SETFILELOCK, sleep, &fl, owner);
9795 ldout(cct, 10) << "_flock " << fh << " ino " << in->ino << " result=" << ret << dendl;
9796 return ret;
9797}
9798
9799int Client::ll_statfs(Inode *in, struct statvfs *stbuf, const UserPerm& perms)
9800{
9801 /* Since the only thing this does is wrap a call to statfs, and
9802 statfs takes a lock, it doesn't seem we have a need to split it
9803 out. */
9804 return statfs(0, stbuf, perms);
9805}
9806
9807void Client::ll_register_callbacks(struct client_callback_args *args)
9808{
9809 if (!args)
9810 return;
9811 Mutex::Locker l(client_lock);
9812 ldout(cct, 10) << "ll_register_callbacks cb " << args->handle
9813 << " invalidate_ino_cb " << args->ino_cb
9814 << " invalidate_dentry_cb " << args->dentry_cb
9815 << " getgroups_cb" << args->getgroups_cb
9816 << " switch_interrupt_cb " << args->switch_intr_cb
9817 << " remount_cb " << args->remount_cb
9818 << dendl;
9819 callback_handle = args->handle;
9820 if (args->ino_cb) {
9821 ino_invalidate_cb = args->ino_cb;
9822 async_ino_invalidator.start();
9823 }
9824 if (args->dentry_cb) {
9825 dentry_invalidate_cb = args->dentry_cb;
9826 async_dentry_invalidator.start();
9827 }
9828 if (args->switch_intr_cb) {
9829 switch_interrupt_cb = args->switch_intr_cb;
9830 interrupt_finisher.start();
9831 }
9832 if (args->remount_cb) {
9833 remount_cb = args->remount_cb;
9834 remount_finisher.start();
9835 }
9836 getgroups_cb = args->getgroups_cb;
9837 umask_cb = args->umask_cb;
9838}
9839
9840int Client::test_dentry_handling(bool can_invalidate)
9841{
9842 int r = 0;
9843
9844 can_invalidate_dentries = can_invalidate;
9845
9846 if (can_invalidate_dentries) {
9847 assert(dentry_invalidate_cb);
9848 ldout(cct, 1) << "using dentry_invalidate_cb" << dendl;
9849 } else if (remount_cb) {
9850 ldout(cct, 1) << "using remount_cb" << dendl;
9851 int s = remount_cb(callback_handle);
9852 if (s) {
9853 lderr(cct) << "Failed to invoke remount, needed to ensure kernel dcache consistency"
9854 << dendl;
9855 }
9856 if (cct->_conf->client_die_on_failed_remount) {
9857 require_remount = true;
9858 r = s;
9859 }
9860 } else {
9861 lderr(cct) << "no method to invalidate kernel dentry cache; expect issues!" << dendl;
9862 if (cct->_conf->client_die_on_failed_remount)
9863 ceph_abort();
9864 }
9865 return r;
9866}
9867
9868int Client::_sync_fs()
9869{
9870 ldout(cct, 10) << "_sync_fs" << dendl;
9871
9872 // flush file data
9873 Mutex lock("Client::_fsync::lock");
9874 Cond cond;
9875 bool flush_done = false;
9876 if (cct->_conf->client_oc)
9877 objectcacher->flush_all(new C_SafeCond(&lock, &cond, &flush_done));
9878 else
9879 flush_done = true;
9880
9881 // flush caps
9882 flush_caps_sync();
9883 ceph_tid_t flush_tid = last_flush_tid;
9884
9885 // wait for unsafe mds requests
9886 wait_unsafe_requests();
9887
9888 wait_sync_caps(flush_tid);
9889
9890 if (!flush_done) {
9891 client_lock.Unlock();
9892 lock.Lock();
9893 ldout(cct, 15) << "waiting on data to flush" << dendl;
9894 while (!flush_done)
9895 cond.Wait(lock);
9896 lock.Unlock();
9897 client_lock.Lock();
9898 }
9899
9900 return 0;
9901}
9902
9903int Client::sync_fs()
9904{
9905 Mutex::Locker l(client_lock);
9906 return _sync_fs();
9907}
9908
9909int64_t Client::drop_caches()
9910{
9911 Mutex::Locker l(client_lock);
9912 return objectcacher->release_all();
9913}
9914
9915
9916int Client::lazyio_propogate(int fd, loff_t offset, size_t count)
9917{
9918 Mutex::Locker l(client_lock);
9919 ldout(cct, 3) << "op: client->lazyio_propogate(" << fd
9920 << ", " << offset << ", " << count << ")" << dendl;
9921
9922 Fh *f = get_filehandle(fd);
9923 if (!f)
9924 return -EBADF;
9925
9926 // for now
9927 _fsync(f, true);
9928
9929 return 0;
9930}
9931
9932int Client::lazyio_synchronize(int fd, loff_t offset, size_t count)
9933{
9934 Mutex::Locker l(client_lock);
9935 ldout(cct, 3) << "op: client->lazyio_synchronize(" << fd
9936 << ", " << offset << ", " << count << ")" << dendl;
9937
9938 Fh *f = get_filehandle(fd);
9939 if (!f)
9940 return -EBADF;
9941 Inode *in = f->inode.get();
9942
9943 _fsync(f, true);
9944 if (_release(in))
9945 check_caps(in, 0);
9946 return 0;
9947}
9948
9949
9950// =============================
9951// snaps
9952
9953int Client::mksnap(const char *relpath, const char *name, const UserPerm& perm)
9954{
9955 Mutex::Locker l(client_lock);
9956 filepath path(relpath);
9957 InodeRef in;
9958 int r = path_walk(path, &in, perm);
9959 if (r < 0)
9960 return r;
9961 if (cct->_conf->client_permissions) {
9962 r = may_create(in.get(), perm);
9963 if (r < 0)
9964 return r;
9965 }
9966 Inode *snapdir = open_snapdir(in.get());
9967 return _mkdir(snapdir, name, 0, perm);
9968}
9969int Client::rmsnap(const char *relpath, const char *name, const UserPerm& perms)
9970{
9971 Mutex::Locker l(client_lock);
9972 filepath path(relpath);
9973 InodeRef in;
9974 int r = path_walk(path, &in, perms);
9975 if (r < 0)
9976 return r;
9977 if (cct->_conf->client_permissions) {
9978 r = may_delete(in.get(), NULL, perms);
9979 if (r < 0)
9980 return r;
9981 }
9982 Inode *snapdir = open_snapdir(in.get());
9983 return _rmdir(snapdir, name, perms);
9984}
9985
9986// =============================
9987// expose caps
9988
9989int Client::get_caps_issued(int fd) {
9990
9991 Mutex::Locker lock(client_lock);
9992
9993 Fh *f = get_filehandle(fd);
9994 if (!f)
9995 return -EBADF;
9996
9997 return f->inode->caps_issued();
9998}
9999
10000int Client::get_caps_issued(const char *path, const UserPerm& perms)
10001{
10002 Mutex::Locker lock(client_lock);
10003 filepath p(path);
10004 InodeRef in;
10005 int r = path_walk(p, &in, perms, true);
10006 if (r < 0)
10007 return r;
10008 return in->caps_issued();
10009}
10010
10011// =========================================
10012// low level
10013
10014Inode *Client::open_snapdir(Inode *diri)
10015{
10016 Inode *in;
10017 vinodeno_t vino(diri->ino, CEPH_SNAPDIR);
10018 if (!inode_map.count(vino)) {
10019 in = new Inode(this, vino, &diri->layout);
10020
10021 in->ino = diri->ino;
10022 in->snapid = CEPH_SNAPDIR;
10023 in->mode = diri->mode;
10024 in->uid = diri->uid;
10025 in->gid = diri->gid;
10026 in->mtime = diri->mtime;
10027 in->ctime = diri->ctime;
10028 in->btime = diri->btime;
10029 in->size = diri->size;
10030 in->change_attr = diri->change_attr;
10031
10032 in->dirfragtree.clear();
10033 in->snapdir_parent = diri;
10034 diri->flags |= I_SNAPDIR_OPEN;
10035 inode_map[vino] = in;
10036 if (use_faked_inos())
10037 _assign_faked_ino(in);
10038 ldout(cct, 10) << "open_snapdir created snapshot inode " << *in << dendl;
10039 } else {
10040 in = inode_map[vino];
10041 ldout(cct, 10) << "open_snapdir had snapshot inode " << *in << dendl;
10042 }
10043 return in;
10044}
10045
10046int Client::ll_lookup(Inode *parent, const char *name, struct stat *attr,
10047 Inode **out, const UserPerm& perms)
10048{
10049 Mutex::Locker lock(client_lock);
31f18b77
FG
10050 vinodeno_t vparent = _get_vino(parent);
10051 ldout(cct, 3) << "ll_lookup " << vparent << " " << name << dendl;
7c673cae
FG
10052 tout(cct) << "ll_lookup" << std::endl;
10053 tout(cct) << name << std::endl;
10054
10055 int r = 0;
10056 if (!cct->_conf->fuse_default_permissions) {
10057 r = may_lookup(parent, perms);
10058 if (r < 0)
10059 return r;
10060 }
10061
10062 string dname(name);
10063 InodeRef in;
10064
10065 r = _lookup(parent, dname, CEPH_STAT_CAP_INODE_ALL, &in, perms);
10066 if (r < 0) {
10067 attr->st_ino = 0;
10068 goto out;
10069 }
10070
10071 assert(in);
10072 fill_stat(in, attr);
10073 _ll_get(in.get());
10074
10075 out:
31f18b77 10076 ldout(cct, 3) << "ll_lookup " << vparent << " " << name
7c673cae
FG
10077 << " -> " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
10078 tout(cct) << attr->st_ino << std::endl;
10079 *out = in.get();
10080 return r;
10081}
10082
10083int Client::ll_lookupx(Inode *parent, const char *name, Inode **out,
10084 struct ceph_statx *stx, unsigned want, unsigned flags,
10085 const UserPerm& perms)
10086{
10087 Mutex::Locker lock(client_lock);
31f18b77
FG
10088 vinodeno_t vparent = _get_vino(parent);
10089 ldout(cct, 3) << "ll_lookupx " << vparent << " " << name << dendl;
7c673cae
FG
10090 tout(cct) << "ll_lookupx" << std::endl;
10091 tout(cct) << name << std::endl;
10092
10093 int r = 0;
10094 if (!cct->_conf->fuse_default_permissions) {
10095 r = may_lookup(parent, perms);
10096 if (r < 0)
10097 return r;
10098 }
10099
10100 string dname(name);
10101 InodeRef in;
10102
10103 unsigned mask = statx_to_mask(flags, want);
10104 r = _lookup(parent, dname, mask, &in, perms);
10105 if (r < 0) {
10106 stx->stx_ino = 0;
10107 stx->stx_mask = 0;
10108 } else {
10109 assert(in);
10110 fill_statx(in, mask, stx);
10111 _ll_get(in.get());
10112 }
10113
31f18b77 10114 ldout(cct, 3) << "ll_lookupx " << vparent << " " << name
7c673cae
FG
10115 << " -> " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
10116 tout(cct) << stx->stx_ino << std::endl;
10117 *out = in.get();
10118 return r;
10119}
10120
10121int Client::ll_walk(const char* name, Inode **out, struct ceph_statx *stx,
10122 unsigned int want, unsigned int flags, const UserPerm& perms)
10123{
10124 Mutex::Locker lock(client_lock);
10125 filepath fp(name, 0);
10126 InodeRef in;
10127 int rc;
10128 unsigned mask = statx_to_mask(flags, want);
10129
10130 ldout(cct, 3) << "ll_walk" << name << dendl;
10131 tout(cct) << "ll_walk" << std::endl;
10132 tout(cct) << name << std::endl;
10133
10134 rc = path_walk(fp, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW), mask);
10135 if (rc < 0) {
10136 /* zero out mask, just in case... */
10137 stx->stx_mask = 0;
10138 stx->stx_ino = 0;
10139 *out = NULL;
10140 return rc;
10141 } else {
10142 assert(in);
10143 fill_statx(in, mask, stx);
10144 _ll_get(in.get());
10145 *out = in.get();
10146 return 0;
10147 }
10148}
10149
10150void Client::_ll_get(Inode *in)
10151{
10152 if (in->ll_ref == 0) {
10153 in->get();
10154 if (in->is_dir() && !in->dn_set.empty()) {
10155 assert(in->dn_set.size() == 1); // dirs can't be hard-linked
10156 in->get_first_parent()->get(); // pin dentry
10157 }
10158 }
10159 in->ll_get();
10160 ldout(cct, 20) << "_ll_get " << in << " " << in->ino << " -> " << in->ll_ref << dendl;
10161}
10162
10163int Client::_ll_put(Inode *in, int num)
10164{
10165 in->ll_put(num);
10166 ldout(cct, 20) << "_ll_put " << in << " " << in->ino << " " << num << " -> " << in->ll_ref << dendl;
10167 if (in->ll_ref == 0) {
10168 if (in->is_dir() && !in->dn_set.empty()) {
10169 assert(in->dn_set.size() == 1); // dirs can't be hard-linked
10170 in->get_first_parent()->put(); // unpin dentry
10171 }
10172 put_inode(in);
10173 return 0;
10174 } else {
10175 return in->ll_ref;
10176 }
10177}
10178
10179void Client::_ll_drop_pins()
10180{
10181 ldout(cct, 10) << "_ll_drop_pins" << dendl;
10182 ceph::unordered_map<vinodeno_t, Inode*>::iterator next;
10183 for (ceph::unordered_map<vinodeno_t, Inode*>::iterator it = inode_map.begin();
10184 it != inode_map.end();
10185 it = next) {
10186 Inode *in = it->second;
10187 next = it;
10188 ++next;
10189 if (in->ll_ref)
10190 _ll_put(in, in->ll_ref);
10191 }
10192}
10193
10194bool Client::ll_forget(Inode *in, int count)
10195{
10196 Mutex::Locker lock(client_lock);
10197 inodeno_t ino = _get_inodeno(in);
10198
10199 ldout(cct, 3) << "ll_forget " << ino << " " << count << dendl;
10200 tout(cct) << "ll_forget" << std::endl;
10201 tout(cct) << ino.val << std::endl;
10202 tout(cct) << count << std::endl;
10203
10204 if (ino == 1) return true; // ignore forget on root.
10205
10206 bool last = false;
10207 if (in->ll_ref < count) {
10208 ldout(cct, 1) << "WARNING: ll_forget on " << ino << " " << count
10209 << ", which only has ll_ref=" << in->ll_ref << dendl;
10210 _ll_put(in, in->ll_ref);
10211 last = true;
10212 } else {
10213 if (_ll_put(in, count) == 0)
10214 last = true;
10215 }
10216
10217 return last;
10218}
10219
10220bool Client::ll_put(Inode *in)
10221{
10222 /* ll_forget already takes the lock */
10223 return ll_forget(in, 1);
10224}
10225
10226snapid_t Client::ll_get_snapid(Inode *in)
10227{
10228 Mutex::Locker lock(client_lock);
10229 return in->snapid;
10230}
10231
10232Inode *Client::ll_get_inode(ino_t ino)
10233{
10234 Mutex::Locker lock(client_lock);
10235 vinodeno_t vino = _map_faked_ino(ino);
10236 unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino);
10237 if (p == inode_map.end())
10238 return NULL;
10239 Inode *in = p->second;
10240 _ll_get(in);
10241 return in;
10242}
10243
10244Inode *Client::ll_get_inode(vinodeno_t vino)
10245{
10246 Mutex::Locker lock(client_lock);
10247 unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino);
10248 if (p == inode_map.end())
10249 return NULL;
10250 Inode *in = p->second;
10251 _ll_get(in);
10252 return in;
10253}
10254
10255int Client::_ll_getattr(Inode *in, int caps, const UserPerm& perms)
10256{
10257 vinodeno_t vino = _get_vino(in);
10258
10259 ldout(cct, 3) << "ll_getattr " << vino << dendl;
10260 tout(cct) << "ll_getattr" << std::endl;
10261 tout(cct) << vino.ino.val << std::endl;
10262
10263 if (vino.snapid < CEPH_NOSNAP)
10264 return 0;
10265 else
10266 return _getattr(in, caps, perms);
10267}
10268
10269int Client::ll_getattr(Inode *in, struct stat *attr, const UserPerm& perms)
10270{
10271 Mutex::Locker lock(client_lock);
10272
10273 int res = _ll_getattr(in, CEPH_STAT_CAP_INODE_ALL, perms);
10274
10275 if (res == 0)
10276 fill_stat(in, attr);
10277 ldout(cct, 3) << "ll_getattr " << _get_vino(in) << " = " << res << dendl;
10278 return res;
10279}
10280
10281int Client::ll_getattrx(Inode *in, struct ceph_statx *stx, unsigned int want,
10282 unsigned int flags, const UserPerm& perms)
10283{
10284 Mutex::Locker lock(client_lock);
10285
10286 int res = 0;
10287 unsigned mask = statx_to_mask(flags, want);
10288
10289 if (mask && !in->caps_issued_mask(mask))
10290 res = _ll_getattr(in, mask, perms);
10291
10292 if (res == 0)
10293 fill_statx(in, mask, stx);
10294 ldout(cct, 3) << "ll_getattrx " << _get_vino(in) << " = " << res << dendl;
10295 return res;
10296}
10297
10298int Client::_ll_setattrx(Inode *in, struct ceph_statx *stx, int mask,
10299 const UserPerm& perms, InodeRef *inp)
10300{
10301 vinodeno_t vino = _get_vino(in);
10302
10303 ldout(cct, 3) << "ll_setattrx " << vino << " mask " << hex << mask << dec
10304 << dendl;
10305 tout(cct) << "ll_setattrx" << std::endl;
10306 tout(cct) << vino.ino.val << std::endl;
10307 tout(cct) << stx->stx_mode << std::endl;
10308 tout(cct) << stx->stx_uid << std::endl;
10309 tout(cct) << stx->stx_gid << std::endl;
10310 tout(cct) << stx->stx_size << std::endl;
10311 tout(cct) << stx->stx_mtime << std::endl;
10312 tout(cct) << stx->stx_atime << std::endl;
10313 tout(cct) << stx->stx_btime << std::endl;
10314 tout(cct) << mask << std::endl;
10315
10316 if (!cct->_conf->fuse_default_permissions) {
10317 int res = may_setattr(in, stx, mask, perms);
10318 if (res < 0)
10319 return res;
10320 }
10321
10322 mask &= ~(CEPH_SETATTR_MTIME_NOW | CEPH_SETATTR_ATIME_NOW);
10323
10324 return __setattrx(in, stx, mask, perms, inp);
10325}
10326
10327int Client::ll_setattrx(Inode *in, struct ceph_statx *stx, int mask,
10328 const UserPerm& perms)
10329{
10330 Mutex::Locker lock(client_lock);
10331 InodeRef target(in);
10332 int res = _ll_setattrx(in, stx, mask, perms, &target);
10333 if (res == 0) {
10334 assert(in == target.get());
10335 fill_statx(in, in->caps_issued(), stx);
10336 }
10337
10338 ldout(cct, 3) << "ll_setattrx " << _get_vino(in) << " = " << res << dendl;
10339 return res;
10340}
10341
10342int Client::ll_setattr(Inode *in, struct stat *attr, int mask,
10343 const UserPerm& perms)
10344{
10345 struct ceph_statx stx;
10346 stat_to_statx(attr, &stx);
10347
10348 Mutex::Locker lock(client_lock);
10349 InodeRef target(in);
10350 int res = _ll_setattrx(in, &stx, mask, perms, &target);
10351 if (res == 0) {
10352 assert(in == target.get());
10353 fill_stat(in, attr);
10354 }
10355
10356 ldout(cct, 3) << "ll_setattr " << _get_vino(in) << " = " << res << dendl;
10357 return res;
10358}
10359
10360
10361// ----------
10362// xattrs
10363
10364int Client::getxattr(const char *path, const char *name, void *value, size_t size,
10365 const UserPerm& perms)
10366{
10367 Mutex::Locker lock(client_lock);
10368 InodeRef in;
10369 int r = Client::path_walk(path, &in, perms, true, CEPH_STAT_CAP_XATTR);
10370 if (r < 0)
10371 return r;
10372 return _getxattr(in, name, value, size, perms);
10373}
10374
10375int Client::lgetxattr(const char *path, const char *name, void *value, size_t size,
10376 const UserPerm& perms)
10377{
10378 Mutex::Locker lock(client_lock);
10379 InodeRef in;
10380 int r = Client::path_walk(path, &in, perms, false, CEPH_STAT_CAP_XATTR);
10381 if (r < 0)
10382 return r;
10383 return _getxattr(in, name, value, size, perms);
10384}
10385
10386int Client::fgetxattr(int fd, const char *name, void *value, size_t size,
10387 const UserPerm& perms)
10388{
10389 Mutex::Locker lock(client_lock);
10390 Fh *f = get_filehandle(fd);
10391 if (!f)
10392 return -EBADF;
10393 return _getxattr(f->inode, name, value, size, perms);
10394}
10395
10396int Client::listxattr(const char *path, char *list, size_t size,
10397 const UserPerm& perms)
10398{
10399 Mutex::Locker lock(client_lock);
10400 InodeRef in;
10401 int r = Client::path_walk(path, &in, perms, true, CEPH_STAT_CAP_XATTR);
10402 if (r < 0)
10403 return r;
10404 return Client::_listxattr(in.get(), list, size, perms);
10405}
10406
10407int Client::llistxattr(const char *path, char *list, size_t size,
10408 const UserPerm& perms)
10409{
10410 Mutex::Locker lock(client_lock);
10411 InodeRef in;
10412 int r = Client::path_walk(path, &in, perms, false, CEPH_STAT_CAP_XATTR);
10413 if (r < 0)
10414 return r;
10415 return Client::_listxattr(in.get(), list, size, perms);
10416}
10417
10418int Client::flistxattr(int fd, char *list, size_t size, const UserPerm& perms)
10419{
10420 Mutex::Locker lock(client_lock);
10421 Fh *f = get_filehandle(fd);
10422 if (!f)
10423 return -EBADF;
10424 return Client::_listxattr(f->inode.get(), list, size, perms);
10425}
10426
10427int Client::removexattr(const char *path, const char *name,
10428 const UserPerm& perms)
10429{
10430 Mutex::Locker lock(client_lock);
10431 InodeRef in;
10432 int r = Client::path_walk(path, &in, perms, true);
10433 if (r < 0)
10434 return r;
10435 return _removexattr(in, name, perms);
10436}
10437
10438int Client::lremovexattr(const char *path, const char *name,
10439 const UserPerm& perms)
10440{
10441 Mutex::Locker lock(client_lock);
10442 InodeRef in;
10443 int r = Client::path_walk(path, &in, perms, false);
10444 if (r < 0)
10445 return r;
10446 return _removexattr(in, name, perms);
10447}
10448
10449int Client::fremovexattr(int fd, const char *name, const UserPerm& perms)
10450{
10451 Mutex::Locker lock(client_lock);
10452 Fh *f = get_filehandle(fd);
10453 if (!f)
10454 return -EBADF;
10455 return _removexattr(f->inode, name, perms);
10456}
10457
10458int Client::setxattr(const char *path, const char *name, const void *value,
10459 size_t size, int flags, const UserPerm& perms)
10460{
10461 _setxattr_maybe_wait_for_osdmap(name, value, size);
10462
10463 Mutex::Locker lock(client_lock);
10464 InodeRef in;
10465 int r = Client::path_walk(path, &in, perms, true);
10466 if (r < 0)
10467 return r;
10468 return _setxattr(in, name, value, size, flags, perms);
10469}
10470
10471int Client::lsetxattr(const char *path, const char *name, const void *value,
10472 size_t size, int flags, const UserPerm& perms)
10473{
10474 _setxattr_maybe_wait_for_osdmap(name, value, size);
10475
10476 Mutex::Locker lock(client_lock);
10477 InodeRef in;
10478 int r = Client::path_walk(path, &in, perms, false);
10479 if (r < 0)
10480 return r;
10481 return _setxattr(in, name, value, size, flags, perms);
10482}
10483
10484int Client::fsetxattr(int fd, const char *name, const void *value, size_t size,
10485 int flags, const UserPerm& perms)
10486{
10487 _setxattr_maybe_wait_for_osdmap(name, value, size);
10488
10489 Mutex::Locker lock(client_lock);
10490 Fh *f = get_filehandle(fd);
10491 if (!f)
10492 return -EBADF;
10493 return _setxattr(f->inode, name, value, size, flags, perms);
10494}
10495
10496int Client::_getxattr(Inode *in, const char *name, void *value, size_t size,
10497 const UserPerm& perms)
10498{
10499 int r;
10500
10501 const VXattr *vxattr = _match_vxattr(in, name);
10502 if (vxattr) {
10503 r = -ENODATA;
10504
10505 // Do a force getattr to get the latest quota before returning
10506 // a value to userspace.
10507 r = _getattr(in, 0, perms, true);
10508 if (r != 0) {
10509 // Error from getattr!
10510 return r;
10511 }
10512
10513 // call pointer-to-member function
10514 char buf[256];
10515 if (!(vxattr->exists_cb && !(this->*(vxattr->exists_cb))(in))) {
10516 r = (this->*(vxattr->getxattr_cb))(in, buf, sizeof(buf));
10517 } else {
10518 r = -ENODATA;
10519 }
10520
10521 if (size != 0) {
10522 if (r > (int)size) {
10523 r = -ERANGE;
10524 } else if (r > 0) {
10525 memcpy(value, buf, r);
10526 }
10527 }
10528 goto out;
10529 }
10530
10531 if (acl_type == NO_ACL && !strncmp(name, "system.", 7)) {
10532 r = -EOPNOTSUPP;
10533 goto out;
10534 }
10535
10536 r = _getattr(in, CEPH_STAT_CAP_XATTR, perms, in->xattr_version == 0);
10537 if (r == 0) {
10538 string n(name);
10539 r = -ENODATA;
10540 if (in->xattrs.count(n)) {
10541 r = in->xattrs[n].length();
10542 if (r > 0 && size != 0) {
10543 if (size >= (unsigned)r)
10544 memcpy(value, in->xattrs[n].c_str(), r);
10545 else
10546 r = -ERANGE;
10547 }
10548 }
10549 }
10550 out:
10551 ldout(cct, 3) << "_getxattr(" << in->ino << ", \"" << name << "\", " << size << ") = " << r << dendl;
10552 return r;
10553}
10554
10555int Client::_getxattr(InodeRef &in, const char *name, void *value, size_t size,
10556 const UserPerm& perms)
10557{
10558 if (cct->_conf->client_permissions) {
10559 int r = xattr_permission(in.get(), name, MAY_READ, perms);
10560 if (r < 0)
10561 return r;
10562 }
10563 return _getxattr(in.get(), name, value, size, perms);
10564}
10565
10566int Client::ll_getxattr(Inode *in, const char *name, void *value,
10567 size_t size, const UserPerm& perms)
10568{
10569 Mutex::Locker lock(client_lock);
10570
10571 vinodeno_t vino = _get_vino(in);
10572
10573 ldout(cct, 3) << "ll_getxattr " << vino << " " << name << " size " << size << dendl;
10574 tout(cct) << "ll_getxattr" << std::endl;
10575 tout(cct) << vino.ino.val << std::endl;
10576 tout(cct) << name << std::endl;
10577
10578 if (!cct->_conf->fuse_default_permissions) {
10579 int r = xattr_permission(in, name, MAY_READ, perms);
10580 if (r < 0)
10581 return r;
10582 }
10583
10584 return _getxattr(in, name, value, size, perms);
10585}
10586
10587int Client::_listxattr(Inode *in, char *name, size_t size,
10588 const UserPerm& perms)
10589{
10590 int r = _getattr(in, CEPH_STAT_CAP_XATTR, perms, in->xattr_version == 0);
10591 if (r == 0) {
10592 for (map<string,bufferptr>::iterator p = in->xattrs.begin();
10593 p != in->xattrs.end();
10594 ++p)
10595 r += p->first.length() + 1;
10596
10597 const VXattr *vxattrs = _get_vxattrs(in);
10598 r += _vxattrs_name_size(vxattrs);
10599
10600 if (size != 0) {
10601 if (size >= (unsigned)r) {
10602 for (map<string,bufferptr>::iterator p = in->xattrs.begin();
10603 p != in->xattrs.end();
10604 ++p) {
10605 memcpy(name, p->first.c_str(), p->first.length());
10606 name += p->first.length();
10607 *name = '\0';
10608 name++;
10609 }
10610 if (vxattrs) {
10611 for (int i = 0; !vxattrs[i].name.empty(); i++) {
10612 const VXattr& vxattr = vxattrs[i];
10613 if (vxattr.hidden)
10614 continue;
10615 // call pointer-to-member function
10616 if(vxattr.exists_cb && !(this->*(vxattr.exists_cb))(in))
10617 continue;
10618 memcpy(name, vxattr.name.c_str(), vxattr.name.length());
10619 name += vxattr.name.length();
10620 *name = '\0';
10621 name++;
10622 }
10623 }
10624 } else
10625 r = -ERANGE;
10626 }
10627 }
10628 ldout(cct, 3) << "_listxattr(" << in->ino << ", " << size << ") = " << r << dendl;
10629 return r;
10630}
10631
10632int Client::ll_listxattr(Inode *in, char *names, size_t size,
10633 const UserPerm& perms)
10634{
10635 Mutex::Locker lock(client_lock);
10636
10637 vinodeno_t vino = _get_vino(in);
10638
10639 ldout(cct, 3) << "ll_listxattr " << vino << " size " << size << dendl;
10640 tout(cct) << "ll_listxattr" << std::endl;
10641 tout(cct) << vino.ino.val << std::endl;
10642 tout(cct) << size << std::endl;
10643
10644 return _listxattr(in, names, size, perms);
10645}
10646
10647int Client::_do_setxattr(Inode *in, const char *name, const void *value,
10648 size_t size, int flags, const UserPerm& perms)
10649{
10650
10651 int xattr_flags = 0;
10652 if (!value)
10653 xattr_flags |= CEPH_XATTR_REMOVE;
10654 if (flags & XATTR_CREATE)
10655 xattr_flags |= CEPH_XATTR_CREATE;
10656 if (flags & XATTR_REPLACE)
10657 xattr_flags |= CEPH_XATTR_REPLACE;
10658
10659 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_SETXATTR);
10660 filepath path;
10661 in->make_nosnap_relative_path(path);
10662 req->set_filepath(path);
10663 req->set_string2(name);
10664 req->set_inode(in);
10665 req->head.args.setxattr.flags = xattr_flags;
10666
10667 bufferlist bl;
10668 bl.append((const char*)value, size);
10669 req->set_data(bl);
10670
10671 int res = make_request(req, perms);
10672
10673 trim_cache();
10674 ldout(cct, 3) << "_setxattr(" << in->ino << ", \"" << name << "\") = " <<
10675 res << dendl;
10676 return res;
10677}
10678
10679int Client::_setxattr(Inode *in, const char *name, const void *value,
10680 size_t size, int flags, const UserPerm& perms)
10681{
10682 if (in->snapid != CEPH_NOSNAP) {
10683 return -EROFS;
10684 }
10685
10686 bool posix_acl_xattr = false;
10687 if (acl_type == POSIX_ACL)
10688 posix_acl_xattr = !strncmp(name, "system.", 7);
10689
10690 if (strncmp(name, "user.", 5) &&
10691 strncmp(name, "security.", 9) &&
10692 strncmp(name, "trusted.", 8) &&
10693 strncmp(name, "ceph.", 5) &&
10694 !posix_acl_xattr)
10695 return -EOPNOTSUPP;
10696
10697 if (posix_acl_xattr) {
10698 if (!strcmp(name, ACL_EA_ACCESS)) {
10699 mode_t new_mode = in->mode;
10700 if (value) {
10701 int ret = posix_acl_equiv_mode(value, size, &new_mode);
10702 if (ret < 0)
10703 return ret;
10704 if (ret == 0) {
10705 value = NULL;
10706 size = 0;
10707 }
10708 if (new_mode != in->mode) {
10709 struct ceph_statx stx;
10710 stx.stx_mode = new_mode;
10711 ret = _do_setattr(in, &stx, CEPH_SETATTR_MODE, perms, NULL);
10712 if (ret < 0)
10713 return ret;
10714 }
10715 }
10716 } else if (!strcmp(name, ACL_EA_DEFAULT)) {
10717 if (value) {
10718 if (!S_ISDIR(in->mode))
10719 return -EACCES;
10720 int ret = posix_acl_check(value, size);
10721 if (ret < 0)
10722 return -EINVAL;
10723 if (ret == 0) {
10724 value = NULL;
10725 size = 0;
10726 }
10727 }
10728 } else {
10729 return -EOPNOTSUPP;
10730 }
10731 } else {
10732 const VXattr *vxattr = _match_vxattr(in, name);
10733 if (vxattr && vxattr->readonly)
10734 return -EOPNOTSUPP;
10735 }
10736
10737 return _do_setxattr(in, name, value, size, flags, perms);
10738}
10739
10740int Client::_setxattr(InodeRef &in, const char *name, const void *value,
10741 size_t size, int flags, const UserPerm& perms)
10742{
10743 if (cct->_conf->client_permissions) {
10744 int r = xattr_permission(in.get(), name, MAY_WRITE, perms);
10745 if (r < 0)
10746 return r;
10747 }
10748 return _setxattr(in.get(), name, value, size, flags, perms);
10749}
10750
10751int Client::_setxattr_check_data_pool(string& name, string& value, const OSDMap *osdmap)
10752{
10753 string tmp;
10754 if (name == "layout") {
10755 string::iterator begin = value.begin();
10756 string::iterator end = value.end();
10757 keys_and_values<string::iterator> p; // create instance of parser
10758 std::map<string, string> m; // map to receive results
10759 if (!qi::parse(begin, end, p, m)) { // returns true if successful
10760 return -EINVAL;
10761 }
10762 if (begin != end)
10763 return -EINVAL;
10764 for (map<string,string>::iterator q = m.begin(); q != m.end(); ++q) {
10765 if (q->first == "pool") {
10766 tmp = q->second;
10767 break;
10768 }
10769 }
10770 } else if (name == "layout.pool") {
10771 tmp = value;
10772 }
10773
10774 if (tmp.length()) {
10775 int64_t pool;
10776 try {
10777 pool = boost::lexical_cast<unsigned>(tmp);
10778 if (!osdmap->have_pg_pool(pool))
10779 return -ENOENT;
10780 } catch (boost::bad_lexical_cast const&) {
10781 pool = osdmap->lookup_pg_pool_name(tmp);
10782 if (pool < 0) {
10783 return -ENOENT;
10784 }
10785 }
10786 }
10787
10788 return 0;
10789}
10790
10791void Client::_setxattr_maybe_wait_for_osdmap(const char *name, const void *value, size_t size)
10792{
10793 // For setting pool of layout, MetaRequest need osdmap epoch.
10794 // There is a race which create a new data pool but client and mds both don't have.
10795 // Make client got the latest osdmap which make mds quickly judge whether get newer osdmap.
10796 if (strcmp(name, "ceph.file.layout.pool") == 0 || strcmp(name, "ceph.dir.layout.pool") == 0 ||
10797 strcmp(name, "ceph.file.layout") == 0 || strcmp(name, "ceph.dir.layout") == 0) {
10798 string rest(strstr(name, "layout"));
10799 string v((const char*)value, size);
10800 int r = objecter->with_osdmap([&](const OSDMap& o) {
10801 return _setxattr_check_data_pool(rest, v, &o);
10802 });
10803
10804 if (r == -ENOENT) {
10805 C_SaferCond ctx;
10806 objecter->wait_for_latest_osdmap(&ctx);
10807 ctx.wait();
10808 }
10809 }
10810}
10811
10812int Client::ll_setxattr(Inode *in, const char *name, const void *value,
10813 size_t size, int flags, const UserPerm& perms)
10814{
10815 _setxattr_maybe_wait_for_osdmap(name, value, size);
10816
10817 Mutex::Locker lock(client_lock);
10818
10819 vinodeno_t vino = _get_vino(in);
10820
10821 ldout(cct, 3) << "ll_setxattr " << vino << " " << name << " size " << size << dendl;
10822 tout(cct) << "ll_setxattr" << std::endl;
10823 tout(cct) << vino.ino.val << std::endl;
10824 tout(cct) << name << std::endl;
10825
10826 if (!cct->_conf->fuse_default_permissions) {
10827 int r = xattr_permission(in, name, MAY_WRITE, perms);
10828 if (r < 0)
10829 return r;
10830 }
10831 return _setxattr(in, name, value, size, flags, perms);
10832}
10833
10834int Client::_removexattr(Inode *in, const char *name, const UserPerm& perms)
10835{
10836 if (in->snapid != CEPH_NOSNAP) {
10837 return -EROFS;
10838 }
10839
10840 // same xattrs supported by kernel client
10841 if (strncmp(name, "user.", 5) &&
10842 strncmp(name, "system.", 7) &&
10843 strncmp(name, "security.", 9) &&
10844 strncmp(name, "trusted.", 8) &&
10845 strncmp(name, "ceph.", 5))
10846 return -EOPNOTSUPP;
10847
10848 const VXattr *vxattr = _match_vxattr(in, name);
10849 if (vxattr && vxattr->readonly)
10850 return -EOPNOTSUPP;
10851
10852 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_RMXATTR);
10853 filepath path;
10854 in->make_nosnap_relative_path(path);
10855 req->set_filepath(path);
10856 req->set_filepath2(name);
10857 req->set_inode(in);
10858
10859 int res = make_request(req, perms);
10860
10861 trim_cache();
10862 ldout(cct, 3) << "_removexattr(" << in->ino << ", \"" << name << "\") = " << res << dendl;
10863 return res;
10864}
10865
10866int Client::_removexattr(InodeRef &in, const char *name, const UserPerm& perms)
10867{
10868 if (cct->_conf->client_permissions) {
10869 int r = xattr_permission(in.get(), name, MAY_WRITE, perms);
10870 if (r < 0)
10871 return r;
10872 }
10873 return _removexattr(in.get(), name, perms);
10874}
10875
10876int Client::ll_removexattr(Inode *in, const char *name, const UserPerm& perms)
10877{
10878 Mutex::Locker lock(client_lock);
10879
10880 vinodeno_t vino = _get_vino(in);
10881
10882 ldout(cct, 3) << "ll_removexattr " << vino << " " << name << dendl;
10883 tout(cct) << "ll_removexattr" << std::endl;
10884 tout(cct) << vino.ino.val << std::endl;
10885 tout(cct) << name << std::endl;
10886
10887 if (!cct->_conf->fuse_default_permissions) {
10888 int r = xattr_permission(in, name, MAY_WRITE, perms);
10889 if (r < 0)
10890 return r;
10891 }
10892
10893 return _removexattr(in, name, perms);
10894}
10895
10896bool Client::_vxattrcb_quota_exists(Inode *in)
10897{
10898 return in->quota.is_enable();
10899}
10900size_t Client::_vxattrcb_quota(Inode *in, char *val, size_t size)
10901{
10902 return snprintf(val, size,
10903 "max_bytes=%lld max_files=%lld",
10904 (long long int)in->quota.max_bytes,
10905 (long long int)in->quota.max_files);
10906}
10907size_t Client::_vxattrcb_quota_max_bytes(Inode *in, char *val, size_t size)
10908{
10909 return snprintf(val, size, "%lld", (long long int)in->quota.max_bytes);
10910}
10911size_t Client::_vxattrcb_quota_max_files(Inode *in, char *val, size_t size)
10912{
10913 return snprintf(val, size, "%lld", (long long int)in->quota.max_files);
10914}
10915
10916bool Client::_vxattrcb_layout_exists(Inode *in)
10917{
10918 return in->layout != file_layout_t();
10919}
10920size_t Client::_vxattrcb_layout(Inode *in, char *val, size_t size)
10921{
10922 int r = snprintf(val, size,
10923 "stripe_unit=%lld stripe_count=%lld object_size=%lld pool=",
10924 (unsigned long long)in->layout.stripe_unit,
10925 (unsigned long long)in->layout.stripe_count,
10926 (unsigned long long)in->layout.object_size);
10927 objecter->with_osdmap([&](const OSDMap& o) {
10928 if (o.have_pg_pool(in->layout.pool_id))
10929 r += snprintf(val + r, size - r, "%s",
10930 o.get_pool_name(in->layout.pool_id).c_str());
10931 else
10932 r += snprintf(val + r, size - r, "%" PRIu64,
10933 (uint64_t)in->layout.pool_id);
10934 });
10935 if (in->layout.pool_ns.length())
10936 r += snprintf(val + r, size - r, " pool_namespace=%s",
10937 in->layout.pool_ns.c_str());
10938 return r;
10939}
10940size_t Client::_vxattrcb_layout_stripe_unit(Inode *in, char *val, size_t size)
10941{
10942 return snprintf(val, size, "%lld", (unsigned long long)in->layout.stripe_unit);
10943}
10944size_t Client::_vxattrcb_layout_stripe_count(Inode *in, char *val, size_t size)
10945{
10946 return snprintf(val, size, "%lld", (unsigned long long)in->layout.stripe_count);
10947}
10948size_t Client::_vxattrcb_layout_object_size(Inode *in, char *val, size_t size)
10949{
10950 return snprintf(val, size, "%lld", (unsigned long long)in->layout.object_size);
10951}
10952size_t Client::_vxattrcb_layout_pool(Inode *in, char *val, size_t size)
10953{
10954 size_t r;
10955 objecter->with_osdmap([&](const OSDMap& o) {
10956 if (o.have_pg_pool(in->layout.pool_id))
10957 r = snprintf(val, size, "%s", o.get_pool_name(
10958 in->layout.pool_id).c_str());
10959 else
10960 r = snprintf(val, size, "%" PRIu64, (uint64_t)in->layout.pool_id);
10961 });
10962 return r;
10963}
10964size_t Client::_vxattrcb_layout_pool_namespace(Inode *in, char *val, size_t size)
10965{
10966 return snprintf(val, size, "%s", in->layout.pool_ns.c_str());
10967}
10968size_t Client::_vxattrcb_dir_entries(Inode *in, char *val, size_t size)
10969{
10970 return snprintf(val, size, "%lld", (unsigned long long)(in->dirstat.nfiles + in->dirstat.nsubdirs));
10971}
10972size_t Client::_vxattrcb_dir_files(Inode *in, char *val, size_t size)
10973{
10974 return snprintf(val, size, "%lld", (unsigned long long)in->dirstat.nfiles);
10975}
10976size_t Client::_vxattrcb_dir_subdirs(Inode *in, char *val, size_t size)
10977{
10978 return snprintf(val, size, "%lld", (unsigned long long)in->dirstat.nsubdirs);
10979}
10980size_t Client::_vxattrcb_dir_rentries(Inode *in, char *val, size_t size)
10981{
10982 return snprintf(val, size, "%lld", (unsigned long long)(in->rstat.rfiles + in->rstat.rsubdirs));
10983}
10984size_t Client::_vxattrcb_dir_rfiles(Inode *in, char *val, size_t size)
10985{
10986 return snprintf(val, size, "%lld", (unsigned long long)in->rstat.rfiles);
10987}
10988size_t Client::_vxattrcb_dir_rsubdirs(Inode *in, char *val, size_t size)
10989{
10990 return snprintf(val, size, "%lld", (unsigned long long)in->rstat.rsubdirs);
10991}
10992size_t Client::_vxattrcb_dir_rbytes(Inode *in, char *val, size_t size)
10993{
10994 return snprintf(val, size, "%lld", (unsigned long long)in->rstat.rbytes);
10995}
10996size_t Client::_vxattrcb_dir_rctime(Inode *in, char *val, size_t size)
10997{
10998 return snprintf(val, size, "%ld.09%ld", (long)in->rstat.rctime.sec(),
10999 (long)in->rstat.rctime.nsec());
11000}
11001
11002#define CEPH_XATTR_NAME(_type, _name) "ceph." #_type "." #_name
11003#define CEPH_XATTR_NAME2(_type, _name, _name2) "ceph." #_type "." #_name "." #_name2
11004
11005#define XATTR_NAME_CEPH(_type, _name) \
11006{ \
11007 name: CEPH_XATTR_NAME(_type, _name), \
11008 getxattr_cb: &Client::_vxattrcb_ ## _type ## _ ## _name, \
11009 readonly: true, \
11010 hidden: false, \
11011 exists_cb: NULL, \
11012}
11013#define XATTR_LAYOUT_FIELD(_type, _name, _field) \
11014{ \
11015 name: CEPH_XATTR_NAME2(_type, _name, _field), \
11016 getxattr_cb: &Client::_vxattrcb_ ## _name ## _ ## _field, \
11017 readonly: false, \
11018 hidden: true, \
11019 exists_cb: &Client::_vxattrcb_layout_exists, \
11020}
11021#define XATTR_QUOTA_FIELD(_type, _name) \
11022{ \
11023 name: CEPH_XATTR_NAME(_type, _name), \
11024 getxattr_cb: &Client::_vxattrcb_ ## _type ## _ ## _name, \
11025 readonly: false, \
11026 hidden: true, \
11027 exists_cb: &Client::_vxattrcb_quota_exists, \
11028}
11029
11030const Client::VXattr Client::_dir_vxattrs[] = {
11031 {
11032 name: "ceph.dir.layout",
11033 getxattr_cb: &Client::_vxattrcb_layout,
11034 readonly: false,
11035 hidden: true,
11036 exists_cb: &Client::_vxattrcb_layout_exists,
11037 },
11038 XATTR_LAYOUT_FIELD(dir, layout, stripe_unit),
11039 XATTR_LAYOUT_FIELD(dir, layout, stripe_count),
11040 XATTR_LAYOUT_FIELD(dir, layout, object_size),
11041 XATTR_LAYOUT_FIELD(dir, layout, pool),
11042 XATTR_LAYOUT_FIELD(dir, layout, pool_namespace),
11043 XATTR_NAME_CEPH(dir, entries),
11044 XATTR_NAME_CEPH(dir, files),
11045 XATTR_NAME_CEPH(dir, subdirs),
11046 XATTR_NAME_CEPH(dir, rentries),
11047 XATTR_NAME_CEPH(dir, rfiles),
11048 XATTR_NAME_CEPH(dir, rsubdirs),
11049 XATTR_NAME_CEPH(dir, rbytes),
11050 XATTR_NAME_CEPH(dir, rctime),
11051 {
11052 name: "ceph.quota",
11053 getxattr_cb: &Client::_vxattrcb_quota,
11054 readonly: false,
11055 hidden: true,
11056 exists_cb: &Client::_vxattrcb_quota_exists,
11057 },
11058 XATTR_QUOTA_FIELD(quota, max_bytes),
11059 XATTR_QUOTA_FIELD(quota, max_files),
11060 { name: "" } /* Required table terminator */
11061};
11062
11063const Client::VXattr Client::_file_vxattrs[] = {
11064 {
11065 name: "ceph.file.layout",
11066 getxattr_cb: &Client::_vxattrcb_layout,
11067 readonly: false,
11068 hidden: true,
11069 exists_cb: &Client::_vxattrcb_layout_exists,
11070 },
11071 XATTR_LAYOUT_FIELD(file, layout, stripe_unit),
11072 XATTR_LAYOUT_FIELD(file, layout, stripe_count),
11073 XATTR_LAYOUT_FIELD(file, layout, object_size),
11074 XATTR_LAYOUT_FIELD(file, layout, pool),
11075 XATTR_LAYOUT_FIELD(file, layout, pool_namespace),
11076 { name: "" } /* Required table terminator */
11077};
11078
11079const Client::VXattr *Client::_get_vxattrs(Inode *in)
11080{
11081 if (in->is_dir())
11082 return _dir_vxattrs;
11083 else if (in->is_file())
11084 return _file_vxattrs;
11085 return NULL;
11086}
11087
11088const Client::VXattr *Client::_match_vxattr(Inode *in, const char *name)
11089{
11090 if (strncmp(name, "ceph.", 5) == 0) {
11091 const VXattr *vxattr = _get_vxattrs(in);
11092 if (vxattr) {
11093 while (!vxattr->name.empty()) {
11094 if (vxattr->name == name)
11095 return vxattr;
11096 vxattr++;
11097 }
11098 }
11099 }
11100 return NULL;
11101}
11102
11103size_t Client::_vxattrs_calcu_name_size(const VXattr *vxattr)
11104{
11105 size_t len = 0;
11106 while (!vxattr->name.empty()) {
11107 if (!vxattr->hidden)
11108 len += vxattr->name.length() + 1;
11109 vxattr++;
11110 }
11111 return len;
11112}
11113
11114int Client::ll_readlink(Inode *in, char *buf, size_t buflen, const UserPerm& perms)
11115{
11116 Mutex::Locker lock(client_lock);
11117
11118 vinodeno_t vino = _get_vino(in);
11119
11120 ldout(cct, 3) << "ll_readlink " << vino << dendl;
11121 tout(cct) << "ll_readlink" << std::endl;
11122 tout(cct) << vino.ino.val << std::endl;
11123
11124 set<Dentry*>::iterator dn = in->dn_set.begin();
11125 while (dn != in->dn_set.end()) {
11126 touch_dn(*dn);
11127 ++dn;
11128 }
11129
11130 int r = _readlink(in, buf, buflen); // FIXME: no permission checking!
11131 ldout(cct, 3) << "ll_readlink " << vino << " = " << r << dendl;
11132 return r;
11133}
11134
11135int Client::_mknod(Inode *dir, const char *name, mode_t mode, dev_t rdev,
11136 const UserPerm& perms, InodeRef *inp)
11137{
11138 ldout(cct, 3) << "_mknod(" << dir->ino << " " << name << ", 0" << oct
11139 << mode << dec << ", " << rdev << ", uid " << perms.uid()
11140 << ", gid " << perms.gid() << ")" << dendl;
11141
11142 if (strlen(name) > NAME_MAX)
11143 return -ENAMETOOLONG;
11144
11145 if (dir->snapid != CEPH_NOSNAP) {
11146 return -EROFS;
11147 }
11148 if (is_quota_files_exceeded(dir, perms)) {
11149 return -EDQUOT;
11150 }
11151
11152 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_MKNOD);
11153
11154 filepath path;
11155 dir->make_nosnap_relative_path(path);
11156 path.push_dentry(name);
11157 req->set_filepath(path);
11158 req->set_inode(dir);
11159 req->head.args.mknod.rdev = rdev;
11160 req->dentry_drop = CEPH_CAP_FILE_SHARED;
11161 req->dentry_unless = CEPH_CAP_FILE_EXCL;
11162
11163 bufferlist xattrs_bl;
11164 int res = _posix_acl_create(dir, &mode, xattrs_bl, perms);
11165 if (res < 0)
11166 goto fail;
11167 req->head.args.mknod.mode = mode;
11168 if (xattrs_bl.length() > 0)
11169 req->set_data(xattrs_bl);
11170
11171 Dentry *de;
11172 res = get_or_create(dir, name, &de);
11173 if (res < 0)
11174 goto fail;
11175 req->set_dentry(de);
11176
11177 res = make_request(req, perms, inp);
11178
11179 trim_cache();
11180
11181 ldout(cct, 3) << "mknod(" << path << ", 0" << oct << mode << dec << ") = " << res << dendl;
11182 return res;
11183
11184 fail:
11185 put_request(req);
11186 return res;
11187}
11188
11189int Client::ll_mknod(Inode *parent, const char *name, mode_t mode,
11190 dev_t rdev, struct stat *attr, Inode **out,
11191 const UserPerm& perms)
11192{
11193 Mutex::Locker lock(client_lock);
11194
11195 vinodeno_t vparent = _get_vino(parent);
11196
11197 ldout(cct, 3) << "ll_mknod " << vparent << " " << name << dendl;
11198 tout(cct) << "ll_mknod" << std::endl;
11199 tout(cct) << vparent.ino.val << std::endl;
11200 tout(cct) << name << std::endl;
11201 tout(cct) << mode << std::endl;
11202 tout(cct) << rdev << std::endl;
11203
11204 if (!cct->_conf->fuse_default_permissions) {
11205 int r = may_create(parent, perms);
11206 if (r < 0)
11207 return r;
11208 }
11209
11210 InodeRef in;
11211 int r = _mknod(parent, name, mode, rdev, perms, &in);
11212 if (r == 0) {
11213 fill_stat(in, attr);
11214 _ll_get(in.get());
11215 }
11216 tout(cct) << attr->st_ino << std::endl;
11217 ldout(cct, 3) << "ll_mknod " << vparent << " " << name
11218 << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
11219 *out = in.get();
11220 return r;
11221}
11222
11223int Client::ll_mknodx(Inode *parent, const char *name, mode_t mode,
11224 dev_t rdev, Inode **out,
11225 struct ceph_statx *stx, unsigned want, unsigned flags,
11226 const UserPerm& perms)
11227{
11228 unsigned caps = statx_to_mask(flags, want);
11229 Mutex::Locker lock(client_lock);
11230
11231 vinodeno_t vparent = _get_vino(parent);
11232
11233 ldout(cct, 3) << "ll_mknodx " << vparent << " " << name << dendl;
11234 tout(cct) << "ll_mknodx" << std::endl;
11235 tout(cct) << vparent.ino.val << std::endl;
11236 tout(cct) << name << std::endl;
11237 tout(cct) << mode << std::endl;
11238 tout(cct) << rdev << std::endl;
11239
11240 if (!cct->_conf->fuse_default_permissions) {
11241 int r = may_create(parent, perms);
11242 if (r < 0)
11243 return r;
11244 }
11245
11246 InodeRef in;
11247 int r = _mknod(parent, name, mode, rdev, perms, &in);
11248 if (r == 0) {
11249 fill_statx(in, caps, stx);
11250 _ll_get(in.get());
11251 }
11252 tout(cct) << stx->stx_ino << std::endl;
11253 ldout(cct, 3) << "ll_mknodx " << vparent << " " << name
11254 << " = " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
11255 *out = in.get();
11256 return r;
11257}
11258
11259int Client::_create(Inode *dir, const char *name, int flags, mode_t mode,
11260 InodeRef *inp, Fh **fhp, int stripe_unit, int stripe_count,
11261 int object_size, const char *data_pool, bool *created,
11262 const UserPerm& perms)
11263{
11264 ldout(cct, 3) << "_create(" << dir->ino << " " << name << ", 0" << oct <<
11265 mode << dec << ")" << dendl;
11266
11267 if (strlen(name) > NAME_MAX)
11268 return -ENAMETOOLONG;
11269 if (dir->snapid != CEPH_NOSNAP) {
11270 return -EROFS;
11271 }
11272 if (is_quota_files_exceeded(dir, perms)) {
11273 return -EDQUOT;
11274 }
11275
11276 // use normalized flags to generate cmode
11277 int cmode = ceph_flags_to_mode(ceph_flags_sys2wire(flags));
11278 if (cmode < 0)
11279 return -EINVAL;
11280
11281 int64_t pool_id = -1;
11282 if (data_pool && *data_pool) {
11283 pool_id = objecter->with_osdmap(
11284 std::mem_fn(&OSDMap::lookup_pg_pool_name), data_pool);
11285 if (pool_id < 0)
11286 return -EINVAL;
11287 if (pool_id > 0xffffffffll)
11288 return -ERANGE; // bummer!
11289 }
11290
11291 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_CREATE);
11292
11293 filepath path;
11294 dir->make_nosnap_relative_path(path);
11295 path.push_dentry(name);
11296 req->set_filepath(path);
11297 req->set_inode(dir);
11298 req->head.args.open.flags = ceph_flags_sys2wire(flags | O_CREAT);
11299
11300 req->head.args.open.stripe_unit = stripe_unit;
11301 req->head.args.open.stripe_count = stripe_count;
11302 req->head.args.open.object_size = object_size;
11303 if (cct->_conf->client_debug_getattr_caps)
11304 req->head.args.open.mask = DEBUG_GETATTR_CAPS;
11305 else
11306 req->head.args.open.mask = 0;
11307 req->head.args.open.pool = pool_id;
11308 req->dentry_drop = CEPH_CAP_FILE_SHARED;
11309 req->dentry_unless = CEPH_CAP_FILE_EXCL;
11310
11311 mode |= S_IFREG;
11312 bufferlist xattrs_bl;
11313 int res = _posix_acl_create(dir, &mode, xattrs_bl, perms);
11314 if (res < 0)
11315 goto fail;
11316 req->head.args.open.mode = mode;
11317 if (xattrs_bl.length() > 0)
11318 req->set_data(xattrs_bl);
11319
11320 Dentry *de;
11321 res = get_or_create(dir, name, &de);
11322 if (res < 0)
11323 goto fail;
11324 req->set_dentry(de);
11325
11326 res = make_request(req, perms, inp, created);
11327 if (res < 0) {
11328 goto reply_error;
11329 }
11330
11331 /* If the caller passed a value in fhp, do the open */
11332 if(fhp) {
11333 (*inp)->get_open_ref(cmode);
11334 *fhp = _create_fh(inp->get(), flags, cmode, perms);
11335 }
11336
11337 reply_error:
11338 trim_cache();
11339
11340 ldout(cct, 3) << "create(" << path << ", 0" << oct << mode << dec
11341 << " layout " << stripe_unit
11342 << ' ' << stripe_count
11343 << ' ' << object_size
11344 <<") = " << res << dendl;
11345 return res;
11346
11347 fail:
11348 put_request(req);
11349 return res;
11350}
11351
11352
11353int Client::_mkdir(Inode *dir, const char *name, mode_t mode, const UserPerm& perm,
11354 InodeRef *inp)
11355{
11356 ldout(cct, 3) << "_mkdir(" << dir->ino << " " << name << ", 0" << oct
11357 << mode << dec << ", uid " << perm.uid()
11358 << ", gid " << perm.gid() << ")" << dendl;
11359
11360 if (strlen(name) > NAME_MAX)
11361 return -ENAMETOOLONG;
11362
11363 if (dir->snapid != CEPH_NOSNAP && dir->snapid != CEPH_SNAPDIR) {
11364 return -EROFS;
11365 }
11366 if (is_quota_files_exceeded(dir, perm)) {
11367 return -EDQUOT;
11368 }
11369 MetaRequest *req = new MetaRequest(dir->snapid == CEPH_SNAPDIR ?
11370 CEPH_MDS_OP_MKSNAP : CEPH_MDS_OP_MKDIR);
11371
11372 filepath path;
11373 dir->make_nosnap_relative_path(path);
11374 path.push_dentry(name);
11375 req->set_filepath(path);
11376 req->set_inode(dir);
11377 req->dentry_drop = CEPH_CAP_FILE_SHARED;
11378 req->dentry_unless = CEPH_CAP_FILE_EXCL;
11379
11380 mode |= S_IFDIR;
11381 bufferlist xattrs_bl;
11382 int res = _posix_acl_create(dir, &mode, xattrs_bl, perm);
11383 if (res < 0)
11384 goto fail;
11385 req->head.args.mkdir.mode = mode;
11386 if (xattrs_bl.length() > 0)
11387 req->set_data(xattrs_bl);
11388
11389 Dentry *de;
11390 res = get_or_create(dir, name, &de);
11391 if (res < 0)
11392 goto fail;
11393 req->set_dentry(de);
11394
11395 ldout(cct, 10) << "_mkdir: making request" << dendl;
11396 res = make_request(req, perm, inp);
11397 ldout(cct, 10) << "_mkdir result is " << res << dendl;
11398
11399 trim_cache();
11400
11401 ldout(cct, 3) << "_mkdir(" << path << ", 0" << oct << mode << dec << ") = " << res << dendl;
11402 return res;
11403
11404 fail:
11405 put_request(req);
11406 return res;
11407}
11408
11409int Client::ll_mkdir(Inode *parent, const char *name, mode_t mode,
11410 struct stat *attr, Inode **out, const UserPerm& perm)
11411{
11412 Mutex::Locker lock(client_lock);
11413
11414 vinodeno_t vparent = _get_vino(parent);
11415
11416 ldout(cct, 3) << "ll_mkdir " << vparent << " " << name << dendl;
11417 tout(cct) << "ll_mkdir" << std::endl;
11418 tout(cct) << vparent.ino.val << std::endl;
11419 tout(cct) << name << std::endl;
11420 tout(cct) << mode << std::endl;
11421
11422 if (!cct->_conf->fuse_default_permissions) {
11423 int r = may_create(parent, perm);
11424 if (r < 0)
11425 return r;
11426 }
11427
11428 InodeRef in;
11429 int r = _mkdir(parent, name, mode, perm, &in);
11430 if (r == 0) {
11431 fill_stat(in, attr);
11432 _ll_get(in.get());
11433 }
11434 tout(cct) << attr->st_ino << std::endl;
11435 ldout(cct, 3) << "ll_mkdir " << vparent << " " << name
11436 << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
11437 *out = in.get();
11438 return r;
11439}
11440
11441int Client::ll_mkdirx(Inode *parent, const char *name, mode_t mode, Inode **out,
11442 struct ceph_statx *stx, unsigned want, unsigned flags,
11443 const UserPerm& perms)
11444{
11445 Mutex::Locker lock(client_lock);
11446
11447 vinodeno_t vparent = _get_vino(parent);
11448
11449 ldout(cct, 3) << "ll_mkdirx " << vparent << " " << name << dendl;
11450 tout(cct) << "ll_mkdirx" << std::endl;
11451 tout(cct) << vparent.ino.val << std::endl;
11452 tout(cct) << name << std::endl;
11453 tout(cct) << mode << std::endl;
11454
11455 if (!cct->_conf->fuse_default_permissions) {
11456 int r = may_create(parent, perms);
11457 if (r < 0)
11458 return r;
11459 }
11460
11461 InodeRef in;
11462 int r = _mkdir(parent, name, mode, perms, &in);
11463 if (r == 0) {
11464 fill_statx(in, statx_to_mask(flags, want), stx);
11465 _ll_get(in.get());
11466 } else {
11467 stx->stx_ino = 0;
11468 stx->stx_mask = 0;
11469 }
11470 tout(cct) << stx->stx_ino << std::endl;
11471 ldout(cct, 3) << "ll_mkdirx " << vparent << " " << name
11472 << " = " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
11473 *out = in.get();
11474 return r;
11475}
11476
11477int Client::_symlink(Inode *dir, const char *name, const char *target,
11478 const UserPerm& perms, InodeRef *inp)
11479{
11480 ldout(cct, 3) << "_symlink(" << dir->ino << " " << name << ", " << target
11481 << ", uid " << perms.uid() << ", gid " << perms.gid() << ")"
11482 << dendl;
11483
11484 if (strlen(name) > NAME_MAX)
11485 return -ENAMETOOLONG;
11486
11487 if (dir->snapid != CEPH_NOSNAP) {
11488 return -EROFS;
11489 }
11490 if (is_quota_files_exceeded(dir, perms)) {
11491 return -EDQUOT;
11492 }
11493
11494 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_SYMLINK);
11495
11496 filepath path;
11497 dir->make_nosnap_relative_path(path);
11498 path.push_dentry(name);
11499 req->set_filepath(path);
11500 req->set_inode(dir);
11501 req->set_string2(target);
11502 req->dentry_drop = CEPH_CAP_FILE_SHARED;
11503 req->dentry_unless = CEPH_CAP_FILE_EXCL;
11504
11505 Dentry *de;
11506 int res = get_or_create(dir, name, &de);
11507 if (res < 0)
11508 goto fail;
11509 req->set_dentry(de);
11510
11511 res = make_request(req, perms, inp);
11512
11513 trim_cache();
11514 ldout(cct, 3) << "_symlink(\"" << path << "\", \"" << target << "\") = " <<
11515 res << dendl;
11516 return res;
11517
11518 fail:
11519 put_request(req);
11520 return res;
11521}
11522
11523int Client::ll_symlink(Inode *parent, const char *name, const char *value,
11524 struct stat *attr, Inode **out, const UserPerm& perms)
11525{
11526 Mutex::Locker lock(client_lock);
11527
11528 vinodeno_t vparent = _get_vino(parent);
11529
11530 ldout(cct, 3) << "ll_symlink " << vparent << " " << name << " -> " << value
11531 << dendl;
11532 tout(cct) << "ll_symlink" << std::endl;
11533 tout(cct) << vparent.ino.val << std::endl;
11534 tout(cct) << name << std::endl;
11535 tout(cct) << value << std::endl;
11536
11537 if (!cct->_conf->fuse_default_permissions) {
11538 int r = may_create(parent, perms);
11539 if (r < 0)
11540 return r;
11541 }
11542
11543 InodeRef in;
11544 int r = _symlink(parent, name, value, perms, &in);
11545 if (r == 0) {
11546 fill_stat(in, attr);
11547 _ll_get(in.get());
11548 }
11549 tout(cct) << attr->st_ino << std::endl;
11550 ldout(cct, 3) << "ll_symlink " << vparent << " " << name
11551 << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
11552 *out = in.get();
11553 return r;
11554}
11555
11556int Client::ll_symlinkx(Inode *parent, const char *name, const char *value,
11557 Inode **out, struct ceph_statx *stx, unsigned want,
11558 unsigned flags, const UserPerm& perms)
11559{
11560 Mutex::Locker lock(client_lock);
11561
11562 vinodeno_t vparent = _get_vino(parent);
11563
11564 ldout(cct, 3) << "ll_symlinkx " << vparent << " " << name << " -> " << value
11565 << dendl;
11566 tout(cct) << "ll_symlinkx" << std::endl;
11567 tout(cct) << vparent.ino.val << std::endl;
11568 tout(cct) << name << std::endl;
11569 tout(cct) << value << std::endl;
11570
11571 if (!cct->_conf->fuse_default_permissions) {
11572 int r = may_create(parent, perms);
11573 if (r < 0)
11574 return r;
11575 }
11576
11577 InodeRef in;
11578 int r = _symlink(parent, name, value, perms, &in);
11579 if (r == 0) {
11580 fill_statx(in, statx_to_mask(flags, want), stx);
11581 _ll_get(in.get());
11582 }
11583 tout(cct) << stx->stx_ino << std::endl;
11584 ldout(cct, 3) << "ll_symlinkx " << vparent << " " << name
11585 << " = " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
11586 *out = in.get();
11587 return r;
11588}
11589
11590int Client::_unlink(Inode *dir, const char *name, const UserPerm& perm)
11591{
11592 ldout(cct, 3) << "_unlink(" << dir->ino << " " << name
11593 << " uid " << perm.uid() << " gid " << perm.gid()
11594 << ")" << dendl;
11595
11596 if (dir->snapid != CEPH_NOSNAP) {
11597 return -EROFS;
11598 }
11599
11600 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_UNLINK);
11601
11602 filepath path;
11603 dir->make_nosnap_relative_path(path);
11604 path.push_dentry(name);
11605 req->set_filepath(path);
11606
11607 InodeRef otherin;
11608
11609 Dentry *de;
11610 int res = get_or_create(dir, name, &de);
11611 if (res < 0)
11612 goto fail;
11613 req->set_dentry(de);
11614 req->dentry_drop = CEPH_CAP_FILE_SHARED;
11615 req->dentry_unless = CEPH_CAP_FILE_EXCL;
11616
11617 res = _lookup(dir, name, 0, &otherin, perm);
11618 if (res < 0)
11619 goto fail;
11620 req->set_other_inode(otherin.get());
11621 req->other_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
11622
11623 req->set_inode(dir);
11624
11625 res = make_request(req, perm);
11626
11627 trim_cache();
11628 ldout(cct, 3) << "unlink(" << path << ") = " << res << dendl;
11629 return res;
11630
11631 fail:
11632 put_request(req);
11633 return res;
11634}
11635
11636int Client::ll_unlink(Inode *in, const char *name, const UserPerm& perm)
11637{
11638 Mutex::Locker lock(client_lock);
11639
11640 vinodeno_t vino = _get_vino(in);
11641
11642 ldout(cct, 3) << "ll_unlink " << vino << " " << name << dendl;
11643 tout(cct) << "ll_unlink" << std::endl;
11644 tout(cct) << vino.ino.val << std::endl;
11645 tout(cct) << name << std::endl;
11646
11647 if (!cct->_conf->fuse_default_permissions) {
11648 int r = may_delete(in, name, perm);
11649 if (r < 0)
11650 return r;
11651 }
11652 return _unlink(in, name, perm);
11653}
11654
11655int Client::_rmdir(Inode *dir, const char *name, const UserPerm& perms)
11656{
11657 ldout(cct, 3) << "_rmdir(" << dir->ino << " " << name << " uid "
11658 << perms.uid() << " gid " << perms.gid() << ")" << dendl;
11659
11660 if (dir->snapid != CEPH_NOSNAP && dir->snapid != CEPH_SNAPDIR) {
11661 return -EROFS;
11662 }
11663
11664 MetaRequest *req = new MetaRequest(dir->snapid == CEPH_SNAPDIR ? CEPH_MDS_OP_RMSNAP:CEPH_MDS_OP_RMDIR);
11665 filepath path;
11666 dir->make_nosnap_relative_path(path);
11667 path.push_dentry(name);
11668 req->set_filepath(path);
11669
11670 req->dentry_drop = CEPH_CAP_FILE_SHARED;
11671 req->dentry_unless = CEPH_CAP_FILE_EXCL;
11672 req->other_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
11673
11674 InodeRef in;
11675
11676 Dentry *de;
11677 int res = get_or_create(dir, name, &de);
11678 if (res < 0)
11679 goto fail;
11680 res = _lookup(dir, name, 0, &in, perms);
11681 if (res < 0)
11682 goto fail;
11683 if (req->get_op() == CEPH_MDS_OP_RMDIR) {
11684 req->set_inode(dir);
11685 req->set_dentry(de);
11686 req->set_other_inode(in.get());
11687 } else {
11688 unlink(de, true, true);
11689 req->set_other_inode(in.get());
11690 }
11691
11692 res = make_request(req, perms);
11693
11694 trim_cache();
11695 ldout(cct, 3) << "rmdir(" << path << ") = " << res << dendl;
11696 return res;
11697
11698 fail:
11699 put_request(req);
11700 return res;
11701}
11702
11703int Client::ll_rmdir(Inode *in, const char *name, const UserPerm& perms)
11704{
11705 Mutex::Locker lock(client_lock);
11706
11707 vinodeno_t vino = _get_vino(in);
11708
11709 ldout(cct, 3) << "ll_rmdir " << vino << " " << name << dendl;
11710 tout(cct) << "ll_rmdir" << std::endl;
11711 tout(cct) << vino.ino.val << std::endl;
11712 tout(cct) << name << std::endl;
11713
11714 if (!cct->_conf->fuse_default_permissions) {
11715 int r = may_delete(in, name, perms);
11716 if (r < 0)
11717 return r;
11718 }
11719
11720 return _rmdir(in, name, perms);
11721}
11722
11723int Client::_rename(Inode *fromdir, const char *fromname, Inode *todir, const char *toname, const UserPerm& perm)
11724{
11725 ldout(cct, 3) << "_rename(" << fromdir->ino << " " << fromname << " to "
11726 << todir->ino << " " << toname
11727 << " uid " << perm.uid() << " gid " << perm.gid() << ")"
11728 << dendl;
11729
11730 if (fromdir->snapid != todir->snapid)
11731 return -EXDEV;
11732
11733 int op = CEPH_MDS_OP_RENAME;
11734 if (fromdir->snapid != CEPH_NOSNAP) {
11735 if (fromdir == todir && fromdir->snapid == CEPH_SNAPDIR)
11736 op = CEPH_MDS_OP_RENAMESNAP;
11737 else
11738 return -EROFS;
11739 }
11740 if (fromdir != todir) {
11741 Inode *fromdir_root =
11742 fromdir->quota.is_enable() ? fromdir : get_quota_root(fromdir, perm);
11743 Inode *todir_root =
11744 todir->quota.is_enable() ? todir : get_quota_root(todir, perm);
11745 if (fromdir_root != todir_root) {
11746 return -EXDEV;
11747 }
11748 }
11749
11750 InodeRef target;
11751 MetaRequest *req = new MetaRequest(op);
11752
11753 filepath from;
11754 fromdir->make_nosnap_relative_path(from);
11755 from.push_dentry(fromname);
11756 filepath to;
11757 todir->make_nosnap_relative_path(to);
11758 to.push_dentry(toname);
11759 req->set_filepath(to);
11760 req->set_filepath2(from);
11761
11762 Dentry *oldde;
11763 int res = get_or_create(fromdir, fromname, &oldde);
11764 if (res < 0)
11765 goto fail;
11766 Dentry *de;
11767 res = get_or_create(todir, toname, &de);
11768 if (res < 0)
11769 goto fail;
11770
11771 if (op == CEPH_MDS_OP_RENAME) {
11772 req->set_old_dentry(oldde);
11773 req->old_dentry_drop = CEPH_CAP_FILE_SHARED;
11774 req->old_dentry_unless = CEPH_CAP_FILE_EXCL;
11775
11776 req->set_dentry(de);
11777 req->dentry_drop = CEPH_CAP_FILE_SHARED;
11778 req->dentry_unless = CEPH_CAP_FILE_EXCL;
11779
11780 InodeRef oldin, otherin;
11781 res = _lookup(fromdir, fromname, 0, &oldin, perm);
11782 if (res < 0)
11783 goto fail;
11784 req->set_old_inode(oldin.get());
11785 req->old_inode_drop = CEPH_CAP_LINK_SHARED;
11786
11787 res = _lookup(todir, toname, 0, &otherin, perm);
11788 if (res != 0 && res != -ENOENT) {
11789 goto fail;
11790 } else if (res == 0) {
11791 req->set_other_inode(otherin.get());
11792 req->other_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
11793 }
11794
11795 req->set_inode(todir);
11796 } else {
11797 // renamesnap reply contains no tracedn, so we need to invalidate
11798 // dentry manually
11799 unlink(oldde, true, true);
11800 unlink(de, true, true);
11801 }
11802
11803 res = make_request(req, perm, &target);
11804 ldout(cct, 10) << "rename result is " << res << dendl;
11805
11806 // renamed item from our cache
11807
11808 trim_cache();
11809 ldout(cct, 3) << "_rename(" << from << ", " << to << ") = " << res << dendl;
11810 return res;
11811
11812 fail:
11813 put_request(req);
11814 return res;
11815}
11816
11817int Client::ll_rename(Inode *parent, const char *name, Inode *newparent,
11818 const char *newname, const UserPerm& perm)
11819{
11820 Mutex::Locker lock(client_lock);
11821
11822 vinodeno_t vparent = _get_vino(parent);
11823 vinodeno_t vnewparent = _get_vino(newparent);
11824
11825 ldout(cct, 3) << "ll_rename " << vparent << " " << name << " to "
11826 << vnewparent << " " << newname << dendl;
11827 tout(cct) << "ll_rename" << std::endl;
11828 tout(cct) << vparent.ino.val << std::endl;
11829 tout(cct) << name << std::endl;
11830 tout(cct) << vnewparent.ino.val << std::endl;
11831 tout(cct) << newname << std::endl;
11832
11833 if (!cct->_conf->fuse_default_permissions) {
11834 int r = may_delete(parent, name, perm);
11835 if (r < 0)
11836 return r;
11837 r = may_delete(newparent, newname, perm);
11838 if (r < 0 && r != -ENOENT)
11839 return r;
11840 }
11841
11842 return _rename(parent, name, newparent, newname, perm);
11843}
11844
11845int Client::_link(Inode *in, Inode *dir, const char *newname, const UserPerm& perm, InodeRef *inp)
11846{
11847 ldout(cct, 3) << "_link(" << in->ino << " to " << dir->ino << " " << newname
11848 << " uid " << perm.uid() << " gid " << perm.gid() << ")" << dendl;
11849
11850 if (strlen(newname) > NAME_MAX)
11851 return -ENAMETOOLONG;
11852
11853 if (in->snapid != CEPH_NOSNAP || dir->snapid != CEPH_NOSNAP) {
11854 return -EROFS;
11855 }
11856 if (is_quota_files_exceeded(dir, perm)) {
11857 return -EDQUOT;
11858 }
11859
11860 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LINK);
11861
11862 filepath path(newname, dir->ino);
11863 req->set_filepath(path);
11864 filepath existing(in->ino);
11865 req->set_filepath2(existing);
11866
11867 req->set_inode(dir);
11868 req->inode_drop = CEPH_CAP_FILE_SHARED;
11869 req->inode_unless = CEPH_CAP_FILE_EXCL;
11870
11871 Dentry *de;
11872 int res = get_or_create(dir, newname, &de);
11873 if (res < 0)
11874 goto fail;
11875 req->set_dentry(de);
11876
11877 res = make_request(req, perm, inp);
11878 ldout(cct, 10) << "link result is " << res << dendl;
11879
11880 trim_cache();
11881 ldout(cct, 3) << "link(" << existing << ", " << path << ") = " << res << dendl;
11882 return res;
11883
11884 fail:
11885 put_request(req);
11886 return res;
11887}
11888
11889int Client::ll_link(Inode *in, Inode *newparent, const char *newname,
11890 const UserPerm& perm)
11891{
11892 Mutex::Locker lock(client_lock);
11893
11894 vinodeno_t vino = _get_vino(in);
11895 vinodeno_t vnewparent = _get_vino(newparent);
11896
31f18b77 11897 ldout(cct, 3) << "ll_link " << vino << " to " << vnewparent << " " <<
7c673cae
FG
11898 newname << dendl;
11899 tout(cct) << "ll_link" << std::endl;
11900 tout(cct) << vino.ino.val << std::endl;
11901 tout(cct) << vnewparent << std::endl;
11902 tout(cct) << newname << std::endl;
11903
11904 int r = 0;
11905 InodeRef target;
11906
11907 if (!cct->_conf->fuse_default_permissions) {
11908 if (S_ISDIR(in->mode))
11909 return -EPERM;
11910
11911 r = may_hardlink(in, perm);
11912 if (r < 0)
11913 return r;
11914
11915 r = may_create(newparent, perm);
11916 if (r < 0)
11917 return r;
11918 }
11919
11920 return _link(in, newparent, newname, perm, &target);
11921}
11922
11923int Client::ll_num_osds(void)
11924{
11925 Mutex::Locker lock(client_lock);
11926 return objecter->with_osdmap(std::mem_fn(&OSDMap::get_num_osds));
11927}
11928
11929int Client::ll_osdaddr(int osd, uint32_t *addr)
11930{
11931 Mutex::Locker lock(client_lock);
11932 entity_addr_t g;
11933 bool exists = objecter->with_osdmap([&](const OSDMap& o) {
11934 if (!o.exists(osd))
11935 return false;
11936 g = o.get_addr(osd);
11937 return true;
11938 });
11939 if (!exists)
11940 return -1;
11941 uint32_t nb_addr = (g.in4_addr()).sin_addr.s_addr;
11942 *addr = ntohl(nb_addr);
11943 return 0;
11944}
11945uint32_t Client::ll_stripe_unit(Inode *in)
11946{
11947 Mutex::Locker lock(client_lock);
11948 return in->layout.stripe_unit;
11949}
11950
11951uint64_t Client::ll_snap_seq(Inode *in)
11952{
11953 Mutex::Locker lock(client_lock);
11954 return in->snaprealm->seq;
11955}
11956
11957int Client::ll_file_layout(Inode *in, file_layout_t *layout)
11958{
11959 Mutex::Locker lock(client_lock);
11960 *layout = in->layout;
11961 return 0;
11962}
11963
11964int Client::ll_file_layout(Fh *fh, file_layout_t *layout)
11965{
11966 return ll_file_layout(fh->inode.get(), layout);
11967}
11968
11969/* Currently we cannot take advantage of redundancy in reads, since we
11970 would have to go through all possible placement groups (a
11971 potentially quite large number determined by a hash), and use CRUSH
11972 to calculate the appropriate set of OSDs for each placement group,
11973 then index into that. An array with one entry per OSD is much more
11974 tractable and works for demonstration purposes. */
11975
11976int Client::ll_get_stripe_osd(Inode *in, uint64_t blockno,
11977 file_layout_t* layout)
11978{
11979 Mutex::Locker lock(client_lock);
11980 inodeno_t ino = ll_get_inodeno(in);
11981 uint32_t object_size = layout->object_size;
11982 uint32_t su = layout->stripe_unit;
11983 uint32_t stripe_count = layout->stripe_count;
11984 uint64_t stripes_per_object = object_size / su;
11985
11986 uint64_t stripeno = blockno / stripe_count; // which horizontal stripe (Y)
11987 uint64_t stripepos = blockno % stripe_count; // which object in the object set (X)
11988 uint64_t objectsetno = stripeno / stripes_per_object; // which object set
11989 uint64_t objectno = objectsetno * stripe_count + stripepos; // object id
11990
11991 object_t oid = file_object_t(ino, objectno);
11992 return objecter->with_osdmap([&](const OSDMap& o) {
11993 ceph_object_layout olayout =
11994 o.file_to_object_layout(oid, *layout);
11995 pg_t pg = (pg_t)olayout.ol_pgid;
11996 vector<int> osds;
11997 int primary;
11998 o.pg_to_acting_osds(pg, &osds, &primary);
11999 return primary;
12000 });
12001}
12002
12003/* Return the offset of the block, internal to the object */
12004
12005uint64_t Client::ll_get_internal_offset(Inode *in, uint64_t blockno)
12006{
12007 Mutex::Locker lock(client_lock);
12008 file_layout_t *layout=&(in->layout);
12009 uint32_t object_size = layout->object_size;
12010 uint32_t su = layout->stripe_unit;
12011 uint64_t stripes_per_object = object_size / su;
12012
12013 return (blockno % stripes_per_object) * su;
12014}
12015
12016int Client::ll_opendir(Inode *in, int flags, dir_result_t** dirpp,
12017 const UserPerm& perms)
12018{
12019 Mutex::Locker lock(client_lock);
12020
12021 vinodeno_t vino = _get_vino(in);
12022
12023 ldout(cct, 3) << "ll_opendir " << vino << dendl;
12024 tout(cct) << "ll_opendir" << std::endl;
12025 tout(cct) << vino.ino.val << std::endl;
12026
12027 if (!cct->_conf->fuse_default_permissions) {
12028 int r = may_open(in, flags, perms);
12029 if (r < 0)
12030 return r;
12031 }
12032
12033 int r = _opendir(in, dirpp, perms);
12034 tout(cct) << (unsigned long)*dirpp << std::endl;
12035
12036 ldout(cct, 3) << "ll_opendir " << vino << " = " << r << " (" << *dirpp << ")"
12037 << dendl;
12038 return r;
12039}
12040
12041int Client::ll_releasedir(dir_result_t *dirp)
12042{
12043 Mutex::Locker lock(client_lock);
12044 ldout(cct, 3) << "ll_releasedir " << dirp << dendl;
12045 tout(cct) << "ll_releasedir" << std::endl;
12046 tout(cct) << (unsigned long)dirp << std::endl;
12047 _closedir(dirp);
12048 return 0;
12049}
12050
12051int Client::ll_fsyncdir(dir_result_t *dirp)
12052{
12053 Mutex::Locker lock(client_lock);
12054 ldout(cct, 3) << "ll_fsyncdir " << dirp << dendl;
12055 tout(cct) << "ll_fsyncdir" << std::endl;
12056 tout(cct) << (unsigned long)dirp << std::endl;
12057
12058 return _fsync(dirp->inode.get(), false);
12059}
12060
12061int Client::ll_open(Inode *in, int flags, Fh **fhp, const UserPerm& perms)
12062{
12063 assert(!(flags & O_CREAT));
12064
12065 Mutex::Locker lock(client_lock);
12066
12067 vinodeno_t vino = _get_vino(in);
12068
12069 ldout(cct, 3) << "ll_open " << vino << " " << ceph_flags_sys2wire(flags) << dendl;
12070 tout(cct) << "ll_open" << std::endl;
12071 tout(cct) << vino.ino.val << std::endl;
12072 tout(cct) << ceph_flags_sys2wire(flags) << std::endl;
12073
12074 int r;
12075 if (!cct->_conf->fuse_default_permissions) {
12076 r = may_open(in, flags, perms);
12077 if (r < 0)
12078 goto out;
12079 }
12080
12081 r = _open(in, flags, 0, fhp /* may be NULL */, perms);
12082
12083 out:
12084 Fh *fhptr = fhp ? *fhp : NULL;
12085 if (fhptr) {
12086 ll_unclosed_fh_set.insert(fhptr);
12087 }
12088 tout(cct) << (unsigned long)fhptr << std::endl;
12089 ldout(cct, 3) << "ll_open " << vino << " " << ceph_flags_sys2wire(flags) <<
12090 " = " << r << " (" << fhptr << ")" << dendl;
12091 return r;
12092}
12093
12094int Client::_ll_create(Inode *parent, const char *name, mode_t mode,
12095 int flags, InodeRef *in, int caps, Fh **fhp,
12096 const UserPerm& perms)
12097{
12098 *fhp = NULL;
12099
12100 vinodeno_t vparent = _get_vino(parent);
12101
12102 ldout(cct, 3) << "_ll_create " << vparent << " " << name << " 0" << oct <<
12103 mode << dec << " " << ceph_flags_sys2wire(flags) << ", uid " << perms.uid()
12104 << ", gid " << perms.gid() << dendl;
12105 tout(cct) << "ll_create" << std::endl;
12106 tout(cct) << vparent.ino.val << std::endl;
12107 tout(cct) << name << std::endl;
12108 tout(cct) << mode << std::endl;
12109 tout(cct) << ceph_flags_sys2wire(flags) << std::endl;
12110
12111 bool created = false;
12112 int r = _lookup(parent, name, caps, in, perms);
12113
12114 if (r == 0 && (flags & O_CREAT) && (flags & O_EXCL))
12115 return -EEXIST;
12116
12117 if (r == -ENOENT && (flags & O_CREAT)) {
12118 if (!cct->_conf->fuse_default_permissions) {
12119 r = may_create(parent, perms);
12120 if (r < 0)
12121 goto out;
12122 }
12123 r = _create(parent, name, flags, mode, in, fhp, 0, 0, 0, NULL, &created,
12124 perms);
12125 if (r < 0)
12126 goto out;
12127 }
12128
12129 if (r < 0)
12130 goto out;
12131
12132 assert(*in);
12133
12134 ldout(cct, 20) << "_ll_create created = " << created << dendl;
12135 if (!created) {
12136 if (!cct->_conf->fuse_default_permissions) {
12137 r = may_open(in->get(), flags, perms);
12138 if (r < 0) {
12139 if (*fhp) {
12140 int release_r = _release_fh(*fhp);
12141 assert(release_r == 0); // during create, no async data ops should have happened
12142 }
12143 goto out;
12144 }
12145 }
12146 if (*fhp == NULL) {
12147 r = _open(in->get(), flags, mode, fhp, perms);
12148 if (r < 0)
12149 goto out;
12150 }
12151 }
12152
12153out:
12154 if (*fhp) {
12155 ll_unclosed_fh_set.insert(*fhp);
12156 }
12157
12158 ino_t ino = 0;
12159 if (r >= 0) {
12160 Inode *inode = in->get();
12161 if (use_faked_inos())
12162 ino = inode->faked_ino;
12163 else
12164 ino = inode->ino;
12165 }
12166
12167 tout(cct) << (unsigned long)*fhp << std::endl;
12168 tout(cct) << ino << std::endl;
31f18b77 12169 ldout(cct, 3) << "_ll_create " << vparent << " " << name << " 0" << oct <<
7c673cae
FG
12170 mode << dec << " " << ceph_flags_sys2wire(flags) << " = " << r << " (" <<
12171 *fhp << " " << hex << ino << dec << ")" << dendl;
12172
12173 return r;
12174}
12175
12176int Client::ll_create(Inode *parent, const char *name, mode_t mode,
12177 int flags, struct stat *attr, Inode **outp, Fh **fhp,
12178 const UserPerm& perms)
12179{
12180 Mutex::Locker lock(client_lock);
12181 InodeRef in;
12182
12183 int r = _ll_create(parent, name, mode, flags, &in, CEPH_STAT_CAP_INODE_ALL,
12184 fhp, perms);
12185 if (r >= 0) {
12186 assert(in);
12187
12188 // passing an Inode in outp requires an additional ref
12189 if (outp) {
12190 _ll_get(in.get());
12191 *outp = in.get();
12192 }
12193 fill_stat(in, attr);
12194 } else {
12195 attr->st_ino = 0;
12196 }
12197
12198 return r;
12199}
12200
12201int Client::ll_createx(Inode *parent, const char *name, mode_t mode,
12202 int oflags, Inode **outp, Fh **fhp,
12203 struct ceph_statx *stx, unsigned want, unsigned lflags,
12204 const UserPerm& perms)
12205{
12206 unsigned caps = statx_to_mask(lflags, want);
12207 Mutex::Locker lock(client_lock);
12208 InodeRef in;
12209
12210
12211 int r = _ll_create(parent, name, mode, oflags, &in, caps, fhp, perms);
12212 if (r >= 0) {
12213 assert(in);
12214
12215 // passing an Inode in outp requires an additional ref
12216 if (outp) {
12217 _ll_get(in.get());
12218 *outp = in.get();
12219 }
12220 fill_statx(in, caps, stx);
12221 } else {
12222 stx->stx_ino = 0;
12223 stx->stx_mask = 0;
12224 }
12225
12226 return r;
12227}
12228
12229loff_t Client::ll_lseek(Fh *fh, loff_t offset, int whence)
12230{
12231 Mutex::Locker lock(client_lock);
12232 tout(cct) << "ll_lseek" << std::endl;
12233 tout(cct) << offset << std::endl;
12234 tout(cct) << whence << std::endl;
12235
12236 return _lseek(fh, offset, whence);
12237}
12238
12239int Client::ll_read(Fh *fh, loff_t off, loff_t len, bufferlist *bl)
12240{
12241 Mutex::Locker lock(client_lock);
12242 ldout(cct, 3) << "ll_read " << fh << " " << fh->inode->ino << " " << " " << off << "~" << len << dendl;
12243 tout(cct) << "ll_read" << std::endl;
12244 tout(cct) << (unsigned long)fh << std::endl;
12245 tout(cct) << off << std::endl;
12246 tout(cct) << len << std::endl;
12247
12248 return _read(fh, off, len, bl);
12249}
12250
12251int Client::ll_read_block(Inode *in, uint64_t blockid,
12252 char *buf,
12253 uint64_t offset,
12254 uint64_t length,
12255 file_layout_t* layout)
12256{
12257 Mutex::Locker lock(client_lock);
12258 vinodeno_t vino = ll_get_vino(in);
12259 object_t oid = file_object_t(vino.ino, blockid);
12260 C_SaferCond onfinish;
12261 bufferlist bl;
12262
12263 objecter->read(oid,
12264 object_locator_t(layout->pool_id),
12265 offset,
12266 length,
12267 vino.snapid,
12268 &bl,
12269 CEPH_OSD_FLAG_READ,
12270 &onfinish);
12271
12272 client_lock.Unlock();
12273 int r = onfinish.wait();
12274 client_lock.Lock();
12275
12276 if (r >= 0) {
12277 bl.copy(0, bl.length(), buf);
12278 r = bl.length();
12279 }
12280
12281 return r;
12282}
12283
12284/* It appears that the OSD doesn't return success unless the entire
12285 buffer was written, return the write length on success. */
12286
12287int Client::ll_write_block(Inode *in, uint64_t blockid,
12288 char* buf, uint64_t offset,
12289 uint64_t length, file_layout_t* layout,
12290 uint64_t snapseq, uint32_t sync)
12291{
12292 Mutex flock("Client::ll_write_block flock");
12293 vinodeno_t vino = ll_get_vino(in);
12294 Cond cond;
12295 bool done;
12296 int r = 0;
12297 Context *onsafe;
12298
12299 if (length == 0) {
12300 return -EINVAL;
12301 }
12302 if (true || sync) {
12303 /* if write is stable, the epilogue is waiting on
12304 * flock */
12305 onsafe = new C_SafeCond(&flock, &cond, &done, &r);
12306 done = false;
12307 } else {
12308 /* if write is unstable, we just place a barrier for
12309 * future commits to wait on */
12310 /*onsafe = new C_Block_Sync(this, vino.ino,
12311 barrier_interval(offset, offset + length), &r);
12312 */
12313 done = true;
12314 }
12315 object_t oid = file_object_t(vino.ino, blockid);
12316 SnapContext fakesnap;
12317 bufferptr bp;
12318 if (length > 0) bp = buffer::copy(buf, length);
12319 bufferlist bl;
12320 bl.push_back(bp);
12321
12322 ldout(cct, 1) << "ll_block_write for " << vino.ino << "." << blockid
12323 << dendl;
12324
12325 fakesnap.seq = snapseq;
12326
12327 /* lock just in time */
12328 client_lock.Lock();
12329
12330 objecter->write(oid,
12331 object_locator_t(layout->pool_id),
12332 offset,
12333 length,
12334 fakesnap,
12335 bl,
12336 ceph::real_clock::now(),
12337 0,
12338 onsafe);
12339
12340 client_lock.Unlock();
12341 if (!done /* also !sync */) {
12342 flock.Lock();
12343 while (! done)
12344 cond.Wait(flock);
12345 flock.Unlock();
12346 }
12347
12348 if (r < 0) {
12349 return r;
12350 } else {
12351 return length;
12352 }
12353}
12354
12355int Client::ll_commit_blocks(Inode *in,
12356 uint64_t offset,
12357 uint64_t length)
12358{
12359 Mutex::Locker lock(client_lock);
12360 /*
12361 BarrierContext *bctx;
12362 vinodeno_t vino = ll_get_vino(in);
12363 uint64_t ino = vino.ino;
12364
12365 ldout(cct, 1) << "ll_commit_blocks for " << vino.ino << " from "
12366 << offset << " to " << length << dendl;
12367
12368 if (length == 0) {
12369 return -EINVAL;
12370 }
12371
12372 map<uint64_t, BarrierContext*>::iterator p = barriers.find(ino);
12373 if (p != barriers.end()) {
12374 barrier_interval civ(offset, offset + length);
12375 p->second->commit_barrier(civ);
12376 }
12377 */
12378 return 0;
12379}
12380
12381int Client::ll_write(Fh *fh, loff_t off, loff_t len, const char *data)
12382{
12383 Mutex::Locker lock(client_lock);
12384 ldout(cct, 3) << "ll_write " << fh << " " << fh->inode->ino << " " << off <<
12385 "~" << len << dendl;
12386 tout(cct) << "ll_write" << std::endl;
12387 tout(cct) << (unsigned long)fh << std::endl;
12388 tout(cct) << off << std::endl;
12389 tout(cct) << len << std::endl;
12390
12391 int r = _write(fh, off, len, data, NULL, 0);
12392 ldout(cct, 3) << "ll_write " << fh << " " << off << "~" << len << " = " << r
12393 << dendl;
12394 return r;
12395}
12396
12397int Client::ll_flush(Fh *fh)
12398{
12399 Mutex::Locker lock(client_lock);
12400 ldout(cct, 3) << "ll_flush " << fh << " " << fh->inode->ino << " " << dendl;
12401 tout(cct) << "ll_flush" << std::endl;
12402 tout(cct) << (unsigned long)fh << std::endl;
12403
12404 return _flush(fh);
12405}
12406
12407int Client::ll_fsync(Fh *fh, bool syncdataonly)
12408{
12409 Mutex::Locker lock(client_lock);
12410 ldout(cct, 3) << "ll_fsync " << fh << " " << fh->inode->ino << " " << dendl;
12411 tout(cct) << "ll_fsync" << std::endl;
12412 tout(cct) << (unsigned long)fh << std::endl;
12413
12414 int r = _fsync(fh, syncdataonly);
12415 if (r) {
12416 // If we're returning an error, clear it from the FH
12417 fh->take_async_err();
12418 }
12419 return r;
12420}
12421
12422#ifdef FALLOC_FL_PUNCH_HOLE
12423
12424int Client::_fallocate(Fh *fh, int mode, int64_t offset, int64_t length)
12425{
12426 if (offset < 0 || length <= 0)
12427 return -EINVAL;
12428
12429 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
12430 return -EOPNOTSUPP;
12431
12432 if ((mode & FALLOC_FL_PUNCH_HOLE) && !(mode & FALLOC_FL_KEEP_SIZE))
12433 return -EOPNOTSUPP;
12434
12435 Inode *in = fh->inode.get();
12436
12437 if (objecter->osdmap_pool_full(in->layout.pool_id) &&
12438 !(mode & FALLOC_FL_PUNCH_HOLE)) {
12439 return -ENOSPC;
12440 }
12441
12442 if (in->snapid != CEPH_NOSNAP)
12443 return -EROFS;
12444
12445 if ((fh->mode & CEPH_FILE_MODE_WR) == 0)
12446 return -EBADF;
12447
12448 uint64_t size = offset + length;
12449 if (!(mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE)) &&
12450 size > in->size &&
12451 is_quota_bytes_exceeded(in, size - in->size, fh->actor_perms)) {
12452 return -EDQUOT;
12453 }
12454
12455 int have;
12456 int r = get_caps(in, CEPH_CAP_FILE_WR, CEPH_CAP_FILE_BUFFER, &have, -1);
12457 if (r < 0)
12458 return r;
12459
12460 Mutex uninline_flock("Client::_fallocate_uninline_data flock");
12461 Cond uninline_cond;
12462 bool uninline_done = false;
12463 int uninline_ret = 0;
12464 Context *onuninline = NULL;
12465
12466 if (mode & FALLOC_FL_PUNCH_HOLE) {
12467 if (in->inline_version < CEPH_INLINE_NONE &&
12468 (have & CEPH_CAP_FILE_BUFFER)) {
12469 bufferlist bl;
12470 int len = in->inline_data.length();
12471 if (offset < len) {
12472 if (offset > 0)
12473 in->inline_data.copy(0, offset, bl);
12474 int size = length;
12475 if (offset + size > len)
12476 size = len - offset;
12477 if (size > 0)
12478 bl.append_zero(size);
12479 if (offset + size < len)
12480 in->inline_data.copy(offset + size, len - offset - size, bl);
12481 in->inline_data = bl;
12482 in->inline_version++;
12483 }
12484 in->mtime = ceph_clock_now();
12485 in->change_attr++;
12486 mark_caps_dirty(in, CEPH_CAP_FILE_WR);
12487 } else {
12488 if (in->inline_version < CEPH_INLINE_NONE) {
12489 onuninline = new C_SafeCond(&uninline_flock,
12490 &uninline_cond,
12491 &uninline_done,
12492 &uninline_ret);
12493 uninline_data(in, onuninline);
12494 }
12495
12496 Mutex flock("Client::_punch_hole flock");
12497 Cond cond;
12498 bool done = false;
12499 Context *onfinish = new C_SafeCond(&flock, &cond, &done);
12500
12501 unsafe_sync_write++;
12502 get_cap_ref(in, CEPH_CAP_FILE_BUFFER);
12503
12504 _invalidate_inode_cache(in, offset, length);
12505 filer->zero(in->ino, &in->layout,
12506 in->snaprealm->get_snap_context(),
12507 offset, length,
12508 ceph::real_clock::now(),
12509 0, true, onfinish);
12510 in->mtime = ceph_clock_now();
12511 in->change_attr++;
12512 mark_caps_dirty(in, CEPH_CAP_FILE_WR);
12513
12514 client_lock.Unlock();
12515 flock.Lock();
12516 while (!done)
12517 cond.Wait(flock);
12518 flock.Unlock();
12519 client_lock.Lock();
12520 _sync_write_commit(in);
12521 }
12522 } else if (!(mode & FALLOC_FL_KEEP_SIZE)) {
12523 uint64_t size = offset + length;
12524 if (size > in->size) {
12525 in->size = size;
12526 in->mtime = ceph_clock_now();
12527 in->change_attr++;
12528 mark_caps_dirty(in, CEPH_CAP_FILE_WR);
12529
12530 if (is_quota_bytes_approaching(in, fh->actor_perms)) {
12531 check_caps(in, CHECK_CAPS_NODELAY);
31f18b77
FG
12532 } else if (is_max_size_approaching(in)) {
12533 check_caps(in, 0);
7c673cae
FG
12534 }
12535 }
12536 }
12537
12538 if (onuninline) {
12539 client_lock.Unlock();
12540 uninline_flock.Lock();
12541 while (!uninline_done)
12542 uninline_cond.Wait(uninline_flock);
12543 uninline_flock.Unlock();
12544 client_lock.Lock();
12545
12546 if (uninline_ret >= 0 || uninline_ret == -ECANCELED) {
12547 in->inline_data.clear();
12548 in->inline_version = CEPH_INLINE_NONE;
12549 mark_caps_dirty(in, CEPH_CAP_FILE_WR);
12550 check_caps(in, 0);
12551 } else
12552 r = uninline_ret;
12553 }
12554
12555 put_cap_ref(in, CEPH_CAP_FILE_WR);
12556 return r;
12557}
12558#else
12559
12560int Client::_fallocate(Fh *fh, int mode, int64_t offset, int64_t length)
12561{
12562 return -EOPNOTSUPP;
12563}
12564
12565#endif
12566
12567
12568int Client::ll_fallocate(Fh *fh, int mode, loff_t offset, loff_t length)
12569{
12570 Mutex::Locker lock(client_lock);
12571 ldout(cct, 3) << "ll_fallocate " << fh << " " << fh->inode->ino << " " << dendl;
12572 tout(cct) << "ll_fallocate " << mode << " " << offset << " " << length << std::endl;
12573 tout(cct) << (unsigned long)fh << std::endl;
12574
12575 return _fallocate(fh, mode, offset, length);
12576}
12577
12578int Client::fallocate(int fd, int mode, loff_t offset, loff_t length)
12579{
12580 Mutex::Locker lock(client_lock);
12581 tout(cct) << "fallocate " << " " << fd << mode << " " << offset << " " << length << std::endl;
12582
12583 Fh *fh = get_filehandle(fd);
12584 if (!fh)
12585 return -EBADF;
12586#if defined(__linux__) && defined(O_PATH)
12587 if (fh->flags & O_PATH)
12588 return -EBADF;
12589#endif
12590 return _fallocate(fh, mode, offset, length);
12591}
12592
12593int Client::ll_release(Fh *fh)
12594{
12595 Mutex::Locker lock(client_lock);
12596 ldout(cct, 3) << "ll_release (fh)" << fh << " " << fh->inode->ino << " " <<
12597 dendl;
12598 tout(cct) << "ll_release (fh)" << std::endl;
12599 tout(cct) << (unsigned long)fh << std::endl;
12600
12601 if (ll_unclosed_fh_set.count(fh))
12602 ll_unclosed_fh_set.erase(fh);
12603 return _release_fh(fh);
12604}
12605
12606int Client::ll_getlk(Fh *fh, struct flock *fl, uint64_t owner)
12607{
12608 Mutex::Locker lock(client_lock);
12609
12610 ldout(cct, 3) << "ll_getlk (fh)" << fh << " " << fh->inode->ino << dendl;
12611 tout(cct) << "ll_getk (fh)" << (unsigned long)fh << std::endl;
12612
12613 return _getlk(fh, fl, owner);
12614}
12615
12616int Client::ll_setlk(Fh *fh, struct flock *fl, uint64_t owner, int sleep)
12617{
12618 Mutex::Locker lock(client_lock);
12619
12620 ldout(cct, 3) << "ll_setlk (fh) " << fh << " " << fh->inode->ino << dendl;
12621 tout(cct) << "ll_setk (fh)" << (unsigned long)fh << std::endl;
12622
12623 return _setlk(fh, fl, owner, sleep);
12624}
12625
12626int Client::ll_flock(Fh *fh, int cmd, uint64_t owner)
12627{
12628 Mutex::Locker lock(client_lock);
12629
12630 ldout(cct, 3) << "ll_flock (fh) " << fh << " " << fh->inode->ino << dendl;
12631 tout(cct) << "ll_flock (fh)" << (unsigned long)fh << std::endl;
12632
12633 return _flock(fh, cmd, owner);
12634}
12635
12636class C_Client_RequestInterrupt : public Context {
12637private:
12638 Client *client;
12639 MetaRequest *req;
12640public:
12641 C_Client_RequestInterrupt(Client *c, MetaRequest *r) : client(c), req(r) {
12642 req->get();
12643 }
12644 void finish(int r) override {
12645 Mutex::Locker l(client->client_lock);
12646 assert(req->head.op == CEPH_MDS_OP_SETFILELOCK);
12647 client->_interrupt_filelock(req);
12648 client->put_request(req);
12649 }
12650};
12651
12652void Client::ll_interrupt(void *d)
12653{
12654 MetaRequest *req = static_cast<MetaRequest*>(d);
12655 ldout(cct, 3) << "ll_interrupt tid " << req->get_tid() << dendl;
12656 tout(cct) << "ll_interrupt tid " << req->get_tid() << std::endl;
12657 interrupt_finisher.queue(new C_Client_RequestInterrupt(this, req));
12658}
12659
12660// =========================================
12661// layout
12662
12663// expose file layouts
12664
12665int Client::describe_layout(const char *relpath, file_layout_t *lp,
12666 const UserPerm& perms)
12667{
12668 Mutex::Locker lock(client_lock);
12669
12670 filepath path(relpath);
12671 InodeRef in;
12672 int r = path_walk(path, &in, perms);
12673 if (r < 0)
12674 return r;
12675
12676 *lp = in->layout;
12677
12678 ldout(cct, 3) << "describe_layout(" << relpath << ") = 0" << dendl;
12679 return 0;
12680}
12681
12682int Client::fdescribe_layout(int fd, file_layout_t *lp)
12683{
12684 Mutex::Locker lock(client_lock);
12685
12686 Fh *f = get_filehandle(fd);
12687 if (!f)
12688 return -EBADF;
12689 Inode *in = f->inode.get();
12690
12691 *lp = in->layout;
12692
12693 ldout(cct, 3) << "fdescribe_layout(" << fd << ") = 0" << dendl;
12694 return 0;
12695}
12696
12697
12698// expose osdmap
12699
12700int64_t Client::get_pool_id(const char *pool_name)
12701{
12702 Mutex::Locker lock(client_lock);
12703 return objecter->with_osdmap(std::mem_fn(&OSDMap::lookup_pg_pool_name),
12704 pool_name);
12705}
12706
12707string Client::get_pool_name(int64_t pool)
12708{
12709 Mutex::Locker lock(client_lock);
12710 return objecter->with_osdmap([pool](const OSDMap& o) {
12711 return o.have_pg_pool(pool) ? o.get_pool_name(pool) : string();
12712 });
12713}
12714
12715int Client::get_pool_replication(int64_t pool)
12716{
12717 Mutex::Locker lock(client_lock);
12718 return objecter->with_osdmap([pool](const OSDMap& o) {
12719 return o.have_pg_pool(pool) ? o.get_pg_pool(pool)->get_size() : -ENOENT;
12720 });
12721}
12722
12723int Client::get_file_extent_osds(int fd, loff_t off, loff_t *len, vector<int>& osds)
12724{
12725 Mutex::Locker lock(client_lock);
12726
12727 Fh *f = get_filehandle(fd);
12728 if (!f)
12729 return -EBADF;
12730 Inode *in = f->inode.get();
12731
12732 vector<ObjectExtent> extents;
12733 Striper::file_to_extents(cct, in->ino, &in->layout, off, 1, in->truncate_size, extents);
12734 assert(extents.size() == 1);
12735
12736 objecter->with_osdmap([&](const OSDMap& o) {
12737 pg_t pg = o.object_locator_to_pg(extents[0].oid, extents[0].oloc);
12738 o.pg_to_acting_osds(pg, osds);
12739 });
12740
12741 if (osds.empty())
12742 return -EINVAL;
12743
12744 /*
12745 * Return the remainder of the extent (stripe unit)
12746 *
12747 * If length = 1 is passed to Striper::file_to_extents we get a single
12748 * extent back, but its length is one so we still need to compute the length
12749 * to the end of the stripe unit.
12750 *
12751 * If length = su then we may get 1 or 2 objects back in the extents vector
12752 * which would have to be examined. Even then, the offsets are local to the
12753 * object, so matching up to the file offset is extra work.
12754 *
12755 * It seems simpler to stick with length = 1 and manually compute the
12756 * remainder.
12757 */
12758 if (len) {
12759 uint64_t su = in->layout.stripe_unit;
12760 *len = su - (off % su);
12761 }
12762
12763 return 0;
12764}
12765
12766int Client::get_osd_crush_location(int id, vector<pair<string, string> >& path)
12767{
12768 Mutex::Locker lock(client_lock);
12769 if (id < 0)
12770 return -EINVAL;
12771 return objecter->with_osdmap([&](const OSDMap& o) {
12772 return o.crush->get_full_location_ordered(id, path);
12773 });
12774}
12775
12776int Client::get_file_stripe_address(int fd, loff_t offset,
12777 vector<entity_addr_t>& address)
12778{
12779 Mutex::Locker lock(client_lock);
12780
12781 Fh *f = get_filehandle(fd);
12782 if (!f)
12783 return -EBADF;
12784 Inode *in = f->inode.get();
12785
12786 // which object?
12787 vector<ObjectExtent> extents;
12788 Striper::file_to_extents(cct, in->ino, &in->layout, offset, 1,
12789 in->truncate_size, extents);
12790 assert(extents.size() == 1);
12791
12792 // now we have the object and its 'layout'
12793 return objecter->with_osdmap([&](const OSDMap& o) {
12794 pg_t pg = o.object_locator_to_pg(extents[0].oid, extents[0].oloc);
12795 vector<int> osds;
12796 o.pg_to_acting_osds(pg, osds);
12797 if (osds.empty())
12798 return -EINVAL;
12799 for (unsigned i = 0; i < osds.size(); i++) {
12800 entity_addr_t addr = o.get_addr(osds[i]);
12801 address.push_back(addr);
12802 }
12803 return 0;
12804 });
12805}
12806
12807int Client::get_osd_addr(int osd, entity_addr_t& addr)
12808{
12809 Mutex::Locker lock(client_lock);
12810 return objecter->with_osdmap([&](const OSDMap& o) {
12811 if (!o.exists(osd))
12812 return -ENOENT;
12813
12814 addr = o.get_addr(osd);
12815 return 0;
12816 });
12817}
12818
12819int Client::enumerate_layout(int fd, vector<ObjectExtent>& result,
12820 loff_t length, loff_t offset)
12821{
12822 Mutex::Locker lock(client_lock);
12823
12824 Fh *f = get_filehandle(fd);
12825 if (!f)
12826 return -EBADF;
12827 Inode *in = f->inode.get();
12828
12829 // map to a list of extents
12830 Striper::file_to_extents(cct, in->ino, &in->layout, offset, length, in->truncate_size, result);
12831
12832 ldout(cct, 3) << "enumerate_layout(" << fd << ", " << length << ", " << offset << ") = 0" << dendl;
12833 return 0;
12834}
12835
12836
12837/*
12838 * find an osd with the same ip. -1 if none.
12839 */
12840int Client::get_local_osd()
12841{
12842 Mutex::Locker lock(client_lock);
12843 objecter->with_osdmap([this](const OSDMap& o) {
12844 if (o.get_epoch() != local_osd_epoch) {
12845 local_osd = o.find_osd_on_ip(messenger->get_myaddr());
12846 local_osd_epoch = o.get_epoch();
12847 }
12848 });
12849 return local_osd;
12850}
12851
12852
12853
12854
12855
12856
12857// ===============================
12858
12859void Client::ms_handle_connect(Connection *con)
12860{
12861 ldout(cct, 10) << "ms_handle_connect on " << con->get_peer_addr() << dendl;
12862}
12863
12864bool Client::ms_handle_reset(Connection *con)
12865{
12866 ldout(cct, 0) << "ms_handle_reset on " << con->get_peer_addr() << dendl;
12867 return false;
12868}
12869
12870void Client::ms_handle_remote_reset(Connection *con)
12871{
12872 ldout(cct, 0) << "ms_handle_remote_reset on " << con->get_peer_addr() << dendl;
12873 Mutex::Locker l(client_lock);
12874 switch (con->get_peer_type()) {
12875 case CEPH_ENTITY_TYPE_MDS:
12876 {
12877 // kludge to figure out which mds this is; fixme with a Connection* state
12878 mds_rank_t mds = MDS_RANK_NONE;
12879 MetaSession *s = NULL;
12880 for (map<mds_rank_t,MetaSession*>::iterator p = mds_sessions.begin();
12881 p != mds_sessions.end();
12882 ++p) {
12883 if (mdsmap->get_addr(p->first) == con->get_peer_addr()) {
12884 mds = p->first;
12885 s = p->second;
12886 }
12887 }
12888 if (mds >= 0) {
12889 switch (s->state) {
12890 case MetaSession::STATE_CLOSING:
12891 ldout(cct, 1) << "reset from mds we were closing; we'll call that closed" << dendl;
12892 _closed_mds_session(s);
12893 break;
12894
12895 case MetaSession::STATE_OPENING:
12896 {
12897 ldout(cct, 1) << "reset from mds we were opening; retrying" << dendl;
12898 list<Context*> waiters;
12899 waiters.swap(s->waiting_for_open);
12900 _closed_mds_session(s);
12901 MetaSession *news = _get_or_open_mds_session(mds);
12902 news->waiting_for_open.swap(waiters);
12903 }
12904 break;
12905
12906 case MetaSession::STATE_OPEN:
12907 {
12908 const md_config_t *conf = cct->_conf;
12909 if (conf->client_reconnect_stale) {
12910 ldout(cct, 1) << "reset from mds we were open; close mds session for reconnect" << dendl;
12911 _closed_mds_session(s);
12912 } else {
12913 ldout(cct, 1) << "reset from mds we were open; mark session as stale" << dendl;
12914 s->state = MetaSession::STATE_STALE;
12915 }
12916 }
12917 break;
12918
12919 case MetaSession::STATE_NEW:
12920 case MetaSession::STATE_CLOSED:
12921 default:
12922 break;
12923 }
12924 }
12925 }
12926 break;
12927 }
12928}
12929
12930bool Client::ms_handle_refused(Connection *con)
12931{
12932 ldout(cct, 1) << "ms_handle_refused on " << con->get_peer_addr() << dendl;
12933 return false;
12934}
12935
12936bool Client::ms_get_authorizer(int dest_type, AuthAuthorizer **authorizer, bool force_new)
12937{
12938 if (dest_type == CEPH_ENTITY_TYPE_MON)
12939 return true;
12940 *authorizer = monclient->build_authorizer(dest_type);
12941 return true;
12942}
12943
12944Inode *Client::get_quota_root(Inode *in, const UserPerm& perms)
12945{
12946 Inode *cur = in;
12947 utime_t now = ceph_clock_now();
12948
12949 while (cur) {
12950 if (cur != in && cur->quota.is_enable())
12951 break;
12952
12953 Inode *parent_in = NULL;
12954 if (!cur->dn_set.empty()) {
12955 for (auto p = cur->dn_set.begin(); p != cur->dn_set.end(); ++p) {
12956 Dentry *dn = *p;
12957 if (dn->lease_mds >= 0 &&
12958 dn->lease_ttl > now &&
12959 mds_sessions.count(dn->lease_mds)) {
12960 parent_in = dn->dir->parent_inode;
12961 } else {
12962 Inode *diri = dn->dir->parent_inode;
12963 if (diri->caps_issued_mask(CEPH_CAP_FILE_SHARED) &&
12964 diri->shared_gen == dn->cap_shared_gen) {
12965 parent_in = dn->dir->parent_inode;
12966 }
12967 }
12968 if (parent_in)
12969 break;
12970 }
12971 } else if (root_parents.count(cur)) {
12972 parent_in = root_parents[cur].get();
12973 }
12974
12975 if (parent_in) {
12976 cur = parent_in;
12977 continue;
12978 }
12979
12980 if (cur == root_ancestor)
12981 break;
12982
12983 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPNAME);
12984 filepath path(cur->ino);
12985 req->set_filepath(path);
12986 req->set_inode(cur);
12987
12988 InodeRef parent_ref;
12989 int ret = make_request(req, perms, &parent_ref);
12990 if (ret < 0) {
12991 ldout(cct, 1) << __func__ << " " << in->vino()
12992 << " failed to find parent of " << cur->vino()
12993 << " err " << ret << dendl;
12994 // FIXME: what to do?
12995 cur = root_ancestor;
12996 break;
12997 }
12998
12999 now = ceph_clock_now();
13000 if (cur == in)
13001 cur = parent_ref.get();
13002 else
13003 cur = in; // start over
13004 }
13005
13006 ldout(cct, 10) << __func__ << " " << in->vino() << " -> " << cur->vino() << dendl;
13007 return cur;
13008}
13009
13010/**
13011 * Traverse quota ancestors of the Inode, return true
13012 * if any of them passes the passed function
13013 */
13014bool Client::check_quota_condition(Inode *in, const UserPerm& perms,
13015 std::function<bool (const Inode &in)> test)
13016{
13017 while (true) {
13018 assert(in != NULL);
13019 if (test(*in)) {
13020 return true;
13021 }
13022
13023 if (in == root_ancestor) {
13024 // We're done traversing, drop out
13025 return false;
13026 } else {
13027 // Continue up the tree
13028 in = get_quota_root(in, perms);
13029 }
13030 }
13031
13032 return false;
13033}
13034
13035bool Client::is_quota_files_exceeded(Inode *in, const UserPerm& perms)
13036{
13037 return check_quota_condition(in, perms,
13038 [](const Inode &in) {
13039 return in.quota.max_files && in.rstat.rsize() >= in.quota.max_files;
13040 });
13041}
13042
13043bool Client::is_quota_bytes_exceeded(Inode *in, int64_t new_bytes,
13044 const UserPerm& perms)
13045{
13046 return check_quota_condition(in, perms,
13047 [&new_bytes](const Inode &in) {
13048 return in.quota.max_bytes && (in.rstat.rbytes + new_bytes)
13049 > in.quota.max_bytes;
13050 });
13051}
13052
13053bool Client::is_quota_bytes_approaching(Inode *in, const UserPerm& perms)
13054{
13055 return check_quota_condition(in, perms,
13056 [](const Inode &in) {
13057 if (in.quota.max_bytes) {
13058 if (in.rstat.rbytes >= in.quota.max_bytes) {
13059 return true;
13060 }
13061
13062 assert(in.size >= in.reported_size);
13063 const uint64_t space = in.quota.max_bytes - in.rstat.rbytes;
13064 const uint64_t size = in.size - in.reported_size;
13065 return (space >> 4) < size;
13066 } else {
13067 return false;
13068 }
13069 });
13070}
13071
13072enum {
13073 POOL_CHECKED = 1,
13074 POOL_CHECKING = 2,
13075 POOL_READ = 4,
13076 POOL_WRITE = 8,
13077};
13078
13079int Client::check_pool_perm(Inode *in, int need)
13080{
13081 if (!cct->_conf->client_check_pool_perm)
13082 return 0;
13083
13084 int64_t pool_id = in->layout.pool_id;
13085 std::string pool_ns = in->layout.pool_ns;
13086 std::pair<int64_t, std::string> perm_key(pool_id, pool_ns);
13087 int have = 0;
13088 while (true) {
13089 auto it = pool_perms.find(perm_key);
13090 if (it == pool_perms.end())
13091 break;
13092 if (it->second == POOL_CHECKING) {
13093 // avoid concurrent checkings
13094 wait_on_list(waiting_for_pool_perm);
13095 } else {
13096 have = it->second;
13097 assert(have & POOL_CHECKED);
13098 break;
13099 }
13100 }
13101
13102 if (!have) {
13103 if (in->snapid != CEPH_NOSNAP) {
13104 // pool permission check needs to write to the first object. But for snapshot,
13105 // head of the first object may have alread been deleted. To avoid creating
13106 // orphan object, skip the check for now.
13107 return 0;
13108 }
13109
13110 pool_perms[perm_key] = POOL_CHECKING;
13111
13112 char oid_buf[32];
13113 snprintf(oid_buf, sizeof(oid_buf), "%llx.00000000", (unsigned long long)in->ino);
13114 object_t oid = oid_buf;
13115
13116 SnapContext nullsnapc;
13117
13118 C_SaferCond rd_cond;
13119 ObjectOperation rd_op;
13120 rd_op.stat(NULL, (ceph::real_time*)nullptr, NULL);
13121
13122 objecter->mutate(oid, OSDMap::file_to_object_locator(in->layout), rd_op,
13123 nullsnapc, ceph::real_clock::now(), 0, &rd_cond);
13124
13125 C_SaferCond wr_cond;
13126 ObjectOperation wr_op;
13127 wr_op.create(true);
13128
13129 objecter->mutate(oid, OSDMap::file_to_object_locator(in->layout), wr_op,
13130 nullsnapc, ceph::real_clock::now(), 0, &wr_cond);
13131
13132 client_lock.Unlock();
13133 int rd_ret = rd_cond.wait();
13134 int wr_ret = wr_cond.wait();
13135 client_lock.Lock();
13136
13137 bool errored = false;
13138
13139 if (rd_ret == 0 || rd_ret == -ENOENT)
13140 have |= POOL_READ;
13141 else if (rd_ret != -EPERM) {
13142 ldout(cct, 10) << "check_pool_perm on pool " << pool_id << " ns " << pool_ns
13143 << " rd_err = " << rd_ret << " wr_err = " << wr_ret << dendl;
13144 errored = true;
13145 }
13146
13147 if (wr_ret == 0 || wr_ret == -EEXIST)
13148 have |= POOL_WRITE;
13149 else if (wr_ret != -EPERM) {
13150 ldout(cct, 10) << "check_pool_perm on pool " << pool_id << " ns " << pool_ns
13151 << " rd_err = " << rd_ret << " wr_err = " << wr_ret << dendl;
13152 errored = true;
13153 }
13154
13155 if (errored) {
13156 // Indeterminate: erase CHECKING state so that subsequent calls re-check.
13157 // Raise EIO because actual error code might be misleading for
13158 // userspace filesystem user.
13159 pool_perms.erase(perm_key);
13160 signal_cond_list(waiting_for_pool_perm);
13161 return -EIO;
13162 }
13163
13164 pool_perms[perm_key] = have | POOL_CHECKED;
13165 signal_cond_list(waiting_for_pool_perm);
13166 }
13167
13168 if ((need & CEPH_CAP_FILE_RD) && !(have & POOL_READ)) {
13169 ldout(cct, 10) << "check_pool_perm on pool " << pool_id << " ns " << pool_ns
13170 << " need " << ccap_string(need) << ", but no read perm" << dendl;
13171 return -EPERM;
13172 }
13173 if ((need & CEPH_CAP_FILE_WR) && !(have & POOL_WRITE)) {
13174 ldout(cct, 10) << "check_pool_perm on pool " << pool_id << " ns " << pool_ns
13175 << " need " << ccap_string(need) << ", but no write perm" << dendl;
13176 return -EPERM;
13177 }
13178
13179 return 0;
13180}
13181
13182int Client::_posix_acl_permission(Inode *in, const UserPerm& perms, unsigned want)
13183{
13184 if (acl_type == POSIX_ACL) {
13185 if (in->xattrs.count(ACL_EA_ACCESS)) {
13186 const bufferptr& access_acl = in->xattrs[ACL_EA_ACCESS];
13187
13188 return posix_acl_permits(access_acl, in->uid, in->gid, perms, want);
13189 }
13190 }
13191 return -EAGAIN;
13192}
13193
13194int Client::_posix_acl_chmod(Inode *in, mode_t mode, const UserPerm& perms)
13195{
13196 if (acl_type == NO_ACL)
13197 return 0;
13198
13199 int r = _getattr(in, CEPH_STAT_CAP_XATTR, perms, in->xattr_version == 0);
13200 if (r < 0)
13201 goto out;
13202
13203 if (acl_type == POSIX_ACL) {
13204 if (in->xattrs.count(ACL_EA_ACCESS)) {
13205 const bufferptr& access_acl = in->xattrs[ACL_EA_ACCESS];
13206 bufferptr acl(access_acl.c_str(), access_acl.length());
13207 r = posix_acl_access_chmod(acl, mode);
13208 if (r < 0)
13209 goto out;
13210 r = _do_setxattr(in, ACL_EA_ACCESS, acl.c_str(), acl.length(), 0, perms);
13211 } else {
13212 r = 0;
13213 }
13214 }
13215out:
13216 ldout(cct, 10) << __func__ << " ino " << in->ino << " result=" << r << dendl;
13217 return r;
13218}
13219
13220int Client::_posix_acl_create(Inode *dir, mode_t *mode, bufferlist& xattrs_bl,
13221 const UserPerm& perms)
13222{
13223 if (acl_type == NO_ACL)
13224 return 0;
13225
13226 if (S_ISLNK(*mode))
13227 return 0;
13228
13229 int r = _getattr(dir, CEPH_STAT_CAP_XATTR, perms, dir->xattr_version == 0);
13230 if (r < 0)
13231 goto out;
13232
13233 if (acl_type == POSIX_ACL) {
13234 if (dir->xattrs.count(ACL_EA_DEFAULT)) {
13235 map<string, bufferptr> xattrs;
13236
13237 const bufferptr& default_acl = dir->xattrs[ACL_EA_DEFAULT];
13238 bufferptr acl(default_acl.c_str(), default_acl.length());
13239 r = posix_acl_inherit_mode(acl, mode);
13240 if (r < 0)
13241 goto out;
13242
13243 if (r > 0) {
13244 r = posix_acl_equiv_mode(acl.c_str(), acl.length(), mode);
13245 if (r < 0)
13246 goto out;
13247 if (r > 0)
13248 xattrs[ACL_EA_ACCESS] = acl;
13249 }
13250
13251 if (S_ISDIR(*mode))
13252 xattrs[ACL_EA_DEFAULT] = dir->xattrs[ACL_EA_DEFAULT];
13253
13254 r = xattrs.size();
13255 if (r > 0)
13256 ::encode(xattrs, xattrs_bl);
13257 } else {
13258 if (umask_cb)
13259 *mode &= ~umask_cb(callback_handle);
13260 r = 0;
13261 }
13262 }
13263out:
13264 ldout(cct, 10) << __func__ << " dir ino " << dir->ino << " result=" << r << dendl;
13265 return r;
13266}
13267
13268void Client::set_filer_flags(int flags)
13269{
13270 Mutex::Locker l(client_lock);
13271 assert(flags == 0 ||
13272 flags == CEPH_OSD_FLAG_LOCALIZE_READS);
13273 objecter->add_global_op_flags(flags);
13274}
13275
13276void Client::clear_filer_flags(int flags)
13277{
13278 Mutex::Locker l(client_lock);
13279 assert(flags == CEPH_OSD_FLAG_LOCALIZE_READS);
13280 objecter->clear_global_op_flag(flags);
13281}
13282
13283/**
13284 * This is included in cap release messages, to cause
13285 * the MDS to wait until this OSD map epoch. It is necessary
13286 * in corner cases where we cancel RADOS ops, so that
13287 * nobody else tries to do IO to the same objects in
13288 * the same epoch as the cancelled ops.
13289 */
13290void Client::set_cap_epoch_barrier(epoch_t e)
13291{
13292 ldout(cct, 5) << __func__ << " epoch = " << e << dendl;
13293 cap_epoch_barrier = e;
13294}
13295
13296const char** Client::get_tracked_conf_keys() const
13297{
13298 static const char* keys[] = {
13299 "client_cache_size",
13300 "client_cache_mid",
13301 "client_acl_type",
13302 NULL
13303 };
13304 return keys;
13305}
13306
13307void Client::handle_conf_change(const struct md_config_t *conf,
13308 const std::set <std::string> &changed)
13309{
13310 Mutex::Locker lock(client_lock);
13311
13312 if (changed.count("client_cache_size") ||
13313 changed.count("client_cache_mid")) {
13314 lru.lru_set_max(cct->_conf->client_cache_size);
13315 lru.lru_set_midpoint(cct->_conf->client_cache_mid);
13316 }
13317 if (changed.count("client_acl_type")) {
13318 acl_type = NO_ACL;
13319 if (cct->_conf->client_acl_type == "posix_acl")
13320 acl_type = POSIX_ACL;
13321 }
13322}
13323
13324void Client::init_groups(UserPerm *perms)
13325{
13326 gid_t *sgids;
13327 int count = _getgrouplist(&sgids, perms->uid(), perms->gid());
13328 perms->init_gids(sgids, count);
13329}
13330
13331void intrusive_ptr_add_ref(Inode *in)
13332{
13333 in->get();
13334}
13335
13336void intrusive_ptr_release(Inode *in)
13337{
13338 in->client->put_inode(in);
13339}
13340
13341mds_rank_t Client::_get_random_up_mds() const
13342{
13343 assert(client_lock.is_locked_by_me());
13344
13345 std::set<mds_rank_t> up;
13346 mdsmap->get_up_mds_set(up);
13347
13348 if (up.empty())
13349 return MDS_RANK_NONE;
13350 std::set<mds_rank_t>::const_iterator p = up.begin();
13351 for (int n = rand() % up.size(); n; n--)
13352 ++p;
13353 return *p;
13354}
13355
13356
13357StandaloneClient::StandaloneClient(Messenger *m, MonClient *mc)
13358 : Client(m, mc, new Objecter(m->cct, m, mc, NULL, 0, 0))
13359{
13360 monclient->set_messenger(m);
13361 objecter->set_client_incarnation(0);
13362}
13363
13364StandaloneClient::~StandaloneClient()
13365{
13366 delete objecter;
13367 objecter = nullptr;
13368}
13369
13370int StandaloneClient::init()
13371{
13372 timer.init();
13373 objectcacher->start();
13374 objecter->init();
13375
13376 client_lock.Lock();
13377 assert(!initialized);
13378
13379 messenger->add_dispatcher_tail(objecter);
13380 messenger->add_dispatcher_tail(this);
13381
13382 monclient->set_want_keys(CEPH_ENTITY_TYPE_MDS | CEPH_ENTITY_TYPE_OSD);
13383 int r = monclient->init();
13384 if (r < 0) {
13385 // need to do cleanup because we're in an intermediate init state
13386 timer.shutdown();
13387 client_lock.Unlock();
13388 objecter->shutdown();
13389 objectcacher->stop();
13390 monclient->shutdown();
13391 return r;
13392 }
13393 objecter->start();
13394
13395 client_lock.Unlock();
13396 _finish_init();
13397
13398 return 0;
13399}
13400
13401void StandaloneClient::shutdown()
13402{
13403 Client::shutdown();
13404 objecter->shutdown();
13405 monclient->shutdown();
13406}
13407