ceph/src/client/Client.cc

   1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
   2 // vim: ts=8 sw=2 smarttab
   3 /*
   4  * Ceph - scalable distributed file system
   5  *
   6  * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
   7  *
   8  * This is free software; you can redistribute it and/or
   9  * modify it under the terms of the GNU Lesser General Public
  10  * License version 2.1, as published by the Free Software
  11  * Foundation.  See file COPYING.
  12  *
  13  */
  14
  15
  16 // unix-ey fs stuff
  17 #include <unistd.h>
  18 #include <sys/types.h>
  19 #include <time.h>
  20 #include <utime.h>
  21 #include <string.h>
  22 #include <sys/stat.h>
  23 #include <sys/param.h>
  24 #include <fcntl.h>
  25 #include <sys/file.h>
  26 #include <sys/utsname.h>
  27 #include <sys/uio.h>
  28
  29 #include <boost/lexical_cast.hpp>
  30 #include <boost/fusion/include/std_pair.hpp>
  31
  32 #if defined(__FreeBSD__)
  33 #define XATTR_CREATE    0x1
  34 #define XATTR_REPLACE   0x2
  35 #else
  36 #include <sys/xattr.h>
  37 #endif
  38
  39 #if defined(__linux__)
  40 #include <linux/falloc.h>
  41 #endif
  42
  43 #include <sys/statvfs.h>
  44
  45 #include "common/config.h"
  46 #include "common/version.h"
  47
  48 #include "mon/MonClient.h"
  49
  50 #include "messages/MClientCaps.h"
  51 #include "messages/MClientLease.h"
  52 #include "messages/MClientQuota.h"
  53 #include "messages/MClientReclaim.h"
  54 #include "messages/MClientReclaimReply.h"
  55 #include "messages/MClientReconnect.h"
  56 #include "messages/MClientReply.h"
  57 #include "messages/MClientRequest.h"
  58 #include "messages/MClientRequestForward.h"
  59 #include "messages/MClientSession.h"
  60 #include "messages/MClientSnap.h"
  61 #include "messages/MCommandReply.h"
  62 #include "messages/MFSMap.h"
  63 #include "messages/MFSMapUser.h"
  64 #include "messages/MMDSMap.h"
  65 #include "messages/MOSDMap.h"
  66
  67 #include "mds/flock.h"
  68 #include "mds/cephfs_features.h"
  69 #include "osd/OSDMap.h"
  70 #include "osdc/Filer.h"
  71
  72 #include "common/Cond.h"
  73 #include "common/perf_counters.h"
  74 #include "common/admin_socket.h"
  75 #include "common/errno.h"
  76 #include "include/str_list.h"
  77
  78 #define dout_subsys ceph_subsys_client
  79
  80 #include "include/lru.h"
  81 #include "include/compat.h"
  82 #include "include/stringify.h"
  83
  84 #include "Client.h"
  85 #include "Inode.h"
  86 #include "Dentry.h"
  87 #include "Delegation.h"
  88 #include "Dir.h"
  89 #include "ClientSnapRealm.h"
  90 #include "Fh.h"
  91 #include "MetaSession.h"
  92 #include "MetaRequest.h"
  93 #include "ObjecterWriteback.h"
  94 #include "posix_acl.h"
  95
  96 #include "include/ceph_assert.h"
  97 #include "include/stat.h"
  98
  99 #include "include/cephfs/ceph_ll_client.h"
 100
 101 #if HAVE_GETGROUPLIST
 102 #include <grp.h>
 103 #include <pwd.h>
 104 #include <unistd.h>
 105 #endif
 106
 107 #undef dout_prefix
 108 #define dout_prefix *_dout << "client." << whoami << " "
 109
 110 #define  tout(cct)       if (!cct->_conf->client_trace.empty()) traceout
 111
 112 // FreeBSD fails to define this
 113 #ifndef O_DSYNC
 114 #define O_DSYNC 0x0
 115 #endif
 116 // Darwin fails to define this
 117 #ifndef O_RSYNC
 118 #define O_RSYNC 0x0
 119 #endif
 120
 121 #ifndef O_DIRECT
 122 #define O_DIRECT 0x0
 123 #endif
 124
 125 #define DEBUG_GETATTR_CAPS (CEPH_CAP_XATTR_SHARED)
 126
 127 void client_flush_set_callback(void *p, ObjectCacher::ObjectSet *oset)
 128 {
 129   Client *client = static_cast<Client*>(p);
 130   client->flush_set_callback(oset);
 131 }
 132
 133
 134 // -------------
 135
 136 Client::CommandHook::CommandHook(Client *client) :
 137   m_client(client)
 138 {
 139 }
 140
 141 int Client::CommandHook::call(
 142   std::string_view command,
 143   const cmdmap_t& cmdmap,
 144   Formatter *f,
 145   std::ostream& errss,
 146   bufferlist& out)
 147 {
 148   f->open_object_section("result");
 149   {
 150     std::lock_guard l{m_client->client_lock};
 151     if (command == "mds_requests")
 152       m_client->dump_mds_requests(f);
 153     else if (command == "mds_sessions")
 154       m_client->dump_mds_sessions(f);
 155     else if (command == "dump_cache")
 156       m_client->dump_cache(f);
 157     else if (command == "kick_stale_sessions")
 158       m_client->_kick_stale_sessions();
 159     else if (command == "status")
 160       m_client->dump_status(f);
 161     else
 162       ceph_abort_msg("bad command registered");
 163   }
 164   f->close_section();
 165   return 0;
 166 }
 167
 168
 169 // -------------
 170
 171 dir_result_t::dir_result_t(Inode *in, const UserPerm& perms)
 172   : inode(in), offset(0), next_offset(2),
 173     release_count(0), ordered_count(0), cache_index(0), start_shared_gen(0),
 174     perms(perms)
 175   { }
 176
 177 void Client::_reset_faked_inos()
 178 {
 179   ino_t start = 1024;
 180   free_faked_inos.clear();
 181   free_faked_inos.insert(start, (uint32_t)-1 - start + 1);
 182   last_used_faked_ino = 0;
 183   last_used_faked_root = 0;
 184   _use_faked_inos = sizeof(ino_t) < 8 || cct->_conf->client_use_faked_inos;
 185 }
 186
 187 void Client::_assign_faked_ino(Inode *in)
 188 {
 189   if (0 == last_used_faked_ino)
 190     last_used_faked_ino = last_used_faked_ino + 2048; // start(1024)~2048 reserved for _assign_faked_root
 191   interval_set<ino_t>::const_iterator it = free_faked_inos.lower_bound(last_used_faked_ino + 1);
 192   if (it == free_faked_inos.end() && last_used_faked_ino > 0) {
 193     last_used_faked_ino = 2048;
 194     it = free_faked_inos.lower_bound(last_used_faked_ino + 1);
 195   }
 196   ceph_assert(it != free_faked_inos.end());
 197   if (last_used_faked_ino < it.get_start()) {
 198     ceph_assert(it.get_len() > 0);
 199     last_used_faked_ino = it.get_start();
 200   } else {
 201     ++last_used_faked_ino;
 202     ceph_assert(it.get_start() + it.get_len() > last_used_faked_ino);
 203   }
 204   in->faked_ino = last_used_faked_ino;
 205   free_faked_inos.erase(in->faked_ino);
 206   faked_ino_map[in->faked_ino] = in->vino();
 207 }
 208
 209 /*
 210  * In the faked mode, if you export multiple subdirectories,
 211  * you will see that the inode numbers of the exported subdirectories
 212  * are the same. so we distinguish the mount point by reserving
 213  * the "fake ids" between "1024~2048" and combining the last
 214  * 10bits(0x3ff) of the "root inodes".
 215 */
 216 void Client::_assign_faked_root(Inode *in)
 217 {
 218   interval_set<ino_t>::const_iterator it = free_faked_inos.lower_bound(last_used_faked_root + 1);
 219   if (it == free_faked_inos.end() && last_used_faked_root > 0) {
 220     last_used_faked_root = 0;
 221     it = free_faked_inos.lower_bound(last_used_faked_root + 1);
 222   }
 223   assert(it != free_faked_inos.end());
 224   vinodeno_t inode_info = in->vino();
 225   uint64_t inode_num = (uint64_t)inode_info.ino;
 226   ldout(cct, 10) << "inode_num " << inode_num << "inode_num & 0x3ff=" << (inode_num & 0x3ff)<< dendl;
 227   last_used_faked_root = it.get_start()  + (inode_num & 0x3ff); // 0x3ff mask and get_start will not exceed 2048
 228   assert(it.get_start() + it.get_len() > last_used_faked_root);
 229
 230   in->faked_ino = last_used_faked_root;
 231   free_faked_inos.erase(in->faked_ino);
 232   faked_ino_map[in->faked_ino] = in->vino();
 233 }
 234
 235 void Client::_release_faked_ino(Inode *in)
 236 {
 237   free_faked_inos.insert(in->faked_ino);
 238   faked_ino_map.erase(in->faked_ino);
 239 }
 240
 241 vinodeno_t Client::_map_faked_ino(ino_t ino)
 242 {
 243   vinodeno_t vino;
 244   if (ino == 1)
 245     vino = root->vino();
 246   else if (faked_ino_map.count(ino))
 247     vino = faked_ino_map[ino];
 248   else
 249     vino = vinodeno_t(0, CEPH_NOSNAP);
 250   ldout(cct, 10) << __func__ << " " << ino << " -> " << vino << dendl;
 251   return vino;
 252 }
 253
 254 vinodeno_t Client::map_faked_ino(ino_t ino)
 255 {
 256   std::lock_guard lock(client_lock);
 257   return _map_faked_ino(ino);
 258 }
 259
 260 // cons/des
 261
 262 Client::Client(Messenger *m, MonClient *mc, Objecter *objecter_)
 263   : Dispatcher(m->cct),
 264     timer(m->cct, client_lock),
 265     messenger(m),
 266     monclient(mc),
 267     objecter(objecter_),
 268     whoami(mc->get_global_id()),
 269     async_ino_invalidator(m->cct),
 270     async_dentry_invalidator(m->cct),
 271     interrupt_finisher(m->cct),
 272     remount_finisher(m->cct),
 273     async_ino_releasor(m->cct),
 274     objecter_finisher(m->cct),
 275     m_command_hook(this),
 276     fscid(0)
 277 {
 278   _reset_faked_inos();
 279
 280   user_id = cct->_conf->client_mount_uid;
 281   group_id = cct->_conf->client_mount_gid;
 282   fuse_default_permissions = cct->_conf.get_val<bool>(
 283     "fuse_default_permissions");
 284
 285   if (cct->_conf->client_acl_type == "posix_acl")
 286     acl_type = POSIX_ACL;
 287
 288   lru.lru_set_midpoint(cct->_conf->client_cache_mid);
 289
 290   // file handles
 291   free_fd_set.insert(10, 1<<30);
 292
 293   mdsmap.reset(new MDSMap);
 294
 295   // osd interfaces
 296   writeback_handler.reset(new ObjecterWriteback(objecter, &objecter_finisher,
 297                                             &client_lock));
 298   objectcacher.reset(new ObjectCacher(cct, "libcephfs", *writeback_handler, client_lock,
 299                                   client_flush_set_callback,    // all commit callback
 300                                   (void*)this,
 301                                   cct->_conf->client_oc_size,
 302                                   cct->_conf->client_oc_max_objects,
 303                                   cct->_conf->client_oc_max_dirty,
 304                                   cct->_conf->client_oc_target_dirty,
 305                                   cct->_conf->client_oc_max_dirty_age,
 306                                   true));
 307 }
 308
 309
 310 Client::~Client()
 311 {
 312   ceph_assert(ceph_mutex_is_not_locked(client_lock));
 313
 314   // It is necessary to hold client_lock, because any inode destruction
 315   // may call into ObjectCacher, which asserts that it's lock (which is
 316   // client_lock) is held.
 317   std::lock_guard l{client_lock};
 318   tear_down_cache();
 319 }
 320
 321 void Client::tear_down_cache()
 322 {
 323   // fd's
 324   for (ceph::unordered_map<int, Fh*>::iterator it = fd_map.begin();
 325        it != fd_map.end();
 326        ++it) {
 327     Fh *fh = it->second;
 328     ldout(cct, 1) << __func__ << " forcing close of fh " << it->first << " ino " << fh->inode->ino << dendl;
 329     _release_fh(fh);
 330   }
 331   fd_map.clear();
 332
 333   while (!opened_dirs.empty()) {
 334     dir_result_t *dirp = *opened_dirs.begin();
 335     ldout(cct, 1) << __func__ << " forcing close of dir " << dirp << " ino " << dirp->inode->ino << dendl;
 336     _closedir(dirp);
 337   }
 338
 339   // caps!
 340   // *** FIXME ***
 341
 342   // empty lru
 343   trim_cache();
 344   ceph_assert(lru.lru_get_size() == 0);
 345
 346   // close root ino
 347   ceph_assert(inode_map.size() <= 1 + root_parents.size());
 348   if (root && inode_map.size() == 1 + root_parents.size()) {
 349     delete root;
 350     root = 0;
 351     root_ancestor = 0;
 352     while (!root_parents.empty())
 353       root_parents.erase(root_parents.begin());
 354     inode_map.clear();
 355     _reset_faked_inos();
 356   }
 357
 358   ceph_assert(inode_map.empty());
 359 }
 360
 361 inodeno_t Client::get_root_ino()
 362 {
 363   std::lock_guard l(client_lock);
 364   if (use_faked_inos())
 365     return root->faked_ino;
 366   else
 367     return root->ino;
 368 }
 369
 370 Inode *Client::get_root()
 371 {
 372   std::lock_guard l(client_lock);
 373   root->ll_get();
 374   return root;
 375 }
 376
 377
 378 // debug crapola
 379
 380 void Client::dump_inode(Formatter *f, Inode *in, set<Inode*>& did, bool disconnected)
 381 {
 382   filepath path;
 383   in->make_long_path(path);
 384   ldout(cct, 1) << "dump_inode: "
 385                 << (disconnected ? "DISCONNECTED ":"")
 386                 << "inode " << in->ino
 387                 << " " << path
 388                 << " ref " << in->get_num_ref()
 389                 << *in << dendl;
 390
 391   if (f) {
 392     f->open_object_section("inode");
 393     f->dump_stream("path") << path;
 394     if (disconnected)
 395       f->dump_int("disconnected", 1);
 396     in->dump(f);
 397     f->close_section();
 398   }
 399
 400   did.insert(in);
 401   if (in->dir) {
 402     ldout(cct, 1) << "  dir " << in->dir << " size " << in->dir->dentries.size() << dendl;
 403     for (ceph::unordered_map<string, Dentry*>::iterator it = in->dir->dentries.begin();
 404          it != in->dir->dentries.end();
 405          ++it) {
 406       ldout(cct, 1) << "   " << in->ino << " dn " << it->first << " " << it->second << " ref " << it->second->ref << dendl;
 407       if (f) {
 408         f->open_object_section("dentry");
 409         it->second->dump(f);
 410         f->close_section();
 411       }
 412       if (it->second->inode)
 413         dump_inode(f, it->second->inode.get(), did, false);
 414     }
 415   }
 416 }
 417
 418 void Client::dump_cache(Formatter *f)
 419 {
 420   set<Inode*> did;
 421
 422   ldout(cct, 1) << __func__ << dendl;
 423
 424   if (f)
 425     f->open_array_section("cache");
 426
 427   if (root)
 428     dump_inode(f, root, did, true);
 429
 430   // make a second pass to catch anything disconnected
 431   for (ceph::unordered_map<vinodeno_t, Inode*>::iterator it = inode_map.begin();
 432        it != inode_map.end();
 433        ++it) {
 434     if (did.count(it->second))
 435       continue;
 436     dump_inode(f, it->second, did, true);
 437   }
 438
 439   if (f)
 440     f->close_section();
 441 }
 442
 443 void Client::dump_status(Formatter *f)
 444 {
 445   ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
 446
 447   ldout(cct, 1) << __func__ << dendl;
 448
 449   const epoch_t osd_epoch
 450     = objecter->with_osdmap(std::mem_fn(&OSDMap::get_epoch));
 451
 452   if (f) {
 453     f->open_object_section("metadata");
 454     for (const auto& kv : metadata)
 455       f->dump_string(kv.first.c_str(), kv.second);
 456     f->close_section();
 457
 458     f->dump_int("dentry_count", lru.lru_get_size());
 459     f->dump_int("dentry_pinned_count", lru.lru_get_num_pinned());
 460     f->dump_int("id", get_nodeid().v);
 461     entity_inst_t inst(messenger->get_myname(), messenger->get_myaddr_legacy());
 462     f->dump_object("inst", inst);
 463     f->dump_object("addr", inst.addr);
 464     f->dump_stream("inst_str") << inst.name << " " << inst.addr.get_legacy_str();
 465     f->dump_string("addr_str", inst.addr.get_legacy_str());
 466     f->dump_int("inode_count", inode_map.size());
 467     f->dump_int("mds_epoch", mdsmap->get_epoch());
 468     f->dump_int("osd_epoch", osd_epoch);
 469     f->dump_int("osd_epoch_barrier", cap_epoch_barrier);
 470     f->dump_bool("blacklisted", blacklisted);
 471   }
 472 }
 473
 474 void Client::_pre_init()
 475 {
 476   timer.init();
 477
 478   objecter_finisher.start();
 479   filer.reset(new Filer(objecter, &objecter_finisher));
 480   objecter->enable_blacklist_events();
 481
 482   objectcacher->start();
 483 }
 484
 485 int Client::init()
 486 {
 487   _pre_init();
 488   {
 489     std::lock_guard l{client_lock};
 490     ceph_assert(!initialized);
 491     messenger->add_dispatcher_tail(this);
 492   }
 493   _finish_init();
 494   return 0;
 495 }
 496
 497 void Client::_finish_init()
 498 {
 499   {
 500     std::lock_guard l{client_lock};
 501     // logger
 502     PerfCountersBuilder plb(cct, "client", l_c_first, l_c_last);
 503     plb.add_time_avg(l_c_reply, "reply", "Latency of receiving a reply on metadata request");
 504     plb.add_time_avg(l_c_lat, "lat", "Latency of processing a metadata request");
 505     plb.add_time_avg(l_c_wrlat, "wrlat", "Latency of a file data write operation");
 506     plb.add_time_avg(l_c_read, "rdlat", "Latency of a file data read operation");
 507     plb.add_time_avg(l_c_fsync, "fsync", "Latency of a file sync operation");
 508     logger.reset(plb.create_perf_counters());
 509     cct->get_perfcounters_collection()->add(logger.get());
 510   }
 511
 512   cct->_conf.add_observer(this);
 513
 514   AdminSocket* admin_socket = cct->get_admin_socket();
 515   int ret = admin_socket->register_command("mds_requests",
 516                                            &m_command_hook,
 517                                            "show in-progress mds requests");
 518   if (ret < 0) {
 519     lderr(cct) << "error registering admin socket command: "
 520                << cpp_strerror(-ret) << dendl;
 521   }
 522   ret = admin_socket->register_command("mds_sessions",
 523                                        &m_command_hook,
 524                                        "show mds session state");
 525   if (ret < 0) {
 526     lderr(cct) << "error registering admin socket command: "
 527                << cpp_strerror(-ret) << dendl;
 528   }
 529   ret = admin_socket->register_command("dump_cache",
 530                                        &m_command_hook,
 531                                        "show in-memory metadata cache contents");
 532   if (ret < 0) {
 533     lderr(cct) << "error registering admin socket command: "
 534                << cpp_strerror(-ret) << dendl;
 535   }
 536   ret = admin_socket->register_command("kick_stale_sessions",
 537                                        &m_command_hook,
 538                                        "kick sessions that were remote reset");
 539   if (ret < 0) {
 540     lderr(cct) << "error registering admin socket command: "
 541                << cpp_strerror(-ret) << dendl;
 542   }
 543   ret = admin_socket->register_command("status",
 544                                        &m_command_hook,
 545                                        "show overall client status");
 546   if (ret < 0) {
 547     lderr(cct) << "error registering admin socket command: "
 548                << cpp_strerror(-ret) << dendl;
 549   }
 550
 551   std::lock_guard l{client_lock};
 552   initialized = true;
 553 }
 554
 555 void Client::shutdown()
 556 {
 557   ldout(cct, 1) << __func__ << dendl;
 558
 559   // If we were not mounted, but were being used for sending
 560   // MDS commands, we may have sessions that need closing.
 561   {
 562     std::lock_guard l{client_lock};
 563     _close_sessions();
 564   }
 565   cct->_conf.remove_observer(this);
 566
 567   cct->get_admin_socket()->unregister_commands(&m_command_hook);
 568
 569   if (ino_invalidate_cb) {
 570     ldout(cct, 10) << "shutdown stopping cache invalidator finisher" << dendl;
 571     async_ino_invalidator.wait_for_empty();
 572     async_ino_invalidator.stop();
 573   }
 574
 575   if (dentry_invalidate_cb) {
 576     ldout(cct, 10) << "shutdown stopping dentry invalidator finisher" << dendl;
 577     async_dentry_invalidator.wait_for_empty();
 578     async_dentry_invalidator.stop();
 579   }
 580
 581   if (switch_interrupt_cb) {
 582     ldout(cct, 10) << "shutdown stopping interrupt finisher" << dendl;
 583     interrupt_finisher.wait_for_empty();
 584     interrupt_finisher.stop();
 585   }
 586
 587   if (remount_cb) {
 588     ldout(cct, 10) << "shutdown stopping remount finisher" << dendl;
 589     remount_finisher.wait_for_empty();
 590     remount_finisher.stop();
 591   }
 592
 593   if (ino_release_cb) {
 594     ldout(cct, 10) << "shutdown stopping inode release finisher" << dendl;
 595     async_ino_releasor.wait_for_empty();
 596     async_ino_releasor.stop();
 597   }
 598
 599   objectcacher->stop();  // outside of client_lock! this does a join.
 600   {
 601     std::lock_guard l{client_lock};
 602     ceph_assert(initialized);
 603     initialized = false;
 604     timer.shutdown();
 605   }
 606   objecter_finisher.wait_for_empty();
 607   objecter_finisher.stop();
 608
 609   if (logger) {
 610     cct->get_perfcounters_collection()->remove(logger.get());
 611     logger.reset();
 612   }
 613 }
 614
 615
 616 // ===================
 617 // metadata cache stuff
 618
 619 void Client::trim_cache(bool trim_kernel_dcache)
 620 {
 621   uint64_t max = cct->_conf->client_cache_size;
 622   ldout(cct, 20) << "trim_cache size " << lru.lru_get_size() << " max " << max << dendl;
 623   unsigned last = 0;
 624   while (lru.lru_get_size() != last) {
 625     last = lru.lru_get_size();
 626
 627     if (!unmounting && lru.lru_get_size() <= max)  break;
 628
 629     // trim!
 630     Dentry *dn = static_cast<Dentry*>(lru.lru_get_next_expire());
 631     if (!dn)
 632       break;  // done
 633
 634     trim_dentry(dn);
 635   }
 636
 637   if (trim_kernel_dcache && lru.lru_get_size() > max)
 638     _invalidate_kernel_dcache();
 639
 640   // hose root?
 641   if (lru.lru_get_size() == 0 && root && root->get_num_ref() == 0 && inode_map.size() == 1 + root_parents.size()) {
 642     ldout(cct, 15) << "trim_cache trimmed root " << root << dendl;
 643     delete root;
 644     root = 0;
 645     root_ancestor = 0;
 646     while (!root_parents.empty())
 647       root_parents.erase(root_parents.begin());
 648     inode_map.clear();
 649     _reset_faked_inos();
 650   }
 651 }
 652
 653 void Client::trim_cache_for_reconnect(MetaSession *s)
 654 {
 655   mds_rank_t mds = s->mds_num;
 656   ldout(cct, 20) << __func__ << " mds." << mds << dendl;
 657
 658   int trimmed = 0;
 659   list<Dentry*> skipped;
 660   while (lru.lru_get_size() > 0) {
 661     Dentry *dn = static_cast<Dentry*>(lru.lru_expire());
 662     if (!dn)
 663       break;
 664
 665     if ((dn->inode && dn->inode->caps.count(mds)) ||
 666         dn->dir->parent_inode->caps.count(mds)) {
 667       trim_dentry(dn);
 668       trimmed++;
 669     } else
 670       skipped.push_back(dn);
 671   }
 672
 673   for(list<Dentry*>::iterator p = skipped.begin(); p != skipped.end(); ++p)
 674     lru.lru_insert_mid(*p);
 675
 676   ldout(cct, 20) << __func__ << " mds." << mds
 677                  << " trimmed " << trimmed << " dentries" << dendl;
 678
 679   if (s->caps.size() > 0)
 680     _invalidate_kernel_dcache();
 681 }
 682
 683 void Client::trim_dentry(Dentry *dn)
 684 {
 685   ldout(cct, 15) << "trim_dentry unlinking dn " << dn->name
 686                  << " in dir "
 687                  << std::hex << dn->dir->parent_inode->ino << std::dec
 688                  << dendl;
 689   if (dn->inode) {
 690     Inode *diri = dn->dir->parent_inode;
 691     diri->dir_release_count++;
 692     clear_dir_complete_and_ordered(diri, true);
 693   }
 694   unlink(dn, false, false);  // drop dir, drop dentry
 695 }
 696
 697
 698 void Client::update_inode_file_size(Inode *in, int issued, uint64_t size,
 699                                     uint64_t truncate_seq, uint64_t truncate_size)
 700 {
 701   uint64_t prior_size = in->size;
 702
 703   if (truncate_seq > in->truncate_seq ||
 704       (truncate_seq == in->truncate_seq && size > in->size)) {
 705     ldout(cct, 10) << "size " << in->size << " -> " << size << dendl;
 706     in->size = size;
 707     in->reported_size = size;
 708     if (truncate_seq != in->truncate_seq) {
 709       ldout(cct, 10) << "truncate_seq " << in->truncate_seq << " -> "
 710                << truncate_seq << dendl;
 711       in->truncate_seq = truncate_seq;
 712       in->oset.truncate_seq = truncate_seq;
 713
 714       // truncate cached file data
 715       if (prior_size > size) {
 716         _invalidate_inode_cache(in, truncate_size, prior_size - truncate_size);
 717       }
 718     }
 719
 720     // truncate inline data
 721     if (in->inline_version < CEPH_INLINE_NONE) {
 722       uint32_t len = in->inline_data.length();
 723       if (size < len)
 724         in->inline_data.splice(size, len - size);
 725     }
 726   }
 727   if (truncate_seq >= in->truncate_seq &&
 728       in->truncate_size != truncate_size) {
 729     if (in->is_file()) {
 730       ldout(cct, 10) << "truncate_size " << in->truncate_size << " -> "
 731                << truncate_size << dendl;
 732       in->truncate_size = truncate_size;
 733       in->oset.truncate_size = truncate_size;
 734     } else {
 735       ldout(cct, 0) << "Hmmm, truncate_seq && truncate_size changed on non-file inode!" << dendl;
 736     }
 737   }
 738 }
 739
 740 void Client::update_inode_file_time(Inode *in, int issued, uint64_t time_warp_seq,
 741                                     utime_t ctime, utime_t mtime, utime_t atime)
 742 {
 743   ldout(cct, 10) << __func__ << " " << *in << " " << ccap_string(issued)
 744                  << " ctime " << ctime << " mtime " << mtime << dendl;
 745
 746   if (time_warp_seq > in->time_warp_seq)
 747     ldout(cct, 10) << " mds time_warp_seq " << time_warp_seq
 748                    << " is higher than local time_warp_seq "
 749                    << in->time_warp_seq << dendl;
 750
 751   int warn = false;
 752   // be careful with size, mtime, atime
 753   if (issued & (CEPH_CAP_FILE_EXCL|
 754                 CEPH_CAP_FILE_WR|
 755                 CEPH_CAP_FILE_BUFFER|
 756                 CEPH_CAP_AUTH_EXCL|
 757                 CEPH_CAP_XATTR_EXCL)) {
 758     ldout(cct, 30) << "Yay have enough caps to look at our times" << dendl;
 759     if (ctime > in->ctime)
 760       in->ctime = ctime;
 761     if (time_warp_seq > in->time_warp_seq) {
 762       //the mds updated times, so take those!
 763       in->mtime = mtime;
 764       in->atime = atime;
 765       in->time_warp_seq = time_warp_seq;
 766     } else if (time_warp_seq == in->time_warp_seq) {
 767       //take max times
 768       if (mtime > in->mtime)
 769         in->mtime = mtime;
 770       if (atime > in->atime)
 771         in->atime = atime;
 772     } else if (issued & CEPH_CAP_FILE_EXCL) {
 773       //ignore mds values as we have a higher seq
 774     } else warn = true;
 775   } else {
 776     ldout(cct, 30) << "Don't have enough caps, just taking mds' time values" << dendl;
 777     if (time_warp_seq >= in->time_warp_seq) {
 778       in->ctime = ctime;
 779       in->mtime = mtime;
 780       in->atime = atime;
 781       in->time_warp_seq = time_warp_seq;
 782     } else warn = true;
 783   }
 784   if (warn) {
 785     ldout(cct, 0) << "WARNING: " << *in << " mds time_warp_seq "
 786             << time_warp_seq << " is lower than local time_warp_seq "
 787             << in->time_warp_seq
 788             << dendl;
 789   }
 790 }
 791
 792 void Client::_fragmap_remove_non_leaves(Inode *in)
 793 {
 794   for (map<frag_t,int>::iterator p = in->fragmap.begin(); p != in->fragmap.end(); )
 795     if (!in->dirfragtree.is_leaf(p->first))
 796       in->fragmap.erase(p++);
 797     else
 798       ++p;
 799 }
 800
 801 void Client::_fragmap_remove_stopped_mds(Inode *in, mds_rank_t mds)
 802 {
 803   for (auto p = in->fragmap.begin(); p != in->fragmap.end(); )
 804     if (p->second == mds)
 805       in->fragmap.erase(p++);
 806     else
 807       ++p;
 808 }
 809
 810 Inode * Client::add_update_inode(InodeStat *st, utime_t from,
 811                                  MetaSession *session,
 812                                  const UserPerm& request_perms)
 813 {
 814   Inode *in;
 815   bool was_new = false;
 816   if (inode_map.count(st->vino)) {
 817     in = inode_map[st->vino];
 818     ldout(cct, 12) << __func__ << " had " << *in << " caps " << ccap_string(st->cap.caps) << dendl;
 819   } else {
 820     in = new Inode(this, st->vino, &st->layout);
 821     inode_map[st->vino] = in;
 822
 823     if (use_faked_inos())
 824       _assign_faked_ino(in);
 825
 826     if (!root) {
 827       root = in;
 828       if (use_faked_inos())
 829         _assign_faked_root(root);
 830       root_ancestor = in;
 831       cwd = root;
 832     } else if (!mounted) {
 833       root_parents[root_ancestor] = in;
 834       root_ancestor = in;
 835     }
 836
 837     // immutable bits
 838     in->ino = st->vino.ino;
 839     in->snapid = st->vino.snapid;
 840     in->mode = st->mode & S_IFMT;
 841     was_new = true;
 842   }
 843
 844   in->rdev = st->rdev;
 845   if (in->is_symlink())
 846     in->symlink = st->symlink;
 847
 848   // only update inode if mds info is strictly newer, or it is the same and projected (odd).
 849   bool new_version = false;
 850   if (in->version == 0 ||
 851       ((st->cap.flags & CEPH_CAP_FLAG_AUTH) &&
 852        (in->version & ~1) < st->version))
 853     new_version = true;
 854
 855   int issued;
 856   in->caps_issued(&issued);
 857   issued |= in->caps_dirty();
 858   int new_issued = ~issued & (int)st->cap.caps;
 859
 860   if ((new_version || (new_issued & CEPH_CAP_AUTH_SHARED)) &&
 861       !(issued & CEPH_CAP_AUTH_EXCL)) {
 862     in->mode = st->mode;
 863     in->uid = st->uid;
 864     in->gid = st->gid;
 865     in->btime = st->btime;
 866     in->snap_btime = st->snap_btime;
 867   }
 868
 869   if ((new_version || (new_issued & CEPH_CAP_LINK_SHARED)) &&
 870       !(issued & CEPH_CAP_LINK_EXCL)) {
 871     in->nlink = st->nlink;
 872   }
 873
 874   if (new_version || (new_issued & CEPH_CAP_ANY_RD)) {
 875     update_inode_file_time(in, issued, st->time_warp_seq,
 876                            st->ctime, st->mtime, st->atime);
 877   }
 878
 879   if (new_version ||
 880       (new_issued & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR))) {
 881     in->layout = st->layout;
 882     update_inode_file_size(in, issued, st->size, st->truncate_seq, st->truncate_size);
 883   }
 884
 885   if (in->is_dir()) {
 886     if (new_version || (new_issued & CEPH_CAP_FILE_SHARED)) {
 887       in->dirstat = st->dirstat;
 888     }
 889     // dir_layout/rstat/quota are not tracked by capability, update them only if
 890     // the inode stat is from auth mds
 891     if (new_version || (st->cap.flags & CEPH_CAP_FLAG_AUTH)) {
 892       in->dir_layout = st->dir_layout;
 893       ldout(cct, 20) << " dir hash is " << (int)in->dir_layout.dl_dir_hash << dendl;
 894       in->rstat = st->rstat;
 895       in->quota = st->quota;
 896       in->dir_pin = st->dir_pin;
 897     }
 898     // move me if/when version reflects fragtree changes.
 899     if (in->dirfragtree != st->dirfragtree) {
 900       in->dirfragtree = st->dirfragtree;
 901       _fragmap_remove_non_leaves(in);
 902     }
 903   }
 904
 905   if ((in->xattr_version  == 0 || !(issued & CEPH_CAP_XATTR_EXCL)) &&
 906       st->xattrbl.length() &&
 907       st->xattr_version > in->xattr_version) {
 908     auto p = st->xattrbl.cbegin();
 909     decode(in->xattrs, p);
 910     in->xattr_version = st->xattr_version;
 911   }
 912
 913   if (st->inline_version > in->inline_version) {
 914     in->inline_data = st->inline_data;
 915     in->inline_version = st->inline_version;
 916   }
 917
 918   /* always take a newer change attr */
 919   if (st->change_attr > in->change_attr)
 920     in->change_attr = st->change_attr;
 921
 922   if (st->version > in->version)
 923     in->version = st->version;
 924
 925   if (was_new)
 926     ldout(cct, 12) << __func__ << " adding " << *in << " caps " << ccap_string(st->cap.caps) << dendl;
 927
 928   if (!st->cap.caps)
 929     return in;   // as with readdir returning indoes in different snaprealms (no caps!)
 930
 931   if (in->snapid == CEPH_NOSNAP) {
 932     add_update_cap(in, session, st->cap.cap_id, st->cap.caps, st->cap.wanted,
 933                    st->cap.seq, st->cap.mseq, inodeno_t(st->cap.realm),
 934                    st->cap.flags, request_perms);
 935     if (in->auth_cap && in->auth_cap->session == session) {
 936       in->max_size = st->max_size;
 937       in->rstat = st->rstat;
 938     }
 939
 940     // setting I_COMPLETE needs to happen after adding the cap
 941     if (in->is_dir() &&
 942         (st->cap.caps & CEPH_CAP_FILE_SHARED) &&
 943         (issued & CEPH_CAP_FILE_EXCL) == 0 &&
 944         in->dirstat.nfiles == 0 &&
 945         in->dirstat.nsubdirs == 0) {
 946       ldout(cct, 10) << " marking (I_COMPLETE|I_DIR_ORDERED) on empty dir " << *in << dendl;
 947       in->flags |= I_COMPLETE | I_DIR_ORDERED;
 948       if (in->dir) {
 949         ldout(cct, 10) << " dir is open on empty dir " << in->ino << " with "
 950                        << in->dir->dentries.size() << " entries, marking all dentries null" << dendl;
 951         in->dir->readdir_cache.clear();
 952         for (const auto& p : in->dir->dentries) {
 953           unlink(p.second, true, true);  // keep dir, keep dentry
 954         }
 955         if (in->dir->dentries.empty())
 956           close_dir(in->dir);
 957       }
 958     }
 959   } else {
 960     in->snap_caps |= st->cap.caps;
 961   }
 962
 963   return in;
 964 }
 965
 966
 967 /*
 968  * insert_dentry_inode - insert + link a single dentry + inode into the metadata cache.
 969  */
 970 Dentry *Client::insert_dentry_inode(Dir *dir, const string& dname, LeaseStat *dlease,
 971                                     Inode *in, utime_t from, MetaSession *session,
 972                                     Dentry *old_dentry)
 973 {
 974   Dentry *dn = NULL;
 975   if (dir->dentries.count(dname))
 976     dn = dir->dentries[dname];
 977
 978   ldout(cct, 12) << __func__ << " '" << dname << "' vino " << in->vino()
 979                  << " in dir " << dir->parent_inode->vino() << " dn " << dn
 980                  << dendl;
 981
 982   if (dn && dn->inode) {
 983     if (dn->inode->vino() == in->vino()) {
 984       touch_dn(dn);
 985       ldout(cct, 12) << " had dentry " << dname
 986                << " with correct vino " << dn->inode->vino()
 987                << dendl;
 988     } else {
 989       ldout(cct, 12) << " had dentry " << dname
 990                << " with WRONG vino " << dn->inode->vino()
 991                << dendl;
 992       unlink(dn, true, true);  // keep dir, keep dentry
 993     }
 994   }
 995
 996   if (!dn || !dn->inode) {
 997     InodeRef tmp_ref(in);
 998     if (old_dentry) {
 999       if (old_dentry->dir != dir) {
1000         Inode *old_diri = old_dentry->dir->parent_inode;
1001         old_diri->dir_ordered_count++;
1002         clear_dir_complete_and_ordered(old_diri, false);
1003       }
1004       unlink(old_dentry, dir == old_dentry->dir, false);  // drop dentry, keep dir open if its the same dir
1005     }
1006     Inode *diri = dir->parent_inode;
1007     diri->dir_ordered_count++;
1008     clear_dir_complete_and_ordered(diri, false);
1009     dn = link(dir, dname, in, dn);
1010   }
1011
1012   update_dentry_lease(dn, dlease, from, session);
1013   return dn;
1014 }
1015
1016 void Client::update_dentry_lease(Dentry *dn, LeaseStat *dlease, utime_t from, MetaSession *session)
1017 {
1018   utime_t dttl = from;
1019   dttl += (float)dlease->duration_ms / 1000.0;
1020
1021   ceph_assert(dn);
1022
1023   if (dlease->mask & CEPH_LEASE_VALID) {
1024     if (dttl > dn->lease_ttl) {
1025       ldout(cct, 10) << "got dentry lease on " << dn->name
1026                << " dur " << dlease->duration_ms << "ms ttl " << dttl << dendl;
1027       dn->lease_ttl = dttl;
1028       dn->lease_mds = session->mds_num;
1029       dn->lease_seq = dlease->seq;
1030       dn->lease_gen = session->cap_gen;
1031     }
1032   }
1033   dn->cap_shared_gen = dn->dir->parent_inode->shared_gen;
1034 }
1035
1036
1037 /*
1038  * update MDS location cache for a single inode
1039  */
1040 void Client::update_dir_dist(Inode *in, DirStat *dst)
1041 {
1042   // auth
1043   ldout(cct, 20) << "got dirfrag map for " << in->ino << " frag " << dst->frag << " to mds " << dst->auth << dendl;
1044   if (dst->auth >= 0) {
1045     in->fragmap[dst->frag] = dst->auth;
1046   } else {
1047     in->fragmap.erase(dst->frag);
1048   }
1049   if (!in->dirfragtree.is_leaf(dst->frag)) {
1050     in->dirfragtree.force_to_leaf(cct, dst->frag);
1051     _fragmap_remove_non_leaves(in);
1052   }
1053
1054   // replicated
1055   in->dir_replicated = !dst->dist.empty();  // FIXME that's just one frag!
1056 }
1057
1058 void Client::clear_dir_complete_and_ordered(Inode *diri, bool complete)
1059 {
1060   if (diri->flags & I_COMPLETE) {
1061     if (complete) {
1062       ldout(cct, 10) << " clearing (I_COMPLETE|I_DIR_ORDERED) on " << *diri << dendl;
1063       diri->flags &= ~(I_COMPLETE | I_DIR_ORDERED);
1064     } else {
1065       if (diri->flags & I_DIR_ORDERED) {
1066         ldout(cct, 10) << " clearing I_DIR_ORDERED on " << *diri << dendl;
1067         diri->flags &= ~I_DIR_ORDERED;
1068       }
1069     }
1070     if (diri->dir)
1071       diri->dir->readdir_cache.clear();
1072   }
1073 }
1074
1075 /*
1076  * insert results from readdir or lssnap into the metadata cache.
1077  */
1078 void Client::insert_readdir_results(MetaRequest *request, MetaSession *session, Inode *diri) {
1079
1080   auto& reply = request->reply;
1081   ConnectionRef con = request->reply->get_connection();
1082   uint64_t features;
1083   if(session->mds_features.test(CEPHFS_FEATURE_REPLY_ENCODING)) {
1084     features = (uint64_t)-1;
1085   }
1086   else {
1087     features = con->get_features();
1088   }
1089
1090   dir_result_t *dirp = request->dirp;
1091   ceph_assert(dirp);
1092
1093   // the extra buffer list is only set for readdir and lssnap replies
1094   auto p = reply->get_extra_bl().cbegin();
1095   if (!p.end()) {
1096     // snapdir?
1097     if (request->head.op == CEPH_MDS_OP_LSSNAP) {
1098       ceph_assert(diri);
1099       diri = open_snapdir(diri);
1100     }
1101
1102     // only open dir if we're actually adding stuff to it!
1103     Dir *dir = diri->open_dir();
1104     ceph_assert(dir);
1105
1106     // dirstat
1107     DirStat dst(p, features);
1108     __u32 numdn;
1109     __u16 flags;
1110     decode(numdn, p);
1111     decode(flags, p);
1112
1113     bool end = ((unsigned)flags & CEPH_READDIR_FRAG_END);
1114     bool hash_order = ((unsigned)flags & CEPH_READDIR_HASH_ORDER);
1115
1116     frag_t fg = (unsigned)request->head.args.readdir.frag;
1117     unsigned readdir_offset = dirp->next_offset;
1118     string readdir_start = dirp->last_name;
1119     ceph_assert(!readdir_start.empty() || readdir_offset == 2);
1120
1121     unsigned last_hash = 0;
1122     if (hash_order) {
1123       if (!readdir_start.empty()) {
1124         last_hash = ceph_frag_value(diri->hash_dentry_name(readdir_start));
1125       } else if (flags & CEPH_READDIR_OFFSET_HASH) {
1126         /* mds understands offset_hash */
1127         last_hash = (unsigned)request->head.args.readdir.offset_hash;
1128       }
1129     }
1130
1131     if (fg != dst.frag) {
1132       ldout(cct, 10) << "insert_trace got new frag " << fg << " -> " << dst.frag << dendl;
1133       fg = dst.frag;
1134       if (!hash_order) {
1135         readdir_offset = 2;
1136         readdir_start.clear();
1137         dirp->offset = dir_result_t::make_fpos(fg, readdir_offset, false);
1138       }
1139     }
1140
1141     ldout(cct, 10) << __func__ << " " << numdn << " readdir items, end=" << end
1142                    << ", hash_order=" << hash_order
1143                    << ", readdir_start " << readdir_start
1144                    << ", last_hash " << last_hash
1145                    << ", next_offset " << readdir_offset << dendl;
1146
1147     if (diri->snapid != CEPH_SNAPDIR &&
1148         fg.is_leftmost() && readdir_offset == 2 &&
1149         !(hash_order && last_hash)) {
1150       dirp->release_count = diri->dir_release_count;
1151       dirp->ordered_count = diri->dir_ordered_count;
1152       dirp->start_shared_gen = diri->shared_gen;
1153       dirp->cache_index = 0;
1154     }
1155
1156     dirp->buffer_frag = fg;
1157
1158     _readdir_drop_dirp_buffer(dirp);
1159     dirp->buffer.reserve(numdn);
1160
1161     string dname;
1162     LeaseStat dlease;
1163     for (unsigned i=0; i<numdn; i++) {
1164       decode(dname, p);
1165       dlease.decode(p, features);
1166       InodeStat ist(p, features);
1167
1168       ldout(cct, 15) << "" << i << ": '" << dname << "'" << dendl;
1169
1170       Inode *in = add_update_inode(&ist, request->sent_stamp, session,
1171                                    request->perms);
1172       Dentry *dn;
1173       if (diri->dir->dentries.count(dname)) {
1174         Dentry *olddn = diri->dir->dentries[dname];
1175         if (olddn->inode != in) {
1176           // replace incorrect dentry
1177           unlink(olddn, true, true);  // keep dir, dentry
1178           dn = link(dir, dname, in, olddn);
1179           ceph_assert(dn == olddn);
1180         } else {
1181           // keep existing dn
1182           dn = olddn;
1183           touch_dn(dn);
1184         }
1185       } else {
1186         // new dn
1187         dn = link(dir, dname, in, NULL);
1188       }
1189
1190       update_dentry_lease(dn, &dlease, request->sent_stamp, session);
1191       if (hash_order) {
1192         unsigned hash = ceph_frag_value(diri->hash_dentry_name(dname));
1193         if (hash != last_hash)
1194           readdir_offset = 2;
1195         last_hash = hash;
1196         dn->offset = dir_result_t::make_fpos(hash, readdir_offset++, true);
1197       } else {
1198         dn->offset = dir_result_t::make_fpos(fg, readdir_offset++, false);
1199       }
1200       // add to readdir cache
1201       if (dirp->release_count == diri->dir_release_count &&
1202           dirp->ordered_count == diri->dir_ordered_count &&
1203           dirp->start_shared_gen == diri->shared_gen) {
1204         if (dirp->cache_index == dir->readdir_cache.size()) {
1205           if (i == 0) {
1206             ceph_assert(!dirp->inode->is_complete_and_ordered());
1207             dir->readdir_cache.reserve(dirp->cache_index + numdn);
1208           }
1209           dir->readdir_cache.push_back(dn);
1210         } else if (dirp->cache_index < dir->readdir_cache.size()) {
1211           if (dirp->inode->is_complete_and_ordered())
1212             ceph_assert(dir->readdir_cache[dirp->cache_index] == dn);
1213           else
1214             dir->readdir_cache[dirp->cache_index] = dn;
1215         } else {
1216           ceph_abort_msg("unexpected readdir buffer idx");
1217         }
1218         dirp->cache_index++;
1219       }
1220       // add to cached result list
1221       dirp->buffer.push_back(dir_result_t::dentry(dn->offset, dname, in));
1222       ldout(cct, 15) << __func__ << "  " << hex << dn->offset << dec << ": '" << dname << "' -> " << in->ino << dendl;
1223     }
1224
1225     if (numdn > 0)
1226       dirp->last_name = dname;
1227     if (end)
1228       dirp->next_offset = 2;
1229     else
1230       dirp->next_offset = readdir_offset;
1231
1232     if (dir->is_empty())
1233       close_dir(dir);
1234   }
1235 }
1236
1237 /** insert_trace
1238  *
1239  * insert a trace from a MDS reply into the cache.
1240  */
1241 Inode* Client::insert_trace(MetaRequest *request, MetaSession *session)
1242 {
1243   auto& reply = request->reply;
1244   int op = request->get_op();
1245
1246   ldout(cct, 10) << "insert_trace from " << request->sent_stamp << " mds." << session->mds_num
1247            << " is_target=" << (int)reply->head.is_target
1248            << " is_dentry=" << (int)reply->head.is_dentry
1249            << dendl;
1250
1251   auto p = reply->get_trace_bl().cbegin();
1252   if (request->got_unsafe) {
1253     ldout(cct, 10) << "insert_trace -- already got unsafe; ignoring" << dendl;
1254     ceph_assert(p.end());
1255     return NULL;
1256   }
1257
1258   if (p.end()) {
1259     ldout(cct, 10) << "insert_trace -- no trace" << dendl;
1260
1261     Dentry *d = request->dentry();
1262     if (d) {
1263       Inode *diri = d->dir->parent_inode;
1264       diri->dir_release_count++;
1265       clear_dir_complete_and_ordered(diri, true);
1266     }
1267
1268     if (d && reply->get_result() == 0) {
1269       if (op == CEPH_MDS_OP_RENAME) {
1270         // rename
1271         Dentry *od = request->old_dentry();
1272         ldout(cct, 10) << " unlinking rename src dn " << od << " for traceless reply" << dendl;
1273         ceph_assert(od);
1274         unlink(od, true, true);  // keep dir, dentry
1275       } else if (op == CEPH_MDS_OP_RMDIR ||
1276                  op == CEPH_MDS_OP_UNLINK) {
1277         // unlink, rmdir
1278         ldout(cct, 10) << " unlinking unlink/rmdir dn " << d << " for traceless reply" << dendl;
1279         unlink(d, true, true);  // keep dir, dentry
1280       }
1281     }
1282     return NULL;
1283   }
1284
1285   ConnectionRef con = request->reply->get_connection();
1286   uint64_t features;
1287   if (session->mds_features.test(CEPHFS_FEATURE_REPLY_ENCODING)) {
1288     features = (uint64_t)-1;
1289   }
1290   else {
1291     features = con->get_features();
1292   }
1293   ldout(cct, 10) << " features 0x" << hex << features << dec << dendl;
1294
1295   // snap trace
1296   SnapRealm *realm = NULL;
1297   if (reply->snapbl.length())
1298     update_snap_trace(reply->snapbl, &realm);
1299
1300   ldout(cct, 10) << " hrm "
1301            << " is_target=" << (int)reply->head.is_target
1302            << " is_dentry=" << (int)reply->head.is_dentry
1303            << dendl;
1304
1305   InodeStat dirst;
1306   DirStat dst;
1307   string dname;
1308   LeaseStat dlease;
1309   InodeStat ist;
1310
1311   if (reply->head.is_dentry) {
1312     dirst.decode(p, features);
1313     dst.decode(p, features);
1314     decode(dname, p);
1315     dlease.decode(p, features);
1316   }
1317
1318   Inode *in = 0;
1319   if (reply->head.is_target) {
1320     ist.decode(p, features);
1321     if (cct->_conf->client_debug_getattr_caps) {
1322       unsigned wanted = 0;
1323       if (op == CEPH_MDS_OP_GETATTR || op == CEPH_MDS_OP_LOOKUP)
1324         wanted = request->head.args.getattr.mask;
1325       else if (op == CEPH_MDS_OP_OPEN || op == CEPH_MDS_OP_CREATE)
1326         wanted = request->head.args.open.mask;
1327
1328       if ((wanted & CEPH_CAP_XATTR_SHARED) &&
1329           !(ist.xattr_version > 0 && ist.xattrbl.length() > 0))
1330         ceph_abort_msg("MDS reply does not contain xattrs");
1331     }
1332
1333     in = add_update_inode(&ist, request->sent_stamp, session,
1334                           request->perms);
1335   }
1336
1337   Inode *diri = NULL;
1338   if (reply->head.is_dentry) {
1339     diri = add_update_inode(&dirst, request->sent_stamp, session,
1340                             request->perms);
1341     update_dir_dist(diri, &dst);  // dir stat info is attached to ..
1342
1343     if (in) {
1344       Dir *dir = diri->open_dir();
1345       insert_dentry_inode(dir, dname, &dlease, in, request->sent_stamp, session,
1346                           (op == CEPH_MDS_OP_RENAME) ? request->old_dentry() : NULL);
1347     } else {
1348       Dentry *dn = NULL;
1349       if (diri->dir && diri->dir->dentries.count(dname)) {
1350         dn = diri->dir->dentries[dname];
1351         if (dn->inode) {
1352           diri->dir_ordered_count++;
1353           clear_dir_complete_and_ordered(diri, false);
1354           unlink(dn, true, true);  // keep dir, dentry
1355         }
1356       }
1357       if (dlease.duration_ms > 0) {
1358         if (!dn) {
1359           Dir *dir = diri->open_dir();
1360           dn = link(dir, dname, NULL, NULL);
1361         }
1362         update_dentry_lease(dn, &dlease, request->sent_stamp, session);
1363       }
1364     }
1365   } else if (op == CEPH_MDS_OP_LOOKUPSNAP ||
1366              op == CEPH_MDS_OP_MKSNAP) {
1367     ldout(cct, 10) << " faking snap lookup weirdness" << dendl;
1368     // fake it for snap lookup
1369     vinodeno_t vino = ist.vino;
1370     vino.snapid = CEPH_SNAPDIR;
1371     ceph_assert(inode_map.count(vino));
1372     diri = inode_map[vino];
1373
1374     string dname = request->path.last_dentry();
1375
1376     LeaseStat dlease;
1377     dlease.duration_ms = 0;
1378
1379     if (in) {
1380       Dir *dir = diri->open_dir();
1381       insert_dentry_inode(dir, dname, &dlease, in, request->sent_stamp, session);
1382     } else {
1383       if (diri->dir && diri->dir->dentries.count(dname)) {
1384         Dentry *dn = diri->dir->dentries[dname];
1385         if (dn->inode)
1386           unlink(dn, true, true);  // keep dir, dentry
1387       }
1388     }
1389   }
1390
1391   if (in) {
1392     if (op == CEPH_MDS_OP_READDIR ||
1393         op == CEPH_MDS_OP_LSSNAP) {
1394       insert_readdir_results(request, session, in);
1395     } else if (op == CEPH_MDS_OP_LOOKUPNAME) {
1396       // hack: return parent inode instead
1397       in = diri;
1398     }
1399
1400     if (request->dentry() == NULL && in != request->inode()) {
1401       // pin the target inode if its parent dentry is not pinned
1402       request->set_other_inode(in);
1403     }
1404   }
1405
1406   if (realm)
1407     put_snap_realm(realm);
1408
1409   request->target = in;
1410   return in;
1411 }
1412
1413 // -------
1414
1415 mds_rank_t Client::choose_target_mds(MetaRequest *req, Inode** phash_diri)
1416 {
1417   mds_rank_t mds = MDS_RANK_NONE;
1418   __u32 hash = 0;
1419   bool is_hash = false;
1420
1421   Inode *in = NULL;
1422   Dentry *de = NULL;
1423
1424   if (req->resend_mds >= 0) {
1425     mds = req->resend_mds;
1426     req->resend_mds = -1;
1427     ldout(cct, 10) << __func__ << " resend_mds specified as mds." << mds << dendl;
1428     goto out;
1429   }
1430
1431   if (cct->_conf->client_use_random_mds)
1432     goto random_mds;
1433
1434   in = req->inode();
1435   de = req->dentry();
1436   if (in) {
1437     ldout(cct, 20) << __func__ << " starting with req->inode " << *in << dendl;
1438     if (req->path.depth()) {
1439       hash = in->hash_dentry_name(req->path[0]);
1440       ldout(cct, 20) << __func__ << " inode dir hash is " << (int)in->dir_layout.dl_dir_hash
1441                << " on " << req->path[0]
1442                << " => " << hash << dendl;
1443       is_hash = true;
1444     }
1445   } else if (de) {
1446     if (de->inode) {
1447       in = de->inode.get();
1448       ldout(cct, 20) << __func__ << " starting with req->dentry inode " << *in << dendl;
1449     } else {
1450       in = de->dir->parent_inode;
1451       hash = in->hash_dentry_name(de->name);
1452       ldout(cct, 20) << __func__ << " dentry dir hash is " << (int)in->dir_layout.dl_dir_hash
1453                << " on " << de->name
1454                << " => " << hash << dendl;
1455       is_hash = true;
1456     }
1457   }
1458   if (in) {
1459     if (in->snapid != CEPH_NOSNAP) {
1460       ldout(cct, 10) << __func__ << " " << *in << " is snapped, using nonsnap parent" << dendl;
1461       while (in->snapid != CEPH_NOSNAP) {
1462         if (in->snapid == CEPH_SNAPDIR)
1463           in = in->snapdir_parent.get();
1464         else if (!in->dentries.empty())
1465           /* In most cases there will only be one dentry, so getting it
1466            * will be the correct action. If there are multiple hard links,
1467            * I think the MDS should be able to redirect as needed*/
1468           in = in->get_first_parent()->dir->parent_inode;
1469         else {
1470           ldout(cct, 10) << "got unlinked inode, can't look at parent" << dendl;
1471           break;
1472         }
1473       }
1474       is_hash = false;
1475     }
1476
1477     ldout(cct, 20) << __func__ << " " << *in << " is_hash=" << is_hash
1478              << " hash=" << hash << dendl;
1479
1480     if (is_hash && S_ISDIR(in->mode) && !in->fragmap.empty()) {
1481       frag_t fg = in->dirfragtree[hash];
1482       if (in->fragmap.count(fg)) {
1483         mds = in->fragmap[fg];
1484         if (phash_diri)
1485           *phash_diri = in;
1486       } else if (in->auth_cap) {
1487         mds = in->auth_cap->session->mds_num;
1488       }
1489       if (mds >= 0) {
1490         ldout(cct, 10) << __func__ << " from dirfragtree hash" << dendl;
1491         goto out;
1492       }
1493     }
1494
1495     if (in->auth_cap && req->auth_is_best()) {
1496       mds = in->auth_cap->session->mds_num;
1497     } else if (!in->caps.empty()) {
1498       mds = in->caps.begin()->second.session->mds_num;
1499     } else {
1500       goto random_mds;
1501     }
1502     ldout(cct, 10) << __func__ << " from caps on inode " << *in << dendl;
1503
1504     goto out;
1505   }
1506
1507 random_mds:
1508   if (mds < 0) {
1509     mds = _get_random_up_mds();
1510     ldout(cct, 10) << "did not get mds through better means, so chose random mds " << mds << dendl;
1511   }
1512
1513 out:
1514   ldout(cct, 20) << "mds is " << mds << dendl;
1515   return mds;
1516 }
1517
1518
1519 void Client::connect_mds_targets(mds_rank_t mds)
1520 {
1521   ldout(cct, 10) << __func__ << " for mds." << mds << dendl;
1522   ceph_assert(mds_sessions.count(mds));
1523   const MDSMap::mds_info_t& info = mdsmap->get_mds_info(mds);
1524   for (set<mds_rank_t>::const_iterator q = info.export_targets.begin();
1525        q != info.export_targets.end();
1526        ++q) {
1527     if (mds_sessions.count(*q) == 0 &&
1528         mdsmap->is_clientreplay_or_active_or_stopping(*q)) {
1529       ldout(cct, 10) << "check_mds_sessions opening mds." << mds
1530                      << " export target mds." << *q << dendl;
1531       _open_mds_session(*q);
1532     }
1533   }
1534 }
1535
1536 void Client::dump_mds_sessions(Formatter *f)
1537 {
1538   f->dump_int("id", get_nodeid().v);
1539   entity_inst_t inst(messenger->get_myname(), messenger->get_myaddr_legacy());
1540   f->dump_object("inst", inst);
1541   f->dump_stream("inst_str") << inst;
1542   f->dump_stream("addr_str") << inst.addr;
1543   f->open_array_section("sessions");
1544   for (const auto &p : mds_sessions) {
1545     f->open_object_section("session");
1546     p.second.dump(f);
1547     f->close_section();
1548   }
1549   f->close_section();
1550   f->dump_int("mdsmap_epoch", mdsmap->get_epoch());
1551 }
1552 void Client::dump_mds_requests(Formatter *f)
1553 {
1554   for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
1555        p != mds_requests.end();
1556        ++p) {
1557     f->open_object_section("request");
1558     p->second->dump(f);
1559     f->close_section();
1560   }
1561 }
1562
1563 int Client::verify_reply_trace(int r, MetaSession *session,
1564                                MetaRequest *request, const MConstRef<MClientReply>& reply,
1565                                InodeRef *ptarget, bool *pcreated,
1566                                const UserPerm& perms)
1567 {
1568   // check whether this request actually did the create, and set created flag
1569   bufferlist extra_bl;
1570   inodeno_t created_ino;
1571   bool got_created_ino = false;
1572   ceph::unordered_map<vinodeno_t, Inode*>::iterator p;
1573
1574   extra_bl = reply->get_extra_bl();
1575   if (extra_bl.length() >= 8) {
1576     if (session->mds_features.test(CEPHFS_FEATURE_DELEG_INO)) {
1577      struct openc_response_t    ocres;
1578
1579      decode(ocres, extra_bl);
1580      created_ino = ocres.created_ino;
1581      /*
1582       * The userland cephfs client doesn't have a way to do an async create
1583       * (yet), so just discard delegated_inos for now. Eventually we should
1584       * store them and use them in create calls, even if they are synchronous,
1585       * if only for testing purposes.
1586       */
1587      ldout(cct, 10) << "delegated_inos: " << ocres.delegated_inos << dendl;
1588     } else {
1589      // u64 containing number of created ino
1590      decode(created_ino, extra_bl);
1591     }
1592     ldout(cct, 10) << "make_request created ino " << created_ino << dendl;
1593     got_created_ino = true;
1594   }
1595
1596   if (pcreated)
1597     *pcreated = got_created_ino;
1598
1599   if (request->target) {
1600     *ptarget = request->target;
1601     ldout(cct, 20) << "make_request target is " << *ptarget->get() << dendl;
1602   } else {
1603     if (got_created_ino && (p = inode_map.find(vinodeno_t(created_ino, CEPH_NOSNAP))) != inode_map.end()) {
1604       (*ptarget) = p->second;
1605       ldout(cct, 20) << "make_request created, target is " << *ptarget->get() << dendl;
1606     } else {
1607       // we got a traceless reply, and need to look up what we just
1608       // created.  for now, do this by name.  someday, do this by the
1609       // ino... which we know!  FIXME.
1610       InodeRef target;
1611       Dentry *d = request->dentry();
1612       if (d) {
1613         if (d->dir) {
1614           ldout(cct, 10) << "make_request got traceless reply, looking up #"
1615                          << d->dir->parent_inode->ino << "/" << d->name
1616                          << " got_ino " << got_created_ino
1617                          << " ino " << created_ino
1618                          << dendl;
1619           r = _do_lookup(d->dir->parent_inode, d->name, request->regetattr_mask,
1620                          &target, perms);
1621         } else {
1622           // if the dentry is not linked, just do our best. see #5021.
1623           ceph_abort_msg("how did this happen?  i want logs!");
1624         }
1625       } else {
1626         Inode *in = request->inode();
1627         ldout(cct, 10) << "make_request got traceless reply, forcing getattr on #"
1628                        << in->ino << dendl;
1629         r = _getattr(in, request->regetattr_mask, perms, true);
1630         target = in;
1631       }
1632       if (r >= 0) {
1633         // verify ino returned in reply and trace_dist are the same
1634         if (got_created_ino &&
1635             created_ino.val != target->ino.val) {
1636           ldout(cct, 5) << "create got ino " << created_ino << " but then failed on lookup; EINTR?" << dendl;
1637           r = -EINTR;
1638         }
1639         if (ptarget)
1640           ptarget->swap(target);
1641       }
1642     }
1643   }
1644
1645   return r;
1646 }
1647
1648
1649 /**
1650  * make a request
1651  *
1652  * Blocking helper to make an MDS request.
1653  *
1654  * If the ptarget flag is set, behavior changes slightly: the caller
1655  * expects to get a pointer to the inode we are creating or operating
1656  * on.  As a result, we will follow up any traceless mutation reply
1657  * with a getattr or lookup to transparently handle a traceless reply
1658  * from the MDS (as when the MDS restarts and the client has to replay
1659  * a request).
1660  *
1661  * @param request the MetaRequest to execute
1662  * @param perms The user uid/gid to execute as (eventually, full group lists?)
1663  * @param ptarget [optional] address to store a pointer to the target inode we want to create or operate on
1664  * @param pcreated [optional; required if ptarget] where to store a bool of whether our create atomically created a file
1665  * @param use_mds [optional] prefer a specific mds (-1 for default)
1666  * @param pdirbl [optional; disallowed if ptarget] where to pass extra reply payload to the caller
1667  */
1668 int Client::make_request(MetaRequest *request,
1669                          const UserPerm& perms,
1670                          InodeRef *ptarget, bool *pcreated,
1671                          mds_rank_t use_mds,
1672                          bufferlist *pdirbl)
1673 {
1674   int r = 0;
1675
1676   // assign a unique tid
1677   ceph_tid_t tid = ++last_tid;
1678   request->set_tid(tid);
1679
1680   // and timestamp
1681   request->op_stamp = ceph_clock_now();
1682
1683   // make note
1684   mds_requests[tid] = request->get();
1685   if (oldest_tid == 0 && request->get_op() != CEPH_MDS_OP_SETFILELOCK)
1686     oldest_tid = tid;
1687
1688   request->set_caller_perms(perms);
1689
1690   if (cct->_conf->client_inject_fixed_oldest_tid) {
1691     ldout(cct, 20) << __func__ << " injecting fixed oldest_client_tid(1)" << dendl;
1692     request->set_oldest_client_tid(1);
1693   } else {
1694     request->set_oldest_client_tid(oldest_tid);
1695   }
1696
1697   // hack target mds?
1698   if (use_mds >= 0)
1699     request->resend_mds = use_mds;
1700
1701   MetaSession *session = NULL;
1702   while (1) {
1703     if (request->aborted())
1704       break;
1705
1706     if (blacklisted) {
1707       request->abort(-EBLACKLISTED);
1708       break;
1709     }
1710
1711     // set up wait cond
1712     ceph::condition_variable caller_cond;
1713     request->caller_cond = &caller_cond;
1714
1715     // choose mds
1716     Inode *hash_diri = NULL;
1717     mds_rank_t mds = choose_target_mds(request, &hash_diri);
1718     int mds_state = (mds == MDS_RANK_NONE) ? MDSMap::STATE_NULL : mdsmap->get_state(mds);
1719     if (mds_state != MDSMap::STATE_ACTIVE && mds_state != MDSMap::STATE_STOPPING) {
1720       if (mds_state == MDSMap::STATE_NULL && mds >= mdsmap->get_max_mds()) {
1721         if (hash_diri) {
1722           ldout(cct, 10) << " target mds." << mds << " has stopped, remove it from fragmap" << dendl;
1723           _fragmap_remove_stopped_mds(hash_diri, mds);
1724         } else {
1725           ldout(cct, 10) << " target mds." << mds << " has stopped, trying a random mds" << dendl;
1726           request->resend_mds = _get_random_up_mds();
1727         }
1728       } else {
1729         ldout(cct, 10) << " target mds." << mds << " not active, waiting for new mdsmap" << dendl;
1730         wait_on_list(waiting_for_mdsmap);
1731       }
1732       continue;
1733     }
1734
1735     // open a session?
1736     if (!have_open_session(mds)) {
1737       session = _get_or_open_mds_session(mds);
1738
1739       // wait
1740       if (session->state == MetaSession::STATE_OPENING) {
1741         ldout(cct, 10) << "waiting for session to mds." << mds << " to open" << dendl;
1742         wait_on_context_list(session->waiting_for_open);
1743         // Abort requests on REJECT from MDS
1744         if (rejected_by_mds.count(mds)) {
1745           request->abort(-EPERM);
1746           break;
1747         }
1748         continue;
1749       }
1750
1751       if (!have_open_session(mds))
1752         continue;
1753     } else {
1754       session = &mds_sessions.at(mds);
1755     }
1756
1757     // send request.
1758     send_request(request, session);
1759
1760     // wait for signal
1761     ldout(cct, 20) << "awaiting reply|forward|kick on " << &caller_cond << dendl;
1762     request->kick = false;
1763     std::unique_lock l{client_lock, std::adopt_lock};
1764     caller_cond.wait(l, [request] {
1765       return (request->reply ||           // reply
1766               request->resend_mds >= 0 || // forward
1767               request->kick);
1768     });
1769     l.release();
1770     request->caller_cond = nullptr;
1771
1772     // did we get a reply?
1773     if (request->reply)
1774       break;
1775   }
1776
1777   if (!request->reply) {
1778     ceph_assert(request->aborted());
1779     ceph_assert(!request->got_unsafe);
1780     r = request->get_abort_code();
1781     request->item.remove_myself();
1782     unregister_request(request);
1783     put_request(request);
1784     return r;
1785   }
1786
1787   // got it!
1788   auto reply = std::move(request->reply);
1789   r = reply->get_result();
1790   if (r >= 0)
1791     request->success = true;
1792
1793   // kick dispatcher (we've got it!)
1794   ceph_assert(request->dispatch_cond);
1795   request->dispatch_cond->notify_all();
1796   ldout(cct, 20) << "sendrecv kickback on tid " << tid << " " << request->dispatch_cond << dendl;
1797   request->dispatch_cond = 0;
1798
1799   if (r >= 0 && ptarget)
1800     r = verify_reply_trace(r, session, request, reply, ptarget, pcreated, perms);
1801
1802   if (pdirbl)
1803     *pdirbl = reply->get_extra_bl();
1804
1805   // -- log times --
1806   utime_t lat = ceph_clock_now();
1807   lat -= request->sent_stamp;
1808   ldout(cct, 20) << "lat " << lat << dendl;
1809   logger->tinc(l_c_lat, lat);
1810   logger->tinc(l_c_reply, lat);
1811
1812   put_request(request);
1813   return r;
1814 }
1815
1816 void Client::unregister_request(MetaRequest *req)
1817 {
1818   mds_requests.erase(req->tid);
1819   if (req->tid == oldest_tid) {
1820     map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.upper_bound(oldest_tid);
1821     while (true) {
1822       if (p == mds_requests.end()) {
1823         oldest_tid = 0;
1824         break;
1825       }
1826       if (p->second->get_op() != CEPH_MDS_OP_SETFILELOCK) {
1827         oldest_tid = p->first;
1828         break;
1829       }
1830       ++p;
1831     }
1832   }
1833   put_request(req);
1834 }
1835
1836 void Client::put_request(MetaRequest *request)
1837 {
1838   if (request->_put()) {
1839     int op = -1;
1840     if (request->success)
1841       op = request->get_op();
1842     InodeRef other_in;
1843     request->take_other_inode(&other_in);
1844     delete request;
1845
1846     if (other_in &&
1847         (op == CEPH_MDS_OP_RMDIR ||
1848          op == CEPH_MDS_OP_RENAME ||
1849          op == CEPH_MDS_OP_RMSNAP)) {
1850       _try_to_trim_inode(other_in.get(), false);
1851     }
1852   }
1853 }
1854
1855 int Client::encode_inode_release(Inode *in, MetaRequest *req,
1856                          mds_rank_t mds, int drop,
1857                          int unless, int force)
1858 {
1859   ldout(cct, 20) << __func__ << " enter(in:" << *in << ", req:" << req
1860            << " mds:" << mds << ", drop:" << drop << ", unless:" << unless
1861            << ", force:" << force << ")" << dendl;
1862   int released = 0;
1863   auto it = in->caps.find(mds);
1864   if (it != in->caps.end()) {
1865     Cap &cap = it->second;
1866     drop &= ~(in->dirty_caps | get_caps_used(in));
1867     if ((drop & cap.issued) &&
1868         !(unless & cap.issued)) {
1869       ldout(cct, 25) << "dropping caps " << ccap_string(drop) << dendl;
1870       cap.issued &= ~drop;
1871       cap.implemented &= ~drop;
1872       released = 1;
1873     } else {
1874       released = force;
1875     }
1876     if (released) {
1877       cap.wanted = in->caps_wanted();
1878       if (&cap == in->auth_cap &&
1879           !(cap.wanted & CEPH_CAP_ANY_FILE_WR)) {
1880         in->requested_max_size = 0;
1881         ldout(cct, 25) << "reset requested_max_size due to not wanting any file write cap" << dendl;
1882       }
1883       ceph_mds_request_release rel;
1884       rel.ino = in->ino;
1885       rel.cap_id = cap.cap_id;
1886       rel.seq = cap.seq;
1887       rel.issue_seq = cap.issue_seq;
1888       rel.mseq = cap.mseq;
1889       rel.caps = cap.implemented;
1890       rel.wanted = cap.wanted;
1891       rel.dname_len = 0;
1892       rel.dname_seq = 0;
1893       req->cap_releases.push_back(MClientRequest::Release(rel,""));
1894     }
1895   }
1896   ldout(cct, 25) << __func__ << " exit(in:" << *in << ") released:"
1897            << released << dendl;
1898   return released;
1899 }
1900
1901 void Client::encode_dentry_release(Dentry *dn, MetaRequest *req,
1902                            mds_rank_t mds, int drop, int unless)
1903 {
1904   ldout(cct, 20) << __func__ << " enter(dn:"
1905            << dn << ")" << dendl;
1906   int released = 0;
1907   if (dn->dir)
1908     released = encode_inode_release(dn->dir->parent_inode, req,
1909                                     mds, drop, unless, 1);
1910   if (released && dn->lease_mds == mds) {
1911     ldout(cct, 25) << "preemptively releasing dn to mds" << dendl;
1912     auto& rel = req->cap_releases.back();
1913     rel.item.dname_len = dn->name.length();
1914     rel.item.dname_seq = dn->lease_seq;
1915     rel.dname = dn->name;
1916   }
1917   ldout(cct, 25) << __func__ << " exit(dn:"
1918            << dn << ")" << dendl;
1919 }
1920
1921
1922 /*
1923  * This requires the MClientRequest *request member to be set.
1924  * It will error out horribly without one.
1925  * Additionally, if you set any *drop member, you'd better have
1926  * set the corresponding dentry!
1927  */
1928 void Client::encode_cap_releases(MetaRequest *req, mds_rank_t mds)
1929 {
1930   ldout(cct, 20) << __func__ << " enter (req: "
1931                  << req << ", mds: " << mds << ")" << dendl;
1932   if (req->inode_drop && req->inode())
1933     encode_inode_release(req->inode(), req,
1934                          mds, req->inode_drop,
1935                          req->inode_unless);
1936
1937   if (req->old_inode_drop && req->old_inode())
1938     encode_inode_release(req->old_inode(), req,
1939                          mds, req->old_inode_drop,
1940                          req->old_inode_unless);
1941   if (req->other_inode_drop && req->other_inode())
1942     encode_inode_release(req->other_inode(), req,
1943                          mds, req->other_inode_drop,
1944                          req->other_inode_unless);
1945
1946   if (req->dentry_drop && req->dentry())
1947     encode_dentry_release(req->dentry(), req,
1948                           mds, req->dentry_drop,
1949                           req->dentry_unless);
1950
1951   if (req->old_dentry_drop && req->old_dentry())
1952     encode_dentry_release(req->old_dentry(), req,
1953                           mds, req->old_dentry_drop,
1954                           req->old_dentry_unless);
1955   ldout(cct, 25) << __func__ << " exit (req: "
1956            << req << ", mds " << mds <<dendl;
1957 }
1958
1959 bool Client::have_open_session(mds_rank_t mds)
1960 {
1961   const auto &it = mds_sessions.find(mds);
1962   return it != mds_sessions.end() &&
1963     (it->second.state == MetaSession::STATE_OPEN ||
1964      it->second.state == MetaSession::STATE_STALE);
1965 }
1966
1967 MetaSession *Client::_get_mds_session(mds_rank_t mds, Connection *con)
1968 {
1969   const auto &it = mds_sessions.find(mds);
1970   if (it == mds_sessions.end() || it->second.con != con) {
1971     return NULL;
1972   } else {
1973     return &it->second;
1974   }
1975 }
1976
1977 MetaSession *Client::_get_or_open_mds_session(mds_rank_t mds)
1978 {
1979   auto it = mds_sessions.find(mds);
1980   return it == mds_sessions.end() ? _open_mds_session(mds) : &it->second;
1981 }
1982
1983 /**
1984  * Populate a map of strings with client-identifying metadata,
1985  * such as the hostname.  Call this once at initialization.
1986  */
1987 void Client::populate_metadata(const std::string &mount_root)
1988 {
1989   // Hostname
1990   struct utsname u;
1991   int r = uname(&u);
1992   if (r >= 0) {
1993     metadata["hostname"] = u.nodename;
1994     ldout(cct, 20) << __func__ << " read hostname '" << u.nodename << "'" << dendl;
1995   } else {
1996     ldout(cct, 1) << __func__ << " failed to read hostname (" << cpp_strerror(r) << ")" << dendl;
1997   }
1998
1999   metadata["pid"] = stringify(getpid());
2000
2001   // Ceph entity id (the '0' in "client.0")
2002   metadata["entity_id"] = cct->_conf->name.get_id();
2003
2004   // Our mount position
2005   if (!mount_root.empty()) {
2006     metadata["root"] = mount_root;
2007   }
2008
2009   // Ceph version
2010   metadata["ceph_version"] = pretty_version_to_str();
2011   metadata["ceph_sha1"] = git_version_to_str();
2012
2013   // Apply any metadata from the user's configured overrides
2014   std::vector<std::string> tokens;
2015   get_str_vec(cct->_conf->client_metadata, ",", tokens);
2016   for (const auto &i : tokens) {
2017     auto eqpos = i.find("=");
2018     // Throw out anything that isn't of the form "<str>=<str>"
2019     if (eqpos == 0 || eqpos == std::string::npos || eqpos == i.size()) {
2020       lderr(cct) << "Invalid metadata keyval pair: '" << i << "'" << dendl;
2021       continue;
2022     }
2023     metadata[i.substr(0, eqpos)] = i.substr(eqpos + 1);
2024   }
2025 }
2026
2027 /**
2028  * Optionally add or override client metadata fields.
2029  */
2030 void Client::update_metadata(std::string const &k, std::string const &v)
2031 {
2032   std::lock_guard l(client_lock);
2033   ceph_assert(initialized);
2034
2035   auto it = metadata.find(k);
2036   if (it != metadata.end()) {
2037     ldout(cct, 1) << __func__ << " warning, overriding metadata field '" << k
2038                   << "' from '" << it->second << "' to '" << v << "'" << dendl;
2039   }
2040
2041   metadata[k] = v;
2042 }
2043
2044 MetaSession *Client::_open_mds_session(mds_rank_t mds)
2045 {
2046   ldout(cct, 10) << __func__ << " mds." << mds << dendl;
2047   auto addrs = mdsmap->get_addrs(mds);
2048   auto em = mds_sessions.emplace(std::piecewise_construct,
2049       std::forward_as_tuple(mds),
2050       std::forward_as_tuple(mds, messenger->connect_to_mds(addrs), addrs));
2051   ceph_assert(em.second); /* not already present */
2052   MetaSession *session = &em.first->second;
2053
2054   // Maybe skip sending a request to open if this MDS daemon
2055   // has previously sent us a REJECT.
2056   if (rejected_by_mds.count(mds)) {
2057     if (rejected_by_mds[mds] == session->addrs) {
2058       ldout(cct, 4) << __func__ << " mds." << mds << " skipping "
2059                        "because we were rejected" << dendl;
2060       return session;
2061     } else {
2062       ldout(cct, 4) << __func__ << " mds." << mds << " old inst "
2063                        "rejected us, trying with new inst" << dendl;
2064       rejected_by_mds.erase(mds);
2065     }
2066   }
2067
2068   auto m = make_message<MClientSession>(CEPH_SESSION_REQUEST_OPEN);
2069   m->metadata = metadata;
2070   m->supported_features = feature_bitset_t(CEPHFS_FEATURES_CLIENT_SUPPORTED);
2071   session->con->send_message2(std::move(m));
2072   return session;
2073 }
2074
2075 void Client::_close_mds_session(MetaSession *s)
2076 {
2077   ldout(cct, 2) << __func__ << " mds." << s->mds_num << " seq " << s->seq << dendl;
2078   s->state = MetaSession::STATE_CLOSING;
2079   s->con->send_message2(make_message<MClientSession>(CEPH_SESSION_REQUEST_CLOSE, s->seq));
2080 }
2081
2082 void Client::_closed_mds_session(MetaSession *s)
2083 {
2084   ldout(cct, 5) << __func__ << " mds." << s->mds_num << " seq " << s->seq << dendl;
2085   s->state = MetaSession::STATE_CLOSED;
2086   s->con->mark_down();
2087   signal_context_list(s->waiting_for_open);
2088   mount_cond.notify_all();
2089   remove_session_caps(s);
2090   kick_requests_closed(s);
2091   mds_sessions.erase(s->mds_num);
2092 }
2093
2094 void Client::handle_client_session(const MConstRef<MClientSession>& m)
2095 {
2096   mds_rank_t from = mds_rank_t(m->get_source().num());
2097   ldout(cct, 10) << __func__ << " " << *m << " from mds." << from << dendl;
2098
2099   MetaSession *session = _get_mds_session(from, m->get_connection().get());
2100   if (!session) {
2101     ldout(cct, 10) << " discarding session message from sessionless mds " << m->get_source_inst() << dendl;
2102     return;
2103   }
2104
2105   switch (m->get_op()) {
2106   case CEPH_SESSION_OPEN:
2107     {
2108       feature_bitset_t missing_features(CEPHFS_FEATURES_CLIENT_REQUIRED);
2109       missing_features -= m->supported_features;
2110       if (!missing_features.empty()) {
2111         lderr(cct) << "mds." << from << " lacks required features '"
2112                    << missing_features << "', closing session " << dendl;
2113         rejected_by_mds[session->mds_num] = session->addrs;
2114         _close_mds_session(session);
2115         _closed_mds_session(session);
2116         break;
2117       }
2118       session->mds_features = std::move(m->supported_features);
2119
2120       renew_caps(session);
2121       session->state = MetaSession::STATE_OPEN;
2122       if (unmounting)
2123         mount_cond.notify_all();
2124       else
2125         connect_mds_targets(from);
2126       signal_context_list(session->waiting_for_open);
2127       break;
2128     }
2129
2130   case CEPH_SESSION_CLOSE:
2131     _closed_mds_session(session);
2132     break;
2133
2134   case CEPH_SESSION_RENEWCAPS:
2135     if (session->cap_renew_seq == m->get_seq()) {
2136       bool was_stale = ceph_clock_now() >= session->cap_ttl;
2137       session->cap_ttl =
2138         session->last_cap_renew_request + mdsmap->get_session_timeout();
2139       if (was_stale)
2140         wake_up_session_caps(session, false);
2141     }
2142     break;
2143
2144   case CEPH_SESSION_STALE:
2145     // invalidate session caps/leases
2146     session->cap_gen++;
2147     session->cap_ttl = ceph_clock_now();
2148     session->cap_ttl -= 1;
2149     renew_caps(session);
2150     break;
2151
2152   case CEPH_SESSION_RECALL_STATE:
2153     trim_caps(session, m->get_max_caps());
2154     break;
2155
2156   case CEPH_SESSION_FLUSHMSG:
2157     /* flush cap release */
2158     if (auto& m = session->release; m) {
2159       session->con->send_message2(std::move(m));
2160     }
2161     session->con->send_message2(make_message<MClientSession>(CEPH_SESSION_FLUSHMSG_ACK, m->get_seq()));
2162     break;
2163
2164   case CEPH_SESSION_FORCE_RO:
2165     force_session_readonly(session);
2166     break;
2167
2168   case CEPH_SESSION_REJECT:
2169     {
2170       std::string_view error_str;
2171       auto it = m->metadata.find("error_string");
2172       if (it != m->metadata.end())
2173         error_str = it->second;
2174       else
2175         error_str = "unknown error";
2176       lderr(cct) << "mds." << from << " rejected us (" << error_str << ")" << dendl;
2177
2178       rejected_by_mds[session->mds_num] = session->addrs;
2179       _closed_mds_session(session);
2180     }
2181     break;
2182
2183   default:
2184     ceph_abort();
2185   }
2186 }
2187
2188 bool Client::_any_stale_sessions() const
2189 {
2190   ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
2191
2192   for (const auto &p : mds_sessions) {
2193     if (p.second.state == MetaSession::STATE_STALE) {
2194       return true;
2195     }
2196   }
2197
2198   return false;
2199 }
2200
2201 void Client::_kick_stale_sessions()
2202 {
2203   ldout(cct, 1) << __func__ << dendl;
2204
2205   for (auto it = mds_sessions.begin(); it != mds_sessions.end(); ) {
2206     MetaSession &s = it->second;
2207     ++it;
2208     if (s.state == MetaSession::STATE_STALE)
2209       _closed_mds_session(&s);
2210   }
2211 }
2212
2213 void Client::send_request(MetaRequest *request, MetaSession *session,
2214                           bool drop_cap_releases)
2215 {
2216   // make the request
2217   mds_rank_t mds = session->mds_num;
2218   ldout(cct, 10) << __func__ << " rebuilding request " << request->get_tid()
2219                  << " for mds." << mds << dendl;
2220   auto r = build_client_request(request);
2221   if (request->dentry()) {
2222     r->set_dentry_wanted();
2223   }
2224   if (request->got_unsafe) {
2225     r->set_replayed_op();
2226     if (request->target)
2227       r->head.ino = request->target->ino;
2228   } else {
2229     encode_cap_releases(request, mds);
2230     if (drop_cap_releases) // we haven't send cap reconnect yet, drop cap releases
2231       request->cap_releases.clear();
2232     else
2233       r->releases.swap(request->cap_releases);
2234   }
2235   r->set_mdsmap_epoch(mdsmap->get_epoch());
2236   if (r->head.op == CEPH_MDS_OP_SETXATTR) {
2237     objecter->with_osdmap([r](const OSDMap& o) {
2238         r->set_osdmap_epoch(o.get_epoch());
2239       });
2240   }
2241
2242   if (request->mds == -1) {
2243     request->sent_stamp = ceph_clock_now();
2244     ldout(cct, 20) << __func__ << " set sent_stamp to " << request->sent_stamp << dendl;
2245   }
2246   request->mds = mds;
2247
2248   Inode *in = request->inode();
2249   if (in) {
2250     auto it = in->caps.find(mds);
2251     if (it != in->caps.end()) {
2252       request->sent_on_mseq = it->second.mseq;
2253     }
2254   }
2255
2256   session->requests.push_back(&request->item);
2257
2258   ldout(cct, 10) << __func__ << " " << *r << " to mds." << mds << dendl;
2259   session->con->send_message2(std::move(r));
2260 }
2261
2262 ref_t<MClientRequest> Client::build_client_request(MetaRequest *request)
2263 {
2264   auto req = make_message<MClientRequest>(request->get_op());
2265   req->set_tid(request->tid);
2266   req->set_stamp(request->op_stamp);
2267   memcpy(&req->head, &request->head, sizeof(ceph_mds_request_head));
2268
2269   // if the filepath's haven't been set, set them!
2270   if (request->path.empty()) {
2271     Inode *in = request->inode();
2272     Dentry *de = request->dentry();
2273     if (in)
2274       in->make_nosnap_relative_path(request->path);
2275     else if (de) {
2276       if (de->inode)
2277         de->inode->make_nosnap_relative_path(request->path);
2278       else if (de->dir) {
2279         de->dir->parent_inode->make_nosnap_relative_path(request->path);
2280         request->path.push_dentry(de->name);
2281       }
2282       else ldout(cct, 1) << "Warning -- unable to construct a filepath!"
2283                    << " No path, inode, or appropriately-endowed dentry given!"
2284                    << dendl;
2285     } else ldout(cct, 1) << "Warning -- unable to construct a filepath!"
2286                    << " No path, inode, or dentry given!"
2287                    << dendl;
2288   }
2289   req->set_filepath(request->get_filepath());
2290   req->set_filepath2(request->get_filepath2());
2291   req->set_data(request->data);
2292   req->set_retry_attempt(request->retry_attempt++);
2293   req->head.num_fwd = request->num_fwd;
2294   const gid_t *_gids;
2295   int gid_count = request->perms.get_gids(&_gids);
2296   req->set_gid_list(gid_count, _gids);
2297   return req;
2298 }
2299
2300
2301
2302 void Client::handle_client_request_forward(const MConstRef<MClientRequestForward>& fwd)
2303 {
2304   mds_rank_t mds = mds_rank_t(fwd->get_source().num());
2305   MetaSession *session = _get_mds_session(mds, fwd->get_connection().get());
2306   if (!session) {
2307     return;
2308   }
2309   ceph_tid_t tid = fwd->get_tid();
2310
2311   if (mds_requests.count(tid) == 0) {
2312     ldout(cct, 10) << __func__ << " no pending request on tid " << tid << dendl;
2313     return;
2314   }
2315
2316   MetaRequest *request = mds_requests[tid];
2317   ceph_assert(request);
2318
2319   // reset retry counter
2320   request->retry_attempt = 0;
2321
2322   // request not forwarded, or dest mds has no session.
2323   // resend.
2324   ldout(cct, 10) << __func__ << " tid " << tid
2325            << " fwd " << fwd->get_num_fwd()
2326            << " to mds." << fwd->get_dest_mds()
2327            << ", resending to " << fwd->get_dest_mds()
2328            << dendl;
2329
2330   request->mds = -1;
2331   request->item.remove_myself();
2332   request->num_fwd = fwd->get_num_fwd();
2333   request->resend_mds = fwd->get_dest_mds();
2334   request->caller_cond->notify_all();
2335 }
2336
2337 bool Client::is_dir_operation(MetaRequest *req)
2338 {
2339   int op = req->get_op();
2340   if (op == CEPH_MDS_OP_MKNOD || op == CEPH_MDS_OP_LINK ||
2341       op == CEPH_MDS_OP_UNLINK || op == CEPH_MDS_OP_RENAME ||
2342       op == CEPH_MDS_OP_MKDIR || op == CEPH_MDS_OP_RMDIR ||
2343       op == CEPH_MDS_OP_SYMLINK || op == CEPH_MDS_OP_CREATE)
2344     return true;
2345   return false;
2346 }
2347
2348 void Client::handle_client_reply(const MConstRef<MClientReply>& reply)
2349 {
2350   mds_rank_t mds_num = mds_rank_t(reply->get_source().num());
2351   MetaSession *session = _get_mds_session(mds_num, reply->get_connection().get());
2352   if (!session) {
2353     return;
2354   }
2355
2356   ceph_tid_t tid = reply->get_tid();
2357   bool is_safe = reply->is_safe();
2358
2359   if (mds_requests.count(tid) == 0) {
2360     lderr(cct) << __func__ << " no pending request on tid " << tid
2361                << " safe is:" << is_safe << dendl;
2362     return;
2363   }
2364   MetaRequest *request = mds_requests.at(tid);
2365
2366   ldout(cct, 20) << __func__ << " got a reply. Safe:" << is_safe
2367                  << " tid " << tid << dendl;
2368
2369   if (request->got_unsafe && !is_safe) {
2370     //duplicate response
2371     ldout(cct, 0) << "got a duplicate reply on tid " << tid << " from mds "
2372             << mds_num << " safe:" << is_safe << dendl;
2373     return;
2374   }
2375
2376   if (-ESTALE == reply->get_result()) { // see if we can get to proper MDS
2377     ldout(cct, 20) << "got ESTALE on tid " << request->tid
2378                    << " from mds." << request->mds << dendl;
2379     request->send_to_auth = true;
2380     request->resend_mds = choose_target_mds(request);
2381     Inode *in = request->inode();
2382     std::map<mds_rank_t, Cap>::const_iterator it;
2383     if (request->resend_mds >= 0 &&
2384         request->resend_mds == request->mds &&
2385         (in == NULL ||
2386          (it = in->caps.find(request->resend_mds)) != in->caps.end() ||
2387          request->sent_on_mseq == it->second.mseq)) {
2388       ldout(cct, 20) << "have to return ESTALE" << dendl;
2389     } else {
2390       request->caller_cond->notify_all();
2391       return;
2392     }
2393   }
2394
2395   ceph_assert(!request->reply);
2396   request->reply = reply;
2397   insert_trace(request, session);
2398
2399   // Handle unsafe reply
2400   if (!is_safe) {
2401     request->got_unsafe = true;
2402     session->unsafe_requests.push_back(&request->unsafe_item);
2403     if (is_dir_operation(request)) {
2404       Inode *dir = request->inode();
2405       ceph_assert(dir);
2406       dir->unsafe_ops.push_back(&request->unsafe_dir_item);
2407     }
2408     if (request->target) {
2409       InodeRef &in = request->target;
2410       in->unsafe_ops.push_back(&request->unsafe_target_item);
2411     }
2412   }
2413
2414   // Only signal the caller once (on the first reply):
2415   // Either its an unsafe reply, or its a safe reply and no unsafe reply was sent.
2416   if (!is_safe || !request->got_unsafe) {
2417     ceph::condition_variable cond;
2418     request->dispatch_cond = &cond;
2419
2420     // wake up waiter
2421     ldout(cct, 20) << __func__ << " signalling caller " << (void*)request->caller_cond << dendl;
2422     request->caller_cond->notify_all();
2423
2424     // wake for kick back
2425     std::unique_lock l{client_lock, std::adopt_lock};
2426     cond.wait(l, [tid, request, &cond, this] {
2427       if (request->dispatch_cond) {
2428         ldout(cct, 20) << "handle_client_reply awaiting kickback on tid "
2429                        << tid << " " << &cond << dendl;
2430       }
2431       return !request->dispatch_cond;
2432     });
2433     l.release();
2434   }
2435
2436   if (is_safe) {
2437     // the filesystem change is committed to disk
2438     // we're done, clean up
2439     if (request->got_unsafe) {
2440       request->unsafe_item.remove_myself();
2441       request->unsafe_dir_item.remove_myself();
2442       request->unsafe_target_item.remove_myself();
2443       signal_cond_list(request->waitfor_safe);
2444     }
2445     request->item.remove_myself();
2446     unregister_request(request);
2447   }
2448   if (unmounting)
2449     mount_cond.notify_all();
2450 }
2451
2452 void Client::_handle_full_flag(int64_t pool)
2453 {
2454   ldout(cct, 1) << __func__ << ": FULL: cancelling outstanding operations "
2455     << "on " << pool << dendl;
2456   // Cancel all outstanding ops in this pool with -ENOSPC: it is necessary
2457   // to do this rather than blocking, because otherwise when we fill up we
2458   // potentially lock caps forever on files with dirty pages, and we need
2459   // to be able to release those caps to the MDS so that it can delete files
2460   // and free up space.
2461   epoch_t cancelled_epoch = objecter->op_cancel_writes(-ENOSPC, pool);
2462
2463   // For all inodes with layouts in this pool and a pending flush write op
2464   // (i.e. one of the ones we will cancel), we've got to purge_set their data
2465   // from ObjectCacher so that it doesn't re-issue the write in response to
2466   // the ENOSPC error.
2467   // Fortunately since we're cancelling everything in a given pool, we don't
2468   // need to know which ops belong to which ObjectSet, we can just blow all
2469   // the un-flushed cached data away and mark any dirty inodes' async_err
2470   // field with -ENOSPC as long as we're sure all the ops we cancelled were
2471   // affecting this pool, and all the objectsets we're purging were also
2472   // in this pool.
2473   for (unordered_map<vinodeno_t,Inode*>::iterator i = inode_map.begin();
2474        i != inode_map.end(); ++i)
2475   {
2476     Inode *inode = i->second;
2477     if (inode->oset.dirty_or_tx
2478         && (pool == -1 || inode->layout.pool_id == pool)) {
2479       ldout(cct, 4) << __func__ << ": FULL: inode 0x" << std::hex << i->first << std::dec
2480         << " has dirty objects, purging and setting ENOSPC" << dendl;
2481       objectcacher->purge_set(&inode->oset);
2482       inode->set_async_err(-ENOSPC);
2483     }
2484   }
2485
2486   if (cancelled_epoch != (epoch_t)-1) {
2487     set_cap_epoch_barrier(cancelled_epoch);
2488   }
2489 }
2490
2491 void Client::handle_osd_map(const MConstRef<MOSDMap>& m)
2492 {
2493   std::set<entity_addr_t> new_blacklists;
2494   objecter->consume_blacklist_events(&new_blacklists);
2495
2496   const auto myaddrs = messenger->get_myaddrs();
2497   bool new_blacklist = false;
2498   bool prenautilus = objecter->with_osdmap(
2499     [&](const OSDMap& o) {
2500       return o.require_osd_release < ceph_release_t::nautilus;
2501     });
2502   if (!blacklisted) {
2503     for (auto a : myaddrs.v) {
2504       // blacklist entries are always TYPE_ANY for nautilus+
2505       a.set_type(entity_addr_t::TYPE_ANY);
2506       if (new_blacklists.count(a)) {
2507         new_blacklist = true;
2508         break;
2509       }
2510       if (prenautilus) {
2511         // ...except pre-nautilus, they were TYPE_LEGACY
2512         a.set_type(entity_addr_t::TYPE_LEGACY);
2513         if (new_blacklists.count(a)) {
2514           new_blacklist = true;
2515           break;
2516         }
2517       }
2518     }
2519   }
2520   if (new_blacklist) {
2521     auto epoch = objecter->with_osdmap([](const OSDMap &o){
2522         return o.get_epoch();
2523         });
2524     lderr(cct) << "I was blacklisted at osd epoch " << epoch << dendl;
2525     blacklisted = true;
2526
2527     _abort_mds_sessions(-EBLACKLISTED);
2528
2529     // Since we know all our OSD ops will fail, cancel them all preemtively,
2530     // so that on an unhealthy cluster we can umount promptly even if e.g.
2531     // some PGs were inaccessible.
2532     objecter->op_cancel_writes(-EBLACKLISTED);
2533
2534   } else if (blacklisted) {
2535     // Handle case where we were blacklisted but no longer are
2536     blacklisted = objecter->with_osdmap([myaddrs](const OSDMap &o){
2537         return o.is_blacklisted(myaddrs);});
2538   }
2539
2540   // Always subscribe to next osdmap for blacklisted client
2541   // until this client is not blacklisted.
2542   if (blacklisted) {
2543     objecter->maybe_request_map();
2544   }
2545
2546   if (objecter->osdmap_full_flag()) {
2547     _handle_full_flag(-1);
2548   } else {
2549     // Accumulate local list of full pools so that I can drop
2550     // the objecter lock before re-entering objecter in
2551     // cancel_writes
2552     std::vector<int64_t> full_pools;
2553
2554     objecter->with_osdmap([&full_pools](const OSDMap &o) {
2555         for (const auto& kv : o.get_pools()) {
2556           if (kv.second.has_flag(pg_pool_t::FLAG_FULL)) {
2557             full_pools.push_back(kv.first);
2558           }
2559         }
2560       });
2561
2562     for (auto p : full_pools)
2563       _handle_full_flag(p);
2564
2565     // Subscribe to subsequent maps to watch for the full flag going
2566     // away.  For the global full flag objecter does this for us, but
2567     // it pays no attention to the per-pool full flag so in this branch
2568     // we do it ourselves.
2569     if (!full_pools.empty()) {
2570       objecter->maybe_request_map();
2571     }
2572   }
2573 }
2574
2575
2576 // ------------------------
2577 // incoming messages
2578
2579
2580 bool Client::ms_dispatch2(const MessageRef &m)
2581 {
2582   std::lock_guard l(client_lock);
2583   if (!initialized) {
2584     ldout(cct, 10) << "inactive, discarding " << *m << dendl;
2585     return true;
2586   }
2587
2588   switch (m->get_type()) {
2589     // mounting and mds sessions
2590   case CEPH_MSG_MDS_MAP:
2591     handle_mds_map(ref_cast<MMDSMap>(m));
2592     break;
2593   case CEPH_MSG_FS_MAP:
2594     handle_fs_map(ref_cast<MFSMap>(m));
2595     break;
2596   case CEPH_MSG_FS_MAP_USER:
2597     handle_fs_map_user(ref_cast<MFSMapUser>(m));
2598     break;
2599   case CEPH_MSG_CLIENT_SESSION:
2600     handle_client_session(ref_cast<MClientSession>(m));
2601     break;
2602
2603   case CEPH_MSG_OSD_MAP:
2604     handle_osd_map(ref_cast<MOSDMap>(m));
2605     break;
2606
2607     // requests
2608   case CEPH_MSG_CLIENT_REQUEST_FORWARD:
2609     handle_client_request_forward(ref_cast<MClientRequestForward>(m));
2610     break;
2611   case CEPH_MSG_CLIENT_REPLY:
2612     handle_client_reply(ref_cast<MClientReply>(m));
2613     break;
2614
2615   // reclaim reply
2616   case CEPH_MSG_CLIENT_RECLAIM_REPLY:
2617     handle_client_reclaim_reply(ref_cast<MClientReclaimReply>(m));
2618     break;
2619
2620   case CEPH_MSG_CLIENT_SNAP:
2621     handle_snap(ref_cast<MClientSnap>(m));
2622     break;
2623   case CEPH_MSG_CLIENT_CAPS:
2624     handle_caps(ref_cast<MClientCaps>(m));
2625     break;
2626   case CEPH_MSG_CLIENT_LEASE:
2627     handle_lease(ref_cast<MClientLease>(m));
2628     break;
2629   case MSG_COMMAND_REPLY:
2630     if (m->get_source().type() == CEPH_ENTITY_TYPE_MDS) {
2631       handle_command_reply(ref_cast<MCommandReply>(m));
2632     } else {
2633       return false;
2634     }
2635     break;
2636   case CEPH_MSG_CLIENT_QUOTA:
2637     handle_quota(ref_cast<MClientQuota>(m));
2638     break;
2639
2640   default:
2641     return false;
2642   }
2643
2644   // unmounting?
2645   if (unmounting) {
2646     ldout(cct, 10) << "unmounting: trim pass, size was " << lru.lru_get_size()
2647              << "+" << inode_map.size() << dendl;
2648     long unsigned size = lru.lru_get_size() + inode_map.size();
2649     trim_cache();
2650     if (size < lru.lru_get_size() + inode_map.size()) {
2651       ldout(cct, 10) << "unmounting: trim pass, cache shrank, poking unmount()" << dendl;
2652       mount_cond.notify_all();
2653     } else {
2654       ldout(cct, 10) << "unmounting: trim pass, size still " << lru.lru_get_size()
2655                << "+" << inode_map.size() << dendl;
2656     }
2657   }
2658
2659   return true;
2660 }
2661
2662 void Client::handle_fs_map(const MConstRef<MFSMap>& m)
2663 {
2664   fsmap.reset(new FSMap(m->get_fsmap()));
2665
2666   signal_cond_list(waiting_for_fsmap);
2667
2668   monclient->sub_got("fsmap", fsmap->get_epoch());
2669 }
2670
2671 void Client::handle_fs_map_user(const MConstRef<MFSMapUser>& m)
2672 {
2673   fsmap_user.reset(new FSMapUser);
2674   *fsmap_user = m->get_fsmap();
2675
2676   monclient->sub_got("fsmap.user", fsmap_user->get_epoch());
2677   signal_cond_list(waiting_for_fsmap);
2678 }
2679
2680 void Client::handle_mds_map(const MConstRef<MMDSMap>& m)
2681 {
2682   mds_gid_t old_inc, new_inc;
2683   if (m->get_epoch() <= mdsmap->get_epoch()) {
2684     ldout(cct, 1) << __func__ << " epoch " << m->get_epoch()
2685                   << " is identical to or older than our "
2686                   << mdsmap->get_epoch() << dendl;
2687     return;
2688   }
2689
2690   ldout(cct, 1) << __func__ << " epoch " << m->get_epoch() << dendl;
2691
2692   std::unique_ptr<MDSMap> oldmap(new MDSMap);
2693   oldmap.swap(mdsmap);
2694
2695   mdsmap->decode(m->get_encoded());
2696
2697   // Cancel any commands for missing or laggy GIDs
2698   std::list<ceph_tid_t> cancel_ops;
2699   auto &commands = command_table.get_commands();
2700   for (const auto &i : commands) {
2701     auto &op = i.second;
2702     const mds_gid_t op_mds_gid = op.mds_gid;
2703     if (mdsmap->is_dne_gid(op_mds_gid) || mdsmap->is_laggy_gid(op_mds_gid)) {
2704       ldout(cct, 1) << __func__ << ": cancelling command op " << i.first << dendl;
2705       cancel_ops.push_back(i.first);
2706       if (op.outs) {
2707         std::ostringstream ss;
2708         ss << "MDS " << op_mds_gid << " went away";
2709         *(op.outs) = ss.str();
2710       }
2711       op.con->mark_down();
2712       if (op.on_finish) {
2713         op.on_finish->complete(-ETIMEDOUT);
2714       }
2715     }
2716   }
2717
2718   for (std::list<ceph_tid_t>::iterator i = cancel_ops.begin();
2719        i != cancel_ops.end(); ++i) {
2720     command_table.erase(*i);
2721   }
2722
2723   // reset session
2724   for (auto p = mds_sessions.begin(); p != mds_sessions.end(); ) {
2725     mds_rank_t mds = p->first;
2726     MetaSession *session = &p->second;
2727     ++p;
2728
2729     int oldstate = oldmap->get_state(mds);
2730     int newstate = mdsmap->get_state(mds);
2731     if (!mdsmap->is_up(mds)) {
2732       session->con->mark_down();
2733     } else if (mdsmap->get_addrs(mds) != session->addrs) {
2734       old_inc = oldmap->get_incarnation(mds);
2735       new_inc = mdsmap->get_incarnation(mds);
2736       if (old_inc != new_inc) {
2737         ldout(cct, 1) << "mds incarnation changed from "
2738                       << old_inc << " to " << new_inc << dendl;
2739         oldstate = MDSMap::STATE_NULL;
2740       }
2741       session->con->mark_down();
2742       session->addrs = mdsmap->get_addrs(mds);
2743       // When new MDS starts to take over, notify kernel to trim unused entries
2744       // in its dcache/icache. Hopefully, the kernel will release some unused
2745       // inodes before the new MDS enters reconnect state.
2746       trim_cache_for_reconnect(session);
2747     } else if (oldstate == newstate)
2748       continue;  // no change
2749
2750     session->mds_state = newstate;
2751     if (newstate == MDSMap::STATE_RECONNECT) {
2752       session->con = messenger->connect_to_mds(session->addrs);
2753       send_reconnect(session);
2754     } else if (newstate > MDSMap::STATE_RECONNECT) {
2755       if (oldstate < MDSMap::STATE_RECONNECT) {
2756         ldout(cct, 1) << "we may miss the MDSMap::RECONNECT, close mds session ... " << dendl;
2757         _closed_mds_session(session);
2758         continue;
2759       }
2760       if (newstate >= MDSMap::STATE_ACTIVE) {
2761         if (oldstate < MDSMap::STATE_ACTIVE) {
2762           // kick new requests
2763           kick_requests(session);
2764           kick_flushing_caps(session);
2765           signal_context_list(session->waiting_for_open);
2766           wake_up_session_caps(session, true);
2767         }
2768         connect_mds_targets(mds);
2769       }
2770     } else if (newstate == MDSMap::STATE_NULL &&
2771                mds >= mdsmap->get_max_mds()) {
2772       _closed_mds_session(session);
2773     }
2774   }
2775
2776   // kick any waiting threads
2777   signal_cond_list(waiting_for_mdsmap);
2778
2779   monclient->sub_got("mdsmap", mdsmap->get_epoch());
2780 }
2781
2782 void Client::send_reconnect(MetaSession *session)
2783 {
2784   mds_rank_t mds = session->mds_num;
2785   ldout(cct, 10) << __func__ << " to mds." << mds << dendl;
2786
2787   // trim unused caps to reduce MDS's cache rejoin time
2788   trim_cache_for_reconnect(session);
2789
2790   session->readonly = false;
2791
2792   session->release.reset();
2793
2794   // reset my cap seq number
2795   session->seq = 0;
2796   //connect to the mds' offload targets
2797   connect_mds_targets(mds);
2798   //make sure unsafe requests get saved
2799   resend_unsafe_requests(session);
2800
2801   early_kick_flushing_caps(session);
2802
2803   auto m = make_message<MClientReconnect>();
2804   bool allow_multi = session->mds_features.test(CEPHFS_FEATURE_MULTI_RECONNECT);
2805
2806   // i have an open session.
2807   ceph::unordered_set<inodeno_t> did_snaprealm;
2808   for (ceph::unordered_map<vinodeno_t, Inode*>::iterator p = inode_map.begin();
2809        p != inode_map.end();
2810        ++p) {
2811     Inode *in = p->second;
2812     auto it = in->caps.find(mds);
2813     if (it != in->caps.end()) {
2814       if (allow_multi &&
2815           m->get_approx_size() >=
2816           static_cast<size_t>((std::numeric_limits<int>::max() >> 1))) {
2817         m->mark_more();
2818         session->con->send_message2(std::move(m));
2819
2820         m = make_message<MClientReconnect>();
2821       }
2822
2823       Cap &cap = it->second;
2824       ldout(cct, 10) << " caps on " << p->first
2825                << " " << ccap_string(cap.issued)
2826                << " wants " << ccap_string(in->caps_wanted())
2827                << dendl;
2828       filepath path;
2829       in->make_long_path(path);
2830       ldout(cct, 10) << "    path " << path << dendl;
2831
2832       bufferlist flockbl;
2833       _encode_filelocks(in, flockbl);
2834
2835       cap.seq = 0;  // reset seq.
2836       cap.issue_seq = 0;  // reset seq.
2837       cap.mseq = 0;  // reset seq.
2838       // cap gen should catch up with session cap_gen
2839       if (cap.gen < session->cap_gen) {
2840         cap.gen = session->cap_gen;
2841         cap.issued = cap.implemented = CEPH_CAP_PIN;
2842       } else {
2843         cap.issued = cap.implemented;
2844       }
2845       snapid_t snap_follows = 0;
2846       if (!in->cap_snaps.empty())
2847         snap_follows = in->cap_snaps.begin()->first;
2848
2849       m->add_cap(p->first.ino,
2850                  cap.cap_id,
2851                  path.get_ino(), path.get_path(),   // ino
2852                  in->caps_wanted(), // wanted
2853                  cap.issued,     // issued
2854                  in->snaprealm->ino,
2855                  snap_follows,
2856                  flockbl);
2857
2858       if (did_snaprealm.count(in->snaprealm->ino) == 0) {
2859         ldout(cct, 10) << " snaprealm " << *in->snaprealm << dendl;
2860         m->add_snaprealm(in->snaprealm->ino, in->snaprealm->seq, in->snaprealm->parent);
2861         did_snaprealm.insert(in->snaprealm->ino);
2862       }
2863     }
2864   }
2865
2866   if (!allow_multi)
2867     m->set_encoding_version(0); // use connection features to choose encoding
2868   session->con->send_message2(std::move(m));
2869
2870   mount_cond.notify_all();
2871
2872   if (session->reclaim_state == MetaSession::RECLAIMING)
2873     signal_cond_list(waiting_for_reclaim);
2874 }
2875
2876
2877 void Client::kick_requests(MetaSession *session)
2878 {
2879   ldout(cct, 10) << __func__ << " for mds." << session->mds_num << dendl;
2880   for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
2881        p != mds_requests.end();
2882        ++p) {
2883     MetaRequest *req = p->second;
2884     if (req->got_unsafe)
2885       continue;
2886     if (req->aborted()) {
2887       if (req->caller_cond) {
2888         req->kick = true;
2889         req->caller_cond->notify_all();
2890       }
2891       continue;
2892     }
2893     if (req->retry_attempt > 0)
2894       continue; // new requests only
2895     if (req->mds == session->mds_num) {
2896       send_request(p->second, session);
2897     }
2898   }
2899 }
2900
2901 void Client::resend_unsafe_requests(MetaSession *session)
2902 {
2903   for (xlist<MetaRequest*>::iterator iter = session->unsafe_requests.begin();
2904        !iter.end();
2905        ++iter)
2906     send_request(*iter, session);
2907
2908   // also re-send old requests when MDS enters reconnect stage. So that MDS can
2909   // process completed requests in clientreplay stage.
2910   for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
2911        p != mds_requests.end();
2912        ++p) {
2913     MetaRequest *req = p->second;
2914     if (req->got_unsafe)
2915       continue;
2916     if (req->aborted())
2917       continue;
2918     if (req->retry_attempt == 0)
2919       continue; // old requests only
2920     if (req->mds == session->mds_num)
2921       send_request(req, session, true);
2922   }
2923 }
2924
2925 void Client::wait_unsafe_requests()
2926 {
2927   list<MetaRequest*> last_unsafe_reqs;
2928   for (const auto &p : mds_sessions) {
2929     const MetaSession &s = p.second;
2930     if (!s.unsafe_requests.empty()) {
2931       MetaRequest *req = s.unsafe_requests.back();
2932       req->get();
2933       last_unsafe_reqs.push_back(req);
2934     }
2935   }
2936
2937   for (list<MetaRequest*>::iterator p = last_unsafe_reqs.begin();
2938        p != last_unsafe_reqs.end();
2939        ++p) {
2940     MetaRequest *req = *p;
2941     if (req->unsafe_item.is_on_list())
2942       wait_on_list(req->waitfor_safe);
2943     put_request(req);
2944   }
2945 }
2946
2947 void Client::kick_requests_closed(MetaSession *session)
2948 {
2949   ldout(cct, 10) << __func__ << " for mds." << session->mds_num << dendl;
2950   for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
2951        p != mds_requests.end(); ) {
2952     MetaRequest *req = p->second;
2953     ++p;
2954     if (req->mds == session->mds_num) {
2955       if (req->caller_cond) {
2956         req->kick = true;
2957         req->caller_cond->notify_all();
2958       }
2959       req->item.remove_myself();
2960       if (req->got_unsafe) {
2961         lderr(cct) << __func__ << " removing unsafe request " << req->get_tid() << dendl;
2962         req->unsafe_item.remove_myself();
2963         if (is_dir_operation(req)) {
2964           Inode *dir = req->inode();
2965           assert(dir);
2966           dir->set_async_err(-EIO);
2967           lderr(cct) << "kick_requests_closed drop req of inode(dir) : "
2968                      <<  dir->ino  << " " << req->get_tid() << dendl;
2969           req->unsafe_dir_item.remove_myself();
2970         }
2971         if (req->target) {
2972           InodeRef &in = req->target;
2973           in->set_async_err(-EIO);
2974           lderr(cct) << "kick_requests_closed drop req of inode : "
2975                      <<  in->ino  << " " << req->get_tid() << dendl;
2976           req->unsafe_target_item.remove_myself();
2977         }
2978         signal_cond_list(req->waitfor_safe);
2979         unregister_request(req);
2980       }
2981     }
2982   }
2983   ceph_assert(session->requests.empty());
2984   ceph_assert(session->unsafe_requests.empty());
2985 }
2986
2987
2988
2989
2990 /************
2991  * leases
2992  */
2993
2994 void Client::got_mds_push(MetaSession *s)
2995 {
2996   s->seq++;
2997   ldout(cct, 10) << " mds." << s->mds_num << " seq now " << s->seq << dendl;
2998   if (s->state == MetaSession::STATE_CLOSING) {
2999     s->con->send_message2(make_message<MClientSession>(CEPH_SESSION_REQUEST_CLOSE, s->seq));
3000   }
3001 }
3002
3003 void Client::handle_lease(const MConstRef<MClientLease>& m)
3004 {
3005   ldout(cct, 10) << __func__ << " " << *m << dendl;
3006
3007   ceph_assert(m->get_action() == CEPH_MDS_LEASE_REVOKE);
3008
3009   mds_rank_t mds = mds_rank_t(m->get_source().num());
3010   MetaSession *session = _get_mds_session(mds, m->get_connection().get());
3011   if (!session) {
3012     return;
3013   }
3014
3015   got_mds_push(session);
3016
3017   ceph_seq_t seq = m->get_seq();
3018
3019   Inode *in;
3020   vinodeno_t vino(m->get_ino(), CEPH_NOSNAP);
3021   if (inode_map.count(vino) == 0) {
3022     ldout(cct, 10) << " don't have vino " << vino << dendl;
3023     goto revoke;
3024   }
3025   in = inode_map[vino];
3026
3027   if (m->get_mask() & CEPH_LEASE_VALID) {
3028     if (!in->dir || in->dir->dentries.count(m->dname) == 0) {
3029       ldout(cct, 10) << " don't have dir|dentry " << m->get_ino() << "/" << m->dname <<dendl;
3030       goto revoke;
3031     }
3032     Dentry *dn = in->dir->dentries[m->dname];
3033     ldout(cct, 10) << " revoked DN lease on " << dn << dendl;
3034     dn->lease_mds = -1;
3035   }
3036
3037  revoke:
3038   {
3039     auto reply = make_message<MClientLease>(CEPH_MDS_LEASE_RELEASE, seq,
3040                                             m->get_mask(), m->get_ino(),
3041                                             m->get_first(), m->get_last(), m->dname);
3042     m->get_connection()->send_message2(std::move(reply));
3043   }
3044 }
3045
3046 void Client::put_inode(Inode *in, int n)
3047 {
3048   ldout(cct, 10) << __func__ << " on " << *in << dendl;
3049   int left = in->_put(n);
3050   if (left == 0) {
3051     // release any caps
3052     remove_all_caps(in);
3053
3054     ldout(cct, 10) << __func__ << " deleting " << *in << dendl;
3055     bool unclean = objectcacher->release_set(&in->oset);
3056     ceph_assert(!unclean);
3057     inode_map.erase(in->vino());
3058     if (use_faked_inos())
3059       _release_faked_ino(in);
3060
3061     if (in == root) {
3062       root = 0;
3063       root_ancestor = 0;
3064       while (!root_parents.empty())
3065         root_parents.erase(root_parents.begin());
3066     }
3067
3068     delete in;
3069   }
3070 }
3071
3072 void Client::close_dir(Dir *dir)
3073 {
3074   Inode *in = dir->parent_inode;
3075   ldout(cct, 15) << __func__ << " dir " << dir << " on " << in << dendl;
3076   ceph_assert(dir->is_empty());
3077   ceph_assert(in->dir == dir);
3078   ceph_assert(in->dentries.size() < 2);     // dirs can't be hard-linked
3079   if (!in->dentries.empty())
3080     in->get_first_parent()->put();   // unpin dentry
3081
3082   delete in->dir;
3083   in->dir = 0;
3084   put_inode(in);               // unpin inode
3085 }
3086
3087   /**
3088    * Don't call this with in==NULL, use get_or_create for that
3089    * leave dn set to default NULL unless you're trying to add
3090    * a new inode to a pre-created Dentry
3091    */
3092 Dentry* Client::link(Dir *dir, const string& name, Inode *in, Dentry *dn)
3093 {
3094   if (!dn) {
3095     // create a new Dentry
3096     dn = new Dentry(dir, name);
3097
3098     lru.lru_insert_mid(dn);    // mid or top?
3099
3100     ldout(cct, 15) << "link dir " << dir->parent_inode << " '" << name << "' to inode " << in
3101                    << " dn " << dn << " (new dn)" << dendl;
3102   } else {
3103     ceph_assert(!dn->inode);
3104     ldout(cct, 15) << "link dir " << dir->parent_inode << " '" << name << "' to inode " << in
3105                    << " dn " << dn << " (old dn)" << dendl;
3106   }
3107
3108   if (in) {    // link to inode
3109     InodeRef tmp_ref;
3110     // only one parent for directories!
3111     if (in->is_dir() && !in->dentries.empty()) {
3112       tmp_ref = in; // prevent unlink below from freeing the inode.
3113       Dentry *olddn = in->get_first_parent();
3114       ceph_assert(olddn->dir != dir || olddn->name != name);
3115       Inode *old_diri = olddn->dir->parent_inode;
3116       old_diri->dir_release_count++;
3117       clear_dir_complete_and_ordered(old_diri, true);
3118       unlink(olddn, true, true);  // keep dir, dentry
3119     }
3120
3121     dn->link(in);
3122     ldout(cct, 20) << "link  inode " << in << " parents now " << in->dentries << dendl;
3123   }
3124
3125   return dn;
3126 }
3127
3128 void Client::unlink(Dentry *dn, bool keepdir, bool keepdentry)
3129 {
3130   InodeRef in(dn->inode);
3131   ldout(cct, 15) << "unlink dir " << dn->dir->parent_inode << " '" << dn->name << "' dn " << dn
3132                  << " inode " << dn->inode << dendl;
3133
3134   // unlink from inode
3135   if (dn->inode) {
3136     dn->unlink();
3137     ldout(cct, 20) << "unlink  inode " << in << " parents now " << in->dentries << dendl;
3138   }
3139
3140   if (keepdentry) {
3141     dn->lease_mds = -1;
3142   } else {
3143     ldout(cct, 15) << "unlink  removing '" << dn->name << "' dn " << dn << dendl;
3144
3145     // unlink from dir
3146     Dir *dir = dn->dir;
3147     dn->detach();
3148
3149     // delete den
3150     lru.lru_remove(dn);
3151     dn->put();
3152
3153     if (dir->is_empty() && !keepdir)
3154       close_dir(dir);
3155   }
3156 }
3157
3158 /**
3159  * For asynchronous flushes, check for errors from the IO and
3160  * update the inode if necessary
3161  */
3162 class C_Client_FlushComplete : public Context {
3163 private:
3164   Client *client;
3165   InodeRef inode;
3166 public:
3167   C_Client_FlushComplete(Client *c, Inode *in) : client(c), inode(in) { }
3168   void finish(int r) override {
3169     ceph_assert(ceph_mutex_is_locked_by_me(client->client_lock));
3170     if (r != 0) {
3171       client_t const whoami = client->whoami;  // For the benefit of ldout prefix
3172       ldout(client->cct, 1) << "I/O error from flush on inode " << inode
3173         << " 0x" << std::hex << inode->ino << std::dec
3174         << ": " << r << "(" << cpp_strerror(r) << ")" << dendl;
3175       inode->set_async_err(r);
3176     }
3177   }
3178 };
3179
3180
3181 /****
3182  * caps
3183  */
3184
3185 void Client::get_cap_ref(Inode *in, int cap)
3186 {
3187   if ((cap & CEPH_CAP_FILE_BUFFER) &&
3188       in->cap_refs[CEPH_CAP_FILE_BUFFER] == 0) {
3189     ldout(cct, 5) << __func__ << " got first FILE_BUFFER ref on " << *in << dendl;
3190     in->get();
3191   }
3192   if ((cap & CEPH_CAP_FILE_CACHE) &&
3193       in->cap_refs[CEPH_CAP_FILE_CACHE] == 0) {
3194     ldout(cct, 5) << __func__ << " got first FILE_CACHE ref on " << *in << dendl;
3195     in->get();
3196   }
3197   in->get_cap_ref(cap);
3198 }
3199
3200 void Client::put_cap_ref(Inode *in, int cap)
3201 {
3202   int last = in->put_cap_ref(cap);
3203   if (last) {
3204     int put_nref = 0;
3205     int drop = last & ~in->caps_issued();
3206     if (in->snapid == CEPH_NOSNAP) {
3207       if ((last & CEPH_CAP_FILE_WR) &&
3208           !in->cap_snaps.empty() &&
3209           in->cap_snaps.rbegin()->second.writing) {
3210         ldout(cct, 10) << __func__ << " finishing pending cap_snap on " << *in << dendl;
3211         in->cap_snaps.rbegin()->second.writing = 0;
3212         finish_cap_snap(in, in->cap_snaps.rbegin()->second, get_caps_used(in));
3213         signal_cond_list(in->waitfor_caps);  // wake up blocked sync writers
3214       }
3215       if (last & CEPH_CAP_FILE_BUFFER) {
3216         for (auto &p : in->cap_snaps)
3217           p.second.dirty_data = 0;
3218         signal_cond_list(in->waitfor_commit);
3219         ldout(cct, 5) << __func__ << " dropped last FILE_BUFFER ref on " << *in << dendl;
3220         ++put_nref;
3221       }
3222     }
3223     if (last & CEPH_CAP_FILE_CACHE) {
3224       ldout(cct, 5) << __func__ << " dropped last FILE_CACHE ref on " << *in << dendl;
3225       ++put_nref;
3226     }
3227     if (drop)
3228       check_caps(in, 0);
3229     if (put_nref)
3230       put_inode(in, put_nref);
3231   }
3232 }
3233
3234 int Client::get_caps(Inode *in, int need, int want, int *phave, loff_t endoff)
3235 {
3236   int r = check_pool_perm(in, need);
3237   if (r < 0)
3238     return r;
3239
3240   while (1) {
3241     int file_wanted = in->caps_file_wanted();
3242     if ((file_wanted & need) != need) {
3243       ldout(cct, 10) << "get_caps " << *in << " need " << ccap_string(need)
3244                      << " file_wanted " << ccap_string(file_wanted) << ", EBADF "
3245                      << dendl;
3246       return -EBADF;
3247     }
3248
3249     int implemented;
3250     int have = in->caps_issued(&implemented);
3251
3252     bool waitfor_caps = false;
3253     bool waitfor_commit = false;
3254
3255     if (have & need & CEPH_CAP_FILE_WR) {
3256       if (endoff > 0) {
3257          if ((endoff >= (loff_t)in->max_size ||
3258               endoff > (loff_t)(in->size << 1)) &&
3259              endoff > (loff_t)in->wanted_max_size) {
3260            ldout(cct, 10) << "wanted_max_size " << in->wanted_max_size << " -> " << endoff << dendl;
3261            in->wanted_max_size = endoff;
3262          }
3263          if (in->wanted_max_size > in->max_size &&
3264              in->wanted_max_size > in->requested_max_size)
3265            check_caps(in, 0);
3266       }
3267
3268       if (endoff >= 0 && endoff > (loff_t)in->max_size) {
3269         ldout(cct, 10) << "waiting on max_size, endoff " << endoff << " max_size " << in->max_size << " on " << *in << dendl;
3270         waitfor_caps = true;
3271       }
3272       if (!in->cap_snaps.empty()) {
3273         if (in->cap_snaps.rbegin()->second.writing) {
3274           ldout(cct, 10) << "waiting on cap_snap write to complete" << dendl;
3275           waitfor_caps = true;
3276         }
3277         for (auto &p : in->cap_snaps) {
3278           if (p.second.dirty_data) {
3279             waitfor_commit = true;
3280             break;
3281           }
3282         }
3283         if (waitfor_commit) {
3284           _flush(in, new C_Client_FlushComplete(this, in));
3285           ldout(cct, 10) << "waiting for WRBUFFER to get dropped" << dendl;
3286         }
3287       }
3288     }
3289
3290     if (!waitfor_caps && !waitfor_commit) {
3291       if ((have & need) == need) {
3292         int revoking = implemented & ~have;
3293         ldout(cct, 10) << "get_caps " << *in << " have " << ccap_string(have)
3294                  << " need " << ccap_string(need) << " want " << ccap_string(want)
3295                  << " revoking " << ccap_string(revoking)
3296                  << dendl;
3297         if ((revoking & want) == 0) {
3298           *phave = need | (have & want);
3299           in->get_cap_ref(need);
3300           return 0;
3301         }
3302       }
3303       ldout(cct, 10) << "waiting for caps " << *in << " need " << ccap_string(need) << " want " << ccap_string(want) << dendl;
3304       waitfor_caps = true;
3305     }
3306
3307     if ((need & CEPH_CAP_FILE_WR) && in->auth_cap &&
3308         in->auth_cap->session->readonly)
3309       return -EROFS;
3310
3311     if (in->flags & I_CAP_DROPPED) {
3312       int mds_wanted = in->caps_mds_wanted();
3313       if ((mds_wanted & need) != need) {
3314         int ret = _renew_caps(in);
3315         if (ret < 0)
3316           return ret;
3317         continue;
3318       }
3319       if (!(file_wanted & ~mds_wanted))
3320         in->flags &= ~I_CAP_DROPPED;
3321     }
3322
3323     if (waitfor_caps)
3324       wait_on_list(in->waitfor_caps);
3325     else if (waitfor_commit)
3326       wait_on_list(in->waitfor_commit);
3327   }
3328 }
3329
3330 int Client::get_caps_used(Inode *in)
3331 {
3332   unsigned used = in->caps_used();
3333   if (!(used & CEPH_CAP_FILE_CACHE) &&
3334       !objectcacher->set_is_empty(&in->oset))
3335     used |= CEPH_CAP_FILE_CACHE;
3336   return used;
3337 }
3338
3339 void Client::cap_delay_requeue(Inode *in)
3340 {
3341   ldout(cct, 10) << __func__ << " on " << *in << dendl;
3342   in->hold_caps_until = ceph_clock_now();
3343   in->hold_caps_until += cct->_conf->client_caps_release_delay;
3344   delayed_list.push_back(&in->delay_cap_item);
3345 }
3346
3347 void Client::send_cap(Inode *in, MetaSession *session, Cap *cap,
3348                       int flags, int used, int want, int retain,
3349                       int flush, ceph_tid_t flush_tid)
3350 {
3351   int held = cap->issued | cap->implemented;
3352   int revoking = cap->implemented & ~cap->issued;
3353   retain &= ~revoking;
3354   int dropping = cap->issued & ~retain;
3355   int op = CEPH_CAP_OP_UPDATE;
3356
3357   ldout(cct, 10) << __func__ << " " << *in
3358            << " mds." << session->mds_num << " seq " << cap->seq
3359            << " used " << ccap_string(used)
3360            << " want " << ccap_string(want)
3361            << " flush " << ccap_string(flush)
3362            << " retain " << ccap_string(retain)
3363            << " held "<< ccap_string(held)
3364            << " revoking " << ccap_string(revoking)
3365            << " dropping " << ccap_string(dropping)
3366            << dendl;
3367
3368   if (cct->_conf->client_inject_release_failure && revoking) {
3369     const int would_have_issued = cap->issued & retain;
3370     const int would_have_implemented = cap->implemented & (cap->issued | used);
3371     // Simulated bug:
3372     //  - tell the server we think issued is whatever they issued plus whatever we implemented
3373     //  - leave what we have implemented in place
3374     ldout(cct, 20) << __func__ << " injecting failure to release caps" << dendl;
3375     cap->issued = cap->issued | cap->implemented;
3376
3377     // Make an exception for revoking xattr caps: we are injecting
3378     // failure to release other caps, but allow xattr because client
3379     // will block on xattr ops if it can't release these to MDS (#9800)
3380     const int xattr_mask = CEPH_CAP_XATTR_SHARED | CEPH_CAP_XATTR_EXCL;
3381     cap->issued ^= xattr_mask & revoking;
3382     cap->implemented ^= xattr_mask & revoking;
3383
3384     ldout(cct, 20) << __func__ << " issued " << ccap_string(cap->issued) << " vs " << ccap_string(would_have_issued) << dendl;
3385     ldout(cct, 20) << __func__ << " implemented " << ccap_string(cap->implemented) << " vs " << ccap_string(would_have_implemented) << dendl;
3386   } else {
3387     // Normal behaviour
3388     cap->issued &= retain;
3389     cap->implemented &= cap->issued | used;
3390   }
3391
3392   snapid_t follows = 0;
3393
3394   if (flush)
3395     follows = in->snaprealm->get_snap_context().seq;
3396
3397   auto m = make_message<MClientCaps>(op,
3398                                    in->ino,
3399                                    0,
3400                                    cap->cap_id, cap->seq,
3401                                    cap->implemented,
3402                                    want,
3403                                    flush,
3404                                    cap->mseq,
3405                                    cap_epoch_barrier);
3406   m->caller_uid = in->cap_dirtier_uid;
3407   m->caller_gid = in->cap_dirtier_gid;
3408
3409   m->head.issue_seq = cap->issue_seq;
3410   m->set_tid(flush_tid);
3411
3412   m->head.uid = in->uid;
3413   m->head.gid = in->gid;
3414   m->head.mode = in->mode;
3415
3416   m->head.nlink = in->nlink;
3417
3418   if (flush & CEPH_CAP_XATTR_EXCL) {
3419     encode(in->xattrs, m->xattrbl);
3420     m->head.xattr_version = in->xattr_version;
3421   }
3422
3423   m->size = in->size;
3424   m->max_size = in->max_size;
3425   m->truncate_seq = in->truncate_seq;
3426   m->truncate_size = in->truncate_size;
3427   m->mtime = in->mtime;
3428   m->atime = in->atime;
3429   m->ctime = in->ctime;
3430   m->btime = in->btime;
3431   m->time_warp_seq = in->time_warp_seq;
3432   m->change_attr = in->change_attr;
3433
3434   if (!(flags & MClientCaps::FLAG_PENDING_CAPSNAP) &&
3435       !in->cap_snaps.empty() &&
3436       in->cap_snaps.rbegin()->second.flush_tid == 0)
3437     flags |= MClientCaps::FLAG_PENDING_CAPSNAP;
3438   m->flags = flags;
3439
3440   if (flush & CEPH_CAP_FILE_WR) {
3441     m->inline_version = in->inline_version;
3442     m->inline_data = in->inline_data;
3443   }
3444
3445   in->reported_size = in->size;
3446   m->set_snap_follows(follows);
3447   cap->wanted = want;
3448   if (cap == in->auth_cap) {
3449     if (want & CEPH_CAP_ANY_FILE_WR) {
3450       m->set_max_size(in->wanted_max_size);
3451       in->requested_max_size = in->wanted_max_size;
3452       ldout(cct, 15) << "auth cap, requesting max_size " << in->requested_max_size << dendl;
3453     } else {
3454       in->requested_max_size = 0;
3455       ldout(cct, 15) << "auth cap, reset requested_max_size due to not wanting any file write cap" << dendl;
3456     }
3457   }
3458
3459   if (!session->flushing_caps_tids.empty())
3460     m->set_oldest_flush_tid(*session->flushing_caps_tids.begin());
3461
3462   session->con->send_message2(std::move(m));
3463 }
3464
3465 static bool is_max_size_approaching(Inode *in)
3466 {
3467   /* mds will adjust max size according to the reported size */
3468   if (in->flushing_caps & CEPH_CAP_FILE_WR)
3469     return false;
3470   if (in->size >= in->max_size)
3471     return true;
3472   /* half of previous max_size increment has been used */
3473   if (in->max_size > in->reported_size &&
3474       (in->size << 1) >= in->max_size + in->reported_size)
3475     return true;
3476   return false;
3477 }
3478
3479 static int adjust_caps_used_for_lazyio(int used, int issued, int implemented)
3480 {
3481   if (!(used & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_BUFFER)))
3482     return used;
3483   if (!(implemented & CEPH_CAP_FILE_LAZYIO))
3484     return used;
3485
3486   if (issued & CEPH_CAP_FILE_LAZYIO) {
3487     if (!(issued & CEPH_CAP_FILE_CACHE)) {
3488       used &= ~CEPH_CAP_FILE_CACHE;
3489       used |= CEPH_CAP_FILE_LAZYIO;
3490     }
3491     if (!(issued & CEPH_CAP_FILE_BUFFER)) {
3492       used &= ~CEPH_CAP_FILE_BUFFER;
3493       used |= CEPH_CAP_FILE_LAZYIO;
3494     }
3495   } else {
3496     if (!(implemented & CEPH_CAP_FILE_CACHE)) {
3497       used &= ~CEPH_CAP_FILE_CACHE;
3498       used |= CEPH_CAP_FILE_LAZYIO;
3499     }
3500     if (!(implemented & CEPH_CAP_FILE_BUFFER)) {
3501       used &= ~CEPH_CAP_FILE_BUFFER;
3502       used |= CEPH_CAP_FILE_LAZYIO;
3503     }
3504   }
3505   return used;
3506 }
3507
3508 /**
3509  * check_caps
3510  *
3511  * Examine currently used and wanted versus held caps. Release, flush or ack
3512  * revoked caps to the MDS as appropriate.
3513  *
3514  * @param in the inode to check
3515  * @param flags flags to apply to cap check
3516  */
3517 void Client::check_caps(Inode *in, unsigned flags)
3518 {
3519   unsigned wanted = in->caps_wanted();
3520   unsigned used = get_caps_used(in);
3521   unsigned cap_used;
3522
3523   int implemented;
3524   int issued = in->caps_issued(&implemented);
3525   int revoking = implemented & ~issued;
3526
3527   int orig_used = used;
3528   used = adjust_caps_used_for_lazyio(used, issued, implemented);
3529
3530   int retain = wanted | used | CEPH_CAP_PIN;
3531   if (!unmounting && in->nlink > 0) {
3532     if (wanted) {
3533       retain |= CEPH_CAP_ANY;
3534     } else if (in->is_dir() &&
3535                (issued & CEPH_CAP_FILE_SHARED) &&
3536                (in->flags & I_COMPLETE)) {
3537       // we do this here because we don't want to drop to Fs (and then
3538       // drop the Fs if we do a create!) if that alone makes us send lookups
3539       // to the MDS. Doing it in in->caps_wanted() has knock-on effects elsewhere
3540       wanted = CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_EXCL;
3541       retain |= wanted;
3542     } else {
3543       retain |= CEPH_CAP_ANY_SHARED;
3544       // keep RD only if we didn't have the file open RW,
3545       // because then the mds would revoke it anyway to
3546       // journal max_size=0.
3547       if (in->max_size == 0)
3548         retain |= CEPH_CAP_ANY_RD;
3549     }
3550   }
3551
3552   ldout(cct, 10) << __func__ << " on " << *in
3553            << " wanted " << ccap_string(wanted)
3554            << " used " << ccap_string(used)
3555            << " issued " << ccap_string(issued)
3556            << " revoking " << ccap_string(revoking)
3557            << " flags=" << flags
3558            << dendl;
3559
3560   if (in->snapid != CEPH_NOSNAP)
3561     return; //snap caps last forever, can't write
3562
3563   if (in->caps.empty())
3564     return;   // guard if at end of func
3565
3566   if (!(orig_used & CEPH_CAP_FILE_BUFFER) &&
3567       (revoking & used & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO))) {
3568     if (_release(in))
3569       used &= ~(CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO);
3570   }
3571
3572
3573   for (auto &p : in->caps) {
3574     mds_rank_t mds = p.first;
3575     Cap &cap = p.second;
3576
3577     MetaSession *session = &mds_sessions.at(mds);
3578
3579     cap_used = used;
3580     if (in->auth_cap && &cap != in->auth_cap)
3581       cap_used &= ~in->auth_cap->issued;
3582
3583     revoking = cap.implemented & ~cap.issued;
3584
3585     ldout(cct, 10) << " cap mds." << mds
3586              << " issued " << ccap_string(cap.issued)
3587              << " implemented " << ccap_string(cap.implemented)
3588              << " revoking " << ccap_string(revoking) << dendl;
3589
3590     if (in->wanted_max_size > in->max_size &&
3591         in->wanted_max_size > in->requested_max_size &&
3592         &cap == in->auth_cap)
3593       goto ack;
3594
3595     /* approaching file_max? */
3596     if ((cap.issued & CEPH_CAP_FILE_WR) &&
3597         &cap == in->auth_cap &&
3598         is_max_size_approaching(in)) {
3599       ldout(cct, 10) << "size " << in->size << " approaching max_size " << in->max_size
3600                      << ", reported " << in->reported_size << dendl;
3601       goto ack;
3602     }
3603
3604     /* completed revocation? */
3605     if (revoking && (revoking & cap_used) == 0) {
3606       ldout(cct, 10) << "completed revocation of " << ccap_string(cap.implemented & ~cap.issued) << dendl;
3607       goto ack;
3608     }
3609
3610     /* want more caps from mds? */
3611     if (wanted & ~(cap.wanted | cap.issued))
3612       goto ack;
3613
3614     if (!revoking && unmounting && (cap_used == 0))
3615       goto ack;
3616
3617     if ((cap.issued & ~retain) == 0 && // and we don't have anything we wouldn't like
3618         !in->dirty_caps)               // and we have no dirty caps
3619       continue;
3620
3621     if (!(flags & CHECK_CAPS_NODELAY)) {
3622       ldout(cct, 10) << "delaying cap release" << dendl;
3623       cap_delay_requeue(in);
3624       continue;
3625     }
3626
3627   ack:
3628     if (&cap == in->auth_cap) {
3629       if (in->flags & I_KICK_FLUSH) {
3630         ldout(cct, 20) << " reflushing caps (check_caps) on " << *in
3631                        << " to mds." << mds << dendl;
3632         kick_flushing_caps(in, session);
3633       }
3634       if (!in->cap_snaps.empty() &&
3635           in->cap_snaps.rbegin()->second.flush_tid == 0)
3636         flush_snaps(in);
3637     }
3638
3639     int flushing;
3640     int msg_flags = 0;
3641     ceph_tid_t flush_tid;
3642     if (in->auth_cap == &cap && in->dirty_caps) {
3643       flushing = mark_caps_flushing(in, &flush_tid);
3644       if (flags & CHECK_CAPS_SYNCHRONOUS)
3645         msg_flags |= MClientCaps::FLAG_SYNC;
3646     } else {
3647       flushing = 0;
3648       flush_tid = 0;
3649     }
3650
3651     send_cap(in, session, &cap, msg_flags, cap_used, wanted, retain,
3652              flushing, flush_tid);
3653   }
3654 }
3655
3656
3657 void Client::queue_cap_snap(Inode *in, SnapContext& old_snapc)
3658 {
3659   int used = get_caps_used(in);
3660   int dirty = in->caps_dirty();
3661   ldout(cct, 10) << __func__ << " " << *in << " snapc " << old_snapc << " used " << ccap_string(used) << dendl;
3662
3663   if (in->cap_snaps.size() &&
3664       in->cap_snaps.rbegin()->second.writing) {
3665     ldout(cct, 10) << __func__ << " already have pending cap_snap on " << *in << dendl;
3666     return;
3667   } else if (in->caps_dirty() ||
3668             (used & CEPH_CAP_FILE_WR) ||
3669              (dirty & CEPH_CAP_ANY_WR)) {
3670     const auto &capsnapem = in->cap_snaps.emplace(std::piecewise_construct, std::make_tuple(old_snapc.seq), std::make_tuple(in));
3671     ceph_assert(capsnapem.second); /* element inserted */
3672     CapSnap &capsnap = capsnapem.first->second;
3673     capsnap.context = old_snapc;
3674     capsnap.issued = in->caps_issued();
3675     capsnap.dirty = in->caps_dirty();
3676
3677     capsnap.dirty_data = (used & CEPH_CAP_FILE_BUFFER);
3678
3679     capsnap.uid = in->uid;
3680     capsnap.gid = in->gid;
3681     capsnap.mode = in->mode;
3682     capsnap.btime = in->btime;
3683     capsnap.xattrs = in->xattrs;
3684     capsnap.xattr_version = in->xattr_version;
3685     capsnap.cap_dirtier_uid = in->cap_dirtier_uid;
3686     capsnap.cap_dirtier_gid = in->cap_dirtier_gid;
3687
3688     if (used & CEPH_CAP_FILE_WR) {
3689       ldout(cct, 10) << __func__ << " WR used on " << *in << dendl;
3690       capsnap.writing = 1;
3691     } else {
3692       finish_cap_snap(in, capsnap, used);
3693     }
3694   } else {
3695     ldout(cct, 10) << __func__ << " not dirty|writing on " << *in << dendl;
3696   }
3697 }
3698
3699 void Client::finish_cap_snap(Inode *in, CapSnap &capsnap, int used)
3700 {
3701   ldout(cct, 10) << __func__ << " " << *in << " capsnap " << (void *)&capsnap << " used " << ccap_string(used) << dendl;
3702   capsnap.size = in->size;
3703   capsnap.mtime = in->mtime;
3704   capsnap.atime = in->atime;
3705   capsnap.ctime = in->ctime;
3706   capsnap.time_warp_seq = in->time_warp_seq;
3707   capsnap.change_attr = in->change_attr;
3708   capsnap.dirty |= in->caps_dirty();
3709
3710   /* Only reset it if it wasn't set before */
3711   if (capsnap.cap_dirtier_uid == -1) {
3712     capsnap.cap_dirtier_uid = in->cap_dirtier_uid;
3713     capsnap.cap_dirtier_gid = in->cap_dirtier_gid;
3714   }
3715
3716   if (capsnap.dirty & CEPH_CAP_FILE_WR) {
3717     capsnap.inline_data = in->inline_data;
3718     capsnap.inline_version = in->inline_version;
3719   }
3720
3721   if (used & CEPH_CAP_FILE_BUFFER) {
3722     ldout(cct, 10) << __func__ << " " << *in << " cap_snap " << &capsnap << " used " << used
3723              << " WRBUFFER, delaying" << dendl;
3724   } else {
3725     capsnap.dirty_data = 0;
3726     flush_snaps(in);
3727   }
3728 }
3729
3730 void Client::_flushed_cap_snap(Inode *in, snapid_t seq)
3731 {
3732   ldout(cct, 10) << __func__ << " seq " << seq << " on " << *in << dendl;
3733   in->cap_snaps.at(seq).dirty_data = 0;
3734   flush_snaps(in);
3735 }
3736
3737 void Client::send_flush_snap(Inode *in, MetaSession *session,
3738                              snapid_t follows, CapSnap& capsnap)
3739 {
3740   auto m = make_message<MClientCaps>(CEPH_CAP_OP_FLUSHSNAP,
3741                                      in->ino, in->snaprealm->ino, 0,
3742                                      in->auth_cap->mseq, cap_epoch_barrier);
3743   m->caller_uid = capsnap.cap_dirtier_uid;
3744   m->caller_gid = capsnap.cap_dirtier_gid;
3745
3746   m->set_client_tid(capsnap.flush_tid);
3747   m->head.snap_follows = follows;
3748
3749   m->head.caps = capsnap.issued;
3750   m->head.dirty = capsnap.dirty;
3751
3752   m->head.uid = capsnap.uid;
3753   m->head.gid = capsnap.gid;
3754   m->head.mode = capsnap.mode;
3755   m->btime = capsnap.btime;
3756
3757   m->size = capsnap.size;
3758
3759   m->head.xattr_version = capsnap.xattr_version;
3760   encode(capsnap.xattrs, m->xattrbl);
3761
3762   m->ctime = capsnap.ctime;
3763   m->btime = capsnap.btime;
3764   m->mtime = capsnap.mtime;
3765   m->atime = capsnap.atime;
3766   m->time_warp_seq = capsnap.time_warp_seq;
3767   m->change_attr = capsnap.change_attr;
3768
3769   if (capsnap.dirty & CEPH_CAP_FILE_WR) {
3770     m->inline_version = in->inline_version;
3771     m->inline_data = in->inline_data;
3772   }
3773
3774   ceph_assert(!session->flushing_caps_tids.empty());
3775   m->set_oldest_flush_tid(*session->flushing_caps_tids.begin());
3776
3777   session->con->send_message2(std::move(m));
3778 }
3779
3780 void Client::flush_snaps(Inode *in)
3781 {
3782   ldout(cct, 10) << "flush_snaps on " << *in << dendl;
3783   ceph_assert(in->cap_snaps.size());
3784
3785   // pick auth mds
3786   ceph_assert(in->auth_cap);
3787   MetaSession *session = in->auth_cap->session;
3788
3789   for (auto &p : in->cap_snaps) {
3790     CapSnap &capsnap = p.second;
3791     // only do new flush
3792     if (capsnap.flush_tid > 0)
3793       continue;
3794
3795     ldout(cct, 10) << "flush_snaps mds." << session->mds_num
3796              << " follows " << p.first
3797              << " size " << capsnap.size
3798              << " mtime " << capsnap.mtime
3799              << " dirty_data=" << capsnap.dirty_data
3800              << " writing=" << capsnap.writing
3801              << " on " << *in << dendl;
3802     if (capsnap.dirty_data || capsnap.writing)
3803       break;
3804
3805     capsnap.flush_tid = ++last_flush_tid;
3806     session->flushing_caps_tids.insert(capsnap.flush_tid);
3807     in->flushing_cap_tids[capsnap.flush_tid] = 0;
3808     if (!in->flushing_cap_item.is_on_list())
3809       session->flushing_caps.push_back(&in->flushing_cap_item);
3810
3811     send_flush_snap(in, session, p.first, capsnap);
3812   }
3813 }
3814
3815 void Client::wait_on_list(list<ceph::condition_variable*>& ls)
3816 {
3817   ceph::condition_variable cond;
3818   ls.push_back(&cond);
3819   std::unique_lock l{client_lock, std::adopt_lock};
3820   cond.wait(l);
3821   l.release();
3822   ls.remove(&cond);
3823 }
3824
3825 void Client::signal_cond_list(list<ceph::condition_variable*>& ls)
3826 {
3827   for (auto cond : ls) {
3828     cond->notify_all();
3829   }
3830 }
3831
3832 void Client::wait_on_context_list(list<Context*>& ls)
3833 {
3834   ceph::condition_variable cond;
3835   bool done = false;
3836   int r;
3837   ls.push_back(new C_Cond(cond, &done, &r));
3838   std::unique_lock l{client_lock, std::adopt_lock};
3839   cond.wait(l, [&done] { return done;});
3840   l.release();
3841 }
3842
3843 void Client::signal_context_list(list<Context*>& ls)
3844 {
3845   while (!ls.empty()) {
3846     ls.front()->complete(0);
3847     ls.pop_front();
3848   }
3849 }
3850
3851 void Client::wake_up_session_caps(MetaSession *s, bool reconnect)
3852 {
3853   for (const auto &cap : s->caps) {
3854     auto &in = cap->inode;
3855     if (reconnect) {
3856       in.requested_max_size = 0;
3857       in.wanted_max_size = 0;
3858     } else {
3859       if (cap->gen < s->cap_gen) {
3860         // mds did not re-issue stale cap.
3861         cap->issued = cap->implemented = CEPH_CAP_PIN;
3862         // make sure mds knows what we want.
3863         if (in.caps_file_wanted() & ~cap->wanted)
3864           in.flags |= I_CAP_DROPPED;
3865       }
3866     }
3867     signal_cond_list(in.waitfor_caps);
3868   }
3869 }
3870
3871
3872 // flush dirty data (from objectcache)
3873
3874 class C_Client_CacheInvalidate : public Context  {
3875 private:
3876   Client *client;
3877   vinodeno_t ino;
3878   int64_t offset, length;
3879 public:
3880   C_Client_CacheInvalidate(Client *c, Inode *in, int64_t off, int64_t len) :
3881     client(c), offset(off), length(len) {
3882     if (client->use_faked_inos())
3883       ino = vinodeno_t(in->faked_ino, CEPH_NOSNAP);
3884     else
3885       ino = in->vino();
3886   }
3887   void finish(int r) override {
3888     // _async_invalidate takes the lock when it needs to, call this back from outside of lock.
3889     ceph_assert(ceph_mutex_is_not_locked_by_me(client->client_lock));
3890     client->_async_invalidate(ino, offset, length);
3891   }
3892 };
3893
3894 void Client::_async_invalidate(vinodeno_t ino, int64_t off, int64_t len)
3895 {
3896   if (unmounting)
3897     return;
3898   ldout(cct, 10) << __func__ << " " << ino << " " << off << "~" << len << dendl;
3899   ino_invalidate_cb(callback_handle, ino, off, len);
3900 }
3901
3902 void Client::_schedule_invalidate_callback(Inode *in, int64_t off, int64_t len) {
3903
3904   if (ino_invalidate_cb)
3905     // we queue the invalidate, which calls the callback and decrements the ref
3906     async_ino_invalidator.queue(new C_Client_CacheInvalidate(this, in, off, len));
3907 }
3908
3909 void Client::_invalidate_inode_cache(Inode *in)
3910 {
3911   ldout(cct, 10) << __func__ << " " << *in << dendl;
3912
3913   // invalidate our userspace inode cache
3914   if (cct->_conf->client_oc) {
3915     objectcacher->release_set(&in->oset);
3916     if (!objectcacher->set_is_empty(&in->oset))
3917       lderr(cct) << "failed to invalidate cache for " << *in << dendl;
3918   }
3919
3920   _schedule_invalidate_callback(in, 0, 0);
3921 }
3922
3923 void Client::_invalidate_inode_cache(Inode *in, int64_t off, int64_t len)
3924 {
3925   ldout(cct, 10) << __func__ << " " << *in << " " << off << "~" << len << dendl;
3926
3927   // invalidate our userspace inode cache
3928   if (cct->_conf->client_oc) {
3929     vector<ObjectExtent> ls;
3930     Striper::file_to_extents(cct, in->ino, &in->layout, off, len, in->truncate_size, ls);
3931     objectcacher->discard_writeback(&in->oset, ls, nullptr);
3932   }
3933
3934   _schedule_invalidate_callback(in, off, len);
3935 }
3936
3937 bool Client::_release(Inode *in)
3938 {
3939   ldout(cct, 20) << "_release " << *in << dendl;
3940   if (in->cap_refs[CEPH_CAP_FILE_CACHE] == 0) {
3941     _invalidate_inode_cache(in);
3942     return true;
3943   }
3944   return false;
3945 }
3946
3947 bool Client::_flush(Inode *in, Context *onfinish)
3948 {
3949   ldout(cct, 10) << "_flush " << *in << dendl;
3950
3951   if (!in->oset.dirty_or_tx) {
3952     ldout(cct, 10) << " nothing to flush" << dendl;
3953     onfinish->complete(0);
3954     return true;
3955   }
3956
3957   if (objecter->osdmap_pool_full(in->layout.pool_id)) {
3958     ldout(cct, 8) << __func__ << ": FULL, purging for ENOSPC" << dendl;
3959     objectcacher->purge_set(&in->oset);
3960     if (onfinish) {
3961       onfinish->complete(-ENOSPC);
3962     }
3963     return true;
3964   }
3965
3966   return objectcacher->flush_set(&in->oset, onfinish);
3967 }
3968
3969 void Client::_flush_range(Inode *in, int64_t offset, uint64_t size)
3970 {
3971   ceph_assert(ceph_mutex_is_locked(client_lock));
3972   if (!in->oset.dirty_or_tx) {
3973     ldout(cct, 10) << " nothing to flush" << dendl;
3974     return;
3975   }
3976
3977   C_SaferCond onflush("Client::_flush_range flock");
3978   bool ret = objectcacher->file_flush(&in->oset, &in->layout, in->snaprealm->get_snap_context(),
3979                                       offset, size, &onflush);
3980   if (!ret) {
3981     // wait for flush
3982     client_lock.unlock();
3983     onflush.wait();
3984     client_lock.lock();
3985   }
3986 }
3987
3988 void Client::flush_set_callback(ObjectCacher::ObjectSet *oset)
3989 {
3990   //  std::lock_guard l(client_lock);
3991   ceph_assert(ceph_mutex_is_locked(client_lock));   // will be called via dispatch() -> objecter -> ...
3992   Inode *in = static_cast<Inode *>(oset->parent);
3993   ceph_assert(in);
3994   _flushed(in);
3995 }
3996
3997 void Client::_flushed(Inode *in)
3998 {
3999   ldout(cct, 10) << "_flushed " << *in << dendl;
4000
4001   put_cap_ref(in, CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_BUFFER);
4002 }
4003
4004
4005
4006 // checks common to add_update_cap, handle_cap_grant
4007 void Client::check_cap_issue(Inode *in, unsigned issued)
4008 {
4009   unsigned had = in->caps_issued();
4010
4011   if ((issued & CEPH_CAP_FILE_CACHE) &&
4012       !(had & CEPH_CAP_FILE_CACHE))
4013     in->cache_gen++;
4014
4015   if ((issued & CEPH_CAP_FILE_SHARED) &&
4016       !(had & CEPH_CAP_FILE_SHARED)) {
4017     in->shared_gen++;
4018
4019     if (in->is_dir())
4020       clear_dir_complete_and_ordered(in, true);
4021   }
4022 }
4023
4024 void Client::add_update_cap(Inode *in, MetaSession *mds_session, uint64_t cap_id,
4025                             unsigned issued, unsigned wanted, unsigned seq, unsigned mseq,
4026                             inodeno_t realm, int flags, const UserPerm& cap_perms)
4027 {
4028   if (!in->is_any_caps()) {
4029     ceph_assert(in->snaprealm == 0);
4030     in->snaprealm = get_snap_realm(realm);
4031     in->snaprealm->inodes_with_caps.push_back(&in->snaprealm_item);
4032     ldout(cct, 15) << __func__ << " first one, opened snaprealm " << in->snaprealm << dendl;
4033   } else {
4034     ceph_assert(in->snaprealm);
4035     if ((flags & CEPH_CAP_FLAG_AUTH) &&
4036         realm != inodeno_t(-1) && in->snaprealm->ino != realm) {
4037       in->snaprealm_item.remove_myself();
4038       auto oldrealm = in->snaprealm;
4039       in->snaprealm = get_snap_realm(realm);
4040       in->snaprealm->inodes_with_caps.push_back(&in->snaprealm_item);
4041       put_snap_realm(oldrealm);
4042     }
4043   }
4044
4045   mds_rank_t mds = mds_session->mds_num;
4046   const auto &capem = in->caps.emplace(std::piecewise_construct, std::forward_as_tuple(mds), std::forward_as_tuple(*in, mds_session));
4047   Cap &cap = capem.first->second;
4048   if (!capem.second) {
4049     if (cap.gen < mds_session->cap_gen)
4050       cap.issued = cap.implemented = CEPH_CAP_PIN;
4051
4052     /*
4053      * auth mds of the inode changed. we received the cap export
4054      * message, but still haven't received the cap import message.
4055      * handle_cap_export() updated the new auth MDS' cap.
4056      *
4057      * "ceph_seq_cmp(seq, cap->seq) <= 0" means we are processing
4058      * a message that was send before the cap import message. So
4059      * don't remove caps.
4060      */
4061     if (ceph_seq_cmp(seq, cap.seq) <= 0) {
4062       if (&cap != in->auth_cap)
4063          ldout(cct, 0) << "WARNING: " <<  "inode " << *in << " caps on mds." << mds << " != auth_cap." << dendl;
4064
4065       ceph_assert(cap.cap_id == cap_id);
4066       seq = cap.seq;
4067       mseq = cap.mseq;
4068       issued |= cap.issued;
4069       flags |= CEPH_CAP_FLAG_AUTH;
4070     }
4071   }
4072
4073   check_cap_issue(in, issued);
4074
4075   if (flags & CEPH_CAP_FLAG_AUTH) {
4076     if (in->auth_cap != &cap &&
4077         (!in->auth_cap || ceph_seq_cmp(in->auth_cap->mseq, mseq) < 0)) {
4078       if (in->auth_cap && in->flushing_cap_item.is_on_list()) {
4079         ldout(cct, 10) << __func__ << " changing auth cap: "
4080                        << "add myself to new auth MDS' flushing caps list" << dendl;
4081         adjust_session_flushing_caps(in, in->auth_cap->session, mds_session);
4082       }
4083       in->auth_cap = &cap;
4084     }
4085   }
4086
4087   unsigned old_caps = cap.issued;
4088   cap.cap_id = cap_id;
4089   cap.issued = issued;
4090   cap.implemented |= issued;
4091   if (ceph_seq_cmp(mseq, cap.mseq) > 0)
4092     cap.wanted = wanted;
4093   else
4094     cap.wanted |= wanted;
4095   cap.seq = seq;
4096   cap.issue_seq = seq;
4097   cap.mseq = mseq;
4098   cap.gen = mds_session->cap_gen;
4099   cap.latest_perms = cap_perms;
4100   ldout(cct, 10) << __func__ << " issued " << ccap_string(old_caps) << " -> " << ccap_string(cap.issued)
4101            << " from mds." << mds
4102            << " on " << *in
4103            << dendl;
4104
4105   if ((issued & ~old_caps) && in->auth_cap == &cap) {
4106     // non-auth MDS is revoking the newly grant caps ?
4107     for (auto &p : in->caps) {
4108       if (&p.second == &cap)
4109         continue;
4110       if (p.second.implemented & ~p.second.issued & issued) {
4111         check_caps(in, CHECK_CAPS_NODELAY);
4112         break;
4113       }
4114     }
4115   }
4116
4117   if (issued & ~old_caps)
4118     signal_cond_list(in->waitfor_caps);
4119 }
4120
4121 void Client::remove_cap(Cap *cap, bool queue_release)
4122 {
4123   auto &in = cap->inode;
4124   MetaSession *session = cap->session;
4125   mds_rank_t mds = cap->session->mds_num;
4126
4127   ldout(cct, 10) << __func__ << " mds." << mds << " on " << in << dendl;
4128
4129   if (queue_release) {
4130     session->enqueue_cap_release(
4131       in.ino,
4132       cap->cap_id,
4133       cap->issue_seq,
4134       cap->mseq,
4135       cap_epoch_barrier);
4136   }
4137
4138   if (in.auth_cap == cap) {
4139     if (in.flushing_cap_item.is_on_list()) {
4140       ldout(cct, 10) << " removing myself from flushing_cap list" << dendl;
4141       in.flushing_cap_item.remove_myself();
4142     }
4143     in.auth_cap = NULL;
4144   }
4145   size_t n = in.caps.erase(mds);
4146   ceph_assert(n == 1);
4147   cap = nullptr;
4148
4149   if (!in.is_any_caps()) {
4150     ldout(cct, 15) << __func__ << " last one, closing snaprealm " << in.snaprealm << dendl;
4151     in.snaprealm_item.remove_myself();
4152     put_snap_realm(in.snaprealm);
4153     in.snaprealm = 0;
4154   }
4155 }
4156
4157 void Client::remove_all_caps(Inode *in)
4158 {
4159   while (!in->caps.empty())
4160     remove_cap(&in->caps.begin()->second, true);
4161 }
4162
4163 void Client::remove_session_caps(MetaSession *s)
4164 {
4165   ldout(cct, 10) << __func__ << " mds." << s->mds_num << dendl;
4166
4167   while (s->caps.size()) {
4168     Cap *cap = *s->caps.begin();
4169     InodeRef in(&cap->inode);
4170     bool dirty_caps = false;
4171     if (in->auth_cap == cap) {
4172       dirty_caps = in->dirty_caps | in->flushing_caps;
4173       in->wanted_max_size = 0;
4174       in->requested_max_size = 0;
4175     }
4176     if (cap->wanted | cap->issued)
4177       in->flags |= I_CAP_DROPPED;
4178     remove_cap(cap, false);
4179     in->cap_snaps.clear();
4180     if (dirty_caps) {
4181       lderr(cct) << __func__ << " still has dirty|flushing caps on " << *in << dendl;
4182       if (in->flushing_caps) {
4183         num_flushing_caps--;
4184         in->flushing_cap_tids.clear();
4185       }
4186       in->flushing_caps = 0;
4187       in->mark_caps_clean();
4188       put_inode(in.get());
4189     }
4190     signal_cond_list(in->waitfor_caps);
4191   }
4192   s->flushing_caps_tids.clear();
4193   sync_cond.notify_all();
4194 }
4195
4196 int Client::_do_remount(bool retry_on_error)
4197 {
4198   uint64_t max_retries = g_conf().get_val<uint64_t>("mds_max_retries_on_remount_failure");
4199
4200   errno = 0;
4201   int r = remount_cb(callback_handle);
4202   if (r == 0) {
4203     retries_on_invalidate = 0;
4204   } else {
4205     int e = errno;
4206     client_t whoami = get_nodeid();
4207     if (r == -1) {
4208       lderr(cct) <<
4209           "failed to remount (to trim kernel dentries): "
4210           "errno = " << e << " (" << strerror(e) << ")" << dendl;
4211     } else {
4212       lderr(cct) <<
4213           "failed to remount (to trim kernel dentries): "
4214           "return code = " << r << dendl;
4215     }
4216     bool should_abort =
4217       (cct->_conf.get_val<bool>("client_die_on_failed_remount") ||
4218        cct->_conf.get_val<bool>("client_die_on_failed_dentry_invalidate")) &&
4219       !(retry_on_error && (++retries_on_invalidate < max_retries));
4220     if (should_abort && !unmounting) {
4221       lderr(cct) << "failed to remount for kernel dentry trimming; quitting!" << dendl;
4222       ceph_abort();
4223     }
4224   }
4225   return r;
4226 }
4227
4228 class C_Client_Remount : public Context  {
4229 private:
4230   Client *client;
4231 public:
4232   explicit C_Client_Remount(Client *c) : client(c) {}
4233   void finish(int r) override {
4234     ceph_assert(r == 0);
4235     client->_do_remount(true);
4236   }
4237 };
4238
4239 void Client::_invalidate_kernel_dcache()
4240 {
4241   if (unmounting)
4242     return;
4243   if (can_invalidate_dentries) {
4244     if (dentry_invalidate_cb && root->dir) {
4245       for (ceph::unordered_map<string, Dentry*>::iterator p = root->dir->dentries.begin();
4246          p != root->dir->dentries.end();
4247          ++p) {
4248        if (p->second->inode)
4249         _schedule_invalidate_dentry_callback(p->second, false);
4250       }
4251     }
4252   } else if (remount_cb) {
4253     // Hacky:
4254     // when remounting a file system, linux kernel trims all unused dentries in the fs
4255     remount_finisher.queue(new C_Client_Remount(this));
4256   }
4257 }
4258
4259 void Client::_trim_negative_child_dentries(InodeRef& in)
4260 {
4261   if (!in->is_dir())
4262     return;
4263
4264   Dir* dir = in->dir;
4265   if (dir && dir->dentries.size() == dir->num_null_dentries) {
4266     for (auto p = dir->dentries.begin(); p != dir->dentries.end(); ) {
4267       Dentry *dn = p->second;
4268       ++p;
4269       ceph_assert(!dn->inode);
4270       if (dn->lru_is_expireable())
4271         unlink(dn, true, false);  // keep dir, drop dentry
4272     }
4273     if (dir->dentries.empty()) {
4274       close_dir(dir);
4275     }
4276   }
4277
4278   if (in->flags & I_SNAPDIR_OPEN) {
4279     InodeRef snapdir = open_snapdir(in.get());
4280     _trim_negative_child_dentries(snapdir);
4281   }
4282 }
4283
4284 class C_Client_CacheRelease : public Context  {
4285 private:
4286   Client *client;
4287   vinodeno_t ino;
4288 public:
4289   C_Client_CacheRelease(Client *c, Inode *in) :
4290     client(c) {
4291     if (client->use_faked_inos())
4292       ino = vinodeno_t(in->faked_ino, CEPH_NOSNAP);
4293     else
4294       ino = in->vino();
4295   }
4296   void finish(int r) override {
4297     ceph_assert(ceph_mutex_is_not_locked_by_me(client->client_lock));
4298     client->_async_inode_release(ino);
4299   }
4300 };
4301
4302 void Client::_async_inode_release(vinodeno_t ino)
4303 {
4304   if (unmounting)
4305     return;
4306   ldout(cct, 10) << __func__ << " " << ino << dendl;
4307   ino_release_cb(callback_handle, ino);
4308 }
4309
4310 void Client::_schedule_ino_release_callback(Inode *in) {
4311
4312   if (ino_release_cb)
4313     // we queue the invalidate, which calls the callback and decrements the ref
4314     async_ino_releasor.queue(new C_Client_CacheRelease(this, in));
4315 }
4316
4317 void Client::trim_caps(MetaSession *s, uint64_t max)
4318 {
4319   mds_rank_t mds = s->mds_num;
4320   size_t caps_size = s->caps.size();
4321   ldout(cct, 10) << __func__ << " mds." << mds << " max " << max
4322     << " caps " << caps_size << dendl;
4323
4324   uint64_t trimmed = 0;
4325   auto p = s->caps.begin();
4326   std::set<Dentry *> to_trim; /* this avoids caps other than the one we're
4327                                * looking at from getting deleted during traversal. */
4328   while ((caps_size - trimmed) > max && !p.end()) {
4329     Cap *cap = *p;
4330     InodeRef in(&cap->inode);
4331
4332     // Increment p early because it will be invalidated if cap
4333     // is deleted inside remove_cap
4334     ++p;
4335
4336     if (in->caps.size() > 1 && cap != in->auth_cap) {
4337       int mine = cap->issued | cap->implemented;
4338       int oissued = in->auth_cap ? in->auth_cap->issued : 0;
4339       // disposable non-auth cap
4340       if (!(get_caps_used(in.get()) & ~oissued & mine)) {
4341         ldout(cct, 20) << " removing unused, unneeded non-auth cap on " << *in << dendl;
4342         cap = (remove_cap(cap, true), nullptr);
4343         trimmed++;
4344       }
4345     } else {
4346       ldout(cct, 20) << " trying to trim dentries for " << *in << dendl;
4347       _trim_negative_child_dentries(in);
4348       bool all = true;
4349       auto q = in->dentries.begin();
4350       while (q != in->dentries.end()) {
4351         Dentry *dn = *q;
4352         ++q;
4353         if (dn->lru_is_expireable()) {
4354           if (can_invalidate_dentries &&
4355               dn->dir->parent_inode->ino == MDS_INO_ROOT) {
4356             // Only issue one of these per DN for inodes in root: handle
4357             // others more efficiently by calling for root-child DNs at
4358             // the end of this function.
4359             _schedule_invalidate_dentry_callback(dn, true);
4360           }
4361           ldout(cct, 20) << " queueing dentry for trimming: " << dn->name << dendl;
4362           to_trim.insert(dn);
4363         } else {
4364           ldout(cct, 20) << "  not expirable: " << dn->name << dendl;
4365           all = false;
4366         }
4367       }
4368       if (all && in->ino != MDS_INO_ROOT) {
4369         ldout(cct, 20) << __func__ << " counting as trimmed: " << *in << dendl;
4370         trimmed++;
4371         _schedule_ino_release_callback(in.get());
4372       }
4373     }
4374   }
4375   ldout(cct, 20) << " trimming queued dentries: " << dendl;
4376   for (const auto &dn : to_trim) {
4377     trim_dentry(dn);
4378   }
4379   to_trim.clear();
4380
4381   caps_size = s->caps.size();
4382   if (caps_size > (size_t)max)
4383     _invalidate_kernel_dcache();
4384 }
4385
4386 void Client::force_session_readonly(MetaSession *s)
4387 {
4388   s->readonly = true;
4389   for (xlist<Cap*>::iterator p = s->caps.begin(); !p.end(); ++p) {
4390     auto &in = (*p)->inode;
4391     if (in.caps_wanted() & CEPH_CAP_FILE_WR)
4392       signal_cond_list(in.waitfor_caps);
4393   }
4394 }
4395
4396 int Client::mark_caps_flushing(Inode *in, ceph_tid_t* ptid)
4397 {
4398   MetaSession *session = in->auth_cap->session;
4399
4400   int flushing = in->dirty_caps;
4401   ceph_assert(flushing);
4402
4403   ceph_tid_t flush_tid = ++last_flush_tid;
4404   in->flushing_cap_tids[flush_tid] = flushing;
4405
4406   if (!in->flushing_caps) {
4407     ldout(cct, 10) << __func__ << " " << ccap_string(flushing) << " " << *in << dendl;
4408     num_flushing_caps++;
4409   } else {
4410     ldout(cct, 10) << __func__ << " (more) " << ccap_string(flushing) << " " << *in << dendl;
4411   }
4412
4413   in->flushing_caps |= flushing;
4414   in->mark_caps_clean();
4415
4416   if (!in->flushing_cap_item.is_on_list())
4417     session->flushing_caps.push_back(&in->flushing_cap_item);
4418   session->flushing_caps_tids.insert(flush_tid);
4419
4420   *ptid = flush_tid;
4421   return flushing;
4422 }
4423
4424 void Client::adjust_session_flushing_caps(Inode *in, MetaSession *old_s,  MetaSession *new_s)
4425 {
4426   for (auto &p : in->cap_snaps) {
4427     CapSnap &capsnap = p.second;
4428     if (capsnap.flush_tid > 0) {
4429       old_s->flushing_caps_tids.erase(capsnap.flush_tid);
4430       new_s->flushing_caps_tids.insert(capsnap.flush_tid);
4431     }
4432   }
4433   for (map<ceph_tid_t, int>::iterator it = in->flushing_cap_tids.begin();
4434        it != in->flushing_cap_tids.end();
4435        ++it) {
4436     old_s->flushing_caps_tids.erase(it->first);
4437     new_s->flushing_caps_tids.insert(it->first);
4438   }
4439   new_s->flushing_caps.push_back(&in->flushing_cap_item);
4440 }
4441
4442 /*
4443  * Flush all caps back to the MDS. Because the callers generally wait on the
4444  * result of this function (syncfs and umount cases), we set
4445  * CHECK_CAPS_SYNCHRONOUS on the last check_caps call.
4446  */
4447 void Client::flush_caps_sync()
4448 {
4449   ldout(cct, 10) << __func__ << dendl;
4450   xlist<Inode*>::iterator p = delayed_list.begin();
4451   while (!p.end()) {
4452     unsigned flags = CHECK_CAPS_NODELAY;
4453     Inode *in = *p;
4454
4455     ++p;
4456     delayed_list.pop_front();
4457     if (p.end() && dirty_list.empty())
4458       flags |= CHECK_CAPS_SYNCHRONOUS;
4459     check_caps(in, flags);
4460   }
4461
4462   // other caps, too
4463   p = dirty_list.begin();
4464   while (!p.end()) {
4465     unsigned flags = CHECK_CAPS_NODELAY;
4466     Inode *in = *p;
4467
4468     ++p;
4469     if (p.end())
4470       flags |= CHECK_CAPS_SYNCHRONOUS;
4471     check_caps(in, flags);
4472   }
4473 }
4474
4475 void Client::wait_sync_caps(Inode *in, ceph_tid_t want)
4476 {
4477   while (in->flushing_caps) {
4478     map<ceph_tid_t, int>::iterator it = in->flushing_cap_tids.begin();
4479     ceph_assert(it != in->flushing_cap_tids.end());
4480     if (it->first > want)
4481       break;
4482     ldout(cct, 10) << __func__ << " on " << *in << " flushing "
4483                    << ccap_string(it->second) << " want " << want
4484                    << " last " << it->first << dendl;
4485     wait_on_list(in->waitfor_caps);
4486   }
4487 }
4488
4489 void Client::wait_sync_caps(ceph_tid_t want)
4490 {
4491  retry:
4492   ldout(cct, 10) << __func__ << " want " << want  << " (last is " << last_flush_tid << ", "
4493            << num_flushing_caps << " total flushing)" << dendl;
4494   for (auto &p : mds_sessions) {
4495     MetaSession *s = &p.second;
4496     if (s->flushing_caps_tids.empty())
4497         continue;
4498     ceph_tid_t oldest_tid = *s->flushing_caps_tids.begin();
4499     if (oldest_tid <= want) {
4500       ldout(cct, 10) << " waiting on mds." << p.first << " tid " << oldest_tid
4501                      << " (want " << want << ")" << dendl;
4502       std::unique_lock l{client_lock, std::adopt_lock};
4503       sync_cond.wait(l);
4504       l.release();
4505       goto retry;
4506     }
4507   }
4508 }
4509
4510 void Client::kick_flushing_caps(Inode *in, MetaSession *session)
4511 {
4512   in->flags &= ~I_KICK_FLUSH;
4513
4514   Cap *cap = in->auth_cap;
4515   ceph_assert(cap->session == session);
4516
4517   ceph_tid_t last_snap_flush = 0;
4518   for (auto p = in->flushing_cap_tids.rbegin();
4519        p != in->flushing_cap_tids.rend();
4520        ++p) {
4521     if (!p->second) {
4522       last_snap_flush = p->first;
4523       break;
4524     }
4525   }
4526
4527   int wanted = in->caps_wanted();
4528   int used = get_caps_used(in) | in->caps_dirty();
4529   auto it = in->cap_snaps.begin();
4530   for (auto& p : in->flushing_cap_tids) {
4531     if (p.second) {
4532       int msg_flags = p.first < last_snap_flush ? MClientCaps::FLAG_PENDING_CAPSNAP : 0;
4533       send_cap(in, session, cap, msg_flags, used, wanted, (cap->issued | cap->implemented),
4534                p.second, p.first);
4535     } else {
4536       ceph_assert(it != in->cap_snaps.end());
4537       ceph_assert(it->second.flush_tid == p.first);
4538       send_flush_snap(in, session, it->first, it->second);
4539       ++it;
4540     }
4541   }
4542 }
4543
4544 void Client::kick_flushing_caps(MetaSession *session)
4545 {
4546   mds_rank_t mds = session->mds_num;
4547   ldout(cct, 10) << __func__ << " mds." << mds << dendl;
4548
4549   for (xlist<Inode*>::iterator p = session->flushing_caps.begin(); !p.end(); ++p) {
4550     Inode *in = *p;
4551     if (in->flags & I_KICK_FLUSH) {
4552       ldout(cct, 20) << " reflushing caps on " << *in << " to mds." << mds << dendl;
4553       kick_flushing_caps(in, session);
4554     }
4555   }
4556 }
4557
4558 void Client::early_kick_flushing_caps(MetaSession *session)
4559 {
4560   for (xlist<Inode*>::iterator p = session->flushing_caps.begin(); !p.end(); ++p) {
4561     Inode *in = *p;
4562     Cap *cap = in->auth_cap;
4563     ceph_assert(cap);
4564
4565     // if flushing caps were revoked, we re-send the cap flush in client reconnect
4566     // stage. This guarantees that MDS processes the cap flush message before issuing
4567     // the flushing caps to other client.
4568     if ((in->flushing_caps & in->auth_cap->issued) == in->flushing_caps) {
4569       in->flags |= I_KICK_FLUSH;
4570       continue;
4571     }
4572
4573     ldout(cct, 20) << " reflushing caps (early_kick) on " << *in
4574                    << " to mds." << session->mds_num << dendl;
4575     // send_reconnect() also will reset these sequence numbers. make sure
4576     // sequence numbers in cap flush message match later reconnect message.
4577     cap->seq = 0;
4578     cap->issue_seq = 0;
4579     cap->mseq = 0;
4580     cap->issued = cap->implemented;
4581
4582     kick_flushing_caps(in, session);
4583   }
4584 }
4585
4586 void SnapRealm::build_snap_context()
4587 {
4588   set<snapid_t> snaps;
4589   snapid_t max_seq = seq;
4590
4591   // start with prior_parents?
4592   for (unsigned i=0; i<prior_parent_snaps.size(); i++)
4593     snaps.insert(prior_parent_snaps[i]);
4594
4595   // current parent's snaps
4596   if (pparent) {
4597     const SnapContext& psnapc = pparent->get_snap_context();
4598     for (unsigned i=0; i<psnapc.snaps.size(); i++)
4599       if (psnapc.snaps[i] >= parent_since)
4600         snaps.insert(psnapc.snaps[i]);
4601     if (psnapc.seq > max_seq)
4602       max_seq = psnapc.seq;
4603   }
4604
4605   // my snaps
4606   for (unsigned i=0; i<my_snaps.size(); i++)
4607     snaps.insert(my_snaps[i]);
4608
4609   // ok!
4610   cached_snap_context.seq = max_seq;
4611   cached_snap_context.snaps.resize(0);
4612   cached_snap_context.snaps.reserve(snaps.size());
4613   for (set<snapid_t>::reverse_iterator p = snaps.rbegin(); p != snaps.rend(); ++p)
4614     cached_snap_context.snaps.push_back(*p);
4615 }
4616
4617 void Client::invalidate_snaprealm_and_children(SnapRealm *realm)
4618 {
4619   list<SnapRealm*> q;
4620   q.push_back(realm);
4621
4622   while (!q.empty()) {
4623     realm = q.front();
4624     q.pop_front();
4625
4626     ldout(cct, 10) << __func__ << " " << *realm << dendl;
4627     realm->invalidate_cache();
4628
4629     for (set<SnapRealm*>::iterator p = realm->pchildren.begin();
4630          p != realm->pchildren.end();
4631          ++p)
4632       q.push_back(*p);
4633   }
4634 }
4635
4636 SnapRealm *Client::get_snap_realm(inodeno_t r)
4637 {
4638   SnapRealm *realm = snap_realms[r];
4639   if (!realm)
4640     snap_realms[r] = realm = new SnapRealm(r);
4641   ldout(cct, 20) << __func__ << " " << r << " " << realm << " " << realm->nref << " -> " << (realm->nref + 1) << dendl;
4642   realm->nref++;
4643   return realm;
4644 }
4645
4646 SnapRealm *Client::get_snap_realm_maybe(inodeno_t r)
4647 {
4648   if (snap_realms.count(r) == 0) {
4649     ldout(cct, 20) << __func__ << " " << r << " fail" << dendl;
4650     return NULL;
4651   }
4652   SnapRealm *realm = snap_realms[r];
4653   ldout(cct, 20) << __func__ << " " << r << " " << realm << " " << realm->nref << " -> " << (realm->nref + 1) << dendl;
4654   realm->nref++;
4655   return realm;
4656 }
4657
4658 void Client::put_snap_realm(SnapRealm *realm)
4659 {
4660   ldout(cct, 20) << __func__ << " " << realm->ino << " " << realm
4661                  << " " << realm->nref << " -> " << (realm->nref - 1) << dendl;
4662   if (--realm->nref == 0) {
4663     snap_realms.erase(realm->ino);
4664     if (realm->pparent) {
4665       realm->pparent->pchildren.erase(realm);
4666       put_snap_realm(realm->pparent);
4667     }
4668     delete realm;
4669   }
4670 }
4671
4672 bool Client::adjust_realm_parent(SnapRealm *realm, inodeno_t parent)
4673 {
4674   if (realm->parent != parent) {
4675     ldout(cct, 10) << __func__ << " " << *realm
4676              << " " << realm->parent << " -> " << parent << dendl;
4677     realm->parent = parent;
4678     if (realm->pparent) {
4679       realm->pparent->pchildren.erase(realm);
4680       put_snap_realm(realm->pparent);
4681     }
4682     realm->pparent = get_snap_realm(parent);
4683     realm->pparent->pchildren.insert(realm);
4684     return true;
4685   }
4686   return false;
4687 }
4688
4689 static bool has_new_snaps(const SnapContext& old_snapc,
4690                           const SnapContext& new_snapc)
4691 {
4692   return !new_snapc.snaps.empty() && new_snapc.snaps[0] > old_snapc.seq;
4693 }
4694
4695
4696 void Client::update_snap_trace(const bufferlist& bl, SnapRealm **realm_ret, bool flush)
4697 {
4698   SnapRealm *first_realm = NULL;
4699   ldout(cct, 10) << __func__ << " len " << bl.length() << dendl;
4700
4701   map<SnapRealm*, SnapContext> dirty_realms;
4702
4703   auto p = bl.cbegin();
4704   while (!p.end()) {
4705     SnapRealmInfo info;
4706     decode(info, p);
4707     SnapRealm *realm = get_snap_realm(info.ino());
4708
4709     bool invalidate = false;
4710
4711     if (info.seq() > realm->seq) {
4712       ldout(cct, 10) << __func__ << " " << *realm << " seq " << info.seq() << " > " << realm->seq
4713                << dendl;
4714
4715       if (flush) {
4716         // writeback any dirty caps _before_ updating snap list (i.e. with old snap info)
4717         //  flush me + children
4718         list<SnapRealm*> q;
4719         q.push_back(realm);
4720         while (!q.empty()) {
4721           SnapRealm *realm = q.front();
4722           q.pop_front();
4723
4724           for (set<SnapRealm*>::iterator p = realm->pchildren.begin();
4725                p != realm->pchildren.end();
4726                ++p)
4727             q.push_back(*p);
4728
4729           if (dirty_realms.count(realm) == 0) {
4730             realm->nref++;
4731             dirty_realms[realm] = realm->get_snap_context();
4732           }
4733         }
4734       }
4735
4736       // update
4737       realm->seq = info.seq();
4738       realm->created = info.created();
4739       realm->parent_since = info.parent_since();
4740       realm->prior_parent_snaps = info.prior_parent_snaps;
4741       realm->my_snaps = info.my_snaps;
4742       invalidate = true;
4743     }
4744
4745     // _always_ verify parent
4746     if (adjust_realm_parent(realm, info.parent()))
4747       invalidate = true;
4748
4749     if (invalidate) {
4750       invalidate_snaprealm_and_children(realm);
4751       ldout(cct, 15) << __func__ << " " << *realm << " self|parent updated" << dendl;
4752       ldout(cct, 15) << "  snapc " << realm->get_snap_context() << dendl;
4753     } else {
4754       ldout(cct, 10) << __func__ << " " << *realm << " seq " << info.seq()
4755                << " <= " << realm->seq << " and same parent, SKIPPING" << dendl;
4756     }
4757
4758     if (!first_realm)
4759       first_realm = realm;
4760     else
4761       put_snap_realm(realm);
4762   }
4763
4764   for (map<SnapRealm*, SnapContext>::iterator q = dirty_realms.begin();
4765        q != dirty_realms.end();
4766        ++q) {
4767     SnapRealm *realm = q->first;
4768     // if there are new snaps ?
4769     if (has_new_snaps(q->second, realm->get_snap_context())) {
4770       ldout(cct, 10) << " flushing caps on " << *realm << dendl;
4771       xlist<Inode*>::iterator r = realm->inodes_with_caps.begin();
4772       while (!r.end()) {
4773         Inode *in = *r;
4774         ++r;
4775         queue_cap_snap(in, q->second);
4776       }
4777     } else {
4778       ldout(cct, 10) << " no new snap on " << *realm << dendl;
4779     }
4780     put_snap_realm(realm);
4781   }
4782
4783   if (realm_ret)
4784     *realm_ret = first_realm;
4785   else
4786     put_snap_realm(first_realm);
4787 }
4788
4789 void Client::handle_snap(const MConstRef<MClientSnap>& m)
4790 {
4791   ldout(cct, 10) << __func__ << " " << *m << dendl;
4792   mds_rank_t mds = mds_rank_t(m->get_source().num());
4793   MetaSession *session = _get_mds_session(mds, m->get_connection().get());
4794   if (!session) {
4795     return;
4796   }
4797
4798   got_mds_push(session);
4799
4800   map<Inode*, SnapContext> to_move;
4801   SnapRealm *realm = 0;
4802
4803   if (m->head.op == CEPH_SNAP_OP_SPLIT) {
4804     ceph_assert(m->head.split);
4805     SnapRealmInfo info;
4806     auto p = m->bl.cbegin();
4807     decode(info, p);
4808     ceph_assert(info.ino() == m->head.split);
4809
4810     // flush, then move, ino's.
4811     realm = get_snap_realm(info.ino());
4812     ldout(cct, 10) << " splitting off " << *realm << dendl;
4813     for (auto& ino : m->split_inos) {
4814       vinodeno_t vino(ino, CEPH_NOSNAP);
4815       if (inode_map.count(vino)) {
4816         Inode *in = inode_map[vino];
4817         if (!in->snaprealm || in->snaprealm == realm)
4818           continue;
4819         if (in->snaprealm->created > info.created()) {
4820           ldout(cct, 10) << " NOT moving " << *in << " from _newer_ realm "
4821                    << *in->snaprealm << dendl;
4822           continue;
4823         }
4824         ldout(cct, 10) << " moving " << *in << " from " << *in->snaprealm << dendl;
4825
4826
4827         in->snaprealm_item.remove_myself();
4828         to_move[in] = in->snaprealm->get_snap_context();
4829         put_snap_realm(in->snaprealm);
4830       }
4831     }
4832
4833     // move child snaprealms, too
4834     for (auto& child_realm : m->split_realms) {
4835       ldout(cct, 10) << "adjusting snaprealm " << child_realm << " parent" << dendl;
4836       SnapRealm *child = get_snap_realm_maybe(child_realm);
4837       if (!child)
4838         continue;
4839       adjust_realm_parent(child, realm->ino);
4840       put_snap_realm(child);
4841     }
4842   }
4843
4844   update_snap_trace(m->bl, NULL, m->head.op != CEPH_SNAP_OP_DESTROY);
4845
4846   if (realm) {
4847     for (auto p = to_move.begin(); p != to_move.end(); ++p) {
4848       Inode *in = p->first;
4849       in->snaprealm = realm;
4850       realm->inodes_with_caps.push_back(&in->snaprealm_item);
4851       realm->nref++;
4852       // queue for snap writeback
4853       if (has_new_snaps(p->second, realm->get_snap_context()))
4854         queue_cap_snap(in, p->second);
4855     }
4856     put_snap_realm(realm);
4857   }
4858 }
4859
4860 void Client::handle_quota(const MConstRef<MClientQuota>& m)
4861 {
4862   mds_rank_t mds = mds_rank_t(m->get_source().num());
4863   MetaSession *session = _get_mds_session(mds, m->get_connection().get());
4864   if (!session) {
4865     return;
4866   }
4867
4868   got_mds_push(session);
4869
4870   ldout(cct, 10) << __func__ << " " << *m << " from mds." << mds << dendl;
4871
4872   vinodeno_t vino(m->ino, CEPH_NOSNAP);
4873   if (inode_map.count(vino)) {
4874     Inode *in = NULL;
4875     in = inode_map[vino];
4876
4877     if (in) {
4878       in->quota = m->quota;
4879       in->rstat = m->rstat;
4880     }
4881   }
4882 }
4883
4884 void Client::handle_caps(const MConstRef<MClientCaps>& m)
4885 {
4886   mds_rank_t mds = mds_rank_t(m->get_source().num());
4887   MetaSession *session = _get_mds_session(mds, m->get_connection().get());
4888   if (!session) {
4889     return;
4890   }
4891
4892   if (m->osd_epoch_barrier && !objecter->have_map(m->osd_epoch_barrier)) {
4893     // Pause RADOS operations until we see the required epoch
4894     objecter->set_epoch_barrier(m->osd_epoch_barrier);
4895   }
4896
4897   if (m->osd_epoch_barrier > cap_epoch_barrier) {
4898     // Record the barrier so that we will transmit it to MDS when releasing
4899     set_cap_epoch_barrier(m->osd_epoch_barrier);
4900   }
4901
4902   got_mds_push(session);
4903
4904   Inode *in;
4905   vinodeno_t vino(m->get_ino(), CEPH_NOSNAP);
4906   if (auto it = inode_map.find(vino); it != inode_map.end()) {
4907     in = it->second;
4908   } else {
4909     if (m->get_op() == CEPH_CAP_OP_IMPORT) {
4910       ldout(cct, 5) << __func__ << " don't have vino " << vino << " on IMPORT, immediately releasing" << dendl;
4911       session->enqueue_cap_release(
4912         m->get_ino(),
4913         m->get_cap_id(),
4914         m->get_seq(),
4915         m->get_mseq(),
4916         cap_epoch_barrier);
4917     } else {
4918       ldout(cct, 5) << __func__ << " don't have vino " << vino << ", dropping" << dendl;
4919     }
4920
4921     // in case the mds is waiting on e.g. a revocation
4922     flush_cap_releases();
4923     return;
4924   }
4925
4926   switch (m->get_op()) {
4927     case CEPH_CAP_OP_EXPORT: return handle_cap_export(session, in, m);
4928     case CEPH_CAP_OP_FLUSHSNAP_ACK: return handle_cap_flushsnap_ack(session, in, m);
4929     case CEPH_CAP_OP_IMPORT: /* no return */ handle_cap_import(session, in, m);
4930   }
4931
4932   if (auto it = in->caps.find(mds); it != in->caps.end()) {
4933     Cap &cap = in->caps.at(mds);
4934
4935     switch (m->get_op()) {
4936       case CEPH_CAP_OP_TRUNC: return handle_cap_trunc(session, in, m);
4937       case CEPH_CAP_OP_IMPORT:
4938       case CEPH_CAP_OP_REVOKE:
4939       case CEPH_CAP_OP_GRANT: return handle_cap_grant(session, in, &cap, m);
4940       case CEPH_CAP_OP_FLUSH_ACK: return handle_cap_flush_ack(session, in, &cap, m);
4941     }
4942   } else {
4943     ldout(cct, 5) << __func__ << " don't have " << *in << " cap on mds." << mds << dendl;
4944     return;
4945   }
4946 }
4947
4948 void Client::handle_cap_import(MetaSession *session, Inode *in, const MConstRef<MClientCaps>& m)
4949 {
4950   mds_rank_t mds = session->mds_num;
4951
4952   ldout(cct, 5) << __func__ << " ino " << m->get_ino() << " mseq " << m->get_mseq()
4953                 << " IMPORT from mds." << mds << dendl;
4954
4955   const mds_rank_t peer_mds = mds_rank_t(m->peer.mds);
4956   Cap *cap = NULL;
4957   UserPerm cap_perms;
4958   if (auto it = in->caps.find(peer_mds); m->peer.cap_id && it != in->caps.end()) {
4959     cap = &it->second;
4960     cap_perms = cap->latest_perms;
4961   }
4962
4963   // add/update it
4964   SnapRealm *realm = NULL;
4965   update_snap_trace(m->snapbl, &realm);
4966
4967   int issued = m->get_caps();
4968   int wanted = m->get_wanted();
4969   add_update_cap(in, session, m->get_cap_id(),
4970                  issued, wanted, m->get_seq(), m->get_mseq(),
4971                  m->get_realm(), CEPH_CAP_FLAG_AUTH, cap_perms);
4972
4973   if (cap && cap->cap_id == m->peer.cap_id) {
4974       remove_cap(cap, (m->peer.flags & CEPH_CAP_FLAG_RELEASE));
4975   }
4976
4977   if (realm)
4978     put_snap_realm(realm);
4979
4980   if (in->auth_cap && in->auth_cap->session == session) {
4981     if (!(wanted & CEPH_CAP_ANY_FILE_WR) ||
4982         in->requested_max_size > m->get_max_size()) {
4983       in->requested_max_size = 0;
4984       ldout(cct, 15) << "reset requested_max_size after cap import" << dendl;
4985     }
4986     // reflush any/all caps (if we are now the auth_cap)
4987     kick_flushing_caps(in, session);
4988   }
4989 }
4990
4991 void Client::handle_cap_export(MetaSession *session, Inode *in, const MConstRef<MClientCaps>& m)
4992 {
4993   mds_rank_t mds = session->mds_num;
4994
4995   ldout(cct, 5) << __func__ << " ino " << m->get_ino() << " mseq " << m->get_mseq()
4996                 << " EXPORT from mds." << mds << dendl;
4997
4998   auto it = in->caps.find(mds);
4999   if (it != in->caps.end()) {
5000     Cap &cap = it->second;
5001     if (cap.cap_id == m->get_cap_id()) {
5002       if (m->peer.cap_id) {
5003         const auto peer_mds = mds_rank_t(m->peer.mds);
5004         MetaSession *tsession = _get_or_open_mds_session(peer_mds);
5005         auto it = in->caps.find(peer_mds);
5006         if (it != in->caps.end()) {
5007           Cap &tcap = it->second;
5008           if (tcap.cap_id == m->peer.cap_id &&
5009               ceph_seq_cmp(tcap.seq, m->peer.seq) < 0) {
5010             tcap.cap_id = m->peer.cap_id;
5011             tcap.seq = m->peer.seq - 1;
5012             tcap.issue_seq = tcap.seq;
5013             tcap.issued |= cap.issued;
5014             tcap.implemented |= cap.issued;
5015             if (&cap == in->auth_cap)
5016               in->auth_cap = &tcap;
5017             if (in->auth_cap == &tcap && in->flushing_cap_item.is_on_list())
5018               adjust_session_flushing_caps(in, session, tsession);
5019           }
5020         } else {
5021           add_update_cap(in, tsession, m->peer.cap_id, cap.issued, 0,
5022                          m->peer.seq - 1, m->peer.mseq, (uint64_t)-1,
5023                          &cap == in->auth_cap ? CEPH_CAP_FLAG_AUTH : 0,
5024                          cap.latest_perms);
5025         }
5026       } else {
5027         if (cap.wanted | cap.issued)
5028           in->flags |= I_CAP_DROPPED;
5029       }
5030
5031       remove_cap(&cap, false);
5032     }
5033   }
5034 }
5035
5036 void Client::handle_cap_trunc(MetaSession *session, Inode *in, const MConstRef<MClientCaps>& m)
5037 {
5038   mds_rank_t mds = session->mds_num;
5039   ceph_assert(in->caps.count(mds));
5040
5041   ldout(cct, 10) << __func__ << " on ino " << *in
5042            << " size " << in->size << " -> " << m->get_size()
5043            << dendl;
5044
5045   int issued;
5046   in->caps_issued(&issued);
5047   issued |= in->caps_dirty();
5048   update_inode_file_size(in, issued, m->get_size(),
5049                          m->get_truncate_seq(), m->get_truncate_size());
5050 }
5051
5052 void Client::handle_cap_flush_ack(MetaSession *session, Inode *in, Cap *cap, const MConstRef<MClientCaps>& m)
5053 {
5054   ceph_tid_t flush_ack_tid = m->get_client_tid();
5055   int dirty = m->get_dirty();
5056   int cleaned = 0;
5057   int flushed = 0;
5058
5059   auto it = in->flushing_cap_tids.begin();
5060   if (it->first < flush_ack_tid) {
5061        ldout(cct, 0) << __func__ << " mds." << session->mds_num
5062                    << " got unexpected flush ack tid " << flush_ack_tid
5063                    << " expected is " << it->first << dendl;
5064   }
5065   for (; it != in->flushing_cap_tids.end(); ) {
5066     if (!it->second) {
5067       // cap snap
5068       ++it;
5069       continue;
5070     }
5071     if (it->first == flush_ack_tid)
5072       cleaned = it->second;
5073     if (it->first <= flush_ack_tid) {
5074       session->flushing_caps_tids.erase(it->first);
5075       in->flushing_cap_tids.erase(it++);
5076       ++flushed;
5077       continue;
5078     }
5079     cleaned &= ~it->second;
5080     if (!cleaned)
5081       break;
5082     ++it;
5083   }
5084
5085   ldout(cct, 5) << __func__ << " mds." << session->mds_num
5086           << " cleaned " << ccap_string(cleaned) << " on " << *in
5087           << " with " << ccap_string(dirty) << dendl;
5088
5089   if (flushed) {
5090     signal_cond_list(in->waitfor_caps);
5091     if (session->flushing_caps_tids.empty() ||
5092         *session->flushing_caps_tids.begin() > flush_ack_tid)
5093       sync_cond.notify_all();
5094   }
5095
5096   if (!dirty) {
5097     in->cap_dirtier_uid = -1;
5098     in->cap_dirtier_gid = -1;
5099   }
5100
5101   if (!cleaned) {
5102     ldout(cct, 10) << " tid " << m->get_client_tid() << " != any cap bit tids" << dendl;
5103   } else {
5104     if (in->flushing_caps) {
5105       ldout(cct, 5) << "  flushing_caps " << ccap_string(in->flushing_caps)
5106               << " -> " << ccap_string(in->flushing_caps & ~cleaned) << dendl;
5107       in->flushing_caps &= ~cleaned;
5108       if (in->flushing_caps == 0) {
5109         ldout(cct, 10) << " " << *in << " !flushing" << dendl;
5110         num_flushing_caps--;
5111        if (in->flushing_cap_tids.empty())
5112           in->flushing_cap_item.remove_myself();
5113       }
5114       if (!in->caps_dirty())
5115         put_inode(in);
5116     }
5117   }
5118 }
5119
5120
5121 void Client::handle_cap_flushsnap_ack(MetaSession *session, Inode *in, const MConstRef<MClientCaps>& m)
5122 {
5123   ceph_tid_t flush_ack_tid = m->get_client_tid();
5124   mds_rank_t mds = session->mds_num;
5125   ceph_assert(in->caps.count(mds));
5126   snapid_t follows = m->get_snap_follows();
5127
5128   if (auto it = in->cap_snaps.find(follows); it != in->cap_snaps.end()) {
5129     auto& capsnap = it->second;
5130     if (flush_ack_tid != capsnap.flush_tid) {
5131       ldout(cct, 10) << " tid " << flush_ack_tid << " != " << capsnap.flush_tid << dendl;
5132     } else {
5133       InodeRef tmp_ref(in);
5134       ldout(cct, 5) << __func__ << " mds." << mds << " flushed snap follows " << follows
5135               << " on " << *in << dendl;
5136       session->flushing_caps_tids.erase(capsnap.flush_tid);
5137       in->flushing_cap_tids.erase(capsnap.flush_tid);
5138       if (in->flushing_caps == 0 && in->flushing_cap_tids.empty())
5139         in->flushing_cap_item.remove_myself();
5140       in->cap_snaps.erase(it);
5141
5142       signal_cond_list(in->waitfor_caps);
5143       if (session->flushing_caps_tids.empty() ||
5144           *session->flushing_caps_tids.begin() > flush_ack_tid)
5145         sync_cond.notify_all();
5146     }
5147   } else {
5148     ldout(cct, 5) << __func__ << " DUP(?) mds." << mds << " flushed snap follows " << follows
5149             << " on " << *in << dendl;
5150     // we may not have it if we send multiple FLUSHSNAP requests and (got multiple FLUSHEDSNAPs back)
5151   }
5152 }
5153
5154 class C_Client_DentryInvalidate : public Context  {
5155 private:
5156   Client *client;
5157   vinodeno_t dirino;
5158   vinodeno_t ino;
5159   string name;
5160 public:
5161   C_Client_DentryInvalidate(Client *c, Dentry *dn, bool del) :
5162     client(c), name(dn->name) {
5163       if (client->use_faked_inos()) {
5164         dirino.ino = dn->dir->parent_inode->faked_ino;
5165         if (del)
5166           ino.ino = dn->inode->faked_ino;
5167       } else {
5168         dirino = dn->dir->parent_inode->vino();
5169         if (del)
5170           ino = dn->inode->vino();
5171       }
5172       if (!del)
5173         ino.ino = inodeno_t();
5174   }
5175   void finish(int r) override {
5176     // _async_dentry_invalidate is responsible for its own locking
5177     ceph_assert(ceph_mutex_is_not_locked_by_me(client->client_lock));
5178     client->_async_dentry_invalidate(dirino, ino, name);
5179   }
5180 };
5181
5182 void Client::_async_dentry_invalidate(vinodeno_t dirino, vinodeno_t ino, string& name)
5183 {
5184   if (unmounting)
5185     return;
5186   ldout(cct, 10) << __func__ << " '" << name << "' ino " << ino
5187                  << " in dir " << dirino << dendl;
5188   dentry_invalidate_cb(callback_handle, dirino, ino, name.c_str(), name.length());
5189 }
5190
5191 void Client::_schedule_invalidate_dentry_callback(Dentry *dn, bool del)
5192 {
5193   if (dentry_invalidate_cb && dn->inode->ll_ref > 0)
5194     async_dentry_invalidator.queue(new C_Client_DentryInvalidate(this, dn, del));
5195 }
5196
5197 void Client::_try_to_trim_inode(Inode *in, bool sched_inval)
5198 {
5199   int ref = in->get_num_ref();
5200   ldout(cct, 5) << __func__ << " in " << *in <<dendl;
5201
5202   if (in->dir && !in->dir->dentries.empty()) {
5203     for (auto p = in->dir->dentries.begin();
5204          p != in->dir->dentries.end(); ) {
5205       Dentry *dn = p->second;
5206       ++p;
5207       /* rmsnap removes whole subtree, need trim inodes recursively.
5208        * we don't need to invalidate dentries recursively. because
5209        * invalidating a directory dentry effectively invalidate
5210        * whole subtree */
5211       if (in->snapid != CEPH_NOSNAP && dn->inode && dn->inode->is_dir())
5212         _try_to_trim_inode(dn->inode.get(), false);
5213
5214       if (dn->lru_is_expireable())
5215         unlink(dn, true, false);  // keep dir, drop dentry
5216     }
5217     if (in->dir->dentries.empty()) {
5218       close_dir(in->dir);
5219       --ref;
5220     }
5221   }
5222
5223   if (ref > 0 && (in->flags & I_SNAPDIR_OPEN)) {
5224     InodeRef snapdir = open_snapdir(in);
5225     _try_to_trim_inode(snapdir.get(), false);
5226     --ref;
5227   }
5228
5229   if (ref > 0) {
5230     auto q = in->dentries.begin();
5231     while (q != in->dentries.end()) {
5232       Dentry *dn = *q;
5233       ++q;
5234       if( in->ll_ref > 0 && sched_inval) {
5235         // FIXME: we play lots of unlink/link tricks when handling MDS replies,
5236         //        so in->dentries doesn't always reflect the state of kernel's dcache.
5237         _schedule_invalidate_dentry_callback(dn, true);
5238       }
5239       unlink(dn, true, true);
5240     }
5241   }
5242 }
5243
5244 void Client::handle_cap_grant(MetaSession *session, Inode *in, Cap *cap, const MConstRef<MClientCaps>& m)
5245 {
5246   mds_rank_t mds = session->mds_num;
5247   int used = get_caps_used(in);
5248   int wanted = in->caps_wanted();
5249
5250   const unsigned new_caps = m->get_caps();
5251   const bool was_stale = session->cap_gen > cap->gen;
5252   ldout(cct, 5) << __func__ << " on in " << m->get_ino()
5253                 << " mds." << mds << " seq " << m->get_seq()
5254                 << " caps now " << ccap_string(new_caps)
5255                 << " was " << ccap_string(cap->issued)
5256                 << (was_stale ? " (stale)" : "") << dendl;
5257
5258   if (was_stale)
5259       cap->issued = cap->implemented = CEPH_CAP_PIN;
5260   cap->seq = m->get_seq();
5261   cap->gen = session->cap_gen;
5262
5263   check_cap_issue(in, new_caps);
5264
5265   // update inode
5266   int issued;
5267   in->caps_issued(&issued);
5268   issued |= in->caps_dirty();
5269
5270   if ((new_caps & CEPH_CAP_AUTH_SHARED) &&
5271       !(issued & CEPH_CAP_AUTH_EXCL)) {
5272     in->mode = m->head.mode;
5273     in->uid = m->head.uid;
5274     in->gid = m->head.gid;
5275     in->btime = m->btime;
5276   }
5277   bool deleted_inode = false;
5278   if ((new_caps & CEPH_CAP_LINK_SHARED) &&
5279       !(issued & CEPH_CAP_LINK_EXCL)) {
5280     in->nlink = m->head.nlink;
5281     if (in->nlink == 0 &&
5282         (new_caps & (CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL)))
5283       deleted_inode = true;
5284   }
5285   if (!(issued & CEPH_CAP_XATTR_EXCL) &&
5286       m->xattrbl.length() &&
5287       m->head.xattr_version > in->xattr_version) {
5288     auto p = m->xattrbl.cbegin();
5289     decode(in->xattrs, p);
5290     in->xattr_version = m->head.xattr_version;
5291   }
5292
5293   if ((new_caps & CEPH_CAP_FILE_SHARED) && m->dirstat_is_valid()) {
5294     in->dirstat.nfiles = m->get_nfiles();
5295     in->dirstat.nsubdirs = m->get_nsubdirs();
5296   }
5297
5298   if (new_caps & CEPH_CAP_ANY_RD) {
5299     update_inode_file_time(in, issued, m->get_time_warp_seq(),
5300                            m->get_ctime(), m->get_mtime(), m->get_atime());
5301   }
5302
5303   if (new_caps & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR)) {
5304     in->layout = m->get_layout();
5305     update_inode_file_size(in, issued, m->get_size(),
5306                            m->get_truncate_seq(), m->get_truncate_size());
5307   }
5308
5309   if (m->inline_version > in->inline_version) {
5310     in->inline_data = m->inline_data;
5311     in->inline_version = m->inline_version;
5312   }
5313
5314   /* always take a newer change attr */
5315   if (m->get_change_attr() > in->change_attr)
5316     in->change_attr = m->get_change_attr();
5317
5318   // max_size
5319   if (cap == in->auth_cap &&
5320       (new_caps & CEPH_CAP_ANY_FILE_WR) &&
5321       (m->get_max_size() != in->max_size)) {
5322     ldout(cct, 10) << "max_size " << in->max_size << " -> " << m->get_max_size() << dendl;
5323     in->max_size = m->get_max_size();
5324     if (in->max_size > in->wanted_max_size) {
5325       in->wanted_max_size = 0;
5326       in->requested_max_size = 0;
5327     }
5328   }
5329
5330   bool check = false;
5331   if ((was_stale || m->get_op() == CEPH_CAP_OP_IMPORT) &&
5332       (wanted & ~(cap->wanted | new_caps))) {
5333     // If mds is importing cap, prior cap messages that update 'wanted'
5334     // may get dropped by mds (migrate seq mismatch).
5335     //
5336     // We don't send cap message to update 'wanted' if what we want are
5337     // already issued. If mds revokes caps, cap message that releases caps
5338     // also tells mds what we want. But if caps got revoked by mds forcedly
5339     // (session stale). We may haven't told mds what we want.
5340     check = true;
5341   }
5342
5343
5344   // update caps
5345   auto revoked = cap->issued & ~new_caps;
5346   if (revoked) {
5347     ldout(cct, 10) << "  revocation of " << ccap_string(revoked) << dendl;
5348     cap->issued = new_caps;
5349     cap->implemented |= new_caps;
5350
5351     // recall delegations if we're losing caps necessary for them
5352     if (revoked & ceph_deleg_caps_for_type(CEPH_DELEGATION_RD))
5353       in->recall_deleg(false);
5354     else if (revoked & ceph_deleg_caps_for_type(CEPH_DELEGATION_WR))
5355       in->recall_deleg(true);
5356
5357     used = adjust_caps_used_for_lazyio(used, cap->issued, cap->implemented);
5358     if ((used & revoked & (CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO)) &&
5359         !_flush(in, new C_Client_FlushComplete(this, in))) {
5360       // waitin' for flush
5361     } else if (used & revoked & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO)) {
5362       if (_release(in))
5363         check = true;
5364     } else {
5365       cap->wanted = 0; // don't let check_caps skip sending a response to MDS
5366       check = true;
5367     }
5368   } else if (cap->issued == new_caps) {
5369     ldout(cct, 10) << "  caps unchanged at " << ccap_string(cap->issued) << dendl;
5370   } else {
5371     ldout(cct, 10) << "  grant, new caps are " << ccap_string(new_caps & ~cap->issued) << dendl;
5372     cap->issued = new_caps;
5373     cap->implemented |= new_caps;
5374
5375     if (cap == in->auth_cap) {
5376       // non-auth MDS is revoking the newly grant caps ?
5377       for (const auto &p : in->caps) {
5378         if (&p.second == cap)
5379           continue;
5380         if (p.second.implemented & ~p.second.issued & new_caps) {
5381           check = true;
5382           break;
5383         }
5384       }
5385     }
5386   }
5387
5388   if (check)
5389     check_caps(in, 0);
5390
5391   // wake up waiters
5392   if (new_caps)
5393     signal_cond_list(in->waitfor_caps);
5394
5395   // may drop inode's last ref
5396   if (deleted_inode)
5397     _try_to_trim_inode(in, true);
5398 }
5399
5400 int Client::inode_permission(Inode *in, const UserPerm& perms, unsigned want)
5401 {
5402   if (perms.uid() == 0)
5403     return 0;
5404
5405   if (perms.uid() != in->uid && (in->mode & S_IRWXG)) {
5406     int ret = _posix_acl_permission(in, perms, want);
5407     if (ret != -EAGAIN)
5408       return ret;
5409   }
5410
5411   // check permissions before doing anything else
5412   if (!in->check_mode(perms, want))
5413     return -EACCES;
5414   return 0;
5415 }
5416
5417 int Client::xattr_permission(Inode *in, const char *name, unsigned want,
5418                              const UserPerm& perms)
5419 {
5420   int r = _getattr_for_perm(in, perms);
5421   if (r < 0)
5422     goto out;
5423
5424   r = 0;
5425   if (strncmp(name, "system.", 7) == 0) {
5426     if ((want & MAY_WRITE) && (perms.uid() != 0 && perms.uid() != in->uid))
5427       r = -EPERM;
5428   } else {
5429     r = inode_permission(in, perms, want);
5430   }
5431 out:
5432   ldout(cct, 5) << __func__ << " " << in << " = " << r <<  dendl;
5433   return r;
5434 }
5435
5436 ostream& operator<<(ostream &out, const UserPerm& perm) {
5437   out << "UserPerm(uid: " << perm.uid() << ", gid: " << perm.gid() << ")";
5438   return out;
5439 }
5440
5441 int Client::may_setattr(Inode *in, struct ceph_statx *stx, int mask,
5442                         const UserPerm& perms)
5443 {
5444   ldout(cct, 20) << __func__ << " " << *in << "; " << perms << dendl;
5445   int r = _getattr_for_perm(in, perms);
5446   if (r < 0)
5447     goto out;
5448
5449   if (mask & CEPH_SETATTR_SIZE) {
5450     r = inode_permission(in, perms, MAY_WRITE);
5451     if (r < 0)
5452       goto out;
5453   }
5454
5455   r = -EPERM;
5456   if (mask & CEPH_SETATTR_UID) {
5457     if (perms.uid() != 0 && (perms.uid() != in->uid || stx->stx_uid != in->uid))
5458       goto out;
5459   }
5460   if (mask & CEPH_SETATTR_GID) {
5461     if (perms.uid() != 0 && (perms.uid() != in->uid ||
5462                (!perms.gid_in_groups(stx->stx_gid) && stx->stx_gid != in->gid)))
5463       goto out;
5464   }
5465
5466   if (mask & CEPH_SETATTR_MODE) {
5467     if (perms.uid() != 0 && perms.uid() != in->uid)
5468       goto out;
5469
5470     gid_t i_gid = (mask & CEPH_SETATTR_GID) ? stx->stx_gid : in->gid;
5471     if (perms.uid() != 0 && !perms.gid_in_groups(i_gid))
5472       stx->stx_mode &= ~S_ISGID;
5473   }
5474
5475   if (mask & (CEPH_SETATTR_CTIME | CEPH_SETATTR_BTIME |
5476               CEPH_SETATTR_MTIME | CEPH_SETATTR_ATIME)) {
5477     if (perms.uid() != 0 && perms.uid() != in->uid) {
5478       int check_mask = CEPH_SETATTR_CTIME | CEPH_SETATTR_BTIME;
5479       if (!(mask & CEPH_SETATTR_MTIME_NOW))
5480         check_mask |= CEPH_SETATTR_MTIME;
5481       if (!(mask & CEPH_SETATTR_ATIME_NOW))
5482         check_mask |= CEPH_SETATTR_ATIME;
5483       if (check_mask & mask) {
5484         goto out;
5485       } else {
5486         r = inode_permission(in, perms, MAY_WRITE);
5487         if (r < 0)
5488           goto out;
5489       }
5490     }
5491   }
5492   r = 0;
5493 out:
5494   ldout(cct, 3) << __func__ << " " << in << " = " << r <<  dendl;
5495   return r;
5496 }
5497
5498 int Client::may_open(Inode *in, int flags, const UserPerm& perms)
5499 {
5500   ldout(cct, 20) << __func__ << " " << *in << "; " << perms << dendl;
5501   unsigned want = 0;
5502
5503   if ((flags & O_ACCMODE) == O_WRONLY)
5504     want = MAY_WRITE;
5505   else if ((flags & O_ACCMODE) == O_RDWR)
5506     want = MAY_READ | MAY_WRITE;
5507   else if ((flags & O_ACCMODE) == O_RDONLY)
5508     want = MAY_READ;
5509   if (flags & O_TRUNC)
5510     want |= MAY_WRITE;
5511
5512   int r = 0;
5513   switch (in->mode & S_IFMT) {
5514     case S_IFLNK:
5515       r = -ELOOP;
5516       goto out;
5517     case S_IFDIR:
5518       if (want & MAY_WRITE) {
5519         r = -EISDIR;
5520         goto out;
5521       }
5522       break;
5523   }
5524
5525   r = _getattr_for_perm(in, perms);
5526   if (r < 0)
5527     goto out;
5528
5529   r = inode_permission(in, perms, want);
5530 out:
5531   ldout(cct, 3) << __func__ << " " << in << " = " << r <<  dendl;
5532   return r;
5533 }
5534
5535 int Client::may_lookup(Inode *dir, const UserPerm& perms)
5536 {
5537   ldout(cct, 20) << __func__ << " " << *dir << "; " << perms << dendl;
5538   int r = _getattr_for_perm(dir, perms);
5539   if (r < 0)
5540     goto out;
5541
5542   r = inode_permission(dir, perms, MAY_EXEC);
5543 out:
5544   ldout(cct, 3) << __func__ << " " << dir << " = " << r <<  dendl;
5545   return r;
5546 }
5547
5548 int Client::may_create(Inode *dir, const UserPerm& perms)
5549 {
5550   ldout(cct, 20) << __func__ << " " << *dir << "; " << perms << dendl;
5551   int r = _getattr_for_perm(dir, perms);
5552   if (r < 0)
5553     goto out;
5554
5555   r = inode_permission(dir, perms, MAY_EXEC | MAY_WRITE);
5556 out:
5557   ldout(cct, 3) << __func__ << " " << dir << " = " << r <<  dendl;
5558   return r;
5559 }
5560
5561 int Client::may_delete(Inode *dir, const char *name, const UserPerm& perms)
5562 {
5563   ldout(cct, 20) << __func__ << " " << *dir << "; " << "; name " << name << "; " << perms << dendl;
5564   int r = _getattr_for_perm(dir, perms);
5565   if (r < 0)
5566     goto out;
5567
5568   r = inode_permission(dir, perms, MAY_EXEC | MAY_WRITE);
5569   if (r < 0)
5570     goto out;
5571
5572   /* 'name == NULL' means rmsnap */
5573   if (perms.uid() != 0 && name && (dir->mode & S_ISVTX)) {
5574     InodeRef otherin;
5575     r = _lookup(dir, name, CEPH_CAP_AUTH_SHARED, &otherin, perms);
5576     if (r < 0)
5577       goto out;
5578     if (dir->uid != perms.uid() && otherin->uid != perms.uid())
5579       r = -EPERM;
5580   }
5581 out:
5582   ldout(cct, 3) << __func__ << " " << dir << " = " << r <<  dendl;
5583   return r;
5584 }
5585
5586 int Client::may_hardlink(Inode *in, const UserPerm& perms)
5587 {
5588   ldout(cct, 20) << __func__ << " " << *in << "; " << perms << dendl;
5589   int r = _getattr_for_perm(in, perms);
5590   if (r < 0)
5591     goto out;
5592
5593   if (perms.uid() == 0 || perms.uid() == in->uid) {
5594     r = 0;
5595     goto out;
5596   }
5597
5598   r = -EPERM;
5599   if (!S_ISREG(in->mode))
5600     goto out;
5601
5602   if (in->mode & S_ISUID)
5603     goto out;
5604
5605   if ((in->mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP))
5606     goto out;
5607
5608   r = inode_permission(in, perms, MAY_READ | MAY_WRITE);
5609 out:
5610   ldout(cct, 3) << __func__ << " " << in << " = " << r <<  dendl;
5611   return r;
5612 }
5613
5614 int Client::_getattr_for_perm(Inode *in, const UserPerm& perms)
5615 {
5616   int mask = CEPH_STAT_CAP_MODE;
5617   bool force = false;
5618   if (acl_type != NO_ACL) {
5619     mask |= CEPH_STAT_CAP_XATTR;
5620     force = in->xattr_version == 0;
5621   }
5622   return _getattr(in, mask, perms, force);
5623 }
5624
5625 vinodeno_t Client::_get_vino(Inode *in)
5626 {
5627   /* The caller must hold the client lock */
5628   return vinodeno_t(in->ino, in->snapid);
5629 }
5630
5631 /**
5632  * Resolve an MDS spec to a list of MDS daemon GIDs.
5633  *
5634  * The spec is a string representing a GID, rank, filesystem:rank, or name/id.
5635  * It may be '*' in which case it matches all GIDs.
5636  *
5637  * If no error is returned, the `targets` vector will be populated with at least
5638  * one MDS.
5639  */
5640 int Client::resolve_mds(
5641     const std::string &mds_spec,
5642     std::vector<mds_gid_t> *targets)
5643 {
5644   ceph_assert(fsmap);
5645   ceph_assert(targets != nullptr);
5646
5647   mds_role_t role;
5648   std::stringstream ss;
5649   int role_r = fsmap->parse_role(mds_spec, &role, ss);
5650   if (role_r == 0) {
5651     // We got a role, resolve it to a GID
5652     ldout(cct, 10) << __func__ << ": resolved '" << mds_spec << "' to role '"
5653       << role << "'" << dendl;
5654     targets->push_back(
5655         fsmap->get_filesystem(role.fscid)->mds_map.get_info(role.rank).global_id);
5656     return 0;
5657   }
5658
5659   std::string strtol_err;
5660   long long rank_or_gid = strict_strtoll(mds_spec.c_str(), 10, &strtol_err);
5661   if (strtol_err.empty()) {
5662     // It is a possible GID
5663     const mds_gid_t mds_gid = mds_gid_t(rank_or_gid);
5664     if (fsmap->gid_exists(mds_gid)) {
5665       ldout(cct, 10) << __func__ << ": validated GID " << mds_gid << dendl;
5666       targets->push_back(mds_gid);
5667     } else {
5668       lderr(cct) << __func__ << ": GID " << mds_gid << " not in MDS map"
5669                  << dendl;
5670       return -ENOENT;
5671     }
5672   } else if (mds_spec == "*") {
5673     // It is a wildcard: use all MDSs
5674     const auto mds_info = fsmap->get_mds_info();
5675
5676     if (mds_info.empty()) {
5677       lderr(cct) << __func__ << ": * passed but no MDS daemons found" << dendl;
5678       return -ENOENT;
5679     }
5680
5681     for (const auto i : mds_info) {
5682       targets->push_back(i.first);
5683     }
5684   } else {
5685     // It did not parse as an integer, it is not a wildcard, it must be a name
5686     const mds_gid_t mds_gid = fsmap->find_mds_gid_by_name(mds_spec);
5687     if (mds_gid == 0) {
5688       lderr(cct) << "MDS ID '" << mds_spec << "' not found" << dendl;
5689
5690       lderr(cct) << "FSMap: " << *fsmap << dendl;
5691
5692       return -ENOENT;
5693     } else {
5694       ldout(cct, 10) << __func__ << ": resolved ID '" << mds_spec
5695                      << "' to GID " << mds_gid << dendl;
5696       targets->push_back(mds_gid);
5697     }
5698   }
5699
5700   return 0;
5701 }
5702
5703
5704 /**
5705  * Authenticate with mon and establish global ID
5706  */
5707 int Client::authenticate()
5708 {
5709   ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
5710
5711   if (monclient->is_authenticated()) {
5712     return 0;
5713   }
5714
5715   client_lock.unlock();
5716   int r = monclient->authenticate(cct->_conf->client_mount_timeout);
5717   client_lock.lock();
5718   if (r < 0) {
5719     return r;
5720   }
5721
5722   whoami = monclient->get_global_id();
5723   messenger->set_myname(entity_name_t::CLIENT(whoami.v));
5724
5725   return 0;
5726 }
5727
5728 int Client::fetch_fsmap(bool user)
5729 {
5730   int r;
5731   // Retrieve FSMap to enable looking up daemon addresses.  We need FSMap
5732   // rather than MDSMap because no one MDSMap contains all the daemons, and
5733   // a `tell` can address any daemon.
5734   version_t fsmap_latest;
5735   do {
5736     C_SaferCond cond;
5737     monclient->get_version("fsmap", &fsmap_latest, NULL, &cond);
5738     client_lock.unlock();
5739     r = cond.wait();
5740     client_lock.lock();
5741   } while (r == -EAGAIN);
5742
5743   if (r < 0) {
5744     lderr(cct) << "Failed to learn FSMap version: " << cpp_strerror(r) << dendl;
5745     return r;
5746   }
5747
5748   ldout(cct, 10) << __func__ << " learned FSMap version " << fsmap_latest << dendl;
5749
5750   if (user) {
5751     if (!fsmap_user || fsmap_user->get_epoch() < fsmap_latest) {
5752       monclient->sub_want("fsmap.user", fsmap_latest, CEPH_SUBSCRIBE_ONETIME);
5753       monclient->renew_subs();
5754       wait_on_list(waiting_for_fsmap);
5755     }
5756     ceph_assert(fsmap_user);
5757     ceph_assert(fsmap_user->get_epoch() >= fsmap_latest);
5758   } else {
5759     if (!fsmap || fsmap->get_epoch() < fsmap_latest) {
5760       monclient->sub_want("fsmap", fsmap_latest, CEPH_SUBSCRIBE_ONETIME);
5761       monclient->renew_subs();
5762       wait_on_list(waiting_for_fsmap);
5763     }
5764     ceph_assert(fsmap);
5765     ceph_assert(fsmap->get_epoch() >= fsmap_latest);
5766   }
5767   ldout(cct, 10) << __func__ << " finished waiting for FSMap version "
5768                  << fsmap_latest << dendl;
5769   return 0;
5770 }
5771
5772 /**
5773  *
5774  * @mds_spec one of ID, rank, GID, "*"
5775  *
5776  */
5777 int Client::mds_command(
5778     const std::string &mds_spec,
5779     const vector<string>& cmd,
5780     const bufferlist& inbl,
5781     bufferlist *outbl,
5782     string *outs,
5783     Context *onfinish)
5784 {
5785   std::lock_guard lock(client_lock);
5786
5787   if (!initialized)
5788     return -ENOTCONN;
5789
5790   int r;
5791   r = authenticate();
5792   if (r < 0) {
5793     return r;
5794   }
5795
5796   r = fetch_fsmap(false);
5797   if (r < 0) {
5798     return r;
5799   }
5800
5801   // Look up MDS target(s) of the command
5802   std::vector<mds_gid_t> targets;
5803   r = resolve_mds(mds_spec, &targets);
5804   if (r < 0) {
5805     return r;
5806   }
5807
5808   // If daemons are laggy, we won't send them commands.  If all
5809   // are laggy then we fail.
5810   std::vector<mds_gid_t> non_laggy;
5811   for (const auto gid : targets) {
5812     const auto info = fsmap->get_info_gid(gid);
5813     if (!info.laggy()) {
5814       non_laggy.push_back(gid);
5815     }
5816   }
5817   if (non_laggy.size() == 0) {
5818     *outs = "All targeted MDS daemons are laggy";
5819     return -ENOENT;
5820   }
5821
5822   if (metadata.empty()) {
5823     // We are called on an unmounted client, so metadata
5824     // won't be initialized yet.
5825     populate_metadata("");
5826   }
5827
5828   // Send commands to targets
5829   C_GatherBuilder gather(cct, onfinish);
5830   for (const auto target_gid : non_laggy) {
5831     const auto info = fsmap->get_info_gid(target_gid);
5832
5833     // Open a connection to the target MDS
5834     ConnectionRef conn = messenger->connect_to_mds(info.get_addrs());
5835
5836     // Generate MDSCommandOp state
5837     auto &op = command_table.start_command();
5838
5839     op.on_finish = gather.new_sub();
5840     op.cmd = cmd;
5841     op.outbl = outbl;
5842     op.outs = outs;
5843     op.inbl = inbl;
5844     op.mds_gid = target_gid;
5845     op.con = conn;
5846
5847     ldout(cct, 4) << __func__ << ": new command op to " << target_gid
5848       << " tid=" << op.tid << cmd << dendl;
5849
5850     // Construct and send MCommand
5851     auto m = op.get_message(monclient->get_fsid());
5852     conn->send_message2(std::move(m));
5853   }
5854   gather.activate();
5855
5856   return 0;
5857 }
5858
5859 void Client::handle_command_reply(const MConstRef<MCommandReply>& m)
5860 {
5861   ceph_tid_t const tid = m->get_tid();
5862
5863   ldout(cct, 10) << __func__ << ": tid=" << m->get_tid() << dendl;
5864
5865   if (!command_table.exists(tid)) {
5866     ldout(cct, 1) << __func__ << ": unknown tid " << tid << ", dropping" << dendl;
5867     return;
5868   }
5869
5870   auto &op = command_table.get_command(tid);
5871   if (op.outbl) {
5872     *op.outbl = m->get_data();
5873   }
5874   if (op.outs) {
5875     *op.outs = m->rs;
5876   }
5877
5878   if (op.on_finish) {
5879     op.on_finish->complete(m->r);
5880   }
5881
5882   command_table.erase(tid);
5883 }
5884
5885 // -------------------
5886 // MOUNT
5887
5888 int Client::subscribe_mdsmap(const std::string &fs_name)
5889 {
5890   int r = authenticate();
5891   if (r < 0) {
5892     lderr(cct) << "authentication failed: " << cpp_strerror(r) << dendl;
5893     return r;
5894   }
5895
5896   std::string resolved_fs_name;
5897   if (fs_name.empty()) {
5898     resolved_fs_name = cct->_conf.get_val<std::string>("client_fs");
5899     if (resolved_fs_name.empty())
5900             // Try the backwards compatibility fs name option
5901             resolved_fs_name = cct->_conf.get_val<std::string>("client_mds_namespace");
5902   } else {
5903     resolved_fs_name = fs_name;
5904   }
5905
5906   std::string want = "mdsmap";
5907   if (!resolved_fs_name.empty()) {
5908     r = fetch_fsmap(true);
5909     if (r < 0)
5910       return r;
5911     fscid = fsmap_user->get_fs_cid(resolved_fs_name);
5912     if (fscid == FS_CLUSTER_ID_NONE) {
5913       return -ENOENT;
5914     }
5915
5916     std::ostringstream oss;
5917     oss << want << "." << fscid;
5918     want = oss.str();
5919   }
5920   ldout(cct, 10) << "Subscribing to map '" << want << "'" << dendl;
5921
5922   monclient->sub_want(want, 0, 0);
5923   monclient->renew_subs();
5924
5925   return 0;
5926 }
5927
5928 int Client::mount(const std::string &mount_root, const UserPerm& perms,
5929                   bool require_mds, const std::string &fs_name)
5930 {
5931   std::lock_guard lock(client_lock);
5932
5933   if (mounted) {
5934     ldout(cct, 5) << "already mounted" << dendl;
5935     return 0;
5936   }
5937
5938   unmounting = false;
5939
5940   int r = subscribe_mdsmap(fs_name);
5941   if (r < 0) {
5942     lderr(cct) << "mdsmap subscription failed: " << cpp_strerror(r) << dendl;
5943     return r;
5944   }
5945
5946   tick(); // start tick
5947
5948   if (require_mds) {
5949     while (1) {
5950       auto availability = mdsmap->is_cluster_available();
5951       if (availability == MDSMap::STUCK_UNAVAILABLE) {
5952         // Error out
5953         ldout(cct, 10) << "mds cluster unavailable: epoch=" << mdsmap->get_epoch() << dendl;
5954         return CEPH_FUSE_NO_MDS_UP;
5955       } else if (availability == MDSMap::AVAILABLE) {
5956         // Continue to mount
5957         break;
5958       } else if (availability == MDSMap::TRANSIENT_UNAVAILABLE) {
5959         // Else, wait.  MDSMonitor will update the map to bring
5960         // us to a conclusion eventually.
5961         wait_on_list(waiting_for_mdsmap);
5962       } else {
5963         // Unexpected value!
5964         ceph_abort();
5965       }
5966     }
5967   }
5968
5969   populate_metadata(mount_root.empty() ? "/" : mount_root);
5970
5971   filepath fp(CEPH_INO_ROOT);
5972   if (!mount_root.empty()) {
5973     fp = filepath(mount_root.c_str());
5974   }
5975   while (true) {
5976     MetaRequest *req = new MetaRequest(CEPH_MDS_OP_GETATTR);
5977     req->set_filepath(fp);
5978     req->head.args.getattr.mask = CEPH_STAT_CAP_INODE_ALL;
5979     int res = make_request(req, perms);
5980     if (res < 0) {
5981       if (res == -EACCES && root) {
5982         ldout(cct, 1) << __func__ << " EACCES on parent of mount point; quotas may not work" << dendl;
5983         break;
5984       }
5985       return res;
5986     }
5987
5988     if (fp.depth())
5989       fp.pop_dentry();
5990     else
5991       break;
5992   }
5993
5994   ceph_assert(root);
5995   _ll_get(root);
5996
5997   mounted = true;
5998
5999   // trace?
6000   if (!cct->_conf->client_trace.empty()) {
6001     traceout.open(cct->_conf->client_trace.c_str());
6002     if (traceout.is_open()) {
6003       ldout(cct, 1) << "opened trace file '" << cct->_conf->client_trace << "'" << dendl;
6004     } else {
6005       ldout(cct, 1) << "FAILED to open trace file '" << cct->_conf->client_trace << "'" << dendl;
6006     }
6007   }
6008
6009   /*
6010   ldout(cct, 3) << "op: // client trace data structs" << dendl;
6011   ldout(cct, 3) << "op: struct stat st;" << dendl;
6012   ldout(cct, 3) << "op: struct utimbuf utim;" << dendl;
6013   ldout(cct, 3) << "op: int readlinkbuf_len = 1000;" << dendl;
6014   ldout(cct, 3) << "op: char readlinkbuf[readlinkbuf_len];" << dendl;
6015   ldout(cct, 3) << "op: map<string, inode_t*> dir_contents;" << dendl;
6016   ldout(cct, 3) << "op: map<int, int> open_files;" << dendl;
6017   ldout(cct, 3) << "op: int fd;" << dendl;
6018   */
6019   return 0;
6020 }
6021
6022 // UNMOUNT
6023
6024 void Client::_close_sessions()
6025 {
6026   while (!mds_sessions.empty()) {
6027     // send session closes!
6028     for (auto &p : mds_sessions) {
6029       if (p.second.state != MetaSession::STATE_CLOSING) {
6030         _close_mds_session(&p.second);
6031       }
6032     }
6033
6034     // wait for sessions to close
6035     ldout(cct, 2) << "waiting for " << mds_sessions.size() << " mds sessions to close" << dendl;
6036     std::unique_lock l{client_lock, std::adopt_lock};
6037     mount_cond.wait(l);
6038     l.release();
6039   }
6040 }
6041
6042 void Client::flush_mdlog_sync()
6043 {
6044   if (mds_requests.empty())
6045     return;
6046   for (auto &p : mds_sessions) {
6047     flush_mdlog(&p.second);
6048   }
6049 }
6050
6051 void Client::flush_mdlog(MetaSession *session)
6052 {
6053   // Only send this to Luminous or newer MDS daemons, older daemons
6054   // will crash if they see an unknown CEPH_SESSION_* value in this msg.
6055   const uint64_t features = session->con->get_features();
6056   if (HAVE_FEATURE(features, SERVER_LUMINOUS)) {
6057     auto m = make_message<MClientSession>(CEPH_SESSION_REQUEST_FLUSH_MDLOG);
6058     session->con->send_message2(std::move(m));
6059   }
6060 }
6061
6062
6063 void Client::_abort_mds_sessions(int err)
6064 {
6065   for (auto p = mds_requests.begin(); p != mds_requests.end(); ) {
6066     auto req = p->second;
6067     ++p;
6068     // unsafe requests will be removed during close session below.
6069     if (req->got_unsafe)
6070       continue;
6071
6072     req->abort(err);
6073     if (req->caller_cond) {
6074       req->kick = true;
6075       req->caller_cond->notify_all();
6076     }
6077   }
6078
6079   // Process aborts on any requests that were on this waitlist.
6080   // Any requests that were on a waiting_for_open session waitlist
6081   // will get kicked during close session below.
6082   signal_cond_list(waiting_for_mdsmap);
6083
6084   // Force-close all sessions
6085   while(!mds_sessions.empty()) {
6086     auto& session = mds_sessions.begin()->second;
6087     _closed_mds_session(&session);
6088   }
6089 }
6090
6091 void Client::_unmount(bool abort)
6092 {
6093   std::unique_lock lock{client_lock, std::adopt_lock};
6094   if (unmounting)
6095     return;
6096
6097   if (abort || blacklisted) {
6098     ldout(cct, 2) << "unmounting (" << (abort ? "abort)" : "blacklisted)") << dendl;
6099   } else {
6100     ldout(cct, 2) << "unmounting" << dendl;
6101   }
6102   unmounting = true;
6103
6104   deleg_timeout = 0;
6105
6106   if (abort) {
6107     // Abort all mds sessions
6108     _abort_mds_sessions(-ENOTCONN);
6109
6110     objecter->op_cancel_writes(-ENOTCONN);
6111   } else {
6112     // flush the mdlog for pending requests, if any
6113     flush_mdlog_sync();
6114   }
6115
6116   mount_cond.wait(lock, [this] {
6117     if (!mds_requests.empty()) {
6118       ldout(cct, 10) << "waiting on " << mds_requests.size() << " requests"
6119                      << dendl;
6120     }
6121     return mds_requests.empty();
6122   });
6123   if (tick_event)
6124     timer.cancel_event(tick_event);
6125   tick_event = 0;
6126
6127   cwd.reset();
6128
6129   // clean up any unclosed files
6130   while (!fd_map.empty()) {
6131     Fh *fh = fd_map.begin()->second;
6132     fd_map.erase(fd_map.begin());
6133     ldout(cct, 0) << " destroyed lost open file " << fh << " on " << *fh->inode << dendl;
6134     _release_fh(fh);
6135   }
6136
6137   while (!ll_unclosed_fh_set.empty()) {
6138     set<Fh*>::iterator it = ll_unclosed_fh_set.begin();
6139     Fh *fh = *it;
6140     ll_unclosed_fh_set.erase(fh);
6141     ldout(cct, 0) << " destroyed lost open file " << fh << " on " << *(fh->inode) << dendl;
6142     _release_fh(fh);
6143   }
6144
6145   while (!opened_dirs.empty()) {
6146     dir_result_t *dirp = *opened_dirs.begin();
6147     ldout(cct, 0) << " destroyed lost open dir " << dirp << " on " << *dirp->inode << dendl;
6148     _closedir(dirp);
6149   }
6150
6151   _ll_drop_pins();
6152
6153   mount_cond.wait(lock, [this] {
6154     if (unsafe_sync_write > 0) {
6155       ldout(cct, 0) << unsafe_sync_write << " unsafe_sync_writes, waiting"
6156                     << dendl;
6157     }
6158     return unsafe_sync_write <= 0;
6159   });
6160
6161   if (cct->_conf->client_oc) {
6162     // flush/release all buffered data
6163     std::list<InodeRef> anchor;
6164     for (auto& p : inode_map) {
6165       Inode *in = p.second;
6166       if (!in) {
6167         ldout(cct, 0) << "null inode_map entry ino " << p.first << dendl;
6168         ceph_assert(in);
6169       }
6170
6171       // prevent inode from getting freed
6172       anchor.emplace_back(in);
6173
6174       if (abort || blacklisted) {
6175         objectcacher->purge_set(&in->oset);
6176       } else if (!in->caps.empty()) {
6177         _release(in);
6178         _flush(in, new C_Client_FlushComplete(this, in));
6179       }
6180     }
6181   }
6182
6183   if (abort || blacklisted) {
6184     for (auto p = dirty_list.begin(); !p.end(); ) {
6185       Inode *in = *p;
6186       ++p;
6187       if (in->dirty_caps) {
6188         ldout(cct, 0) << " drop dirty caps on " << *in << dendl;
6189         in->mark_caps_clean();
6190         put_inode(in);
6191       }
6192     }
6193   } else {
6194     flush_caps_sync();
6195     wait_sync_caps(last_flush_tid);
6196   }
6197
6198   // empty lru cache
6199   trim_cache();
6200
6201   while (lru.lru_get_size() > 0 ||
6202          !inode_map.empty()) {
6203     ldout(cct, 2) << "cache still has " << lru.lru_get_size()
6204             << "+" << inode_map.size() << " items"
6205             << ", waiting (for caps to release?)"
6206             << dendl;
6207     if (auto r = mount_cond.wait_for(lock, ceph::make_timespan(5));
6208         r == std::cv_status::timeout) {
6209       dump_cache(NULL);
6210     }
6211   }
6212   ceph_assert(lru.lru_get_size() == 0);
6213   ceph_assert(inode_map.empty());
6214
6215   // stop tracing
6216   if (!cct->_conf->client_trace.empty()) {
6217     ldout(cct, 1) << "closing trace file '" << cct->_conf->client_trace << "'" << dendl;
6218     traceout.close();
6219   }
6220
6221   _close_sessions();
6222
6223   mounted = false;
6224
6225   lock.release();
6226   ldout(cct, 2) << "unmounted." << dendl;
6227 }
6228
6229 void Client::unmount()
6230 {
6231   std::lock_guard lock(client_lock);
6232   _unmount(false);
6233 }
6234
6235 void Client::abort_conn()
6236 {
6237   std::lock_guard lock(client_lock);
6238   _unmount(true);
6239 }
6240
6241 void Client::flush_cap_releases()
6242 {
6243   // send any cap releases
6244   for (auto &p : mds_sessions) {
6245     auto &session = p.second;
6246     if (session.release && mdsmap->is_clientreplay_or_active_or_stopping(
6247           p.first)) {
6248       if (cct->_conf->client_inject_release_failure) {
6249         ldout(cct, 20) << __func__ << " injecting failure to send cap release message" << dendl;
6250       } else {
6251         session.con->send_message2(std::move(session.release));
6252       }
6253       session.release.reset();
6254     }
6255   }
6256 }
6257
6258 void Client::tick()
6259 {
6260   if (cct->_conf->client_debug_inject_tick_delay > 0) {
6261     sleep(cct->_conf->client_debug_inject_tick_delay);
6262     ceph_assert(0 == cct->_conf.set_val("client_debug_inject_tick_delay", "0"));
6263     cct->_conf.apply_changes(nullptr);
6264   }
6265
6266   ldout(cct, 21) << "tick" << dendl;
6267   tick_event = timer.add_event_after(
6268     cct->_conf->client_tick_interval,
6269     new LambdaContext([this](int) {
6270         // Called back via Timer, which takes client_lock for us
6271         ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
6272         tick();
6273       }));
6274   utime_t now = ceph_clock_now();
6275
6276   if (!mounted && !mds_requests.empty()) {
6277     MetaRequest *req = mds_requests.begin()->second;
6278     if (req->op_stamp + cct->_conf->client_mount_timeout < now) {
6279       req->abort(-ETIMEDOUT);
6280       if (req->caller_cond) {
6281         req->kick = true;
6282         req->caller_cond->notify_all();
6283       }
6284       signal_cond_list(waiting_for_mdsmap);
6285       for (auto &p : mds_sessions) {
6286         signal_context_list(p.second.waiting_for_open);
6287       }
6288     }
6289   }
6290
6291   if (mdsmap->get_epoch()) {
6292     // renew caps?
6293     utime_t el = now - last_cap_renew;
6294     if (el > mdsmap->get_session_timeout() / 3.0)
6295       renew_caps();
6296
6297     flush_cap_releases();
6298   }
6299
6300   // delayed caps
6301   xlist<Inode*>::iterator p = delayed_list.begin();
6302   while (!p.end()) {
6303     Inode *in = *p;
6304     ++p;
6305     if (in->hold_caps_until > now)
6306       break;
6307     delayed_list.pop_front();
6308     check_caps(in, CHECK_CAPS_NODELAY);
6309   }
6310
6311   trim_cache(true);
6312 }
6313
6314 void Client::renew_caps()
6315 {
6316   ldout(cct, 10) << "renew_caps()" << dendl;
6317   last_cap_renew = ceph_clock_now();
6318
6319   for (auto &p : mds_sessions) {
6320     ldout(cct, 15) << "renew_caps requesting from mds." << p.first << dendl;
6321     if (mdsmap->get_state(p.first) >= MDSMap::STATE_REJOIN)
6322       renew_caps(&p.second);
6323   }
6324 }
6325
6326 void Client::renew_caps(MetaSession *session)
6327 {
6328   ldout(cct, 10) << "renew_caps mds." << session->mds_num << dendl;
6329   session->last_cap_renew_request = ceph_clock_now();
6330   uint64_t seq = ++session->cap_renew_seq;
6331   session->con->send_message2(make_message<MClientSession>(CEPH_SESSION_REQUEST_RENEWCAPS, seq));
6332 }
6333
6334
6335 // ===============================================================
6336 // high level (POSIXy) interface
6337
6338 int Client::_do_lookup(Inode *dir, const string& name, int mask,
6339                        InodeRef *target, const UserPerm& perms)
6340 {
6341   int op = dir->snapid == CEPH_SNAPDIR ? CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP;
6342   MetaRequest *req = new MetaRequest(op);
6343   filepath path;
6344   dir->make_nosnap_relative_path(path);
6345   path.push_dentry(name);
6346   req->set_filepath(path);
6347   req->set_inode(dir);
6348   if (cct->_conf->client_debug_getattr_caps && op == CEPH_MDS_OP_LOOKUP)
6349       mask |= DEBUG_GETATTR_CAPS;
6350   req->head.args.getattr.mask = mask;
6351
6352   ldout(cct, 10) << __func__ << " on " << path << dendl;
6353
6354   int r = make_request(req, perms, target);
6355   ldout(cct, 10) << __func__ << " res is " << r << dendl;
6356   return r;
6357 }
6358
6359 int Client::_lookup(Inode *dir, const string& dname, int mask, InodeRef *target,
6360                     const UserPerm& perms)
6361 {
6362   int r = 0;
6363   Dentry *dn = NULL;
6364
6365   if (dname == "..") {
6366     if (dir->dentries.empty()) {
6367       MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPPARENT);
6368       filepath path(dir->ino);
6369       req->set_filepath(path);
6370
6371       InodeRef tmptarget;
6372       int r = make_request(req, perms, &tmptarget, NULL, rand() % mdsmap->get_num_in_mds());
6373
6374       if (r == 0) {
6375         Inode *tempino = tmptarget.get();
6376         _ll_get(tempino);
6377         *target = tempino;
6378         ldout(cct, 8) << __func__ << " found target " << (*target)->ino << dendl;
6379       } else {
6380         *target = dir;
6381       }
6382     }
6383     else
6384       *target = dir->get_first_parent()->dir->parent_inode; //dirs can't be hard-linked
6385     goto done;
6386   }
6387
6388   if (dname == ".") {
6389     *target = dir;
6390     goto done;
6391   }
6392
6393   if (!dir->is_dir()) {
6394     r = -ENOTDIR;
6395     goto done;
6396   }
6397
6398   if (dname.length() > NAME_MAX) {
6399     r = -ENAMETOOLONG;
6400     goto done;
6401   }
6402
6403   if (dname == cct->_conf->client_snapdir &&
6404       dir->snapid == CEPH_NOSNAP) {
6405     *target = open_snapdir(dir);
6406     goto done;
6407   }
6408
6409   if (dir->dir &&
6410       dir->dir->dentries.count(dname)) {
6411     dn = dir->dir->dentries[dname];
6412
6413     ldout(cct, 20) << __func__ << " have dn " << dname << " mds." << dn->lease_mds << " ttl " << dn->lease_ttl
6414              << " seq " << dn->lease_seq
6415              << dendl;
6416
6417     if (!dn->inode || dn->inode->caps_issued_mask(mask, true)) {
6418       // is dn lease valid?
6419       utime_t now = ceph_clock_now();
6420       if (dn->lease_mds >= 0 &&
6421           dn->lease_ttl > now &&
6422           mds_sessions.count(dn->lease_mds)) {
6423         MetaSession &s = mds_sessions.at(dn->lease_mds);
6424         if (s.cap_ttl > now &&
6425             s.cap_gen == dn->lease_gen) {
6426           // touch this mds's dir cap too, even though we don't _explicitly_ use it here, to
6427           // make trim_caps() behave.
6428           dir->try_touch_cap(dn->lease_mds);
6429           goto hit_dn;
6430         }
6431         ldout(cct, 20) << " bad lease, cap_ttl " << s.cap_ttl << ", cap_gen " << s.cap_gen
6432                        << " vs lease_gen " << dn->lease_gen << dendl;
6433       }
6434       // dir shared caps?
6435       if (dir->caps_issued_mask(CEPH_CAP_FILE_SHARED, true)) {
6436         if (dn->cap_shared_gen == dir->shared_gen &&
6437             (!dn->inode || dn->inode->caps_issued_mask(mask, true)))
6438               goto hit_dn;
6439         if (!dn->inode && (dir->flags & I_COMPLETE)) {
6440           ldout(cct, 10) << __func__ << " concluded ENOENT locally for "
6441                          << *dir << " dn '" << dname << "'" << dendl;
6442           return -ENOENT;
6443         }
6444       }
6445     } else {
6446       ldout(cct, 20) << " no cap on " << dn->inode->vino() << dendl;
6447     }
6448   } else {
6449     // can we conclude ENOENT locally?
6450     if (dir->caps_issued_mask(CEPH_CAP_FILE_SHARED, true) &&
6451         (dir->flags & I_COMPLETE)) {
6452       ldout(cct, 10) << __func__ << " concluded ENOENT locally for " << *dir << " dn '" << dname << "'" << dendl;
6453       return -ENOENT;
6454     }
6455   }
6456
6457   r = _do_lookup(dir, dname, mask, target, perms);
6458   goto done;
6459
6460  hit_dn:
6461   if (dn->inode) {
6462     *target = dn->inode;
6463   } else {
6464     r = -ENOENT;
6465   }
6466   touch_dn(dn);
6467
6468  done:
6469   if (r < 0)
6470     ldout(cct, 10) << __func__ << " " << *dir << " " << dname << " = " << r << dendl;
6471   else
6472     ldout(cct, 10) << __func__ << " " << *dir << " " << dname << " = " << **target << dendl;
6473   return r;
6474 }
6475
6476 int Client::get_or_create(Inode *dir, const char* name,
6477                           Dentry **pdn, bool expect_null)
6478 {
6479   // lookup
6480   ldout(cct, 20) << __func__ << " " << *dir << " name " << name << dendl;
6481   dir->open_dir();
6482   if (dir->dir->dentries.count(name)) {
6483     Dentry *dn = dir->dir->dentries[name];
6484
6485     // is dn lease valid?
6486     utime_t now = ceph_clock_now();
6487     if (dn->inode &&
6488         dn->lease_mds >= 0 &&
6489         dn->lease_ttl > now &&
6490         mds_sessions.count(dn->lease_mds)) {
6491       MetaSession &s = mds_sessions.at(dn->lease_mds);
6492       if (s.cap_ttl > now &&
6493           s.cap_gen == dn->lease_gen) {
6494         if (expect_null)
6495           return -EEXIST;
6496       }
6497     }
6498     *pdn = dn;
6499   } else {
6500     // otherwise link up a new one
6501     *pdn = link(dir->dir, name, NULL, NULL);
6502   }
6503
6504   // success
6505   return 0;
6506 }
6507
6508 int Client::path_walk(const filepath& origpath, InodeRef *end,
6509                       const UserPerm& perms, bool followsym, int mask)
6510 {
6511   filepath path = origpath;
6512   InodeRef cur;
6513   if (origpath.absolute())
6514     cur = root;
6515   else
6516     cur = cwd;
6517   ceph_assert(cur);
6518
6519   ldout(cct, 10) << __func__ << " " << path << dendl;
6520
6521   int symlinks = 0;
6522
6523   unsigned i=0;
6524   while (i < path.depth() && cur) {
6525     int caps = 0;
6526     const string &dname = path[i];
6527     ldout(cct, 10) << " " << i << " " << *cur << " " << dname << dendl;
6528     ldout(cct, 20) << "  (path is " << path << ")" << dendl;
6529     InodeRef next;
6530     if (cct->_conf->client_permissions) {
6531       int r = may_lookup(cur.get(), perms);
6532       if (r < 0)
6533         return r;
6534       caps = CEPH_CAP_AUTH_SHARED;
6535     }
6536
6537     /* Get extra requested caps on the last component */
6538     if (i == (path.depth() - 1))
6539       caps |= mask;
6540     int r = _lookup(cur.get(), dname, caps, &next, perms);
6541     if (r < 0)
6542       return r;
6543     // only follow trailing symlink if followsym.  always follow
6544     // 'directory' symlinks.
6545     if (next && next->is_symlink()) {
6546       symlinks++;
6547       ldout(cct, 20) << " symlink count " << symlinks << ", value is '" << next->symlink << "'" << dendl;
6548       if (symlinks > MAXSYMLINKS) {
6549         return -ELOOP;
6550       }
6551
6552       if (i < path.depth() - 1) {
6553         // dir symlink
6554         // replace consumed components of path with symlink dir target
6555         filepath resolved(next->symlink.c_str());
6556         resolved.append(path.postfixpath(i + 1));
6557         path = resolved;
6558         i = 0;
6559         if (next->symlink[0] == '/') {
6560           cur = root;
6561         }
6562         continue;
6563       } else if (followsym) {
6564         if (next->symlink[0] == '/') {
6565           path = next->symlink.c_str();
6566           i = 0;
6567           // reset position
6568           cur = root;
6569         } else {
6570           filepath more(next->symlink.c_str());
6571           // we need to remove the symlink component from off of the path
6572           // before adding the target that the symlink points to.  remain
6573           // at the same position in the path.
6574           path.pop_dentry();
6575           path.append(more);
6576         }
6577         continue;
6578       }
6579     }
6580     cur.swap(next);
6581     i++;
6582   }
6583   if (!cur)
6584     return -ENOENT;
6585   if (end)
6586     end->swap(cur);
6587   return 0;
6588 }
6589
6590
6591 // namespace ops
6592
6593 int Client::link(const char *relexisting, const char *relpath, const UserPerm& perm)
6594 {
6595   std::lock_guard lock(client_lock);
6596   tout(cct) << "link" << std::endl;
6597   tout(cct) << relexisting << std::endl;
6598   tout(cct) << relpath << std::endl;
6599
6600   if (unmounting)
6601     return -ENOTCONN;
6602
6603   filepath existing(relexisting);
6604
6605   InodeRef in, dir;
6606   int r = path_walk(existing, &in, perm, true);
6607   if (r < 0)
6608     return r;
6609   if (std::string(relpath) == "/") {
6610     r = -EEXIST;
6611     return r;
6612   }
6613   filepath path(relpath);
6614   string name = path.last_dentry();
6615   path.pop_dentry();
6616
6617   r = path_walk(path, &dir, perm, true);
6618   if (r < 0)
6619     return r;
6620   if (cct->_conf->client_permissions) {
6621     if (S_ISDIR(in->mode)) {
6622       r = -EPERM;
6623       return r;
6624     }
6625     r = may_hardlink(in.get(), perm);
6626     if (r < 0)
6627       return r;
6628     r = may_create(dir.get(), perm);
6629     if (r < 0)
6630       return r;
6631   }
6632   r = _link(in.get(), dir.get(), name.c_str(), perm);
6633   return r;
6634 }
6635
6636 int Client::unlink(const char *relpath, const UserPerm& perm)
6637 {
6638   std::lock_guard lock(client_lock);
6639   tout(cct) << __func__ << std::endl;
6640   tout(cct) << relpath << std::endl;
6641
6642   if (unmounting)
6643     return -ENOTCONN;
6644
6645   if (std::string(relpath) == "/")
6646     return -EISDIR;
6647
6648   filepath path(relpath);
6649   string name = path.last_dentry();
6650   path.pop_dentry();
6651   InodeRef dir;
6652   int r = path_walk(path, &dir, perm);
6653   if (r < 0)
6654     return r;
6655   if (cct->_conf->client_permissions) {
6656     r = may_delete(dir.get(), name.c_str(), perm);
6657     if (r < 0)
6658       return r;
6659   }
6660   return _unlink(dir.get(), name.c_str(), perm);
6661 }
6662
6663 int Client::rename(const char *relfrom, const char *relto, const UserPerm& perm)
6664 {
6665   std::lock_guard lock(client_lock);
6666   tout(cct) << __func__ << std::endl;
6667   tout(cct) << relfrom << std::endl;
6668   tout(cct) << relto << std::endl;
6669
6670   if (unmounting)
6671     return -ENOTCONN;
6672
6673   if (std::string(relfrom) == "/" || std::string(relto) == "/")
6674     return -EBUSY;
6675
6676   filepath from(relfrom);
6677   filepath to(relto);
6678   string fromname = from.last_dentry();
6679   from.pop_dentry();
6680   string toname = to.last_dentry();
6681   to.pop_dentry();
6682
6683   InodeRef fromdir, todir;
6684   int r = path_walk(from, &fromdir, perm);
6685   if (r < 0)
6686     goto out;
6687   r = path_walk(to, &todir, perm);
6688   if (r < 0)
6689     goto out;
6690
6691   if (cct->_conf->client_permissions) {
6692     int r = may_delete(fromdir.get(), fromname.c_str(), perm);
6693     if (r < 0)
6694       return r;
6695     r = may_delete(todir.get(), toname.c_str(), perm);
6696     if (r < 0 && r != -ENOENT)
6697       return r;
6698   }
6699   r = _rename(fromdir.get(), fromname.c_str(), todir.get(), toname.c_str(), perm);
6700 out:
6701   return r;
6702 }
6703
6704 // dirs
6705
6706 int Client::mkdir(const char *relpath, mode_t mode, const UserPerm& perm)
6707 {
6708   std::lock_guard lock(client_lock);
6709   tout(cct) << __func__ << std::endl;
6710   tout(cct) << relpath << std::endl;
6711   tout(cct) << mode << std::endl;
6712   ldout(cct, 10) << __func__ << ": " << relpath << dendl;
6713
6714   if (unmounting)
6715     return -ENOTCONN;
6716
6717   if (std::string(relpath) == "/")
6718     return -EEXIST;
6719
6720   filepath path(relpath);
6721   string name = path.last_dentry();
6722   path.pop_dentry();
6723   InodeRef dir;
6724   int r = path_walk(path, &dir, perm);
6725   if (r < 0)
6726     return r;
6727   if (cct->_conf->client_permissions) {
6728     r = may_create(dir.get(), perm);
6729     if (r < 0)
6730       return r;
6731   }
6732   return _mkdir(dir.get(), name.c_str(), mode, perm);
6733 }
6734
6735 int Client::mkdirs(const char *relpath, mode_t mode, const UserPerm& perms)
6736 {
6737   std::lock_guard lock(client_lock);
6738   ldout(cct, 10) << "Client::mkdirs " << relpath << dendl;
6739   tout(cct) << __func__ << std::endl;
6740   tout(cct) << relpath << std::endl;
6741   tout(cct) << mode << std::endl;
6742
6743   if (unmounting)
6744     return -ENOTCONN;
6745
6746   //get through existing parts of path
6747   filepath path(relpath);
6748   unsigned int i;
6749   int r = 0, caps = 0;
6750   InodeRef cur, next;
6751   cur = cwd;
6752   for (i=0; i<path.depth(); ++i) {
6753     if (cct->_conf->client_permissions) {
6754       r = may_lookup(cur.get(), perms);
6755       if (r < 0)
6756         break;
6757       caps = CEPH_CAP_AUTH_SHARED;
6758     }
6759     r = _lookup(cur.get(), path[i].c_str(), caps, &next, perms);
6760     if (r < 0)
6761       break;
6762     cur.swap(next);
6763   }
6764   if (r!=-ENOENT) return r;
6765   ldout(cct, 20) << __func__ << " got through " << i << " directories on path " << relpath << dendl;
6766   //make new directory at each level
6767   for (; i<path.depth(); ++i) {
6768     if (cct->_conf->client_permissions) {
6769       r = may_create(cur.get(), perms);
6770       if (r < 0)
6771         return r;
6772     }
6773     //make new dir
6774     r = _mkdir(cur.get(), path[i].c_str(), mode, perms, &next);
6775
6776     //check proper creation/existence
6777     if(-EEXIST == r && i < path.depth() - 1) {
6778       r = _lookup(cur.get(), path[i].c_str(), CEPH_CAP_AUTH_SHARED, &next, perms);
6779     }
6780     if (r < 0)
6781       return r;
6782     //move to new dir and continue
6783     cur.swap(next);
6784     ldout(cct, 20) << __func__ << ": successfully created directory "
6785                    << filepath(cur->ino).get_path() << dendl;
6786   }
6787   return 0;
6788 }
6789
6790 int Client::rmdir(const char *relpath, const UserPerm& perms)
6791 {
6792   std::lock_guard lock(client_lock);
6793   tout(cct) << __func__ << std::endl;
6794   tout(cct) << relpath << std::endl;
6795
6796   if (unmounting)
6797     return -ENOTCONN;
6798
6799   if (std::string(relpath) == "/")
6800     return -EBUSY;
6801
6802   filepath path(relpath);
6803   string name = path.last_dentry();
6804   path.pop_dentry();
6805   InodeRef dir;
6806   int r = path_walk(path, &dir, perms);
6807   if (r < 0)
6808     return r;
6809   if (cct->_conf->client_permissions) {
6810     int r = may_delete(dir.get(), name.c_str(), perms);
6811     if (r < 0)
6812       return r;
6813   }
6814   return _rmdir(dir.get(), name.c_str(), perms);
6815 }
6816
6817 int Client::mknod(const char *relpath, mode_t mode, const UserPerm& perms, dev_t rdev)
6818 {
6819   std::lock_guard lock(client_lock);
6820   tout(cct) << __func__ << std::endl;
6821   tout(cct) << relpath << std::endl;
6822   tout(cct) << mode << std::endl;
6823   tout(cct) << rdev << std::endl;
6824
6825   if (unmounting)
6826     return -ENOTCONN;
6827
6828   if (std::string(relpath) == "/")
6829     return -EEXIST;
6830
6831   filepath path(relpath);
6832   string name = path.last_dentry();
6833   path.pop_dentry();
6834   InodeRef dir;
6835   int r = path_walk(path, &dir, perms);
6836   if (r < 0)
6837     return r;
6838   if (cct->_conf->client_permissions) {
6839     int r = may_create(dir.get(), perms);
6840     if (r < 0)
6841       return r;
6842   }
6843   return _mknod(dir.get(), name.c_str(), mode, rdev, perms);
6844 }
6845
6846 // symlinks
6847
6848 int Client::symlink(const char *target, const char *relpath, const UserPerm& perms)
6849 {
6850   std::lock_guard lock(client_lock);
6851   tout(cct) << __func__ << std::endl;
6852   tout(cct) << target << std::endl;
6853   tout(cct) << relpath << std::endl;
6854
6855   if (unmounting)
6856     return -ENOTCONN;
6857
6858   if (std::string(relpath) == "/")
6859     return -EEXIST;
6860
6861   filepath path(relpath);
6862   string name = path.last_dentry();
6863   path.pop_dentry();
6864   InodeRef dir;
6865   int r = path_walk(path, &dir, perms);
6866   if (r < 0)
6867     return r;
6868   if (cct->_conf->client_permissions) {
6869     int r = may_create(dir.get(), perms);
6870     if (r < 0)
6871       return r;
6872   }
6873   return _symlink(dir.get(), name.c_str(), target, perms);
6874 }
6875
6876 int Client::readlink(const char *relpath, char *buf, loff_t size, const UserPerm& perms)
6877 {
6878   std::lock_guard lock(client_lock);
6879   tout(cct) << __func__ << std::endl;
6880   tout(cct) << relpath << std::endl;
6881
6882   if (unmounting)
6883     return -ENOTCONN;
6884
6885   filepath path(relpath);
6886   InodeRef in;
6887   int r = path_walk(path, &in, perms, false);
6888   if (r < 0)
6889     return r;
6890
6891   return _readlink(in.get(), buf, size);
6892 }
6893
6894 int Client::_readlink(Inode *in, char *buf, size_t size)
6895 {
6896   if (!in->is_symlink())
6897     return -EINVAL;
6898
6899   // copy into buf (at most size bytes)
6900   int r = in->symlink.length();
6901   if (r > (int)size)
6902     r = size;
6903   memcpy(buf, in->symlink.c_str(), r);
6904   return r;
6905 }
6906
6907
6908 // inode stuff
6909
6910 int Client::_getattr(Inode *in, int mask, const UserPerm& perms, bool force)
6911 {
6912   bool yes = in->caps_issued_mask(mask, true);
6913
6914   ldout(cct, 10) << __func__ << " mask " << ccap_string(mask) << " issued=" << yes << dendl;
6915   if (yes && !force)
6916     return 0;
6917
6918   MetaRequest *req = new MetaRequest(CEPH_MDS_OP_GETATTR);
6919   filepath path;
6920   in->make_nosnap_relative_path(path);
6921   req->set_filepath(path);
6922   req->set_inode(in);
6923   req->head.args.getattr.mask = mask;
6924
6925   int res = make_request(req, perms);
6926   ldout(cct, 10) << __func__ << " result=" << res << dendl;
6927   return res;
6928 }
6929
6930 int Client::_do_setattr(Inode *in, struct ceph_statx *stx, int mask,
6931                         const UserPerm& perms, InodeRef *inp)
6932 {
6933   int issued = in->caps_issued();
6934
6935   ldout(cct, 10) << __func__ << " mask " << mask << " issued " <<
6936     ccap_string(issued) << dendl;
6937
6938   if (in->snapid != CEPH_NOSNAP) {
6939     return -EROFS;
6940   }
6941   if ((mask & CEPH_SETATTR_SIZE) &&
6942       (unsigned long)stx->stx_size > in->size &&
6943       is_quota_bytes_exceeded(in, (unsigned long)stx->stx_size - in->size,
6944                               perms)) {
6945     return -EDQUOT;
6946   }
6947
6948   // make the change locally?
6949   if ((in->cap_dirtier_uid >= 0 && perms.uid() != in->cap_dirtier_uid) ||
6950       (in->cap_dirtier_gid >= 0 && perms.gid() != in->cap_dirtier_gid)) {
6951     ldout(cct, 10) << __func__ << " caller " << perms.uid() << ":" << perms.gid()
6952                    << " != cap dirtier " << in->cap_dirtier_uid << ":"
6953                    << in->cap_dirtier_gid << ", forcing sync setattr"
6954                    << dendl;
6955     /*
6956      * This works because we implicitly flush the caps as part of the
6957      * request, so the cap update check will happen with the writeback
6958      * cap context, and then the setattr check will happen with the
6959      * caller's context.
6960      *
6961      * In reality this pattern is likely pretty rare (different users
6962      * setattr'ing the same file).  If that turns out not to be the
6963      * case later, we can build a more complex pipelined cap writeback
6964      * infrastructure...
6965      */
6966     if (!mask)
6967       mask |= CEPH_SETATTR_CTIME;
6968     goto force_request;
6969   }
6970
6971   if (!mask) {
6972     // caller just needs us to bump the ctime
6973     in->ctime = ceph_clock_now();
6974     in->cap_dirtier_uid = perms.uid();
6975     in->cap_dirtier_gid = perms.gid();
6976     if (issued & CEPH_CAP_AUTH_EXCL)
6977       in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
6978     else if (issued & CEPH_CAP_FILE_EXCL)
6979       in->mark_caps_dirty(CEPH_CAP_FILE_EXCL);
6980     else if (issued & CEPH_CAP_XATTR_EXCL)
6981       in->mark_caps_dirty(CEPH_CAP_XATTR_EXCL);
6982     else
6983       mask |= CEPH_SETATTR_CTIME;
6984   }
6985
6986   if (in->caps_issued_mask(CEPH_CAP_AUTH_EXCL)) {
6987     bool kill_sguid = mask & (CEPH_SETATTR_SIZE|CEPH_SETATTR_KILL_SGUID);
6988
6989     mask &= ~CEPH_SETATTR_KILL_SGUID;
6990
6991     if (mask & CEPH_SETATTR_UID) {
6992       in->ctime = ceph_clock_now();
6993       in->cap_dirtier_uid = perms.uid();
6994       in->cap_dirtier_gid = perms.gid();
6995       in->uid = stx->stx_uid;
6996       in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
6997       mask &= ~CEPH_SETATTR_UID;
6998       kill_sguid = true;
6999       ldout(cct,10) << "changing uid to " << stx->stx_uid << dendl;
7000     }
7001     if (mask & CEPH_SETATTR_GID) {
7002       in->ctime = ceph_clock_now();
7003       in->cap_dirtier_uid = perms.uid();
7004       in->cap_dirtier_gid = perms.gid();
7005       in->gid = stx->stx_gid;
7006       in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7007       mask &= ~CEPH_SETATTR_GID;
7008       kill_sguid = true;
7009       ldout(cct,10) << "changing gid to " << stx->stx_gid << dendl;
7010     }
7011
7012     if (mask & CEPH_SETATTR_MODE) {
7013       in->ctime = ceph_clock_now();
7014       in->cap_dirtier_uid = perms.uid();
7015       in->cap_dirtier_gid = perms.gid();
7016       in->mode = (in->mode & ~07777) | (stx->stx_mode & 07777);
7017       in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7018       mask &= ~CEPH_SETATTR_MODE;
7019       ldout(cct,10) << "changing mode to " << stx->stx_mode << dendl;
7020     } else if (kill_sguid && S_ISREG(in->mode) && (in->mode & (S_IXUSR|S_IXGRP|S_IXOTH))) {
7021       /* Must squash the any setuid/setgid bits with an ownership change */
7022       in->mode &= ~(S_ISUID|S_ISGID);
7023       in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7024     }
7025
7026     if (mask & CEPH_SETATTR_BTIME) {
7027       in->ctime = ceph_clock_now();
7028       in->cap_dirtier_uid = perms.uid();
7029       in->cap_dirtier_gid = perms.gid();
7030       in->btime = utime_t(stx->stx_btime);
7031       in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7032       mask &= ~CEPH_SETATTR_BTIME;
7033       ldout(cct,10) << "changing btime to " << in->btime << dendl;
7034     }
7035   } else if (mask & CEPH_SETATTR_SIZE) {
7036     /* If we don't have Ax, then we must ask the server to clear them on truncate */
7037     mask |= CEPH_SETATTR_KILL_SGUID;
7038   }
7039
7040   if (in->caps_issued_mask(CEPH_CAP_FILE_EXCL)) {
7041     if (mask & (CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME)) {
7042       if (mask & CEPH_SETATTR_MTIME)
7043         in->mtime = utime_t(stx->stx_mtime);
7044       if (mask & CEPH_SETATTR_ATIME)
7045         in->atime = utime_t(stx->stx_atime);
7046       in->ctime = ceph_clock_now();
7047       in->cap_dirtier_uid = perms.uid();
7048       in->cap_dirtier_gid = perms.gid();
7049       in->time_warp_seq++;
7050       in->mark_caps_dirty(CEPH_CAP_FILE_EXCL);
7051       mask &= ~(CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME);
7052     }
7053   }
7054   if (!mask) {
7055     in->change_attr++;
7056     return 0;
7057   }
7058
7059 force_request:
7060   MetaRequest *req = new MetaRequest(CEPH_MDS_OP_SETATTR);
7061
7062   filepath path;
7063
7064   in->make_nosnap_relative_path(path);
7065   req->set_filepath(path);
7066   req->set_inode(in);
7067
7068   if (mask & CEPH_SETATTR_KILL_SGUID) {
7069     req->inode_drop |= CEPH_CAP_AUTH_SHARED;
7070   }
7071   if (mask & CEPH_SETATTR_MODE) {
7072     req->head.args.setattr.mode = stx->stx_mode;
7073     req->inode_drop |= CEPH_CAP_AUTH_SHARED;
7074     ldout(cct,10) << "changing mode to " << stx->stx_mode << dendl;
7075   }
7076   if (mask & CEPH_SETATTR_UID) {
7077     req->head.args.setattr.uid = stx->stx_uid;
7078     req->inode_drop |= CEPH_CAP_AUTH_SHARED;
7079     ldout(cct,10) << "changing uid to " << stx->stx_uid << dendl;
7080   }
7081   if (mask & CEPH_SETATTR_GID) {
7082     req->head.args.setattr.gid = stx->stx_gid;
7083     req->inode_drop |= CEPH_CAP_AUTH_SHARED;
7084     ldout(cct,10) << "changing gid to " << stx->stx_gid << dendl;
7085   }
7086   if (mask & CEPH_SETATTR_BTIME) {
7087     req->head.args.setattr.btime = utime_t(stx->stx_btime);
7088     req->inode_drop |= CEPH_CAP_AUTH_SHARED;
7089   }
7090   if (mask & CEPH_SETATTR_MTIME) {
7091     req->head.args.setattr.mtime = utime_t(stx->stx_mtime);
7092     req->inode_drop |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_RD |
7093       CEPH_CAP_FILE_WR;
7094   }
7095   if (mask & CEPH_SETATTR_ATIME) {
7096     req->head.args.setattr.atime = utime_t(stx->stx_atime);
7097     req->inode_drop |= CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_RD |
7098       CEPH_CAP_FILE_WR;
7099   }
7100   if (mask & CEPH_SETATTR_SIZE) {
7101     if ((unsigned long)stx->stx_size < mdsmap->get_max_filesize()) {
7102       req->head.args.setattr.size = stx->stx_size;
7103       ldout(cct,10) << "changing size to " << stx->stx_size << dendl;
7104     } else { //too big!
7105       put_request(req);
7106       ldout(cct,10) << "unable to set size to " << stx->stx_size << ". Too large!" << dendl;
7107       return -EFBIG;
7108     }
7109     req->inode_drop |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_RD |
7110       CEPH_CAP_FILE_WR;
7111   }
7112   req->head.args.setattr.mask = mask;
7113
7114   req->regetattr_mask = mask;
7115
7116   int res = make_request(req, perms, inp);
7117   ldout(cct, 10) << "_setattr result=" << res << dendl;
7118   return res;
7119 }
7120
7121 /* Note that we only care about attrs that setattr cares about */
7122 void Client::stat_to_statx(struct stat *st, struct ceph_statx *stx)
7123 {
7124   stx->stx_size = st->st_size;
7125   stx->stx_mode = st->st_mode;
7126   stx->stx_uid = st->st_uid;
7127   stx->stx_gid = st->st_gid;
7128 #ifdef __APPLE__
7129   stx->stx_mtime = st->st_mtimespec;
7130   stx->stx_atime = st->st_atimespec;
7131 #else
7132   stx->stx_mtime = st->st_mtim;
7133   stx->stx_atime = st->st_atim;
7134 #endif
7135 }
7136
7137 int Client::__setattrx(Inode *in, struct ceph_statx *stx, int mask,
7138                        const UserPerm& perms, InodeRef *inp)
7139 {
7140   int ret = _do_setattr(in, stx, mask, perms, inp);
7141   if (ret < 0)
7142    return ret;
7143   if (mask & CEPH_SETATTR_MODE)
7144     ret = _posix_acl_chmod(in, stx->stx_mode, perms);
7145   return ret;
7146 }
7147
7148 int Client::_setattrx(InodeRef &in, struct ceph_statx *stx, int mask,
7149                       const UserPerm& perms)
7150 {
7151   mask &= (CEPH_SETATTR_MODE | CEPH_SETATTR_UID |
7152            CEPH_SETATTR_GID | CEPH_SETATTR_MTIME |
7153            CEPH_SETATTR_ATIME | CEPH_SETATTR_SIZE |
7154            CEPH_SETATTR_CTIME | CEPH_SETATTR_BTIME);
7155   if (cct->_conf->client_permissions) {
7156     int r = may_setattr(in.get(), stx, mask, perms);
7157     if (r < 0)
7158       return r;
7159   }
7160   return __setattrx(in.get(), stx, mask, perms);
7161 }
7162
7163 int Client::_setattr(InodeRef &in, struct stat *attr, int mask,
7164                      const UserPerm& perms)
7165 {
7166   struct ceph_statx stx;
7167
7168   stat_to_statx(attr, &stx);
7169   mask &= ~CEPH_SETATTR_BTIME;
7170
7171   if ((mask & CEPH_SETATTR_UID) && attr->st_uid == static_cast<uid_t>(-1)) {
7172     mask &= ~CEPH_SETATTR_UID;
7173   }
7174   if ((mask & CEPH_SETATTR_GID) && attr->st_gid == static_cast<uid_t>(-1)) {
7175     mask &= ~CEPH_SETATTR_GID;
7176   }
7177
7178   return _setattrx(in, &stx, mask, perms);
7179 }
7180
7181 int Client::setattr(const char *relpath, struct stat *attr, int mask,
7182                     const UserPerm& perms)
7183 {
7184   std::lock_guard lock(client_lock);
7185   tout(cct) << __func__ << std::endl;
7186   tout(cct) << relpath << std::endl;
7187   tout(cct) << mask  << std::endl;
7188
7189   if (unmounting)
7190     return -ENOTCONN;
7191
7192   filepath path(relpath);
7193   InodeRef in;
7194   int r = path_walk(path, &in, perms);
7195   if (r < 0)
7196     return r;
7197   return _setattr(in, attr, mask, perms);
7198 }
7199
7200 int Client::setattrx(const char *relpath, struct ceph_statx *stx, int mask,
7201                      const UserPerm& perms, int flags)
7202 {
7203   std::lock_guard lock(client_lock);
7204   tout(cct) << __func__ << std::endl;
7205   tout(cct) << relpath << std::endl;
7206   tout(cct) << mask  << std::endl;
7207
7208   if (unmounting)
7209     return -ENOTCONN;
7210
7211   filepath path(relpath);
7212   InodeRef in;
7213   int r = path_walk(path, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW));
7214   if (r < 0)
7215     return r;
7216   return _setattrx(in, stx, mask, perms);
7217 }
7218
7219 int Client::fsetattr(int fd, struct stat *attr, int mask, const UserPerm& perms)
7220 {
7221   std::lock_guard lock(client_lock);
7222   tout(cct) << __func__ << std::endl;
7223   tout(cct) << fd << std::endl;
7224   tout(cct) << mask  << std::endl;
7225
7226   if (unmounting)
7227     return -ENOTCONN;
7228
7229   Fh *f = get_filehandle(fd);
7230   if (!f)
7231     return -EBADF;
7232 #if defined(__linux__) && defined(O_PATH)
7233   if (f->flags & O_PATH)
7234     return -EBADF;
7235 #endif
7236   return _setattr(f->inode, attr, mask, perms);
7237 }
7238
7239 int Client::fsetattrx(int fd, struct ceph_statx *stx, int mask, const UserPerm& perms)
7240 {
7241   std::lock_guard lock(client_lock);
7242   tout(cct) << __func__ << std::endl;
7243   tout(cct) << fd << std::endl;
7244   tout(cct) << mask  << std::endl;
7245
7246   if (unmounting)
7247     return -ENOTCONN;
7248
7249   Fh *f = get_filehandle(fd);
7250   if (!f)
7251     return -EBADF;
7252 #if defined(__linux__) && defined(O_PATH)
7253   if (f->flags & O_PATH)
7254     return -EBADF;
7255 #endif
7256   return _setattrx(f->inode, stx, mask, perms);
7257 }
7258
7259 int Client::stat(const char *relpath, struct stat *stbuf, const UserPerm& perms,
7260                  frag_info_t *dirstat, int mask)
7261 {
7262   ldout(cct, 3) << __func__ << " enter (relpath " << relpath << " mask " << mask << ")" << dendl;
7263   std::lock_guard lock(client_lock);
7264   tout(cct) << "stat" << std::endl;
7265   tout(cct) << relpath << std::endl;
7266
7267   if (unmounting)
7268     return -ENOTCONN;
7269
7270   filepath path(relpath);
7271   InodeRef in;
7272   int r = path_walk(path, &in, perms, true, mask);
7273   if (r < 0)
7274     return r;
7275   r = _getattr(in, mask, perms);
7276   if (r < 0) {
7277     ldout(cct, 3) << __func__ << " exit on error!" << dendl;
7278     return r;
7279   }
7280   fill_stat(in, stbuf, dirstat);
7281   ldout(cct, 3) << __func__ << " exit (relpath " << relpath << " mask " << mask << ")" << dendl;
7282   return r;
7283 }
7284
7285 unsigned Client::statx_to_mask(unsigned int flags, unsigned int want)
7286 {
7287   unsigned mask = 0;
7288
7289   /* if NO_ATTR_SYNC is set, then we don't need any -- just use what's in cache */
7290   if (flags & AT_NO_ATTR_SYNC)
7291     goto out;
7292
7293   /* Always set PIN to distinguish from AT_NO_ATTR_SYNC case */
7294   mask |= CEPH_CAP_PIN;
7295   if (want & (CEPH_STATX_MODE|CEPH_STATX_UID|CEPH_STATX_GID|CEPH_STATX_BTIME|CEPH_STATX_CTIME|CEPH_STATX_VERSION))
7296     mask |= CEPH_CAP_AUTH_SHARED;
7297   if (want & (CEPH_STATX_NLINK|CEPH_STATX_CTIME|CEPH_STATX_VERSION))
7298     mask |= CEPH_CAP_LINK_SHARED;
7299   if (want & (CEPH_STATX_ATIME|CEPH_STATX_MTIME|CEPH_STATX_CTIME|CEPH_STATX_SIZE|CEPH_STATX_BLOCKS|CEPH_STATX_VERSION))
7300     mask |= CEPH_CAP_FILE_SHARED;
7301   if (want & (CEPH_STATX_VERSION|CEPH_STATX_CTIME))
7302     mask |= CEPH_CAP_XATTR_SHARED;
7303 out:
7304   return mask;
7305 }
7306
7307 int Client::statx(const char *relpath, struct ceph_statx *stx,
7308                   const UserPerm& perms,
7309                   unsigned int want, unsigned int flags)
7310 {
7311   ldout(cct, 3) << __func__ << " enter (relpath " << relpath << " want " << want << ")" << dendl;
7312   std::lock_guard lock(client_lock);
7313   tout(cct) << "statx" << std::endl;
7314   tout(cct) << relpath << std::endl;
7315
7316   if (unmounting)
7317     return -ENOTCONN;
7318
7319   filepath path(relpath);
7320   InodeRef in;
7321
7322   unsigned mask = statx_to_mask(flags, want);
7323
7324   int r = path_walk(path, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW), mask);
7325   if (r < 0)
7326     return r;
7327
7328   r = _getattr(in, mask, perms);
7329   if (r < 0) {
7330     ldout(cct, 3) << __func__ << " exit on error!" << dendl;
7331     return r;
7332   }
7333
7334   fill_statx(in, mask, stx);
7335   ldout(cct, 3) << __func__ << " exit (relpath " << relpath << " mask " << stx->stx_mask << ")" << dendl;
7336   return r;
7337 }
7338
7339 int Client::lstat(const char *relpath, struct stat *stbuf,
7340                   const UserPerm& perms, frag_info_t *dirstat, int mask)
7341 {
7342   ldout(cct, 3) << __func__ << " enter (relpath " << relpath << " mask " << mask << ")" << dendl;
7343   std::lock_guard lock(client_lock);
7344   tout(cct) << __func__ << std::endl;
7345   tout(cct) << relpath << std::endl;
7346
7347   if (unmounting)
7348     return -ENOTCONN;
7349
7350   filepath path(relpath);
7351   InodeRef in;
7352   // don't follow symlinks
7353   int r = path_walk(path, &in, perms, false, mask);
7354   if (r < 0)
7355     return r;
7356   r = _getattr(in, mask, perms);
7357   if (r < 0) {
7358     ldout(cct, 3) << __func__ << " exit on error!" << dendl;
7359     return r;
7360   }
7361   fill_stat(in, stbuf, dirstat);
7362   ldout(cct, 3) << __func__ << " exit (relpath " << relpath << " mask " << mask << ")" << dendl;
7363   return r;
7364 }
7365
7366 int Client::fill_stat(Inode *in, struct stat *st, frag_info_t *dirstat, nest_info_t *rstat)
7367 {
7368   ldout(cct, 10) << __func__ << " on " << in->ino << " snap/dev" << in->snapid
7369            << " mode 0" << oct << in->mode << dec
7370            << " mtime " << in->mtime << " ctime " << in->ctime << dendl;
7371   memset(st, 0, sizeof(struct stat));
7372   if (use_faked_inos())
7373     st->st_ino = in->faked_ino;
7374   else
7375     st->st_ino = in->ino;
7376   st->st_dev = in->snapid;
7377   st->st_mode = in->mode;
7378   st->st_rdev = in->rdev;
7379   if (in->is_dir()) {
7380     switch (in->nlink) {
7381       case 0:
7382         st->st_nlink = 0; /* dir is unlinked */
7383         break;
7384       case 1:
7385         st->st_nlink = 1 /* parent dentry */
7386                        + 1 /* <dir>/. */
7387                        + in->dirstat.nsubdirs; /* include <dir>/. self-reference */
7388         break;
7389       default:
7390         ceph_abort();
7391     }
7392   } else {
7393     st->st_nlink = in->nlink;
7394   }
7395   st->st_uid = in->uid;
7396   st->st_gid = in->gid;
7397   if (in->ctime > in->mtime) {
7398     stat_set_ctime_sec(st, in->ctime.sec());
7399     stat_set_ctime_nsec(st, in->ctime.nsec());
7400   } else {
7401     stat_set_ctime_sec(st, in->mtime.sec());
7402     stat_set_ctime_nsec(st, in->mtime.nsec());
7403   }
7404   stat_set_atime_sec(st, in->atime.sec());
7405   stat_set_atime_nsec(st, in->atime.nsec());
7406   stat_set_mtime_sec(st, in->mtime.sec());
7407   stat_set_mtime_nsec(st, in->mtime.nsec());
7408   if (in->is_dir()) {
7409     if (cct->_conf->client_dirsize_rbytes)
7410       st->st_size = in->rstat.rbytes;
7411     else
7412       st->st_size = in->dirstat.size();
7413     st->st_blocks = 1;
7414   } else {
7415     st->st_size = in->size;
7416     st->st_blocks = (in->size + 511) >> 9;
7417   }
7418   st->st_blksize = std::max<uint32_t>(in->layout.stripe_unit, 4096);
7419
7420   if (dirstat)
7421     *dirstat = in->dirstat;
7422   if (rstat)
7423     *rstat = in->rstat;
7424
7425   return in->caps_issued();
7426 }
7427
7428 void Client::fill_statx(Inode *in, unsigned int mask, struct ceph_statx *stx)
7429 {
7430   ldout(cct, 10) << __func__ << " on " << in->ino << " snap/dev" << in->snapid
7431            << " mode 0" << oct << in->mode << dec
7432            << " mtime " << in->mtime << " ctime " << in->ctime << dendl;
7433   memset(stx, 0, sizeof(struct ceph_statx));
7434
7435   /*
7436    * If mask is 0, then the caller set AT_NO_ATTR_SYNC. Reset the mask
7437    * so that all bits are set.
7438    */
7439   if (!mask)
7440     mask = ~0;
7441
7442   /* These are always considered to be available */
7443   stx->stx_dev = in->snapid;
7444   stx->stx_blksize = std::max<uint32_t>(in->layout.stripe_unit, 4096);
7445
7446   /* Type bits are always set, even when CEPH_STATX_MODE is not */
7447   stx->stx_mode = S_IFMT & in->mode;
7448   stx->stx_ino = use_faked_inos() ? in->faked_ino : (ino_t)in->ino;
7449   stx->stx_rdev = in->rdev;
7450   stx->stx_mask |= (CEPH_STATX_INO|CEPH_STATX_RDEV);
7451
7452   if (mask & CEPH_CAP_AUTH_SHARED) {
7453     stx->stx_uid = in->uid;
7454     stx->stx_gid = in->gid;
7455     stx->stx_mode = in->mode;
7456     in->btime.to_timespec(&stx->stx_btime);
7457     stx->stx_mask |= (CEPH_STATX_MODE|CEPH_STATX_UID|CEPH_STATX_GID|CEPH_STATX_BTIME);
7458   }
7459
7460   if (mask & CEPH_CAP_LINK_SHARED) {
7461     if (in->is_dir()) {
7462       switch (in->nlink) {
7463         case 0:
7464           stx->stx_nlink = 0; /* dir is unlinked */
7465           break;
7466         case 1:
7467           stx->stx_nlink = 1 /* parent dentry */
7468                            + 1 /* <dir>/. */
7469                            + in->dirstat.nsubdirs; /* include <dir>/. self-reference */
7470           break;
7471         default:
7472           ceph_abort();
7473       }
7474     } else {
7475       stx->stx_nlink = in->nlink;
7476     }
7477     stx->stx_mask |= CEPH_STATX_NLINK;
7478   }
7479
7480   if (mask & CEPH_CAP_FILE_SHARED) {
7481
7482     in->atime.to_timespec(&stx->stx_atime);
7483     in->mtime.to_timespec(&stx->stx_mtime);
7484
7485     if (in->is_dir()) {
7486       if (cct->_conf->client_dirsize_rbytes)
7487         stx->stx_size = in->rstat.rbytes;
7488       else
7489         stx->stx_size = in->dirstat.size();
7490       stx->stx_blocks = 1;
7491     } else {
7492       stx->stx_size = in->size;
7493       stx->stx_blocks = (in->size + 511) >> 9;
7494     }
7495     stx->stx_mask |= (CEPH_STATX_ATIME|CEPH_STATX_MTIME|
7496                       CEPH_STATX_SIZE|CEPH_STATX_BLOCKS);
7497   }
7498
7499   /* Change time and change_attr both require all shared caps to view */
7500   if ((mask & CEPH_STAT_CAP_INODE_ALL) == CEPH_STAT_CAP_INODE_ALL) {
7501     stx->stx_version = in->change_attr;
7502     if (in->ctime > in->mtime)
7503       in->ctime.to_timespec(&stx->stx_ctime);
7504     else
7505       in->mtime.to_timespec(&stx->stx_ctime);
7506     stx->stx_mask |= (CEPH_STATX_CTIME|CEPH_STATX_VERSION);
7507   }
7508
7509 }
7510
7511 void Client::touch_dn(Dentry *dn)
7512 {
7513   lru.lru_touch(dn);
7514 }
7515
7516 int Client::chmod(const char *relpath, mode_t mode, const UserPerm& perms)
7517 {
7518   std::lock_guard lock(client_lock);
7519   tout(cct) << __func__ << std::endl;
7520   tout(cct) << relpath << std::endl;
7521   tout(cct) << mode << std::endl;
7522
7523   if (unmounting)
7524     return -ENOTCONN;
7525
7526   filepath path(relpath);
7527   InodeRef in;
7528   int r = path_walk(path, &in, perms);
7529   if (r < 0)
7530     return r;
7531   struct stat attr;
7532   attr.st_mode = mode;
7533   return _setattr(in, &attr, CEPH_SETATTR_MODE, perms);
7534 }
7535
7536 int Client::fchmod(int fd, mode_t mode, const UserPerm& perms)
7537 {
7538   std::lock_guard lock(client_lock);
7539   tout(cct) << __func__ << std::endl;
7540   tout(cct) << fd << std::endl;
7541   tout(cct) << mode << std::endl;
7542
7543   if (unmounting)
7544     return -ENOTCONN;
7545
7546   Fh *f = get_filehandle(fd);
7547   if (!f)
7548     return -EBADF;
7549 #if defined(__linux__) && defined(O_PATH)
7550   if (f->flags & O_PATH)
7551     return -EBADF;
7552 #endif
7553   struct stat attr;
7554   attr.st_mode = mode;
7555   return _setattr(f->inode, &attr, CEPH_SETATTR_MODE, perms);
7556 }
7557
7558 int Client::lchmod(const char *relpath, mode_t mode, const UserPerm& perms)
7559 {
7560   std::lock_guard lock(client_lock);
7561   tout(cct) << __func__ << std::endl;
7562   tout(cct) << relpath << std::endl;
7563   tout(cct) << mode << std::endl;
7564
7565   if (unmounting)
7566     return -ENOTCONN;
7567
7568   filepath path(relpath);
7569   InodeRef in;
7570   // don't follow symlinks
7571   int r = path_walk(path, &in, perms, false);
7572   if (r < 0)
7573     return r;
7574   struct stat attr;
7575   attr.st_mode = mode;
7576   return _setattr(in, &attr, CEPH_SETATTR_MODE, perms);
7577 }
7578
7579 int Client::chown(const char *relpath, uid_t new_uid, gid_t new_gid,
7580                   const UserPerm& perms)
7581 {
7582   std::lock_guard lock(client_lock);
7583   tout(cct) << __func__ << std::endl;
7584   tout(cct) << relpath << std::endl;
7585   tout(cct) << new_uid << std::endl;
7586   tout(cct) << new_gid << std::endl;
7587
7588   if (unmounting)
7589     return -ENOTCONN;
7590
7591   filepath path(relpath);
7592   InodeRef in;
7593   int r = path_walk(path, &in, perms);
7594   if (r < 0)
7595     return r;
7596   struct stat attr;
7597   attr.st_uid = new_uid;
7598   attr.st_gid = new_gid;
7599   return _setattr(in, &attr, CEPH_SETATTR_UID|CEPH_SETATTR_GID, perms);
7600 }
7601
7602 int Client::fchown(int fd, uid_t new_uid, gid_t new_gid, const UserPerm& perms)
7603 {
7604   std::lock_guard lock(client_lock);
7605   tout(cct) << __func__ << std::endl;
7606   tout(cct) << fd << std::endl;
7607   tout(cct) << new_uid << std::endl;
7608   tout(cct) << new_gid << std::endl;
7609
7610   if (unmounting)
7611     return -ENOTCONN;
7612
7613   Fh *f = get_filehandle(fd);
7614   if (!f)
7615     return -EBADF;
7616 #if defined(__linux__) && defined(O_PATH)
7617   if (f->flags & O_PATH)
7618     return -EBADF;
7619 #endif
7620   struct stat attr;
7621   attr.st_uid = new_uid;
7622   attr.st_gid = new_gid;
7623   int mask = 0;
7624   if (new_uid != static_cast<uid_t>(-1)) mask |= CEPH_SETATTR_UID;
7625   if (new_gid != static_cast<gid_t>(-1)) mask |= CEPH_SETATTR_GID;
7626   return _setattr(f->inode, &attr, mask, perms);
7627 }
7628
7629 int Client::lchown(const char *relpath, uid_t new_uid, gid_t new_gid,
7630                    const UserPerm& perms)
7631 {
7632   std::lock_guard lock(client_lock);
7633   tout(cct) << __func__ << std::endl;
7634   tout(cct) << relpath << std::endl;
7635   tout(cct) << new_uid << std::endl;
7636   tout(cct) << new_gid << std::endl;
7637
7638   if (unmounting)
7639     return -ENOTCONN;
7640
7641   filepath path(relpath);
7642   InodeRef in;
7643   // don't follow symlinks
7644   int r = path_walk(path, &in, perms, false);
7645   if (r < 0)
7646     return r;
7647   struct stat attr;
7648   attr.st_uid = new_uid;
7649   attr.st_gid = new_gid;
7650   int mask = 0;
7651   if (new_uid != static_cast<uid_t>(-1)) mask |= CEPH_SETATTR_UID;
7652   if (new_gid != static_cast<gid_t>(-1)) mask |= CEPH_SETATTR_GID;
7653   return _setattr(in, &attr, mask, perms);
7654 }
7655
7656 static void attr_set_atime_and_mtime(struct stat *attr,
7657                                      const utime_t &atime,
7658                                      const utime_t &mtime)
7659 {
7660   stat_set_atime_sec(attr, atime.tv.tv_sec);
7661   stat_set_atime_nsec(attr, atime.tv.tv_nsec);
7662   stat_set_mtime_sec(attr, mtime.tv.tv_sec);
7663   stat_set_mtime_nsec(attr, mtime.tv.tv_nsec);
7664 }
7665
7666 // for [l]utime() invoke the timeval variant as the timespec
7667 // variant are not yet implemented. for futime[s](), invoke
7668 // the timespec variant.
7669 int Client::utime(const char *relpath, struct utimbuf *buf,
7670                   const UserPerm& perms)
7671 {
7672   struct timeval tv[2];
7673   tv[0].tv_sec  = buf->actime;
7674   tv[0].tv_usec = 0;
7675   tv[1].tv_sec  = buf->modtime;
7676   tv[1].tv_usec = 0;
7677
7678   return utimes(relpath, tv, perms);
7679 }
7680
7681 int Client::lutime(const char *relpath, struct utimbuf *buf,
7682                    const UserPerm& perms)
7683 {
7684   struct timeval tv[2];
7685   tv[0].tv_sec  = buf->actime;
7686   tv[0].tv_usec = 0;
7687   tv[1].tv_sec  = buf->modtime;
7688   tv[1].tv_usec = 0;
7689
7690   return lutimes(relpath, tv, perms);
7691 }
7692
7693 int Client::futime(int fd, struct utimbuf *buf, const UserPerm& perms)
7694 {
7695   struct timespec ts[2];
7696   ts[0].tv_sec  = buf->actime;
7697   ts[0].tv_nsec = 0;
7698   ts[1].tv_sec  = buf->modtime;
7699   ts[1].tv_nsec = 0;
7700
7701   return futimens(fd, ts, perms);
7702 }
7703
7704 int Client::utimes(const char *relpath, struct timeval times[2],
7705                    const UserPerm& perms)
7706 {
7707   std::lock_guard lock(client_lock);
7708   tout(cct) << __func__ << std::endl;
7709   tout(cct) << relpath << std::endl;
7710   tout(cct) << "atime: " << times[0].tv_sec << "." << times[0].tv_usec
7711             << std::endl;
7712   tout(cct) << "mtime: " << times[1].tv_sec << "." << times[1].tv_usec
7713             << std::endl;
7714
7715   if (unmounting)
7716     return -ENOTCONN;
7717
7718   filepath path(relpath);
7719   InodeRef in;
7720   int r = path_walk(path, &in, perms);
7721   if (r < 0)
7722     return r;
7723   struct stat attr;
7724   utime_t atime(times[0]);
7725   utime_t mtime(times[1]);
7726
7727   attr_set_atime_and_mtime(&attr, atime, mtime);
7728   return _setattr(in, &attr, CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME, perms);
7729 }
7730
7731 int Client::lutimes(const char *relpath, struct timeval times[2],
7732                     const UserPerm& perms)
7733 {
7734   std::lock_guard lock(client_lock);
7735   tout(cct) << __func__ << std::endl;
7736   tout(cct) << relpath << std::endl;
7737   tout(cct) << "atime: " << times[0].tv_sec << "." << times[0].tv_usec
7738             << std::endl;
7739   tout(cct) << "mtime: " << times[1].tv_sec << "." << times[1].tv_usec
7740             << std::endl;
7741
7742   if (unmounting)
7743     return -ENOTCONN;
7744
7745   filepath path(relpath);
7746   InodeRef in;
7747   int r = path_walk(path, &in, perms, false);
7748   if (r < 0)
7749     return r;
7750   struct stat attr;
7751   utime_t atime(times[0]);
7752   utime_t mtime(times[1]);
7753
7754   attr_set_atime_and_mtime(&attr, atime, mtime);
7755   return _setattr(in, &attr, CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME, perms);
7756 }
7757
7758 int Client::futimes(int fd, struct timeval times[2], const UserPerm& perms)
7759 {
7760   struct timespec ts[2];
7761   ts[0].tv_sec  = times[0].tv_sec;
7762   ts[0].tv_nsec = times[0].tv_usec * 1000;
7763   ts[1].tv_sec  = times[1].tv_sec;
7764   ts[1].tv_nsec = times[1].tv_usec * 1000;
7765
7766   return futimens(fd, ts, perms);
7767 }
7768
7769 int Client::futimens(int fd, struct timespec times[2], const UserPerm& perms)
7770 {
7771   std::lock_guard lock(client_lock);
7772   tout(cct) << __func__ << std::endl;
7773   tout(cct) << fd << std::endl;
7774   tout(cct) << "atime: " << times[0].tv_sec << "." << times[0].tv_nsec
7775             << std::endl;
7776   tout(cct) << "mtime: " << times[1].tv_sec << "." << times[1].tv_nsec
7777             << std::endl;
7778
7779   if (unmounting)
7780     return -ENOTCONN;
7781
7782   Fh *f = get_filehandle(fd);
7783   if (!f)
7784     return -EBADF;
7785 #if defined(__linux__) && defined(O_PATH)
7786   if (f->flags & O_PATH)
7787     return -EBADF;
7788 #endif
7789   struct stat attr;
7790   utime_t atime(times[0]);
7791   utime_t mtime(times[1]);
7792
7793   attr_set_atime_and_mtime(&attr, atime, mtime);
7794   return _setattr(f->inode, &attr, CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME, perms);
7795 }
7796
7797 int Client::flock(int fd, int operation, uint64_t owner)
7798 {
7799   std::lock_guard lock(client_lock);
7800   tout(cct) << __func__ << std::endl;
7801   tout(cct) << fd << std::endl;
7802   tout(cct) << operation << std::endl;
7803   tout(cct) << owner << std::endl;
7804
7805   if (unmounting)
7806     return -ENOTCONN;
7807
7808   Fh *f = get_filehandle(fd);
7809   if (!f)
7810     return -EBADF;
7811
7812   return _flock(f, operation, owner);
7813 }
7814
7815 int Client::opendir(const char *relpath, dir_result_t **dirpp, const UserPerm& perms)
7816 {
7817   std::lock_guard lock(client_lock);
7818   tout(cct) << __func__ << std::endl;
7819   tout(cct) << relpath << std::endl;
7820
7821   if (unmounting)
7822     return -ENOTCONN;
7823
7824   filepath path(relpath);
7825   InodeRef in;
7826   int r = path_walk(path, &in, perms, true);
7827   if (r < 0)
7828     return r;
7829   if (cct->_conf->client_permissions) {
7830     int r = may_open(in.get(), O_RDONLY, perms);
7831     if (r < 0)
7832       return r;
7833   }
7834   r = _opendir(in.get(), dirpp, perms);
7835   /* if ENOTDIR, dirpp will be an uninitialized point and it's very dangerous to access its value */
7836   if (r != -ENOTDIR)
7837       tout(cct) << (unsigned long)*dirpp << std::endl;
7838   return r;
7839 }
7840
7841 int Client::_opendir(Inode *in, dir_result_t **dirpp, const UserPerm& perms)
7842 {
7843   if (!in->is_dir())
7844     return -ENOTDIR;
7845   *dirpp = new dir_result_t(in, perms);
7846   opened_dirs.insert(*dirpp);
7847   ldout(cct, 8) << __func__ << "(" << in->ino << ") = " << 0 << " (" << *dirpp << ")" << dendl;
7848   return 0;
7849 }
7850
7851
7852 int Client::closedir(dir_result_t *dir)
7853 {
7854   std::lock_guard lock(client_lock);
7855   tout(cct) << __func__ << std::endl;
7856   tout(cct) << (unsigned long)dir << std::endl;
7857
7858   ldout(cct, 3) << __func__ << "(" << dir << ") = 0" << dendl;
7859   _closedir(dir);
7860   return 0;
7861 }
7862
7863 void Client::_closedir(dir_result_t *dirp)
7864 {
7865   ldout(cct, 10) << __func__ << "(" << dirp << ")" << dendl;
7866   if (dirp->inode) {
7867     ldout(cct, 10) << __func__ << " detaching inode " << dirp->inode << dendl;
7868     dirp->inode.reset();
7869   }
7870   _readdir_drop_dirp_buffer(dirp);
7871   opened_dirs.erase(dirp);
7872   delete dirp;
7873 }
7874
7875 void Client::rewinddir(dir_result_t *dirp)
7876 {
7877   std::lock_guard lock(client_lock);
7878   ldout(cct, 3) << __func__ << "(" << dirp << ")" << dendl;
7879
7880   if (unmounting)
7881     return;
7882
7883   dir_result_t *d = static_cast<dir_result_t*>(dirp);
7884   _readdir_drop_dirp_buffer(d);
7885   d->reset();
7886 }
7887
7888 loff_t Client::telldir(dir_result_t *dirp)
7889 {
7890   dir_result_t *d = static_cast<dir_result_t*>(dirp);
7891   ldout(cct, 3) << __func__ << "(" << dirp << ") = " << d->offset << dendl;
7892   return d->offset;
7893 }
7894
7895 void Client::seekdir(dir_result_t *dirp, loff_t offset)
7896 {
7897   std::lock_guard lock(client_lock);
7898
7899   ldout(cct, 3) << __func__ << "(" << dirp << ", " << offset << ")" << dendl;
7900
7901   if (unmounting)
7902     return;
7903
7904   if (offset == dirp->offset)
7905     return;
7906
7907   if (offset > dirp->offset)
7908     dirp->release_count = 0;   // bump if we do a forward seek
7909   else
7910     dirp->ordered_count = 0;   // disable filling readdir cache
7911
7912   if (dirp->hash_order()) {
7913     if (dirp->offset > offset) {
7914       _readdir_drop_dirp_buffer(dirp);
7915       dirp->reset();
7916     }
7917   } else {
7918     if (offset == 0 ||
7919         dirp->buffer_frag != frag_t(dir_result_t::fpos_high(offset)) ||
7920         dirp->offset_low() > dir_result_t::fpos_low(offset))  {
7921       _readdir_drop_dirp_buffer(dirp);
7922       dirp->reset();
7923     }
7924   }
7925
7926   dirp->offset = offset;
7927 }
7928
7929
7930 //struct dirent {
7931 //  ino_t          d_ino;       /* inode number */
7932 //  off_t          d_off;       /* offset to the next dirent */
7933 //  unsigned short d_reclen;    /* length of this record */
7934 //  unsigned char  d_type;      /* type of file */
7935 //  char           d_name[256]; /* filename */
7936 //};
7937 void Client::fill_dirent(struct dirent *de, const char *name, int type, uint64_t ino, loff_t next_off)
7938 {
7939   strncpy(de->d_name, name, 255);
7940   de->d_name[255] = '\0';
7941 #ifndef __CYGWIN__
7942   de->d_ino = ino;
7943 #if !defined(__APPLE__) && !defined(__FreeBSD__)
7944   de->d_off = next_off;
7945 #endif
7946   de->d_reclen = 1;
7947   de->d_type = IFTODT(type);
7948   ldout(cct, 10) << __func__ << " '" << de->d_name << "' -> " << inodeno_t(de->d_ino)
7949            << " type " << (int)de->d_type << " w/ next_off " << hex << next_off << dec << dendl;
7950 #endif
7951 }
7952
7953 void Client::_readdir_next_frag(dir_result_t *dirp)
7954 {
7955   frag_t fg = dirp->buffer_frag;
7956
7957   if (fg.is_rightmost()) {
7958     ldout(cct, 10) << __func__ << " advance from " << fg << " to END" << dendl;
7959     dirp->set_end();
7960     return;
7961   }
7962
7963   // advance
7964   fg = fg.next();
7965   ldout(cct, 10) << __func__ << " advance from " << dirp->buffer_frag << " to " << fg << dendl;
7966
7967   if (dirp->hash_order()) {
7968     // keep last_name
7969     int64_t new_offset = dir_result_t::make_fpos(fg.value(), 2, true);
7970     if (dirp->offset < new_offset) // don't decrease offset
7971       dirp->offset = new_offset;
7972   } else {
7973     dirp->last_name.clear();
7974     dirp->offset = dir_result_t::make_fpos(fg, 2, false);
7975     _readdir_rechoose_frag(dirp);
7976   }
7977 }
7978
7979 void Client::_readdir_rechoose_frag(dir_result_t *dirp)
7980 {
7981   ceph_assert(dirp->inode);
7982
7983   if (dirp->hash_order())
7984     return;
7985
7986   frag_t cur = frag_t(dirp->offset_high());
7987   frag_t fg = dirp->inode->dirfragtree[cur.value()];
7988   if (fg != cur) {
7989     ldout(cct, 10) << __func__ << " frag " << cur << " maps to " << fg << dendl;
7990     dirp->offset = dir_result_t::make_fpos(fg, 2, false);
7991     dirp->last_name.clear();
7992     dirp->next_offset = 2;
7993   }
7994 }
7995
7996 void Client::_readdir_drop_dirp_buffer(dir_result_t *dirp)
7997 {
7998   ldout(cct, 10) << __func__ << " " << dirp << dendl;
7999   dirp->buffer.clear();
8000 }
8001
8002 int Client::_readdir_get_frag(dir_result_t *dirp)
8003 {
8004   ceph_assert(dirp);
8005   ceph_assert(dirp->inode);
8006
8007   // get the current frag.
8008   frag_t fg;
8009   if (dirp->hash_order())
8010     fg = dirp->inode->dirfragtree[dirp->offset_high()];
8011   else
8012     fg = frag_t(dirp->offset_high());
8013
8014   ldout(cct, 10) << __func__ << " " << dirp << " on " << dirp->inode->ino << " fg " << fg
8015                  << " offset " << hex << dirp->offset << dec << dendl;
8016
8017   int op = CEPH_MDS_OP_READDIR;
8018   if (dirp->inode && dirp->inode->snapid == CEPH_SNAPDIR)
8019     op = CEPH_MDS_OP_LSSNAP;
8020
8021   InodeRef& diri = dirp->inode;
8022
8023   MetaRequest *req = new MetaRequest(op);
8024   filepath path;
8025   diri->make_nosnap_relative_path(path);
8026   req->set_filepath(path);
8027   req->set_inode(diri.get());
8028   req->head.args.readdir.frag = fg;
8029   req->head.args.readdir.flags = CEPH_READDIR_REPLY_BITFLAGS;
8030   if (dirp->last_name.length()) {
8031     req->path2.set_path(dirp->last_name);
8032   } else if (dirp->hash_order()) {
8033     req->head.args.readdir.offset_hash = dirp->offset_high();
8034   }
8035   req->dirp = dirp;
8036
8037   bufferlist dirbl;
8038   int res = make_request(req, dirp->perms, NULL, NULL, -1, &dirbl);
8039
8040   if (res == -EAGAIN) {
8041     ldout(cct, 10) << __func__ << " got EAGAIN, retrying" << dendl;
8042     _readdir_rechoose_frag(dirp);
8043     return _readdir_get_frag(dirp);
8044   }
8045
8046   if (res == 0) {
8047     ldout(cct, 10) << __func__ << " " << dirp << " got frag " << dirp->buffer_frag
8048                    << " size " << dirp->buffer.size() << dendl;
8049   } else {
8050     ldout(cct, 10) << __func__ << " got error " << res << ", setting end flag" << dendl;
8051     dirp->set_end();
8052   }
8053
8054   return res;
8055 }
8056
8057 struct dentry_off_lt {
8058   bool operator()(const Dentry* dn, int64_t off) const {
8059     return dir_result_t::fpos_cmp(dn->offset, off) < 0;
8060   }
8061 };
8062
8063 int Client::_readdir_cache_cb(dir_result_t *dirp, add_dirent_cb_t cb, void *p,
8064                               int caps, bool getref)
8065 {
8066   ceph_assert(ceph_mutex_is_locked(client_lock));
8067   ldout(cct, 10) << __func__ << " " << dirp << " on " << dirp->inode->ino
8068            << " last_name " << dirp->last_name << " offset " << hex << dirp->offset << dec
8069            << dendl;
8070   Dir *dir = dirp->inode->dir;
8071
8072   if (!dir) {
8073     ldout(cct, 10) << " dir is empty" << dendl;
8074     dirp->set_end();
8075     return 0;
8076   }
8077
8078   vector<Dentry*>::iterator pd = std::lower_bound(dir->readdir_cache.begin(),
8079                                                   dir->readdir_cache.end(),
8080                                                   dirp->offset, dentry_off_lt());
8081
8082   string dn_name;
8083   while (true) {
8084     if (!dirp->inode->is_complete_and_ordered())
8085       return -EAGAIN;
8086     if (pd == dir->readdir_cache.end())
8087       break;
8088     Dentry *dn = *pd;
8089     if (dn->inode == NULL) {
8090       ldout(cct, 15) << " skipping null '" << dn->name << "'" << dendl;
8091       ++pd;
8092       continue;
8093     }
8094     if (dn->cap_shared_gen != dir->parent_inode->shared_gen) {
8095       ldout(cct, 15) << " skipping mismatch shared gen '" << dn->name << "'" << dendl;
8096       ++pd;
8097       continue;
8098     }
8099
8100     int idx = pd - dir->readdir_cache.begin();
8101     int r = _getattr(dn->inode, caps, dirp->perms);
8102     if (r < 0)
8103       return r;
8104
8105     // the content of readdir_cache may change after _getattr(), so pd may be invalid iterator
8106     pd = dir->readdir_cache.begin() + idx;
8107     if (pd >= dir->readdir_cache.end() || *pd != dn)
8108       return -EAGAIN;
8109
8110     struct ceph_statx stx;
8111     struct dirent de;
8112     fill_statx(dn->inode, caps, &stx);
8113
8114     uint64_t next_off = dn->offset + 1;
8115     fill_dirent(&de, dn->name.c_str(), stx.stx_mode, stx.stx_ino, next_off);
8116     ++pd;
8117     if (pd == dir->readdir_cache.end())
8118       next_off = dir_result_t::END;
8119
8120     Inode *in = NULL;
8121     if (getref) {
8122       in = dn->inode.get();
8123       _ll_get(in);
8124     }
8125
8126     dn_name = dn->name; // fill in name while we have lock
8127
8128     client_lock.unlock();
8129     r = cb(p, &de, &stx, next_off, in);  // _next_ offset
8130     client_lock.lock();
8131     ldout(cct, 15) << " de " << de.d_name << " off " << hex << dn->offset << dec
8132                    << " = " << r << dendl;
8133     if (r < 0) {
8134       return r;
8135     }
8136
8137     dirp->offset = next_off;
8138     if (dirp->at_end())
8139       dirp->next_offset = 2;
8140     else
8141       dirp->next_offset = dirp->offset_low();
8142     dirp->last_name = dn_name; // we successfully returned this one; update!
8143     dirp->release_count = 0; // last_name no longer match cache index
8144     if (r > 0)
8145       return r;
8146   }
8147
8148   ldout(cct, 10) << __func__ << " " << dirp << " on " << dirp->inode->ino << " at end" << dendl;
8149   dirp->set_end();
8150   return 0;
8151 }
8152
8153 int Client::readdir_r_cb(dir_result_t *d, add_dirent_cb_t cb, void *p,
8154                          unsigned want, unsigned flags, bool getref)
8155 {
8156   int caps = statx_to_mask(flags, want);
8157
8158   std::lock_guard lock(client_lock);
8159
8160   if (unmounting)
8161     return -ENOTCONN;
8162
8163   dir_result_t *dirp = static_cast<dir_result_t*>(d);
8164
8165   ldout(cct, 10) << __func__ << " " << *dirp->inode << " offset " << hex << dirp->offset
8166                  << dec << " at_end=" << dirp->at_end()
8167                  << " hash_order=" << dirp->hash_order() << dendl;
8168
8169   struct dirent de;
8170   struct ceph_statx stx;
8171   memset(&de, 0, sizeof(de));
8172   memset(&stx, 0, sizeof(stx));
8173
8174   InodeRef& diri = dirp->inode;
8175
8176   if (dirp->at_end())
8177     return 0;
8178
8179   if (dirp->offset == 0) {
8180     ldout(cct, 15) << " including ." << dendl;
8181     ceph_assert(diri->dentries.size() < 2); // can't have multiple hard-links to a dir
8182     uint64_t next_off = 1;
8183
8184     int r;
8185     r = _getattr(diri, caps, dirp->perms);
8186     if (r < 0)
8187       return r;
8188
8189     fill_statx(diri, caps, &stx);
8190     fill_dirent(&de, ".", S_IFDIR, stx.stx_ino, next_off);
8191
8192     Inode *inode = NULL;
8193     if (getref) {
8194       inode = diri.get();
8195       _ll_get(inode);
8196     }
8197
8198     client_lock.unlock();
8199     r = cb(p, &de, &stx, next_off, inode);
8200     client_lock.lock();
8201     if (r < 0)
8202       return r;
8203
8204     dirp->offset = next_off;
8205     if (r > 0)
8206       return r;
8207   }
8208   if (dirp->offset == 1) {
8209     ldout(cct, 15) << " including .." << dendl;
8210     uint64_t next_off = 2;
8211     InodeRef in;
8212     if (diri->dentries.empty())
8213       in = diri;
8214     else
8215       in = diri->get_first_parent()->dir->parent_inode;
8216
8217     int r;
8218     r = _getattr(in, caps, dirp->perms);
8219     if (r < 0)
8220       return r;
8221
8222     fill_statx(in, caps, &stx);
8223     fill_dirent(&de, "..", S_IFDIR, stx.stx_ino, next_off);
8224
8225     Inode *inode = NULL;
8226     if (getref) {
8227       inode = in.get();
8228       _ll_get(inode);
8229     }
8230
8231     client_lock.unlock();
8232     r = cb(p, &de, &stx, next_off, inode);
8233     client_lock.lock();
8234     if (r < 0)
8235       return r;
8236
8237     dirp->offset = next_off;
8238     if (r > 0)
8239       return r;
8240   }
8241
8242   // can we read from our cache?
8243   ldout(cct, 10) << "offset " << hex << dirp->offset << dec
8244            << " snapid " << dirp->inode->snapid << " (complete && ordered) "
8245            << dirp->inode->is_complete_and_ordered()
8246            << " issued " << ccap_string(dirp->inode->caps_issued())
8247            << dendl;
8248   if (dirp->inode->snapid != CEPH_SNAPDIR &&
8249       dirp->inode->is_complete_and_ordered() &&
8250       dirp->inode->caps_issued_mask(CEPH_CAP_FILE_SHARED, true)) {
8251     int err = _readdir_cache_cb(dirp, cb, p, caps, getref);
8252     if (err != -EAGAIN)
8253       return err;
8254   }
8255
8256   while (1) {
8257     if (dirp->at_end())
8258       return 0;
8259
8260     bool check_caps = true;
8261     if (!dirp->is_cached()) {
8262       int r = _readdir_get_frag(dirp);
8263       if (r)
8264         return r;
8265       // _readdir_get_frag () may updates dirp->offset if the replied dirfrag is
8266       // different than the requested one. (our dirfragtree was outdated)
8267       check_caps = false;
8268     }
8269     frag_t fg = dirp->buffer_frag;
8270
8271     ldout(cct, 10) << "frag " << fg << " buffer size " << dirp->buffer.size()
8272                    << " offset " << hex << dirp->offset << dendl;
8273
8274     for (auto it = std::lower_bound(dirp->buffer.begin(), dirp->buffer.end(),
8275                                     dirp->offset, dir_result_t::dentry_off_lt());
8276          it != dirp->buffer.end();
8277          ++it) {
8278       dir_result_t::dentry &entry = *it;
8279
8280       uint64_t next_off = entry.offset + 1;
8281
8282       int r;
8283       if (check_caps) {
8284         r = _getattr(entry.inode, caps, dirp->perms);
8285         if (r < 0)
8286           return r;
8287       }
8288
8289       fill_statx(entry.inode, caps, &stx);
8290       fill_dirent(&de, entry.name.c_str(), stx.stx_mode, stx.stx_ino, next_off);
8291
8292       Inode *inode = NULL;
8293       if (getref) {
8294         inode = entry.inode.get();
8295         _ll_get(inode);
8296       }
8297
8298       client_lock.unlock();
8299       r = cb(p, &de, &stx, next_off, inode);  // _next_ offset
8300       client_lock.lock();
8301
8302       ldout(cct, 15) << " de " << de.d_name << " off " << hex << next_off - 1 << dec
8303                      << " = " << r << dendl;
8304       if (r < 0)
8305         return r;
8306
8307       dirp->offset = next_off;
8308       if (r > 0)
8309         return r;
8310     }
8311
8312     if (dirp->next_offset > 2) {
8313       ldout(cct, 10) << " fetching next chunk of this frag" << dendl;
8314       _readdir_drop_dirp_buffer(dirp);
8315       continue;  // more!
8316     }
8317
8318     if (!fg.is_rightmost()) {
8319       // next frag!
8320       _readdir_next_frag(dirp);
8321       continue;
8322     }
8323
8324     if (diri->shared_gen == dirp->start_shared_gen &&
8325         diri->dir_release_count == dirp->release_count) {
8326       if (diri->dir_ordered_count == dirp->ordered_count) {
8327         ldout(cct, 10) << " marking (I_COMPLETE|I_DIR_ORDERED) on " << *diri << dendl;
8328         if (diri->dir) {
8329           ceph_assert(diri->dir->readdir_cache.size() >= dirp->cache_index);
8330           diri->dir->readdir_cache.resize(dirp->cache_index);
8331         }
8332         diri->flags |= I_COMPLETE | I_DIR_ORDERED;
8333       } else {
8334         ldout(cct, 10) << " marking I_COMPLETE on " << *diri << dendl;
8335         diri->flags |= I_COMPLETE;
8336       }
8337     }
8338
8339     dirp->set_end();
8340     return 0;
8341   }
8342   ceph_abort();
8343   return 0;
8344 }
8345
8346
8347 int Client::readdir_r(dir_result_t *d, struct dirent *de)
8348 {
8349   return readdirplus_r(d, de, 0, 0, 0, NULL);
8350 }
8351
8352 /*
8353  * readdirplus_r
8354  *
8355  * returns
8356  *  1 if we got a dirent
8357  *  0 for end of directory
8358  * <0 on error
8359  */
8360
8361 struct single_readdir {
8362   struct dirent *de;
8363   struct ceph_statx *stx;
8364   Inode *inode;
8365   bool full;
8366 };
8367
8368 static int _readdir_single_dirent_cb(void *p, struct dirent *de,
8369                                      struct ceph_statx *stx, off_t off,
8370                                      Inode *in)
8371 {
8372   single_readdir *c = static_cast<single_readdir *>(p);
8373
8374   if (c->full)
8375     return -1;  // already filled this dirent
8376
8377   *c->de = *de;
8378   if (c->stx)
8379     *c->stx = *stx;
8380   c->inode = in;
8381   c->full = true;
8382   return 1;
8383 }
8384
8385 struct dirent *Client::readdir(dir_result_t *d)
8386 {
8387   int ret;
8388   static struct dirent de;
8389   single_readdir sr;
8390   sr.de = &de;
8391   sr.stx = NULL;
8392   sr.inode = NULL;
8393   sr.full = false;
8394
8395   // our callback fills the dirent and sets sr.full=true on first
8396   // call, and returns -1 the second time around.
8397   ret = readdir_r_cb(d, _readdir_single_dirent_cb, (void *)&sr);
8398   if (ret < -1) {
8399     errno = -ret;  // this sucks.
8400     return (dirent *) NULL;
8401   }
8402   if (sr.full) {
8403     return &de;
8404   }
8405   return (dirent *) NULL;
8406 }
8407
8408 int Client::readdirplus_r(dir_result_t *d, struct dirent *de,
8409                           struct ceph_statx *stx, unsigned want,
8410                           unsigned flags, Inode **out)
8411 {
8412   single_readdir sr;
8413   sr.de = de;
8414   sr.stx = stx;
8415   sr.inode = NULL;
8416   sr.full = false;
8417
8418   // our callback fills the dirent and sets sr.full=true on first
8419   // call, and returns -1 the second time around.
8420   int r = readdir_r_cb(d, _readdir_single_dirent_cb, (void *)&sr, want, flags, out);
8421   if (r < -1)
8422     return r;
8423   if (out)
8424     *out = sr.inode;
8425   if (sr.full)
8426     return 1;
8427   return 0;
8428 }
8429
8430
8431 /* getdents */
8432 struct getdents_result {
8433   char *buf;
8434   int buflen;
8435   int pos;
8436   bool fullent;
8437 };
8438
8439 static int _readdir_getdent_cb(void *p, struct dirent *de,
8440                                struct ceph_statx *stx, off_t off, Inode *in)
8441 {
8442   struct getdents_result *c = static_cast<getdents_result *>(p);
8443
8444   int dlen;
8445   if (c->fullent)
8446     dlen = sizeof(*de);
8447   else
8448     dlen = strlen(de->d_name) + 1;
8449
8450   if (c->pos + dlen > c->buflen)
8451     return -1;  // doesn't fit
8452
8453   if (c->fullent) {
8454     memcpy(c->buf + c->pos, de, sizeof(*de));
8455   } else {
8456     memcpy(c->buf + c->pos, de->d_name, dlen);
8457   }
8458   c->pos += dlen;
8459   return 0;
8460 }
8461
8462 int Client::_getdents(dir_result_t *dir, char *buf, int buflen, bool fullent)
8463 {
8464   getdents_result gr;
8465   gr.buf = buf;
8466   gr.buflen = buflen;
8467   gr.fullent = fullent;
8468   gr.pos = 0;
8469
8470   int r = readdir_r_cb(dir, _readdir_getdent_cb, (void *)&gr);
8471
8472   if (r < 0) { // some error
8473     if (r == -1) { // buffer ran out of space
8474       if (gr.pos) { // but we got some entries already!
8475         return gr.pos;
8476       } // or we need a larger buffer
8477       return -ERANGE;
8478     } else { // actual error, return it
8479       return r;
8480     }
8481   }
8482   return gr.pos;
8483 }
8484
8485
8486 /* getdir */
8487 struct getdir_result {
8488   list<string> *contents;
8489   int num;
8490 };
8491
8492 static int _getdir_cb(void *p, struct dirent *de, struct ceph_statx *stx, off_t off, Inode *in)
8493 {
8494   getdir_result *r = static_cast<getdir_result *>(p);
8495
8496   r->contents->push_back(de->d_name);
8497   r->num++;
8498   return 0;
8499 }
8500
8501 int Client::getdir(const char *relpath, list<string>& contents,
8502                    const UserPerm& perms)
8503 {
8504   ldout(cct, 3) << "getdir(" << relpath << ")" << dendl;
8505   {
8506     std::lock_guard lock(client_lock);
8507     tout(cct) << "getdir" << std::endl;
8508     tout(cct) << relpath << std::endl;
8509   }
8510
8511   dir_result_t *d;
8512   int r = opendir(relpath, &d, perms);
8513   if (r < 0)
8514     return r;
8515
8516   getdir_result gr;
8517   gr.contents = &contents;
8518   gr.num = 0;
8519   r = readdir_r_cb(d, _getdir_cb, (void *)&gr);
8520
8521   closedir(d);
8522
8523   if (r < 0)
8524     return r;
8525   return gr.num;
8526 }
8527
8528
8529 /****** file i/o **********/
8530 int Client::open(const char *relpath, int flags, const UserPerm& perms,
8531                  mode_t mode, int stripe_unit, int stripe_count,
8532                  int object_size, const char *data_pool)
8533 {
8534   ldout(cct, 3) << "open enter(" << relpath << ", " << ceph_flags_sys2wire(flags) << "," << mode << ")" << dendl;
8535   std::lock_guard lock(client_lock);
8536   tout(cct) << "open" << std::endl;
8537   tout(cct) << relpath << std::endl;
8538   tout(cct) << ceph_flags_sys2wire(flags) << std::endl;
8539
8540   if (unmounting)
8541     return -ENOTCONN;
8542
8543   Fh *fh = NULL;
8544
8545 #if defined(__linux__) && defined(O_PATH)
8546   /* When the O_PATH is being specified, others flags than O_DIRECTORY
8547    * and O_NOFOLLOW are ignored. Please refer do_entry_open() function
8548    * in kernel (fs/open.c). */
8549   if (flags & O_PATH)
8550     flags &= O_DIRECTORY | O_NOFOLLOW | O_PATH;
8551 #endif
8552
8553   filepath path(relpath);
8554   InodeRef in;
8555   bool created = false;
8556   /* O_CREATE with O_EXCL enforces O_NOFOLLOW. */
8557   bool followsym = !((flags & O_NOFOLLOW) || ((flags & O_CREAT) && (flags & O_EXCL)));
8558   int r = path_walk(path, &in, perms, followsym, ceph_caps_for_mode(mode));
8559
8560   if (r == 0 && (flags & O_CREAT) && (flags & O_EXCL))
8561     return -EEXIST;
8562
8563 #if defined(__linux__) && defined(O_PATH)
8564   if (r == 0 && in->is_symlink() && (flags & O_NOFOLLOW) && !(flags & O_PATH))
8565 #else
8566   if (r == 0 && in->is_symlink() && (flags & O_NOFOLLOW))
8567 #endif
8568     return -ELOOP;
8569
8570   if (r == -ENOENT && (flags & O_CREAT)) {
8571     filepath dirpath = path;
8572     string dname = dirpath.last_dentry();
8573     dirpath.pop_dentry();
8574     InodeRef dir;
8575     r = path_walk(dirpath, &dir, perms, true,
8576                   cct->_conf->client_permissions ? CEPH_CAP_AUTH_SHARED : 0);
8577     if (r < 0)
8578       goto out;
8579     if (cct->_conf->client_permissions) {
8580       r = may_create(dir.get(), perms);
8581       if (r < 0)
8582         goto out;
8583     }
8584     r = _create(dir.get(), dname.c_str(), flags, mode, &in, &fh, stripe_unit,
8585                 stripe_count, object_size, data_pool, &created, perms);
8586   }
8587   if (r < 0)
8588     goto out;
8589
8590   if (!created) {
8591     // posix says we can only check permissions of existing files
8592     if (cct->_conf->client_permissions) {
8593       r = may_open(in.get(), flags, perms);
8594       if (r < 0)
8595         goto out;
8596     }
8597   }
8598
8599   if (!fh)
8600     r = _open(in.get(), flags, mode, &fh, perms);
8601   if (r >= 0) {
8602     // allocate a integer file descriptor
8603     ceph_assert(fh);
8604     r = get_fd();
8605     ceph_assert(fd_map.count(r) == 0);
8606     fd_map[r] = fh;
8607   }
8608
8609  out:
8610   tout(cct) << r << std::endl;
8611   ldout(cct, 3) << "open exit(" << path << ", " << ceph_flags_sys2wire(flags) << ") = " << r << dendl;
8612   return r;
8613 }
8614
8615 int Client::open(const char *relpath, int flags, const UserPerm& perms, mode_t mode)
8616 {
8617   /* Use default file striping parameters */
8618   return open(relpath, flags, perms, mode, 0, 0, 0, NULL);
8619 }
8620
8621 int Client::lookup_hash(inodeno_t ino, inodeno_t dirino, const char *name,
8622                         const UserPerm& perms)
8623 {
8624   std::lock_guard lock(client_lock);
8625   ldout(cct, 3) << __func__ << " enter(" << ino << ", #" << dirino << "/" << name << ")" << dendl;
8626
8627   if (unmounting)
8628     return -ENOTCONN;
8629
8630   MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPHASH);
8631   filepath path(ino);
8632   req->set_filepath(path);
8633
8634   uint32_t h = ceph_str_hash(CEPH_STR_HASH_RJENKINS, name, strlen(name));
8635   char f[30];
8636   sprintf(f, "%u", h);
8637   filepath path2(dirino);
8638   path2.push_dentry(string(f));
8639   req->set_filepath2(path2);
8640
8641   int r = make_request(req, perms, NULL, NULL,
8642                        rand() % mdsmap->get_num_in_mds());
8643   ldout(cct, 3) << __func__ << " exit(" << ino << ", #" << dirino << "/" << name << ") = " << r << dendl;
8644   return r;
8645 }
8646
8647
8648 /**
8649  * Load inode into local cache.
8650  *
8651  * If inode pointer is non-NULL, and take a reference on
8652  * the resulting Inode object in one operation, so that caller
8653  * can safely assume inode will still be there after return.
8654  */
8655 int Client::_lookup_ino(inodeno_t ino, const UserPerm& perms, Inode **inode)
8656 {
8657   ldout(cct, 8) << __func__ << " enter(" << ino << ")" << dendl;
8658
8659   if (unmounting)
8660     return -ENOTCONN;
8661
8662   MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPINO);
8663   filepath path(ino);
8664   req->set_filepath(path);
8665
8666   int r = make_request(req, perms, NULL, NULL, rand() % mdsmap->get_num_in_mds());
8667   if (r == 0 && inode != NULL) {
8668     vinodeno_t vino(ino, CEPH_NOSNAP);
8669     unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino);
8670     ceph_assert(p != inode_map.end());
8671     *inode = p->second;
8672     _ll_get(*inode);
8673   }
8674   ldout(cct, 8) << __func__ << " exit(" << ino << ") = " << r << dendl;
8675   return r;
8676 }
8677
8678 int Client::lookup_ino(inodeno_t ino, const UserPerm& perms, Inode **inode)
8679 {
8680   std::lock_guard lock(client_lock);
8681   return _lookup_ino(ino, perms, inode);
8682 }
8683
8684 /**
8685  * Find the parent inode of `ino` and insert it into
8686  * our cache.  Conditionally also set `parent` to a referenced
8687  * Inode* if caller provides non-NULL value.
8688  */
8689 int Client::_lookup_parent(Inode *ino, const UserPerm& perms, Inode **parent)
8690 {
8691   ldout(cct, 8) << __func__ << " enter(" << ino->ino << ")" << dendl;
8692
8693   MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPPARENT);
8694   filepath path(ino->ino);
8695   req->set_filepath(path);
8696
8697   InodeRef target;
8698   int r = make_request(req, perms, &target, NULL, rand() % mdsmap->get_num_in_mds());
8699   // Give caller a reference to the parent ino if they provided a pointer.
8700   if (parent != NULL) {
8701     if (r == 0) {
8702       *parent = target.get();
8703       _ll_get(*parent);
8704       ldout(cct, 8) << __func__ << " found parent " << (*parent)->ino << dendl;
8705     } else {
8706       *parent = NULL;
8707     }
8708   }
8709   ldout(cct, 8) << __func__ << " exit(" << ino->ino << ") = " << r << dendl;
8710   return r;
8711 }
8712
8713 /**
8714  * Populate the parent dentry for `ino`, provided it is
8715  * a child of `parent`.
8716  */
8717 int Client::_lookup_name(Inode *ino, Inode *parent, const UserPerm& perms)
8718 {
8719   ceph_assert(parent->is_dir());
8720   ldout(cct, 3) << __func__ << " enter(" << ino->ino << ")" << dendl;
8721
8722   if (unmounting)
8723     return -ENOTCONN;
8724
8725   MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPNAME);
8726   req->set_filepath2(filepath(parent->ino));
8727   req->set_filepath(filepath(ino->ino));
8728   req->set_inode(ino);
8729
8730   int r = make_request(req, perms, NULL, NULL, rand() % mdsmap->get_num_in_mds());
8731   ldout(cct, 3) << __func__ << " exit(" << ino->ino << ") = " << r << dendl;
8732   return r;
8733 }
8734
8735 int Client::lookup_name(Inode *ino, Inode *parent, const UserPerm& perms)
8736 {
8737   std::lock_guard lock(client_lock);
8738   return _lookup_name(ino, parent, perms);
8739 }
8740
8741 Fh *Client::_create_fh(Inode *in, int flags, int cmode, const UserPerm& perms)
8742 {
8743   ceph_assert(in);
8744   Fh *f = new Fh(in, flags, cmode, perms);
8745
8746   ldout(cct, 10) << __func__ << " " << in->ino << " mode " << cmode << dendl;
8747
8748   if (in->snapid != CEPH_NOSNAP) {
8749     in->snap_cap_refs++;
8750     ldout(cct, 5) << "open success, fh is " << f << " combined IMMUTABLE SNAP caps "
8751             << ccap_string(in->caps_issued()) << dendl;
8752   }
8753
8754   const auto& conf = cct->_conf;
8755   f->readahead.set_trigger_requests(1);
8756   f->readahead.set_min_readahead_size(conf->client_readahead_min);
8757   uint64_t max_readahead = Readahead::NO_LIMIT;
8758   if (conf->client_readahead_max_bytes) {
8759     max_readahead = std::min(max_readahead, (uint64_t)conf->client_readahead_max_bytes);
8760   }
8761   if (conf->client_readahead_max_periods) {
8762     max_readahead = std::min(max_readahead, in->layout.get_period()*(uint64_t)conf->client_readahead_max_periods);
8763   }
8764   f->readahead.set_max_readahead_size(max_readahead);
8765   vector<uint64_t> alignments;
8766   alignments.push_back(in->layout.get_period());
8767   alignments.push_back(in->layout.stripe_unit);
8768   f->readahead.set_alignments(alignments);
8769
8770   return f;
8771 }
8772
8773 int Client::_release_fh(Fh *f)
8774 {
8775   //ldout(cct, 3) << "op: client->close(open_files[ " << fh << " ]);" << dendl;
8776   //ldout(cct, 3) << "op: open_files.erase( " << fh << " );" << dendl;
8777   Inode *in = f->inode.get();
8778   ldout(cct, 8) << __func__ << " " << f << " mode " << f->mode << " on " << *in << dendl;
8779
8780   in->unset_deleg(f);
8781
8782   if (in->snapid == CEPH_NOSNAP) {
8783     if (in->put_open_ref(f->mode)) {
8784       _flush(in, new C_Client_FlushComplete(this, in));
8785       check_caps(in, 0);
8786     }
8787   } else {
8788     ceph_assert(in->snap_cap_refs > 0);
8789     in->snap_cap_refs--;
8790   }
8791
8792   _release_filelocks(f);
8793
8794   // Finally, read any async err (i.e. from flushes)
8795   int err = f->take_async_err();
8796   if (err != 0) {
8797     ldout(cct, 1) << __func__ << " " << f << " on inode " << *in << " caught async_err = "
8798                   << cpp_strerror(err) << dendl;
8799   } else {
8800     ldout(cct, 10) << __func__ << " " << f << " on inode " << *in << " no async_err state" << dendl;
8801   }
8802
8803   _put_fh(f);
8804
8805   return err;
8806 }
8807
8808 void Client::_put_fh(Fh *f)
8809 {
8810   int left = f->put();
8811   if (!left) {
8812     delete f;
8813   }
8814 }
8815
8816 int Client::_open(Inode *in, int flags, mode_t mode, Fh **fhp,
8817                   const UserPerm& perms)
8818 {
8819   if (in->snapid != CEPH_NOSNAP &&
8820       (flags & (O_WRONLY | O_RDWR | O_CREAT | O_TRUNC | O_APPEND))) {
8821     return -EROFS;
8822   }
8823
8824   // use normalized flags to generate cmode
8825   int cflags = ceph_flags_sys2wire(flags);
8826   if (cct->_conf.get_val<bool>("client_force_lazyio"))
8827     cflags |= CEPH_O_LAZY;
8828
8829   int cmode = ceph_flags_to_mode(cflags);
8830   int want = ceph_caps_for_mode(cmode);
8831   int result = 0;
8832
8833   in->get_open_ref(cmode);  // make note of pending open, since it effects _wanted_ caps.
8834
8835   if ((flags & O_TRUNC) == 0 && in->caps_issued_mask(want)) {
8836     // update wanted?
8837     check_caps(in, CHECK_CAPS_NODELAY);
8838   } else {
8839
8840     MetaRequest *req = new MetaRequest(CEPH_MDS_OP_OPEN);
8841     filepath path;
8842     in->make_nosnap_relative_path(path);
8843     req->set_filepath(path);
8844     req->head.args.open.flags = cflags & ~CEPH_O_CREAT;
8845     req->head.args.open.mode = mode;
8846     req->head.args.open.pool = -1;
8847     if (cct->_conf->client_debug_getattr_caps)
8848       req->head.args.open.mask = DEBUG_GETATTR_CAPS;
8849     else
8850       req->head.args.open.mask = 0;
8851     req->head.args.open.old_size = in->size;   // for O_TRUNC
8852     req->set_inode(in);
8853     result = make_request(req, perms);
8854
8855     /*
8856      * NFS expects that delegations will be broken on a conflicting open,
8857      * not just when there is actual conflicting access to the file. SMB leases
8858      * and oplocks also have similar semantics.
8859      *
8860      * Ensure that clients that have delegations enabled will wait on minimal
8861      * caps during open, just to ensure that other clients holding delegations
8862      * return theirs first.
8863      */
8864     if (deleg_timeout && result == 0) {
8865       int need = 0, have;
8866
8867       if (cmode & CEPH_FILE_MODE_WR)
8868         need |= CEPH_CAP_FILE_WR;
8869       if (cmode & CEPH_FILE_MODE_RD)
8870         need |= CEPH_CAP_FILE_RD;
8871
8872       result = get_caps(in, need, want, &have, -1);
8873       if (result < 0) {
8874         ldout(cct, 8) << "Unable to get caps after open of inode " << *in <<
8875                           " . Denying open: " <<
8876                           cpp_strerror(result) << dendl;
8877         in->put_open_ref(cmode);
8878       } else {
8879         put_cap_ref(in, need);
8880       }
8881     }
8882   }
8883
8884   // success?
8885   if (result >= 0) {
8886     if (fhp)
8887       *fhp = _create_fh(in, flags, cmode, perms);
8888   } else {
8889     in->put_open_ref(cmode);
8890   }
8891
8892   trim_cache();
8893
8894   return result;
8895 }
8896
8897 int Client::_renew_caps(Inode *in)
8898 {
8899   int wanted = in->caps_file_wanted();
8900   if (in->is_any_caps() &&
8901       ((wanted & CEPH_CAP_ANY_WR) == 0 || in->auth_cap)) {
8902     check_caps(in, CHECK_CAPS_NODELAY);
8903     return 0;
8904   }
8905
8906   int flags = 0;
8907   if ((wanted & CEPH_CAP_FILE_RD) && (wanted & CEPH_CAP_FILE_WR))
8908     flags = O_RDWR;
8909   else if (wanted & CEPH_CAP_FILE_RD)
8910     flags = O_RDONLY;
8911   else if (wanted & CEPH_CAP_FILE_WR)
8912     flags = O_WRONLY;
8913
8914   MetaRequest *req = new MetaRequest(CEPH_MDS_OP_OPEN);
8915   filepath path;
8916   in->make_nosnap_relative_path(path);
8917   req->set_filepath(path);
8918   req->head.args.open.flags = flags;
8919   req->head.args.open.pool = -1;
8920   if (cct->_conf->client_debug_getattr_caps)
8921     req->head.args.open.mask = DEBUG_GETATTR_CAPS;
8922   else
8923     req->head.args.open.mask = 0;
8924   req->set_inode(in);
8925
8926   // duplicate in case Cap goes away; not sure if that race is a concern?
8927   const UserPerm *pperm = in->get_best_perms();
8928   UserPerm perms;
8929   if (pperm != NULL)
8930     perms = *pperm;
8931   int ret = make_request(req, perms);
8932   return ret;
8933 }
8934
8935 int Client::close(int fd)
8936 {
8937   ldout(cct, 3) << "close enter(" << fd << ")" << dendl;
8938   std::lock_guard lock(client_lock);
8939   tout(cct) << "close" << std::endl;
8940   tout(cct) << fd << std::endl;
8941
8942   if (unmounting)
8943     return -ENOTCONN;
8944
8945   Fh *fh = get_filehandle(fd);
8946   if (!fh)
8947     return -EBADF;
8948   int err = _release_fh(fh);
8949   fd_map.erase(fd);
8950   put_fd(fd);
8951   ldout(cct, 3) << "close exit(" << fd << ")" << dendl;
8952   return err;
8953 }
8954
8955
8956 // ------------
8957 // read, write
8958
8959 loff_t Client::lseek(int fd, loff_t offset, int whence)
8960 {
8961   std::lock_guard lock(client_lock);
8962   tout(cct) << "lseek" << std::endl;
8963   tout(cct) << fd << std::endl;
8964   tout(cct) << offset << std::endl;
8965   tout(cct) << whence << std::endl;
8966
8967   if (unmounting)
8968     return -ENOTCONN;
8969
8970   Fh *f = get_filehandle(fd);
8971   if (!f)
8972     return -EBADF;
8973 #if defined(__linux__) && defined(O_PATH)
8974   if (f->flags & O_PATH)
8975     return -EBADF;
8976 #endif
8977   return _lseek(f, offset, whence);
8978 }
8979
8980 loff_t Client::_lseek(Fh *f, loff_t offset, int whence)
8981 {
8982   Inode *in = f->inode.get();
8983   bool whence_check = false;
8984   loff_t pos = -1;
8985
8986   switch (whence) {
8987   case SEEK_END:
8988     whence_check = true;
8989   break;
8990
8991 #ifdef SEEK_DATA
8992   case SEEK_DATA:
8993     whence_check = true;
8994   break;
8995 #endif
8996
8997 #ifdef SEEK_HOLE
8998   case SEEK_HOLE:
8999     whence_check = true;
9000   break;
9001 #endif
9002   }
9003
9004   if (whence_check) {
9005     int r = _getattr(in, CEPH_STAT_CAP_SIZE, f->actor_perms);
9006     if (r < 0)
9007       return r;
9008   }
9009
9010   switch (whence) {
9011   case SEEK_SET:
9012     pos = offset;
9013     break;
9014
9015   case SEEK_CUR:
9016     pos = f->pos + offset;
9017     break;
9018
9019   case SEEK_END:
9020     pos = in->size + offset;
9021     break;
9022
9023 #ifdef SEEK_DATA
9024   case SEEK_DATA:
9025     if (offset < 0 || static_cast<uint64_t>(offset) >= in->size)
9026       return -ENXIO;
9027     pos = offset;
9028     break;
9029 #endif
9030
9031 #ifdef SEEK_HOLE
9032   case SEEK_HOLE:
9033     if (offset < 0 || static_cast<uint64_t>(offset) >= in->size)
9034       return -ENXIO;
9035     pos = in->size;
9036     break;
9037 #endif
9038
9039   default:
9040     ldout(cct, 1) << __func__ << ": invalid whence value " << whence << dendl;
9041     return -EINVAL;
9042   }
9043
9044   if (pos < 0) {
9045     return -EINVAL;
9046   } else {
9047     f->pos = pos;
9048   }
9049
9050   ldout(cct, 8) << "_lseek(" << f << ", " << offset << ", " << whence << ") = " << f->pos << dendl;
9051   return f->pos;
9052 }
9053
9054
9055 void Client::lock_fh_pos(Fh *f)
9056 {
9057   ldout(cct, 10) << __func__ << " " << f << dendl;
9058
9059   if (f->pos_locked || !f->pos_waiters.empty()) {
9060     ceph::condition_variable cond;
9061     f->pos_waiters.push_back(&cond);
9062     ldout(cct, 10) << __func__ << " BLOCKING on " << f << dendl;
9063     std::unique_lock l{client_lock, std::adopt_lock};
9064     cond.wait(l, [f, me=&cond] {
9065       return !f->pos_locked && f->pos_waiters.front() == me;
9066     });
9067     l.release();
9068     ldout(cct, 10) << __func__ << " UNBLOCKING on " << f << dendl;
9069     ceph_assert(f->pos_waiters.front() == &cond);
9070     f->pos_waiters.pop_front();
9071   }
9072
9073   f->pos_locked = true;
9074 }
9075
9076 void Client::unlock_fh_pos(Fh *f)
9077 {
9078   ldout(cct, 10) << __func__ << " " << f << dendl;
9079   f->pos_locked = false;
9080 }
9081
9082 int Client::uninline_data(Inode *in, Context *onfinish)
9083 {
9084   if (!in->inline_data.length()) {
9085     onfinish->complete(0);
9086     return 0;
9087   }
9088
9089   char oid_buf[32];
9090   snprintf(oid_buf, sizeof(oid_buf), "%llx.00000000", (long long unsigned)in->ino);
9091   object_t oid = oid_buf;
9092
9093   ObjectOperation create_ops;
9094   create_ops.create(false);
9095
9096   objecter->mutate(oid,
9097                    OSDMap::file_to_object_locator(in->layout),
9098                    create_ops,
9099                    in->snaprealm->get_snap_context(),
9100                    ceph::real_clock::now(),
9101                    0,
9102                    NULL);
9103
9104   bufferlist inline_version_bl;
9105   encode(in->inline_version, inline_version_bl);
9106
9107   ObjectOperation uninline_ops;
9108   uninline_ops.cmpxattr("inline_version",
9109                         CEPH_OSD_CMPXATTR_OP_GT,
9110                         CEPH_OSD_CMPXATTR_MODE_U64,
9111                         inline_version_bl);
9112   bufferlist inline_data = in->inline_data;
9113   uninline_ops.write(0, inline_data, in->truncate_size, in->truncate_seq);
9114   uninline_ops.setxattr("inline_version", stringify(in->inline_version));
9115
9116   objecter->mutate(oid,
9117                    OSDMap::file_to_object_locator(in->layout),
9118                    uninline_ops,
9119                    in->snaprealm->get_snap_context(),
9120                    ceph::real_clock::now(),
9121                    0,
9122                    onfinish);
9123
9124   return 0;
9125 }
9126
9127 //
9128
9129 // blocking osd interface
9130
9131 int Client::read(int fd, char *buf, loff_t size, loff_t offset)
9132 {
9133   std::lock_guard lock(client_lock);
9134   tout(cct) << "read" << std::endl;
9135   tout(cct) << fd << std::endl;
9136   tout(cct) << size << std::endl;
9137   tout(cct) << offset << std::endl;
9138
9139   if (unmounting)
9140     return -ENOTCONN;
9141
9142   Fh *f = get_filehandle(fd);
9143   if (!f)
9144     return -EBADF;
9145 #if defined(__linux__) && defined(O_PATH)
9146   if (f->flags & O_PATH)
9147     return -EBADF;
9148 #endif
9149   bufferlist bl;
9150   /* We can't return bytes written larger than INT_MAX, clamp size to that */
9151   size = std::min(size, (loff_t)INT_MAX);
9152   int r = _read(f, offset, size, &bl);
9153   ldout(cct, 3) << "read(" << fd << ", " << (void*)buf << ", " << size << ", " << offset << ") = " << r << dendl;
9154   if (r >= 0) {
9155     bl.begin().copy(bl.length(), buf);
9156     r = bl.length();
9157   }
9158   return r;
9159 }
9160
9161 int Client::preadv(int fd, const struct iovec *iov, int iovcnt, loff_t offset)
9162 {
9163   if (iovcnt < 0)
9164     return -EINVAL;
9165   return _preadv_pwritev(fd, iov, iovcnt, offset, false);
9166 }
9167
9168 int64_t Client::_read(Fh *f, int64_t offset, uint64_t size, bufferlist *bl)
9169 {
9170   int want, have = 0;
9171   bool movepos = false;
9172   std::unique_ptr<C_SaferCond> onuninline;
9173   int64_t r = 0;
9174   const auto& conf = cct->_conf;
9175   Inode *in = f->inode.get();
9176   utime_t lat;
9177   utime_t start = ceph_clock_now();
9178
9179   if ((f->mode & CEPH_FILE_MODE_RD) == 0)
9180     return -EBADF;
9181   //bool lazy = f->mode == CEPH_FILE_MODE_LAZY;
9182
9183   if (offset < 0) {
9184     lock_fh_pos(f);
9185     offset = f->pos;
9186     movepos = true;
9187   }
9188   loff_t start_pos = offset;
9189
9190   if (in->inline_version == 0) {
9191     r = _getattr(in, CEPH_STAT_CAP_INLINE_DATA, f->actor_perms, true);
9192     if (r < 0) {
9193       goto done;
9194     }
9195     ceph_assert(in->inline_version > 0);
9196   }
9197
9198 retry:
9199   if (f->mode & CEPH_FILE_MODE_LAZY)
9200     want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO;
9201   else
9202     want = CEPH_CAP_FILE_CACHE;
9203   r = get_caps(in, CEPH_CAP_FILE_RD, want, &have, -1);
9204   if (r < 0) {
9205     goto done;
9206   }
9207   if (f->flags & O_DIRECT)
9208     have &= ~(CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO);
9209
9210   if (in->inline_version < CEPH_INLINE_NONE) {
9211     if (!(have & CEPH_CAP_FILE_CACHE)) {
9212       onuninline.reset(new C_SaferCond("Client::_read_uninline_data flock"));
9213       uninline_data(in, onuninline.get());
9214     } else {
9215       uint32_t len = in->inline_data.length();
9216       uint64_t endoff = offset + size;
9217       if (endoff > in->size)
9218         endoff = in->size;
9219
9220       if (offset < len) {
9221         if (endoff <= len) {
9222           bl->substr_of(in->inline_data, offset, endoff - offset);
9223         } else {
9224           bl->substr_of(in->inline_data, offset, len - offset);
9225           bl->append_zero(endoff - len);
9226         }
9227         r = endoff - offset;
9228       } else if ((uint64_t)offset < endoff) {
9229         bl->append_zero(endoff - offset);
9230         r = endoff - offset;
9231       } else {
9232         r = 0;
9233       }
9234       goto success;
9235     }
9236   }
9237
9238   if (!conf->client_debug_force_sync_read &&
9239       conf->client_oc &&
9240       (have & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO))) {
9241
9242     if (f->flags & O_RSYNC) {
9243       _flush_range(in, offset, size);
9244     }
9245     r = _read_async(f, offset, size, bl);
9246     if (r < 0)
9247       goto done;
9248   } else {
9249     if (f->flags & O_DIRECT)
9250       _flush_range(in, offset, size);
9251
9252     bool checkeof = false;
9253     r = _read_sync(f, offset, size, bl, &checkeof);
9254     if (r < 0)
9255       goto done;
9256     if (checkeof) {
9257       offset += r;
9258       size -= r;
9259
9260       put_cap_ref(in, CEPH_CAP_FILE_RD);
9261       have = 0;
9262       // reverify size
9263       r = _getattr(in, CEPH_STAT_CAP_SIZE, f->actor_perms);
9264       if (r < 0)
9265         goto done;
9266
9267       // eof?  short read.
9268       if ((uint64_t)offset < in->size)
9269         goto retry;
9270     }
9271   }
9272
9273 success:
9274   ceph_assert(r >= 0);
9275   if (movepos) {
9276     // adjust fd pos
9277     f->pos = start_pos + r;
9278   }
9279
9280   lat = ceph_clock_now();
9281   lat -= start;
9282   logger->tinc(l_c_read, lat);
9283
9284 done:
9285   // done!
9286
9287   if (onuninline) {
9288     client_lock.unlock();
9289     int ret = onuninline->wait();
9290     client_lock.lock();
9291     if (ret >= 0 || ret == -ECANCELED) {
9292       in->inline_data.clear();
9293       in->inline_version = CEPH_INLINE_NONE;
9294       in->mark_caps_dirty(CEPH_CAP_FILE_WR);
9295       check_caps(in, 0);
9296     } else
9297       r = ret;
9298   }
9299   if (have) {
9300     put_cap_ref(in, CEPH_CAP_FILE_RD);
9301   }
9302   if (movepos) {
9303     unlock_fh_pos(f);
9304   }
9305   return r;
9306 }
9307
9308 Client::C_Readahead::C_Readahead(Client *c, Fh *f) :
9309     client(c), f(f) {
9310   f->get();
9311   f->readahead.inc_pending();
9312 }
9313
9314 Client::C_Readahead::~C_Readahead() {
9315   f->readahead.dec_pending();
9316   client->_put_fh(f);
9317 }
9318
9319 void Client::C_Readahead::finish(int r) {
9320   lgeneric_subdout(client->cct, client, 20) << "client." << client->get_nodeid() << " " << "C_Readahead on " << f->inode << dendl;
9321   client->put_cap_ref(f->inode.get(), CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE);
9322 }
9323
9324 int Client::_read_async(Fh *f, uint64_t off, uint64_t len, bufferlist *bl)
9325 {
9326   const auto& conf = cct->_conf;
9327   Inode *in = f->inode.get();
9328
9329   ldout(cct, 10) << __func__ << " " << *in << " " << off << "~" << len << dendl;
9330
9331   // trim read based on file size?
9332   if (off >= in->size)
9333     return 0;
9334   if (len == 0)
9335     return 0;
9336   if (off + len > in->size) {
9337     len = in->size - off;
9338   }
9339
9340   ldout(cct, 10) << " min_bytes=" << f->readahead.get_min_readahead_size()
9341                  << " max_bytes=" << f->readahead.get_max_readahead_size()
9342                  << " max_periods=" << conf->client_readahead_max_periods << dendl;
9343
9344   // read (and possibly block)
9345   int r = 0;
9346   C_SaferCond onfinish("Client::_read_async flock");
9347   r = objectcacher->file_read(&in->oset, &in->layout, in->snapid,
9348                               off, len, bl, 0, &onfinish);
9349   if (r == 0) {
9350     get_cap_ref(in, CEPH_CAP_FILE_CACHE);
9351     client_lock.unlock();
9352     r = onfinish.wait();
9353     client_lock.lock();
9354     put_cap_ref(in, CEPH_CAP_FILE_CACHE);
9355   }
9356
9357   if(f->readahead.get_min_readahead_size() > 0) {
9358     pair<uint64_t, uint64_t> readahead_extent = f->readahead.update(off, len, in->size);
9359     if (readahead_extent.second > 0) {
9360       ldout(cct, 20) << "readahead " << readahead_extent.first << "~" << readahead_extent.second
9361                      << " (caller wants " << off << "~" << len << ")" << dendl;
9362       Context *onfinish2 = new C_Readahead(this, f);
9363       int r2 = objectcacher->file_read(&in->oset, &in->layout, in->snapid,
9364                                        readahead_extent.first, readahead_extent.second,
9365                                        NULL, 0, onfinish2);
9366       if (r2 == 0) {
9367         ldout(cct, 20) << "readahead initiated, c " << onfinish2 << dendl;
9368         get_cap_ref(in, CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE);
9369       } else {
9370         ldout(cct, 20) << "readahead was no-op, already cached" << dendl;
9371         delete onfinish2;
9372       }
9373     }
9374   }
9375
9376   return r;
9377 }
9378
9379 int Client::_read_sync(Fh *f, uint64_t off, uint64_t len, bufferlist *bl,
9380                        bool *checkeof)
9381 {
9382   Inode *in = f->inode.get();
9383   uint64_t pos = off;
9384   int left = len;
9385   int read = 0;
9386
9387   ldout(cct, 10) << __func__ << " " << *in << " " << off << "~" << len << dendl;
9388
9389   while (left > 0) {
9390     C_SaferCond onfinish("Client::_read_sync flock");
9391     bufferlist tbl;
9392
9393     int wanted = left;
9394     filer->read_trunc(in->ino, &in->layout, in->snapid,
9395                       pos, left, &tbl, 0,
9396                       in->truncate_size, in->truncate_seq,
9397                       &onfinish);
9398     client_lock.unlock();
9399     int r = onfinish.wait();
9400     client_lock.lock();
9401
9402     // if we get ENOENT from OSD, assume 0 bytes returned
9403     if (r == -ENOENT)
9404       r = 0;
9405     if (r < 0)
9406       return r;
9407     if (tbl.length()) {
9408       r = tbl.length();
9409
9410       read += r;
9411       pos += r;
9412       left -= r;
9413       bl->claim_append(tbl);
9414     }
9415     // short read?
9416     if (r >= 0 && r < wanted) {
9417       if (pos < in->size) {
9418         // zero up to known EOF
9419         int64_t some = in->size - pos;
9420         if (some > left)
9421           some = left;
9422         auto z = buffer::ptr_node::create(some);
9423         z->zero();
9424         bl->push_back(std::move(z));
9425         read += some;
9426         pos += some;
9427         left -= some;
9428         if (left == 0)
9429           return read;
9430       }
9431
9432       *checkeof = true;
9433       return read;
9434     }
9435   }
9436   return read;
9437 }
9438
9439
9440 /*
9441  * we keep count of uncommitted sync writes on the inode, so that
9442  * fsync can DDRT.
9443  */
9444 void Client::_sync_write_commit(Inode *in)
9445 {
9446   ceph_assert(unsafe_sync_write > 0);
9447   unsafe_sync_write--;
9448
9449   put_cap_ref(in, CEPH_CAP_FILE_BUFFER);
9450
9451   ldout(cct, 15) << __func__ << " unsafe_sync_write = " << unsafe_sync_write << dendl;
9452   if (unsafe_sync_write == 0 && unmounting) {
9453     ldout(cct, 10) << __func__ << " -- no more unsafe writes, unmount can proceed" << dendl;
9454     mount_cond.notify_all();
9455   }
9456 }
9457
9458 int Client::write(int fd, const char *buf, loff_t size, loff_t offset)
9459 {
9460   std::lock_guard lock(client_lock);
9461   tout(cct) << "write" << std::endl;
9462   tout(cct) << fd << std::endl;
9463   tout(cct) << size << std::endl;
9464   tout(cct) << offset << std::endl;
9465
9466   if (unmounting)
9467     return -ENOTCONN;
9468
9469   Fh *fh = get_filehandle(fd);
9470   if (!fh)
9471     return -EBADF;
9472 #if defined(__linux__) && defined(O_PATH)
9473   if (fh->flags & O_PATH)
9474     return -EBADF;
9475 #endif
9476   /* We can't return bytes written larger than INT_MAX, clamp size to that */
9477   size = std::min(size, (loff_t)INT_MAX);
9478   int r = _write(fh, offset, size, buf, NULL, false);
9479   ldout(cct, 3) << "write(" << fd << ", \"...\", " << size << ", " << offset << ") = " << r << dendl;
9480   return r;
9481 }
9482
9483 int Client::pwritev(int fd, const struct iovec *iov, int iovcnt, int64_t offset)
9484 {
9485   if (iovcnt < 0)
9486     return -EINVAL;
9487   return _preadv_pwritev(fd, iov, iovcnt, offset, true);
9488 }
9489
9490 int64_t Client::_preadv_pwritev_locked(Fh *fh, const struct iovec *iov,
9491                                    unsigned iovcnt, int64_t offset, bool write,
9492                                    bool clamp_to_int)
9493 {
9494 #if defined(__linux__) && defined(O_PATH)
9495     if (fh->flags & O_PATH)
9496         return -EBADF;
9497 #endif
9498     loff_t totallen = 0;
9499     for (unsigned i = 0; i < iovcnt; i++) {
9500         totallen += iov[i].iov_len;
9501     }
9502
9503     /*
9504      * Some of the API functions take 64-bit size values, but only return
9505      * 32-bit signed integers. Clamp the I/O sizes in those functions so that
9506      * we don't do I/Os larger than the values we can return.
9507      */
9508     if (clamp_to_int) {
9509       totallen = std::min(totallen, (loff_t)INT_MAX);
9510     }
9511     if (write) {
9512         int64_t w = _write(fh, offset, totallen, NULL, iov, iovcnt);
9513         ldout(cct, 3) << "pwritev(" << fh << ", \"...\", " << totallen << ", " << offset << ") = " << w << dendl;
9514         return w;
9515     } else {
9516         bufferlist bl;
9517         int64_t r = _read(fh, offset, totallen, &bl);
9518         ldout(cct, 3) << "preadv(" << fh << ", " <<  offset << ") = " << r << dendl;
9519         if (r <= 0)
9520           return r;
9521
9522         auto iter = bl.cbegin();
9523         for (unsigned j = 0, resid = r; j < iovcnt && resid > 0; j++) {
9524                /*
9525                 * This piece of code aims to handle the case that bufferlist does not have enough data
9526                 * to fill in the iov
9527                 */
9528                const auto round_size = std::min<unsigned>(resid, iov[j].iov_len);
9529                iter.copy(round_size, reinterpret_cast<char*>(iov[j].iov_base));
9530                resid -= round_size;
9531                /* iter is self-updating */
9532         }
9533         return r;
9534     }
9535 }
9536
9537 int Client::_preadv_pwritev(int fd, const struct iovec *iov, unsigned iovcnt, int64_t offset, bool write)
9538 {
9539     std::lock_guard lock(client_lock);
9540     tout(cct) << fd << std::endl;
9541     tout(cct) << offset << std::endl;
9542
9543     if (unmounting)
9544      return -ENOTCONN;
9545
9546     Fh *fh = get_filehandle(fd);
9547     if (!fh)
9548         return -EBADF;
9549     return _preadv_pwritev_locked(fh, iov, iovcnt, offset, write, true);
9550 }
9551
9552 int64_t Client::_write(Fh *f, int64_t offset, uint64_t size, const char *buf,
9553                         const struct iovec *iov, int iovcnt)
9554 {
9555   uint64_t fpos = 0;
9556
9557   if ((uint64_t)(offset+size) > mdsmap->get_max_filesize()) //too large!
9558     return -EFBIG;
9559
9560   //ldout(cct, 7) << "write fh " << fh << " size " << size << " offset " << offset << dendl;
9561   Inode *in = f->inode.get();
9562
9563   if (objecter->osdmap_pool_full(in->layout.pool_id)) {
9564     return -ENOSPC;
9565   }
9566
9567   ceph_assert(in->snapid == CEPH_NOSNAP);
9568
9569   // was Fh opened as writeable?
9570   if ((f->mode & CEPH_FILE_MODE_WR) == 0)
9571     return -EBADF;
9572
9573   // use/adjust fd pos?
9574   if (offset < 0) {
9575     lock_fh_pos(f);
9576     /*
9577      * FIXME: this is racy in that we may block _after_ this point waiting for caps, and size may
9578      * change out from under us.
9579      */
9580     if (f->flags & O_APPEND) {
9581       auto r = _lseek(f, 0, SEEK_END);
9582       if (r < 0) {
9583         unlock_fh_pos(f);
9584         return r;
9585       }
9586     }
9587     offset = f->pos;
9588     fpos = offset+size;
9589     unlock_fh_pos(f);
9590   }
9591
9592   // check quota
9593   uint64_t endoff = offset + size;
9594   if (endoff > in->size && is_quota_bytes_exceeded(in, endoff - in->size,
9595                                                    f->actor_perms)) {
9596     return -EDQUOT;
9597   }
9598
9599   //bool lazy = f->mode == CEPH_FILE_MODE_LAZY;
9600
9601   ldout(cct, 10) << "cur file size is " << in->size << dendl;
9602
9603   // time it.
9604   utime_t start = ceph_clock_now();
9605
9606   if (in->inline_version == 0) {
9607     int r = _getattr(in, CEPH_STAT_CAP_INLINE_DATA, f->actor_perms, true);
9608     if (r < 0)
9609       return r;
9610     ceph_assert(in->inline_version > 0);
9611   }
9612
9613   // copy into fresh buffer (since our write may be resub, async)
9614   bufferlist bl;
9615   if (buf) {
9616     if (size > 0)
9617       bl.append(buf, size);
9618   } else if (iov){
9619     for (int i = 0; i < iovcnt; i++) {
9620       if (iov[i].iov_len > 0) {
9621         bl.append((const char *)iov[i].iov_base, iov[i].iov_len);
9622       }
9623     }
9624   }
9625
9626   utime_t lat;
9627   uint64_t totalwritten;
9628   int want, have;
9629   if (f->mode & CEPH_FILE_MODE_LAZY)
9630     want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO;
9631   else
9632     want = CEPH_CAP_FILE_BUFFER;
9633   int r = get_caps(in, CEPH_CAP_FILE_WR|CEPH_CAP_AUTH_SHARED, want, &have, endoff);
9634   if (r < 0)
9635     return r;
9636
9637   /* clear the setuid/setgid bits, if any */
9638   if (unlikely(in->mode & (S_ISUID|S_ISGID)) && size > 0) {
9639     struct ceph_statx stx = { 0 };
9640
9641     put_cap_ref(in, CEPH_CAP_AUTH_SHARED);
9642     r = __setattrx(in, &stx, CEPH_SETATTR_KILL_SGUID, f->actor_perms);
9643     if (r < 0)
9644       return r;
9645   } else {
9646     put_cap_ref(in, CEPH_CAP_AUTH_SHARED);
9647   }
9648
9649   if (f->flags & O_DIRECT)
9650     have &= ~(CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO);
9651
9652   ldout(cct, 10) << " snaprealm " << *in->snaprealm << dendl;
9653
9654   std::unique_ptr<C_SaferCond> onuninline = nullptr;
9655
9656   if (in->inline_version < CEPH_INLINE_NONE) {
9657     if (endoff > cct->_conf->client_max_inline_size ||
9658         endoff > CEPH_INLINE_MAX_SIZE ||
9659         !(have & CEPH_CAP_FILE_BUFFER)) {
9660       onuninline.reset(new C_SaferCond("Client::_write_uninline_data flock"));
9661       uninline_data(in, onuninline.get());
9662     } else {
9663       get_cap_ref(in, CEPH_CAP_FILE_BUFFER);
9664
9665       uint32_t len = in->inline_data.length();
9666
9667       if (endoff < len)
9668         in->inline_data.begin(endoff).copy(len - endoff, bl); // XXX
9669
9670       if (offset < len)
9671         in->inline_data.splice(offset, len - offset);
9672       else if (offset > len)
9673         in->inline_data.append_zero(offset - len);
9674
9675       in->inline_data.append(bl);
9676       in->inline_version++;
9677
9678       put_cap_ref(in, CEPH_CAP_FILE_BUFFER);
9679
9680       goto success;
9681     }
9682   }
9683
9684   if (cct->_conf->client_oc &&
9685       (have & (CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO))) {
9686     // do buffered write
9687     if (!in->oset.dirty_or_tx)
9688       get_cap_ref(in, CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_BUFFER);
9689
9690     get_cap_ref(in, CEPH_CAP_FILE_BUFFER);
9691
9692     // async, caching, non-blocking.
9693     r = objectcacher->file_write(&in->oset, &in->layout,
9694                                  in->snaprealm->get_snap_context(),
9695                                  offset, size, bl, ceph::real_clock::now(),
9696                                  0);
9697     put_cap_ref(in, CEPH_CAP_FILE_BUFFER);
9698
9699     if (r < 0)
9700       goto done;
9701
9702     // flush cached write if O_SYNC is set on file fh
9703     // O_DSYNC == O_SYNC on linux < 2.6.33
9704     // O_SYNC = __O_SYNC | O_DSYNC on linux >= 2.6.33
9705     if ((f->flags & O_SYNC) || (f->flags & O_DSYNC)) {
9706       _flush_range(in, offset, size);
9707     }
9708   } else {
9709     if (f->flags & O_DIRECT)
9710       _flush_range(in, offset, size);
9711
9712     // simple, non-atomic sync write
9713     C_SaferCond onfinish("Client::_write flock");
9714     unsafe_sync_write++;
9715     get_cap_ref(in, CEPH_CAP_FILE_BUFFER);  // released by onsafe callback
9716
9717     filer->write_trunc(in->ino, &in->layout, in->snaprealm->get_snap_context(),
9718                        offset, size, bl, ceph::real_clock::now(), 0,
9719                        in->truncate_size, in->truncate_seq,
9720                        &onfinish);
9721     client_lock.unlock();
9722     onfinish.wait();
9723     client_lock.lock();
9724     _sync_write_commit(in);
9725   }
9726
9727   // if we get here, write was successful, update client metadata
9728 success:
9729   // time
9730   lat = ceph_clock_now();
9731   lat -= start;
9732   logger->tinc(l_c_wrlat, lat);
9733
9734   if (fpos) {
9735     lock_fh_pos(f);
9736     f->pos = fpos;
9737     unlock_fh_pos(f);
9738   }
9739   totalwritten = size;
9740   r = (int64_t)totalwritten;
9741
9742   // extend file?
9743   if (totalwritten + offset > in->size) {
9744     in->size = totalwritten + offset;
9745     in->mark_caps_dirty(CEPH_CAP_FILE_WR);
9746
9747     if (is_quota_bytes_approaching(in, f->actor_perms)) {
9748       check_caps(in, CHECK_CAPS_NODELAY);
9749     } else if (is_max_size_approaching(in)) {
9750       check_caps(in, 0);
9751     }
9752
9753     ldout(cct, 7) << "wrote to " << totalwritten+offset << ", extending file size" << dendl;
9754   } else {
9755     ldout(cct, 7) << "wrote to " << totalwritten+offset << ", leaving file size at " << in->size << dendl;
9756   }
9757
9758   // mtime
9759   in->mtime = in->ctime = ceph_clock_now();
9760   in->change_attr++;
9761   in->mark_caps_dirty(CEPH_CAP_FILE_WR);
9762
9763 done:
9764
9765   if (nullptr != onuninline) {
9766     client_lock.unlock();
9767     int uninline_ret = onuninline->wait();
9768     client_lock.lock();
9769
9770     if (uninline_ret >= 0 || uninline_ret == -ECANCELED) {
9771       in->inline_data.clear();
9772       in->inline_version = CEPH_INLINE_NONE;
9773       in->mark_caps_dirty(CEPH_CAP_FILE_WR);
9774       check_caps(in, 0);
9775     } else
9776       r = uninline_ret;
9777   }
9778
9779   put_cap_ref(in, CEPH_CAP_FILE_WR);
9780   return r;
9781 }
9782
9783 int Client::_flush(Fh *f)
9784 {
9785   Inode *in = f->inode.get();
9786   int err = f->take_async_err();
9787   if (err != 0) {
9788     ldout(cct, 1) << __func__ << ": " << f << " on inode " << *in << " caught async_err = "
9789                   << cpp_strerror(err) << dendl;
9790   } else {
9791     ldout(cct, 10) << __func__ << ": " << f << " on inode " << *in << " no async_err state" << dendl;
9792   }
9793
9794   return err;
9795 }
9796
9797 int Client::truncate(const char *relpath, loff_t length, const UserPerm& perms)
9798 {
9799   struct ceph_statx stx;
9800   stx.stx_size = length;
9801   return setattrx(relpath, &stx, CEPH_SETATTR_SIZE, perms);
9802 }
9803
9804 int Client::ftruncate(int fd, loff_t length, const UserPerm& perms)
9805 {
9806   std::lock_guard lock(client_lock);
9807   tout(cct) << __func__ << std::endl;
9808   tout(cct) << fd << std::endl;
9809   tout(cct) << length << std::endl;
9810
9811   if (unmounting)
9812     return -ENOTCONN;
9813
9814   Fh *f = get_filehandle(fd);
9815   if (!f)
9816     return -EBADF;
9817 #if defined(__linux__) && defined(O_PATH)
9818   if (f->flags & O_PATH)
9819     return -EBADF;
9820 #endif
9821   struct stat attr;
9822   attr.st_size = length;
9823   return _setattr(f->inode, &attr, CEPH_SETATTR_SIZE, perms);
9824 }
9825
9826 int Client::fsync(int fd, bool syncdataonly)
9827 {
9828   std::lock_guard lock(client_lock);
9829   tout(cct) << "fsync" << std::endl;
9830   tout(cct) << fd << std::endl;
9831   tout(cct) << syncdataonly << std::endl;
9832
9833   if (unmounting)
9834     return -ENOTCONN;
9835
9836   Fh *f = get_filehandle(fd);
9837   if (!f)
9838     return -EBADF;
9839 #if defined(__linux__) && defined(O_PATH)
9840   if (f->flags & O_PATH)
9841     return -EBADF;
9842 #endif
9843   int r = _fsync(f, syncdataonly);
9844   if (r == 0) {
9845     // The IOs in this fsync were okay, but maybe something happened
9846     // in the background that we shoudl be reporting?
9847     r = f->take_async_err();
9848     ldout(cct, 5) << "fsync(" << fd << ", " << syncdataonly
9849                   << ") = 0, async_err = " << r << dendl;
9850   } else {
9851     // Assume that an error we encountered during fsync, even reported
9852     // synchronously, would also have applied the error to the Fh, and we
9853     // should clear it here to avoid returning the same error again on next
9854     // call.
9855     ldout(cct, 5) << "fsync(" << fd << ", " << syncdataonly << ") = "
9856                   << r << dendl;
9857     f->take_async_err();
9858   }
9859   return r;
9860 }
9861
9862 int Client::_fsync(Inode *in, bool syncdataonly)
9863 {
9864   int r = 0;
9865   std::unique_ptr<C_SaferCond> object_cacher_completion = nullptr;
9866   ceph_tid_t flush_tid = 0;
9867   InodeRef tmp_ref;
9868   utime_t lat;
9869   utime_t start = ceph_clock_now();
9870
9871   ldout(cct, 8) << "_fsync on " << *in << " " << (syncdataonly ? "(dataonly)":"(data+metadata)") << dendl;
9872
9873   if (cct->_conf->client_oc) {
9874     object_cacher_completion.reset(new C_SaferCond("Client::_fsync::lock"));
9875     tmp_ref = in; // take a reference; C_SaferCond doesn't and _flush won't either
9876     _flush(in, object_cacher_completion.get());
9877     ldout(cct, 15) << "using return-valued form of _fsync" << dendl;
9878   }
9879
9880   if (!syncdataonly && in->dirty_caps) {
9881     check_caps(in, CHECK_CAPS_NODELAY|CHECK_CAPS_SYNCHRONOUS);
9882     if (in->flushing_caps)
9883       flush_tid = last_flush_tid;
9884   } else ldout(cct, 10) << "no metadata needs to commit" << dendl;
9885
9886   if (!syncdataonly && !in->unsafe_ops.empty()) {
9887     flush_mdlog_sync();
9888
9889     MetaRequest *req = in->unsafe_ops.back();
9890     ldout(cct, 15) << "waiting on unsafe requests, last tid " << req->get_tid() <<  dendl;
9891
9892     req->get();
9893     wait_on_list(req->waitfor_safe);
9894     put_request(req);
9895   }
9896
9897   if (nullptr != object_cacher_completion) { // wait on a real reply instead of guessing
9898     client_lock.unlock();
9899     ldout(cct, 15) << "waiting on data to flush" << dendl;
9900     r = object_cacher_completion->wait();
9901     client_lock.lock();
9902     ldout(cct, 15) << "got " << r << " from flush writeback" << dendl;
9903   } else {
9904     // FIXME: this can starve
9905     while (in->cap_refs[CEPH_CAP_FILE_BUFFER] > 0) {
9906       ldout(cct, 10) << "ino " << in->ino << " has " << in->cap_refs[CEPH_CAP_FILE_BUFFER]
9907                      << " uncommitted, waiting" << dendl;
9908       wait_on_list(in->waitfor_commit);
9909     }
9910   }
9911
9912   if (!r) {
9913     if (flush_tid > 0)
9914       wait_sync_caps(in, flush_tid);
9915
9916     ldout(cct, 10) << "ino " << in->ino << " has no uncommitted writes" << dendl;
9917   } else {
9918     ldout(cct, 8) << "ino " << in->ino << " failed to commit to disk! "
9919                   << cpp_strerror(-r) << dendl;
9920   }
9921
9922   lat = ceph_clock_now();
9923   lat -= start;
9924   logger->tinc(l_c_fsync, lat);
9925
9926   return r;
9927 }
9928
9929 int Client::_fsync(Fh *f, bool syncdataonly)
9930 {
9931   ldout(cct, 8) << "_fsync(" << f << ", " << (syncdataonly ? "dataonly)":"data+metadata)") << dendl;
9932   return _fsync(f->inode.get(), syncdataonly);
9933 }
9934
9935 int Client::fstat(int fd, struct stat *stbuf, const UserPerm& perms, int mask)
9936 {
9937   std::lock_guard lock(client_lock);
9938   tout(cct) << "fstat mask " << hex << mask << dec << std::endl;
9939   tout(cct) << fd << std::endl;
9940
9941   if (unmounting)
9942     return -ENOTCONN;
9943
9944   Fh *f = get_filehandle(fd);
9945   if (!f)
9946     return -EBADF;
9947   int r = _getattr(f->inode, mask, perms);
9948   if (r < 0)
9949     return r;
9950   fill_stat(f->inode, stbuf, NULL);
9951   ldout(cct, 5) << "fstat(" << fd << ", " << stbuf << ") = " << r << dendl;
9952   return r;
9953 }
9954
9955 int Client::fstatx(int fd, struct ceph_statx *stx, const UserPerm& perms,
9956                    unsigned int want, unsigned int flags)
9957 {
9958   std::lock_guard lock(client_lock);
9959   tout(cct) << "fstatx flags " << hex << flags << " want " << want << dec << std::endl;
9960   tout(cct) << fd << std::endl;
9961
9962   if (unmounting)
9963     return -ENOTCONN;
9964
9965   Fh *f = get_filehandle(fd);
9966   if (!f)
9967     return -EBADF;
9968
9969   unsigned mask = statx_to_mask(flags, want);
9970
9971   int r = 0;
9972   if (mask && !f->inode->caps_issued_mask(mask, true)) {
9973     r = _getattr(f->inode, mask, perms);
9974     if (r < 0) {
9975       ldout(cct, 3) << "fstatx exit on error!" << dendl;
9976       return r;
9977     }
9978   }
9979
9980   fill_statx(f->inode, mask, stx);
9981   ldout(cct, 3) << "fstatx(" << fd << ", " << stx << ") = " << r << dendl;
9982   return r;
9983 }
9984
9985 // not written yet, but i want to link!
9986
9987 int Client::chdir(const char *relpath, std::string &new_cwd,
9988                   const UserPerm& perms)
9989 {
9990   std::lock_guard lock(client_lock);
9991   tout(cct) << "chdir" << std::endl;
9992   tout(cct) << relpath << std::endl;
9993
9994   if (unmounting)
9995     return -ENOTCONN;
9996
9997   filepath path(relpath);
9998   InodeRef in;
9999   int r = path_walk(path, &in, perms);
10000   if (r < 0)
10001     return r;
10002
10003   if (!(in.get()->is_dir()))
10004     return -ENOTDIR;
10005
10006   if (cwd != in)
10007     cwd.swap(in);
10008   ldout(cct, 3) << "chdir(" << relpath << ")  cwd now " << cwd->ino << dendl;
10009
10010   _getcwd(new_cwd, perms);
10011   return 0;
10012 }
10013
10014 void Client::_getcwd(string& dir, const UserPerm& perms)
10015 {
10016   filepath path;
10017   ldout(cct, 10) << __func__ << " " << *cwd << dendl;
10018
10019   Inode *in = cwd.get();
10020   while (in != root) {
10021     ceph_assert(in->dentries.size() < 2); // dirs can't be hard-linked
10022
10023     // A cwd or ancester is unlinked
10024     if (in->dentries.empty()) {
10025       return;
10026     }
10027
10028     Dentry *dn = in->get_first_parent();
10029
10030
10031     if (!dn) {
10032       // look it up
10033       ldout(cct, 10) << __func__ << " looking up parent for " << *in << dendl;
10034       MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPNAME);
10035       filepath path(in->ino);
10036       req->set_filepath(path);
10037       req->set_inode(in);
10038       int res = make_request(req, perms);
10039       if (res < 0)
10040         break;
10041
10042       // start over
10043       path = filepath();
10044       in = cwd.get();
10045       continue;
10046     }
10047     path.push_front_dentry(dn->name);
10048     in = dn->dir->parent_inode;
10049   }
10050   dir = "/";
10051   dir += path.get_path();
10052 }
10053
10054 void Client::getcwd(string& dir, const UserPerm& perms)
10055 {
10056   std::lock_guard l(client_lock);
10057   if (!unmounting)
10058     _getcwd(dir, perms);
10059 }
10060
10061 int Client::statfs(const char *path, struct statvfs *stbuf,
10062                    const UserPerm& perms)
10063 {
10064   std::lock_guard l(client_lock);
10065   tout(cct) << __func__ << std::endl;
10066   unsigned long int total_files_on_fs;
10067
10068   if (unmounting)
10069     return -ENOTCONN;
10070
10071   ceph_statfs stats;
10072   C_SaferCond cond;
10073
10074   const vector<int64_t> &data_pools = mdsmap->get_data_pools();
10075   if (data_pools.size() == 1) {
10076     objecter->get_fs_stats(stats, data_pools[0], &cond);
10077   } else {
10078     objecter->get_fs_stats(stats, boost::optional<int64_t>(), &cond);
10079   }
10080
10081   client_lock.unlock();
10082   int rval = cond.wait();
10083   assert(root);
10084   total_files_on_fs = root->rstat.rfiles + root->rstat.rsubdirs;
10085   client_lock.lock();
10086
10087   if (rval < 0) {
10088     ldout(cct, 1) << "underlying call to statfs returned error: "
10089                   << cpp_strerror(rval)
10090                   << dendl;
10091     return rval;
10092   }
10093
10094   memset(stbuf, 0, sizeof(*stbuf));
10095
10096   /*
10097    * we're going to set a block size of 4MB so we can represent larger
10098    * FSes without overflowing. Additionally convert the space
10099    * measurements from KB to bytes while making them in terms of
10100    * blocks.  We use 4MB only because it is big enough, and because it
10101    * actually *is* the (ceph) default block size.
10102    */
10103   const int CEPH_BLOCK_SHIFT = 22;
10104   stbuf->f_frsize = 1 << CEPH_BLOCK_SHIFT;
10105   stbuf->f_bsize = 1 << CEPH_BLOCK_SHIFT;
10106   stbuf->f_files = total_files_on_fs;
10107   stbuf->f_ffree = 0;
10108   stbuf->f_favail = -1;
10109   stbuf->f_fsid = -1;       // ??
10110   stbuf->f_flag = 0;        // ??
10111   stbuf->f_namemax = NAME_MAX;
10112
10113   // Usually quota_root will == root_ancestor, but if the mount root has no
10114   // quota but we can see a parent of it that does have a quota, we'll
10115   // respect that one instead.
10116   ceph_assert(root != nullptr);
10117   Inode *quota_root = root->quota.is_enable() ? root : get_quota_root(root, perms);
10118
10119   // get_quota_root should always give us something
10120   // because client quotas are always enabled
10121   ceph_assert(quota_root != nullptr);
10122
10123   if (quota_root && cct->_conf->client_quota_df && quota_root->quota.max_bytes) {
10124
10125     // Skip the getattr if any sessions are stale, as we don't want to
10126     // block `df` if this client has e.g. been evicted, or if the MDS cluster
10127     // is unhealthy.
10128     if (!_any_stale_sessions()) {
10129       int r = _getattr(quota_root, 0, perms, true);
10130       if (r != 0) {
10131         // Ignore return value: error getting latest inode metadata is not a good
10132         // reason to break "df".
10133         lderr(cct) << "Error in getattr on quota root 0x"
10134                    << std::hex << quota_root->ino << std::dec
10135                    << " statfs result may be outdated" << dendl;
10136       }
10137     }
10138
10139     // Special case: if there is a size quota set on the Inode acting
10140     // as the root for this client mount, then report the quota status
10141     // as the filesystem statistics.
10142     const fsblkcnt_t total = quota_root->quota.max_bytes >> CEPH_BLOCK_SHIFT;
10143     const fsblkcnt_t used = quota_root->rstat.rbytes >> CEPH_BLOCK_SHIFT;
10144     // It is possible for a quota to be exceeded: arithmetic here must
10145     // handle case where used > total.
10146     const fsblkcnt_t free = total > used ? total - used : 0;
10147
10148     stbuf->f_blocks = total;
10149     stbuf->f_bfree = free;
10150     stbuf->f_bavail = free;
10151   } else {
10152     // General case: report the cluster statistics returned from RADOS. Because
10153     // multiple pools may be used without one filesystem namespace via
10154     // layouts, this is the most correct thing we can do.
10155     stbuf->f_blocks = stats.kb >> (CEPH_BLOCK_SHIFT - 10);
10156     stbuf->f_bfree = stats.kb_avail >> (CEPH_BLOCK_SHIFT - 10);
10157     stbuf->f_bavail = stats.kb_avail >> (CEPH_BLOCK_SHIFT - 10);
10158   }
10159
10160   return rval;
10161 }
10162
10163 int Client::_do_filelock(Inode *in, Fh *fh, int lock_type, int op, int sleep,
10164                          struct flock *fl, uint64_t owner, bool removing)
10165 {
10166   ldout(cct, 10) << __func__ << " ino " << in->ino
10167                  << (lock_type == CEPH_LOCK_FCNTL ? " fcntl" : " flock")
10168                  << " type " << fl->l_type << " owner " << owner
10169                  << " " << fl->l_start << "~" << fl->l_len << dendl;
10170
10171   int lock_cmd;
10172   if (F_RDLCK == fl->l_type)
10173     lock_cmd = CEPH_LOCK_SHARED;
10174   else if (F_WRLCK == fl->l_type)
10175     lock_cmd = CEPH_LOCK_EXCL;
10176   else if (F_UNLCK == fl->l_type)
10177     lock_cmd = CEPH_LOCK_UNLOCK;
10178   else
10179     return -EIO;
10180
10181   if (op != CEPH_MDS_OP_SETFILELOCK || lock_cmd == CEPH_LOCK_UNLOCK)
10182     sleep = 0;
10183
10184   /*
10185    * Set the most significant bit, so that MDS knows the 'owner'
10186    * is sufficient to identify the owner of lock. (old code uses
10187    * both 'owner' and 'pid')
10188    */
10189   owner |= (1ULL << 63);
10190
10191   MetaRequest *req = new MetaRequest(op);
10192   filepath path;
10193   in->make_nosnap_relative_path(path);
10194   req->set_filepath(path);
10195   req->set_inode(in);
10196
10197   req->head.args.filelock_change.rule = lock_type;
10198   req->head.args.filelock_change.type = lock_cmd;
10199   req->head.args.filelock_change.owner = owner;
10200   req->head.args.filelock_change.pid = fl->l_pid;
10201   req->head.args.filelock_change.start = fl->l_start;
10202   req->head.args.filelock_change.length = fl->l_len;
10203   req->head.args.filelock_change.wait = sleep;
10204
10205   int ret;
10206   bufferlist bl;
10207
10208   if (sleep && switch_interrupt_cb) {
10209     // enable interrupt
10210     switch_interrupt_cb(callback_handle, req->get());
10211     ret = make_request(req, fh->actor_perms, NULL, NULL, -1, &bl);
10212     // disable interrupt
10213     switch_interrupt_cb(callback_handle, NULL);
10214     if (ret == 0 && req->aborted()) {
10215       // effect of this lock request has been revoked by the 'lock intr' request
10216       ret = req->get_abort_code();
10217     }
10218     put_request(req);
10219   } else {
10220     ret = make_request(req, fh->actor_perms, NULL, NULL, -1, &bl);
10221   }
10222
10223   if (ret == 0) {
10224     if (op == CEPH_MDS_OP_GETFILELOCK) {
10225       ceph_filelock filelock;
10226       auto p = bl.cbegin();
10227       decode(filelock, p);
10228
10229       if (CEPH_LOCK_SHARED == filelock.type)
10230         fl->l_type = F_RDLCK;
10231       else if (CEPH_LOCK_EXCL == filelock.type)
10232         fl->l_type = F_WRLCK;
10233       else
10234         fl->l_type = F_UNLCK;
10235
10236       fl->l_whence = SEEK_SET;
10237       fl->l_start = filelock.start;
10238       fl->l_len = filelock.length;
10239       fl->l_pid = filelock.pid;
10240     } else if (op == CEPH_MDS_OP_SETFILELOCK) {
10241       ceph_lock_state_t *lock_state;
10242       if (lock_type == CEPH_LOCK_FCNTL) {
10243         if (!in->fcntl_locks)
10244           in->fcntl_locks.reset(new ceph_lock_state_t(cct, CEPH_LOCK_FCNTL));
10245         lock_state = in->fcntl_locks.get();
10246       } else if (lock_type == CEPH_LOCK_FLOCK) {
10247         if (!in->flock_locks)
10248           in->flock_locks.reset(new ceph_lock_state_t(cct, CEPH_LOCK_FLOCK));
10249         lock_state = in->flock_locks.get();
10250       } else {
10251         ceph_abort();
10252         return -EINVAL;
10253       }
10254       _update_lock_state(fl, owner, lock_state);
10255
10256       if (!removing) {
10257         if (lock_type == CEPH_LOCK_FCNTL) {
10258           if (!fh->fcntl_locks)
10259             fh->fcntl_locks.reset(new ceph_lock_state_t(cct, CEPH_LOCK_FCNTL));
10260           lock_state = fh->fcntl_locks.get();
10261         } else {
10262           if (!fh->flock_locks)
10263             fh->flock_locks.reset(new ceph_lock_state_t(cct, CEPH_LOCK_FLOCK));
10264           lock_state = fh->flock_locks.get();
10265         }
10266         _update_lock_state(fl, owner, lock_state);
10267       }
10268     } else
10269       ceph_abort();
10270   }
10271   return ret;
10272 }
10273
10274 int Client::_interrupt_filelock(MetaRequest *req)
10275 {
10276   // Set abort code, but do not kick. The abort code prevents the request
10277   // from being re-sent.
10278   req->abort(-EINTR);
10279   if (req->mds < 0)
10280     return 0; // haven't sent the request
10281
10282   Inode *in = req->inode();
10283
10284   int lock_type;
10285   if (req->head.args.filelock_change.rule == CEPH_LOCK_FLOCK)
10286     lock_type = CEPH_LOCK_FLOCK_INTR;
10287   else if (req->head.args.filelock_change.rule == CEPH_LOCK_FCNTL)
10288     lock_type = CEPH_LOCK_FCNTL_INTR;
10289   else {
10290     ceph_abort();
10291     return -EINVAL;
10292   }
10293
10294   MetaRequest *intr_req = new MetaRequest(CEPH_MDS_OP_SETFILELOCK);
10295   filepath path;
10296   in->make_nosnap_relative_path(path);
10297   intr_req->set_filepath(path);
10298   intr_req->set_inode(in);
10299   intr_req->head.args.filelock_change = req->head.args.filelock_change;
10300   intr_req->head.args.filelock_change.rule = lock_type;
10301   intr_req->head.args.filelock_change.type = CEPH_LOCK_UNLOCK;
10302
10303   UserPerm perms(req->get_uid(), req->get_gid());
10304   return make_request(intr_req, perms, NULL, NULL, -1);
10305 }
10306
10307 void Client::_encode_filelocks(Inode *in, bufferlist& bl)
10308 {
10309   if (!in->fcntl_locks && !in->flock_locks)
10310     return;
10311
10312   unsigned nr_fcntl_locks = in->fcntl_locks ? in->fcntl_locks->held_locks.size() : 0;
10313   encode(nr_fcntl_locks, bl);
10314   if (nr_fcntl_locks) {
10315     auto &lock_state = in->fcntl_locks;
10316     for(multimap<uint64_t, ceph_filelock>::iterator p = lock_state->held_locks.begin();
10317         p != lock_state->held_locks.end();
10318         ++p)
10319       encode(p->second, bl);
10320   }
10321
10322   unsigned nr_flock_locks = in->flock_locks ? in->flock_locks->held_locks.size() : 0;
10323   encode(nr_flock_locks, bl);
10324   if (nr_flock_locks) {
10325     auto &lock_state = in->flock_locks;
10326     for(multimap<uint64_t, ceph_filelock>::iterator p = lock_state->held_locks.begin();
10327         p != lock_state->held_locks.end();
10328         ++p)
10329       encode(p->second, bl);
10330   }
10331
10332   ldout(cct, 10) << __func__ << " ino " << in->ino << ", " << nr_fcntl_locks
10333                  << " fcntl locks, " << nr_flock_locks << " flock locks" <<  dendl;
10334 }
10335
10336 void Client::_release_filelocks(Fh *fh)
10337 {
10338   if (!fh->fcntl_locks && !fh->flock_locks)
10339     return;
10340
10341   Inode *in = fh->inode.get();
10342   ldout(cct, 10) << __func__ << " " << fh << " ino " << in->ino << dendl;
10343
10344   list<pair<int, ceph_filelock> > to_release;
10345
10346   if (fh->fcntl_locks) {
10347     auto &lock_state = fh->fcntl_locks;
10348     for(multimap<uint64_t, ceph_filelock>::iterator p = lock_state->held_locks.begin();
10349         p != lock_state->held_locks.end();
10350         ++p)
10351       to_release.push_back(pair<int, ceph_filelock>(CEPH_LOCK_FCNTL, p->second));
10352     lock_state.reset();
10353   }
10354   if (fh->flock_locks) {
10355     auto &lock_state = fh->flock_locks;
10356     for(multimap<uint64_t, ceph_filelock>::iterator p = lock_state->held_locks.begin();
10357         p != lock_state->held_locks.end();
10358         ++p)
10359       to_release.push_back(pair<int, ceph_filelock>(CEPH_LOCK_FLOCK, p->second));
10360     lock_state.reset();
10361   }
10362
10363   if (to_release.empty())
10364     return;
10365
10366   // mds has already released filelocks if session was closed.
10367   if (in->caps.empty())
10368     return;
10369
10370   struct flock fl;
10371   memset(&fl, 0, sizeof(fl));
10372   fl.l_whence = SEEK_SET;
10373   fl.l_type = F_UNLCK;
10374
10375   for (list<pair<int, ceph_filelock> >::iterator p = to_release.begin();
10376        p != to_release.end();
10377        ++p) {
10378     fl.l_start = p->second.start;
10379     fl.l_len = p->second.length;
10380     fl.l_pid = p->second.pid;
10381     _do_filelock(in, fh, p->first, CEPH_MDS_OP_SETFILELOCK, 0, &fl,
10382                  p->second.owner, true);
10383   }
10384 }
10385
10386 void Client::_update_lock_state(struct flock *fl, uint64_t owner,
10387                                 ceph_lock_state_t *lock_state)
10388 {
10389   int lock_cmd;
10390   if (F_RDLCK == fl->l_type)
10391     lock_cmd = CEPH_LOCK_SHARED;
10392   else if (F_WRLCK == fl->l_type)
10393     lock_cmd = CEPH_LOCK_EXCL;
10394   else
10395     lock_cmd = CEPH_LOCK_UNLOCK;;
10396
10397   ceph_filelock filelock;
10398   filelock.start = fl->l_start;
10399   filelock.length = fl->l_len;
10400   filelock.client = 0;
10401   // see comment in _do_filelock()
10402   filelock.owner = owner | (1ULL << 63);
10403   filelock.pid = fl->l_pid;
10404   filelock.type = lock_cmd;
10405
10406   if (filelock.type == CEPH_LOCK_UNLOCK) {
10407     list<ceph_filelock> activated_locks;
10408     lock_state->remove_lock(filelock, activated_locks);
10409   } else {
10410     bool r = lock_state->add_lock(filelock, false, false, NULL);
10411     ceph_assert(r);
10412   }
10413 }
10414
10415 int Client::_getlk(Fh *fh, struct flock *fl, uint64_t owner)
10416 {
10417   Inode *in = fh->inode.get();
10418   ldout(cct, 10) << "_getlk " << fh << " ino " << in->ino << dendl;
10419   int ret = _do_filelock(in, fh, CEPH_LOCK_FCNTL, CEPH_MDS_OP_GETFILELOCK, 0, fl, owner);
10420   return ret;
10421 }
10422
10423 int Client::_setlk(Fh *fh, struct flock *fl, uint64_t owner, int sleep)
10424 {
10425   Inode *in = fh->inode.get();
10426   ldout(cct, 10) << "_setlk " << fh << " ino " << in->ino << dendl;
10427   int ret =  _do_filelock(in, fh, CEPH_LOCK_FCNTL, CEPH_MDS_OP_SETFILELOCK, sleep, fl, owner);
10428   ldout(cct, 10) << "_setlk " << fh << " ino " << in->ino << " result=" << ret << dendl;
10429   return ret;
10430 }
10431
10432 int Client::_flock(Fh *fh, int cmd, uint64_t owner)
10433 {
10434   Inode *in = fh->inode.get();
10435   ldout(cct, 10) << "_flock " << fh << " ino " << in->ino << dendl;
10436
10437   int sleep = !(cmd & LOCK_NB);
10438   cmd &= ~LOCK_NB;
10439
10440   int type;
10441   switch (cmd) {
10442     case LOCK_SH:
10443       type = F_RDLCK;
10444       break;
10445     case LOCK_EX:
10446       type = F_WRLCK;
10447       break;
10448     case LOCK_UN:
10449       type = F_UNLCK;
10450       break;
10451     default:
10452       return -EINVAL;
10453   }
10454
10455   struct flock fl;
10456   memset(&fl, 0, sizeof(fl));
10457   fl.l_type = type;
10458   fl.l_whence = SEEK_SET;
10459
10460   int ret =  _do_filelock(in, fh, CEPH_LOCK_FLOCK, CEPH_MDS_OP_SETFILELOCK, sleep, &fl, owner);
10461   ldout(cct, 10) << "_flock " << fh << " ino " << in->ino << " result=" << ret << dendl;
10462   return ret;
10463 }
10464
10465 int Client::ll_statfs(Inode *in, struct statvfs *stbuf, const UserPerm& perms)
10466 {
10467   /* Since the only thing this does is wrap a call to statfs, and
10468      statfs takes a lock, it doesn't seem we have a need to split it
10469      out. */
10470   return statfs(0, stbuf, perms);
10471 }
10472
10473 void Client::ll_register_callbacks(struct ceph_client_callback_args *args)
10474 {
10475   if (!args)
10476     return;
10477   std::lock_guard l(client_lock);
10478   ldout(cct, 10) << __func__ << " cb " << args->handle
10479                  << " invalidate_ino_cb " << args->ino_cb
10480                  << " invalidate_dentry_cb " << args->dentry_cb
10481                  << " switch_interrupt_cb " << args->switch_intr_cb
10482                  << " remount_cb " << args->remount_cb
10483                  << dendl;
10484   callback_handle = args->handle;
10485   if (args->ino_cb) {
10486     ino_invalidate_cb = args->ino_cb;
10487     async_ino_invalidator.start();
10488   }
10489   if (args->dentry_cb) {
10490     dentry_invalidate_cb = args->dentry_cb;
10491     async_dentry_invalidator.start();
10492   }
10493   if (args->switch_intr_cb) {
10494     switch_interrupt_cb = args->switch_intr_cb;
10495     interrupt_finisher.start();
10496   }
10497   if (args->remount_cb) {
10498     remount_cb = args->remount_cb;
10499     remount_finisher.start();
10500   }
10501   if (args->ino_release_cb) {
10502     ino_release_cb = args->ino_release_cb;
10503     async_ino_releasor.start();
10504   }
10505   if (args->umask_cb)
10506     umask_cb = args->umask_cb;
10507 }
10508
10509 int Client::test_dentry_handling(bool can_invalidate)
10510 {
10511   int r = 0;
10512
10513   can_invalidate_dentries = can_invalidate;
10514
10515   if (can_invalidate_dentries) {
10516     ceph_assert(dentry_invalidate_cb);
10517     ldout(cct, 1) << "using dentry_invalidate_cb" << dendl;
10518     r = 0;
10519   } else {
10520     ceph_assert(remount_cb);
10521     ldout(cct, 1) << "using remount_cb" << dendl;
10522     r = _do_remount(false);
10523   }
10524
10525   return r;
10526 }
10527
10528 int Client::_sync_fs()
10529 {
10530   ldout(cct, 10) << __func__ << dendl;
10531
10532   // flush file data
10533   std::unique_ptr<C_SaferCond> cond = nullptr;
10534   if (cct->_conf->client_oc) {
10535     cond.reset(new C_SaferCond("Client::_sync_fs:lock"));
10536     objectcacher->flush_all(cond.get());
10537   }
10538
10539   // flush caps
10540   flush_caps_sync();
10541   ceph_tid_t flush_tid = last_flush_tid;
10542
10543   // wait for unsafe mds requests
10544   wait_unsafe_requests();
10545
10546   wait_sync_caps(flush_tid);
10547
10548   if (nullptr != cond) {
10549     client_lock.unlock();
10550     ldout(cct, 15) << __func__ << " waiting on data to flush" << dendl;
10551     cond->wait();
10552     ldout(cct, 15) << __func__ << " flush finished" << dendl;
10553     client_lock.lock();
10554   }
10555
10556   return 0;
10557 }
10558
10559 int Client::sync_fs()
10560 {
10561   std::lock_guard l(client_lock);
10562
10563   if (unmounting)
10564     return -ENOTCONN;
10565
10566   return _sync_fs();
10567 }
10568
10569 int64_t Client::drop_caches()
10570 {
10571   std::lock_guard l(client_lock);
10572   return objectcacher->release_all();
10573 }
10574
10575 int Client::_lazyio(Fh *fh, int enable)
10576 {
10577   Inode *in = fh->inode.get();
10578   ldout(cct, 20) << __func__ << " " << *in << " " << !!enable << dendl;
10579
10580   if (!!(fh->mode & CEPH_FILE_MODE_LAZY) == !!enable)
10581     return 0;
10582
10583   int orig_mode = fh->mode;
10584   if (enable) {
10585     fh->mode |= CEPH_FILE_MODE_LAZY;
10586     in->get_open_ref(fh->mode);
10587     in->put_open_ref(orig_mode);
10588     check_caps(in, CHECK_CAPS_NODELAY);
10589   } else {
10590     fh->mode &= ~CEPH_FILE_MODE_LAZY;
10591     in->get_open_ref(fh->mode);
10592     in->put_open_ref(orig_mode);
10593     check_caps(in, 0);
10594   }
10595
10596   return 0;
10597 }
10598
10599 int Client::lazyio(int fd, int enable)
10600 {
10601   std::lock_guard l(client_lock);
10602   Fh *f = get_filehandle(fd);
10603   if (!f)
10604     return -EBADF;
10605
10606   return _lazyio(f, enable);
10607 }
10608
10609 int Client::ll_lazyio(Fh *fh, int enable)
10610 {
10611   std::lock_guard lock(client_lock);
10612   ldout(cct, 3) << __func__ << " " << fh << " " << fh->inode->ino << " " << !!enable << dendl;
10613   tout(cct) << __func__ << std::endl;
10614
10615   return _lazyio(fh, enable);
10616 }
10617
10618 int Client::lazyio_propagate(int fd, loff_t offset, size_t count)
10619 {
10620   std::lock_guard l(client_lock);
10621   ldout(cct, 3) << "op: client->lazyio_propagate(" << fd
10622           << ", " << offset << ", " << count << ")" << dendl;
10623
10624   Fh *f = get_filehandle(fd);
10625   if (!f)
10626     return -EBADF;
10627
10628   // for now
10629   _fsync(f, true);
10630
10631   return 0;
10632 }
10633
10634 int Client::lazyio_synchronize(int fd, loff_t offset, size_t count)
10635 {
10636   std::lock_guard l(client_lock);
10637   ldout(cct, 3) << "op: client->lazyio_synchronize(" << fd
10638           << ", " << offset << ", " << count << ")" << dendl;
10639
10640   Fh *f = get_filehandle(fd);
10641   if (!f)
10642     return -EBADF;
10643   Inode *in = f->inode.get();
10644
10645   _fsync(f, true);
10646   if (_release(in)) {
10647     int r =_getattr(in, CEPH_STAT_CAP_SIZE, f->actor_perms);
10648     if (r < 0)
10649       return r;
10650   }
10651   return 0;
10652 }
10653
10654
10655 // =============================
10656 // snaps
10657
10658 int Client::mksnap(const char *relpath, const char *name, const UserPerm& perm)
10659 {
10660   std::lock_guard l(client_lock);
10661
10662   if (unmounting)
10663     return -ENOTCONN;
10664
10665   filepath path(relpath);
10666   InodeRef in;
10667   int r = path_walk(path, &in, perm);
10668   if (r < 0)
10669     return r;
10670   if (cct->_conf->client_permissions) {
10671     r = may_create(in.get(), perm);
10672     if (r < 0)
10673       return r;
10674   }
10675   Inode *snapdir = open_snapdir(in.get());
10676   return _mkdir(snapdir, name, 0, perm);
10677 }
10678
10679 int Client::rmsnap(const char *relpath, const char *name, const UserPerm& perms)
10680 {
10681   std::lock_guard l(client_lock);
10682
10683   if (unmounting)
10684     return -ENOTCONN;
10685
10686   filepath path(relpath);
10687   InodeRef in;
10688   int r = path_walk(path, &in, perms);
10689   if (r < 0)
10690     return r;
10691   if (cct->_conf->client_permissions) {
10692     r = may_delete(in.get(), NULL, perms);
10693     if (r < 0)
10694       return r;
10695   }
10696   Inode *snapdir = open_snapdir(in.get());
10697   return _rmdir(snapdir, name, perms);
10698 }
10699
10700 // =============================
10701 // expose caps
10702
10703 int Client::get_caps_issued(int fd) {
10704
10705   std::lock_guard lock(client_lock);
10706
10707   if (unmounting)
10708     return -ENOTCONN;
10709
10710   Fh *f = get_filehandle(fd);
10711   if (!f)
10712     return -EBADF;
10713
10714   return f->inode->caps_issued();
10715 }
10716
10717 int Client::get_caps_issued(const char *path, const UserPerm& perms)
10718 {
10719   std::lock_guard lock(client_lock);
10720
10721   if (unmounting)
10722     return -ENOTCONN;
10723
10724   filepath p(path);
10725   InodeRef in;
10726   int r = path_walk(p, &in, perms, true);
10727   if (r < 0)
10728     return r;
10729   return in->caps_issued();
10730 }
10731
10732 // =========================================
10733 // low level
10734
10735 Inode *Client::open_snapdir(Inode *diri)
10736 {
10737   Inode *in;
10738   vinodeno_t vino(diri->ino, CEPH_SNAPDIR);
10739   if (!inode_map.count(vino)) {
10740     in = new Inode(this, vino, &diri->layout);
10741
10742     in->ino = diri->ino;
10743     in->snapid = CEPH_SNAPDIR;
10744     in->mode = diri->mode;
10745     in->uid = diri->uid;
10746     in->gid = diri->gid;
10747     in->nlink = 1;
10748     in->mtime = diri->mtime;
10749     in->ctime = diri->ctime;
10750     in->btime = diri->btime;
10751     in->size = diri->size;
10752     in->change_attr = diri->change_attr;
10753
10754     in->dirfragtree.clear();
10755     in->snapdir_parent = diri;
10756     diri->flags |= I_SNAPDIR_OPEN;
10757     inode_map[vino] = in;
10758     if (use_faked_inos())
10759       _assign_faked_ino(in);
10760     ldout(cct, 10) << "open_snapdir created snapshot inode " << *in << dendl;
10761   } else {
10762     in = inode_map[vino];
10763     ldout(cct, 10) << "open_snapdir had snapshot inode " << *in << dendl;
10764   }
10765   return in;
10766 }
10767
10768 int Client::ll_lookup(Inode *parent, const char *name, struct stat *attr,
10769                       Inode **out, const UserPerm& perms)
10770 {
10771   std::lock_guard lock(client_lock);
10772   vinodeno_t vparent = _get_vino(parent);
10773   ldout(cct, 3) << __func__ << " " << vparent << " " << name << dendl;
10774   tout(cct) << __func__ << std::endl;
10775   tout(cct) << name << std::endl;
10776
10777   if (unmounting)
10778     return -ENOTCONN;
10779
10780   int r = 0;
10781   if (!fuse_default_permissions) {
10782     if (strcmp(name, ".") && strcmp(name, "..")) {
10783       r = may_lookup(parent, perms);
10784       if (r < 0)
10785         return r;
10786     }
10787   }
10788
10789   string dname(name);
10790   InodeRef in;
10791
10792   r = _lookup(parent, dname, CEPH_STAT_CAP_INODE_ALL, &in, perms);
10793   if (r < 0) {
10794     attr->st_ino = 0;
10795     goto out;
10796   }
10797
10798   ceph_assert(in);
10799   fill_stat(in, attr);
10800   _ll_get(in.get());
10801
10802  out:
10803   ldout(cct, 3) << __func__ << " " << vparent << " " << name
10804           << " -> " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
10805   tout(cct) << attr->st_ino << std::endl;
10806   *out = in.get();
10807   return r;
10808 }
10809
10810 int Client::ll_lookup_inode(
10811     struct inodeno_t ino,
10812     const UserPerm& perms,
10813     Inode **inode)
10814 {
10815   ceph_assert(inode != NULL);
10816   std::lock_guard lock(client_lock);
10817   ldout(cct, 3) << "ll_lookup_inode " << ino  << dendl;
10818
10819   if (unmounting)
10820     return -ENOTCONN;
10821
10822   // Num1: get inode and *inode
10823   int r = _lookup_ino(ino, perms, inode);
10824   if (r)
10825     return r;
10826
10827   ceph_assert(*inode != NULL);
10828
10829   if (!(*inode)->dentries.empty()) {
10830     ldout(cct, 8) << __func__ << " dentry already present" << dendl;
10831     return 0;
10832   }
10833
10834   if ((*inode)->is_root()) {
10835     ldout(cct, 8) << "ino is root, no parent" << dendl;
10836     return 0;
10837   }
10838
10839   // Num2: Request the parent inode, so that we can look up the name
10840   Inode *parent;
10841   r = _lookup_parent(*inode, perms, &parent);
10842   if (r) {
10843     _ll_forget(*inode, 1);
10844     return r;
10845   }
10846
10847   ceph_assert(parent != NULL);
10848
10849   // Num3: Finally, get the name (dentry) of the requested inode
10850   r = _lookup_name(*inode, parent, perms);
10851   if (r) {
10852     // Unexpected error
10853     _ll_forget(parent, 1);
10854     _ll_forget(*inode, 1);
10855     return r;
10856   }
10857
10858   _ll_forget(parent, 1);
10859   return 0;
10860 }
10861
10862 int Client::ll_lookupx(Inode *parent, const char *name, Inode **out,
10863                        struct ceph_statx *stx, unsigned want, unsigned flags,
10864                        const UserPerm& perms)
10865 {
10866   std::lock_guard lock(client_lock);
10867   vinodeno_t vparent = _get_vino(parent);
10868   ldout(cct, 3) << __func__ << " " << vparent << " " << name << dendl;
10869   tout(cct) << "ll_lookupx" << std::endl;
10870   tout(cct) << name << std::endl;
10871
10872   if (unmounting)
10873     return -ENOTCONN;
10874
10875   int r = 0;
10876   if (!fuse_default_permissions) {
10877     r = may_lookup(parent, perms);
10878     if (r < 0)
10879       return r;
10880   }
10881
10882   string dname(name);
10883   InodeRef in;
10884
10885   unsigned mask = statx_to_mask(flags, want);
10886   r = _lookup(parent, dname, mask, &in, perms);
10887   if (r < 0) {
10888     stx->stx_ino = 0;
10889     stx->stx_mask = 0;
10890   } else {
10891     ceph_assert(in);
10892     fill_statx(in, mask, stx);
10893     _ll_get(in.get());
10894   }
10895
10896   ldout(cct, 3) << __func__ << " " << vparent << " " << name
10897           << " -> " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
10898   tout(cct) << stx->stx_ino << std::endl;
10899   *out = in.get();
10900   return r;
10901 }
10902
10903 int Client::ll_walk(const char* name, Inode **out, struct ceph_statx *stx,
10904                     unsigned int want, unsigned int flags, const UserPerm& perms)
10905 {
10906   std::lock_guard lock(client_lock);
10907
10908   if (unmounting)
10909     return -ENOTCONN;
10910
10911   filepath fp(name, 0);
10912   InodeRef in;
10913   int rc;
10914   unsigned mask = statx_to_mask(flags, want);
10915
10916   ldout(cct, 3) << __func__ << " " << name << dendl;
10917   tout(cct) << __func__ << std::endl;
10918   tout(cct) << name << std::endl;
10919
10920   rc = path_walk(fp, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW), mask);
10921   if (rc < 0) {
10922     /* zero out mask, just in case... */
10923     stx->stx_mask = 0;
10924     stx->stx_ino = 0;
10925     *out = NULL;
10926     return rc;
10927   } else {
10928     ceph_assert(in);
10929     fill_statx(in, mask, stx);
10930     _ll_get(in.get());
10931     *out = in.get();
10932     return 0;
10933   }
10934 }
10935
10936 void Client::_ll_get(Inode *in)
10937 {
10938   if (in->ll_ref == 0) {
10939     in->get();
10940     if (in->is_dir() && !in->dentries.empty()) {
10941       ceph_assert(in->dentries.size() == 1); // dirs can't be hard-linked
10942       in->get_first_parent()->get(); // pin dentry
10943     }
10944     if (in->snapid != CEPH_NOSNAP)
10945       ll_snap_ref[in->snapid]++;
10946   }
10947   in->ll_get();
10948   ldout(cct, 20) << __func__ << " " << in << " " << in->ino << " -> " << in->ll_ref << dendl;
10949 }
10950
10951 int Client::_ll_put(Inode *in, uint64_t num)
10952 {
10953   in->ll_put(num);
10954   ldout(cct, 20) << __func__ << " " << in << " " << in->ino << " " << num << " -> " << in->ll_ref << dendl;
10955   if (in->ll_ref == 0) {
10956     if (in->is_dir() && !in->dentries.empty()) {
10957       ceph_assert(in->dentries.size() == 1); // dirs can't be hard-linked
10958       in->get_first_parent()->put(); // unpin dentry
10959     }
10960     if (in->snapid != CEPH_NOSNAP) {
10961       auto p = ll_snap_ref.find(in->snapid);
10962       ceph_assert(p != ll_snap_ref.end());
10963       ceph_assert(p->second > 0);
10964       if (--p->second == 0)
10965         ll_snap_ref.erase(p);
10966     }
10967     put_inode(in);
10968     return 0;
10969   } else {
10970     return in->ll_ref;
10971   }
10972 }
10973
10974 void Client::_ll_drop_pins()
10975 {
10976   ldout(cct, 10) << __func__ << dendl;
10977   std::set<InodeRef> to_be_put; //this set will be deconstructed item by item when exit
10978   ceph::unordered_map<vinodeno_t, Inode*>::iterator next;
10979   for (ceph::unordered_map<vinodeno_t, Inode*>::iterator it = inode_map.begin();
10980        it != inode_map.end();
10981        it = next) {
10982     Inode *in = it->second;
10983     next = it;
10984     ++next;
10985     if (in->ll_ref){
10986       to_be_put.insert(in);
10987       _ll_put(in, in->ll_ref);
10988     }
10989   }
10990 }
10991
10992 bool Client::_ll_forget(Inode *in, uint64_t count)
10993 {
10994   inodeno_t ino = in->ino;
10995
10996   ldout(cct, 8) << __func__ << " " << ino << " " << count << dendl;
10997   tout(cct) << __func__ << std::endl;
10998   tout(cct) << ino.val << std::endl;
10999   tout(cct) << count << std::endl;
11000
11001   // Ignore forget if we're no longer mounted
11002   if (unmounting)
11003     return true;
11004
11005   if (ino == 1) return true;  // ignore forget on root.
11006
11007   bool last = false;
11008   if (in->ll_ref < count) {
11009     ldout(cct, 1) << "WARNING: ll_forget on " << ino << " " << count
11010                   << ", which only has ll_ref=" << in->ll_ref << dendl;
11011     _ll_put(in, in->ll_ref);
11012     last = true;
11013   } else {
11014     if (_ll_put(in, count) == 0)
11015       last = true;
11016   }
11017
11018   return last;
11019 }
11020
11021 bool Client::ll_forget(Inode *in, uint64_t count)
11022 {
11023   std::lock_guard lock(client_lock);
11024   return _ll_forget(in, count);
11025 }
11026
11027 bool Client::ll_put(Inode *in)
11028 {
11029   /* ll_forget already takes the lock */
11030   return ll_forget(in, 1);
11031 }
11032
11033 int Client::ll_get_snap_ref(snapid_t snap)
11034 {
11035   std::lock_guard lock(client_lock);
11036   auto p = ll_snap_ref.find(snap);
11037   if (p != ll_snap_ref.end())
11038     return p->second;
11039   return 0;
11040 }
11041
11042 snapid_t Client::ll_get_snapid(Inode *in)
11043 {
11044   std::lock_guard lock(client_lock);
11045   return in->snapid;
11046 }
11047
11048 Inode *Client::ll_get_inode(ino_t ino)
11049 {
11050   std::lock_guard lock(client_lock);
11051
11052   if (unmounting)
11053     return NULL;
11054
11055   vinodeno_t vino = _map_faked_ino(ino);
11056   unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino);
11057   if (p == inode_map.end())
11058     return NULL;
11059   Inode *in = p->second;
11060   _ll_get(in);
11061   return in;
11062 }
11063
11064 Inode *Client::ll_get_inode(vinodeno_t vino)
11065 {
11066   std::lock_guard lock(client_lock);
11067
11068   if (unmounting)
11069     return NULL;
11070
11071   unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino);
11072   if (p == inode_map.end())
11073     return NULL;
11074   Inode *in = p->second;
11075   _ll_get(in);
11076   return in;
11077 }
11078
11079 int Client::_ll_getattr(Inode *in, int caps, const UserPerm& perms)
11080 {
11081   vinodeno_t vino = _get_vino(in);
11082
11083   ldout(cct, 8) << __func__ << " " << vino << dendl;
11084   tout(cct) << __func__ << std::endl;
11085   tout(cct) << vino.ino.val << std::endl;
11086
11087   if (vino.snapid < CEPH_NOSNAP)
11088     return 0;
11089   else
11090     return _getattr(in, caps, perms);
11091 }
11092
11093 int Client::ll_getattr(Inode *in, struct stat *attr, const UserPerm& perms)
11094 {
11095   std::lock_guard lock(client_lock);
11096
11097   if (unmounting)
11098     return -ENOTCONN;
11099
11100   int res = _ll_getattr(in, CEPH_STAT_CAP_INODE_ALL, perms);
11101
11102   if (res == 0)
11103     fill_stat(in, attr);
11104   ldout(cct, 3) << __func__ << " " << _get_vino(in) << " = " << res << dendl;
11105   return res;
11106 }
11107
11108 int Client::ll_getattrx(Inode *in, struct ceph_statx *stx, unsigned int want,
11109                         unsigned int flags, const UserPerm& perms)
11110 {
11111   std::lock_guard lock(client_lock);
11112
11113   if (unmounting)
11114     return -ENOTCONN;
11115
11116   int res = 0;
11117   unsigned mask = statx_to_mask(flags, want);
11118
11119   if (mask && !in->caps_issued_mask(mask, true))
11120     res = _ll_getattr(in, mask, perms);
11121
11122   if (res == 0)
11123     fill_statx(in, mask, stx);
11124   ldout(cct, 3) << __func__ << " " << _get_vino(in) << " = " << res << dendl;
11125   return res;
11126 }
11127
11128 int Client::_ll_setattrx(Inode *in, struct ceph_statx *stx, int mask,
11129                          const UserPerm& perms, InodeRef *inp)
11130 {
11131   vinodeno_t vino = _get_vino(in);
11132
11133   ldout(cct, 8) << __func__ << " " << vino << " mask " << hex << mask << dec
11134                 << dendl;
11135   tout(cct) << __func__ << std::endl;
11136   tout(cct) << vino.ino.val << std::endl;
11137   tout(cct) << stx->stx_mode << std::endl;
11138   tout(cct) << stx->stx_uid << std::endl;
11139   tout(cct) << stx->stx_gid << std::endl;
11140   tout(cct) << stx->stx_size << std::endl;
11141   tout(cct) << stx->stx_mtime << std::endl;
11142   tout(cct) << stx->stx_atime << std::endl;
11143   tout(cct) << stx->stx_btime << std::endl;
11144   tout(cct) << mask << std::endl;
11145
11146   if (!fuse_default_permissions) {
11147     int res = may_setattr(in, stx, mask, perms);
11148     if (res < 0)
11149       return res;
11150   }
11151
11152   mask &= ~(CEPH_SETATTR_MTIME_NOW | CEPH_SETATTR_ATIME_NOW);
11153
11154   return __setattrx(in, stx, mask, perms, inp);
11155 }
11156
11157 int Client::ll_setattrx(Inode *in, struct ceph_statx *stx, int mask,
11158                         const UserPerm& perms)
11159 {
11160   std::lock_guard lock(client_lock);
11161
11162   if (unmounting)
11163     return -ENOTCONN;
11164
11165   InodeRef target(in);
11166   int res = _ll_setattrx(in, stx, mask, perms, &target);
11167   if (res == 0) {
11168     ceph_assert(in == target.get());
11169     fill_statx(in, in->caps_issued(), stx);
11170   }
11171
11172   ldout(cct, 3) << __func__ << " " << _get_vino(in) << " = " << res << dendl;
11173   return res;
11174 }
11175
11176 int Client::ll_setattr(Inode *in, struct stat *attr, int mask,
11177                        const UserPerm& perms)
11178 {
11179   struct ceph_statx stx;
11180   stat_to_statx(attr, &stx);
11181
11182   std::lock_guard lock(client_lock);
11183
11184   if (unmounting)
11185     return -ENOTCONN;
11186
11187   InodeRef target(in);
11188   int res = _ll_setattrx(in, &stx, mask, perms, &target);
11189   if (res == 0) {
11190     ceph_assert(in == target.get());
11191     fill_stat(in, attr);
11192   }
11193
11194   ldout(cct, 3) << __func__ << " " << _get_vino(in) << " = " << res << dendl;
11195   return res;
11196 }
11197
11198
11199 // ----------
11200 // xattrs
11201
11202 int Client::getxattr(const char *path, const char *name, void *value, size_t size,
11203                      const UserPerm& perms)
11204 {
11205   std::lock_guard lock(client_lock);
11206
11207   if (unmounting)
11208     return -ENOTCONN;
11209
11210   InodeRef in;
11211   int r = Client::path_walk(path, &in, perms, true, CEPH_STAT_CAP_XATTR);
11212   if (r < 0)
11213     return r;
11214   return _getxattr(in, name, value, size, perms);
11215 }
11216
11217 int Client::lgetxattr(const char *path, const char *name, void *value, size_t size,
11218                       const UserPerm& perms)
11219 {
11220   std::lock_guard lock(client_lock);
11221
11222   if (unmounting)
11223     return -ENOTCONN;
11224
11225   InodeRef in;
11226   int r = Client::path_walk(path, &in, perms, false, CEPH_STAT_CAP_XATTR);
11227   if (r < 0)
11228     return r;
11229   return _getxattr(in, name, value, size, perms);
11230 }
11231
11232 int Client::fgetxattr(int fd, const char *name, void *value, size_t size,
11233                       const UserPerm& perms)
11234 {
11235   std::lock_guard lock(client_lock);
11236
11237   if (unmounting)
11238     return -ENOTCONN;
11239
11240   Fh *f = get_filehandle(fd);
11241   if (!f)
11242     return -EBADF;
11243   return _getxattr(f->inode, name, value, size, perms);
11244 }
11245
11246 int Client::listxattr(const char *path, char *list, size_t size,
11247                       const UserPerm& perms)
11248 {
11249   std::lock_guard lock(client_lock);
11250
11251   if (unmounting)
11252     return -ENOTCONN;
11253
11254   InodeRef in;
11255   int r = Client::path_walk(path, &in, perms, true, CEPH_STAT_CAP_XATTR);
11256   if (r < 0)
11257     return r;
11258   return Client::_listxattr(in.get(), list, size, perms);
11259 }
11260
11261 int Client::llistxattr(const char *path, char *list, size_t size,
11262                        const UserPerm& perms)
11263 {
11264   std::lock_guard lock(client_lock);
11265
11266   if (unmounting)
11267     return -ENOTCONN;
11268
11269   InodeRef in;
11270   int r = Client::path_walk(path, &in, perms, false, CEPH_STAT_CAP_XATTR);
11271   if (r < 0)
11272     return r;
11273   return Client::_listxattr(in.get(), list, size, perms);
11274 }
11275
11276 int Client::flistxattr(int fd, char *list, size_t size, const UserPerm& perms)
11277 {
11278   std::lock_guard lock(client_lock);
11279
11280   if (unmounting)
11281     return -ENOTCONN;
11282
11283   Fh *f = get_filehandle(fd);
11284   if (!f)
11285     return -EBADF;
11286   return Client::_listxattr(f->inode.get(), list, size, perms);
11287 }
11288
11289 int Client::removexattr(const char *path, const char *name,
11290                         const UserPerm& perms)
11291 {
11292   std::lock_guard lock(client_lock);
11293
11294   if (unmounting)
11295     return -ENOTCONN;
11296
11297   InodeRef in;
11298   int r = Client::path_walk(path, &in, perms, true);
11299   if (r < 0)
11300     return r;
11301   return _removexattr(in, name, perms);
11302 }
11303
11304 int Client::lremovexattr(const char *path, const char *name,
11305                          const UserPerm& perms)
11306 {
11307   std::lock_guard lock(client_lock);
11308
11309   if (unmounting)
11310     return -ENOTCONN;
11311
11312   InodeRef in;
11313   int r = Client::path_walk(path, &in, perms, false);
11314   if (r < 0)
11315     return r;
11316   return _removexattr(in, name, perms);
11317 }
11318
11319 int Client::fremovexattr(int fd, const char *name, const UserPerm& perms)
11320 {
11321   std::lock_guard lock(client_lock);
11322
11323   if (unmounting)
11324     return -ENOTCONN;
11325
11326   Fh *f = get_filehandle(fd);
11327   if (!f)
11328     return -EBADF;
11329   return _removexattr(f->inode, name, perms);
11330 }
11331
11332 int Client::setxattr(const char *path, const char *name, const void *value,
11333                      size_t size, int flags, const UserPerm& perms)
11334 {
11335   _setxattr_maybe_wait_for_osdmap(name, value, size);
11336
11337   std::lock_guard lock(client_lock);
11338
11339   if (unmounting)
11340     return -ENOTCONN;
11341
11342   InodeRef in;
11343   int r = Client::path_walk(path, &in, perms, true);
11344   if (r < 0)
11345     return r;
11346   return _setxattr(in, name, value, size, flags, perms);
11347 }
11348
11349 int Client::lsetxattr(const char *path, const char *name, const void *value,
11350                       size_t size, int flags, const UserPerm& perms)
11351 {
11352   _setxattr_maybe_wait_for_osdmap(name, value, size);
11353
11354   std::lock_guard lock(client_lock);
11355
11356   if (unmounting)
11357     return -ENOTCONN;
11358
11359   InodeRef in;
11360   int r = Client::path_walk(path, &in, perms, false);
11361   if (r < 0)
11362     return r;
11363   return _setxattr(in, name, value, size, flags, perms);
11364 }
11365
11366 int Client::fsetxattr(int fd, const char *name, const void *value, size_t size,
11367                       int flags, const UserPerm& perms)
11368 {
11369   _setxattr_maybe_wait_for_osdmap(name, value, size);
11370
11371   std::lock_guard lock(client_lock);
11372
11373   if (unmounting)
11374     return -ENOTCONN;
11375
11376   Fh *f = get_filehandle(fd);
11377   if (!f)
11378     return -EBADF;
11379   return _setxattr(f->inode, name, value, size, flags, perms);
11380 }
11381
11382 int Client::_getxattr(Inode *in, const char *name, void *value, size_t size,
11383                       const UserPerm& perms)
11384 {
11385   int r;
11386
11387   const VXattr *vxattr = _match_vxattr(in, name);
11388   if (vxattr) {
11389     r = -ENODATA;
11390
11391     // Do a force getattr to get the latest quota before returning
11392     // a value to userspace.
11393     int flags = 0;
11394     if (vxattr->flags & VXATTR_RSTAT) {
11395       flags |= CEPH_STAT_RSTAT;
11396     }
11397     r = _getattr(in, flags, perms, true);
11398     if (r != 0) {
11399       // Error from getattr!
11400       return r;
11401     }
11402
11403     // call pointer-to-member function
11404     char buf[256];
11405     if (!(vxattr->exists_cb && !(this->*(vxattr->exists_cb))(in))) {
11406       r = (this->*(vxattr->getxattr_cb))(in, buf, sizeof(buf));
11407     } else {
11408       r = -ENODATA;
11409     }
11410
11411     if (size != 0) {
11412       if (r > (int)size) {
11413         r = -ERANGE;
11414       } else if (r > 0) {
11415         memcpy(value, buf, r);
11416       }
11417     }
11418     goto out;
11419   }
11420
11421   if (acl_type == NO_ACL && !strncmp(name, "system.", 7)) {
11422     r = -EOPNOTSUPP;
11423     goto out;
11424   }
11425
11426   r = _getattr(in, CEPH_STAT_CAP_XATTR, perms, in->xattr_version == 0);
11427   if (r == 0) {
11428     string n(name);
11429     r = -ENODATA;
11430    if (in->xattrs.count(n)) {
11431       r = in->xattrs[n].length();
11432       if (r > 0 && size != 0) {
11433         if (size >= (unsigned)r)
11434           memcpy(value, in->xattrs[n].c_str(), r);
11435         else
11436           r = -ERANGE;
11437       }
11438     }
11439   }
11440  out:
11441   ldout(cct, 8) << "_getxattr(" << in->ino << ", \"" << name << "\", " << size << ") = " << r << dendl;
11442   return r;
11443 }
11444
11445 int Client::_getxattr(InodeRef &in, const char *name, void *value, size_t size,
11446                       const UserPerm& perms)
11447 {
11448   if (cct->_conf->client_permissions) {
11449     int r = xattr_permission(in.get(), name, MAY_READ, perms);
11450     if (r < 0)
11451       return r;
11452   }
11453   return _getxattr(in.get(), name, value, size, perms);
11454 }
11455
11456 int Client::ll_getxattr(Inode *in, const char *name, void *value,
11457                         size_t size, const UserPerm& perms)
11458 {
11459   std::lock_guard lock(client_lock);
11460
11461   if (unmounting)
11462     return -ENOTCONN;
11463
11464   vinodeno_t vino = _get_vino(in);
11465
11466   ldout(cct, 3) << __func__ << " " << vino << " " << name << " size " << size << dendl;
11467   tout(cct) << __func__ << std::endl;
11468   tout(cct) << vino.ino.val << std::endl;
11469   tout(cct) << name << std::endl;
11470
11471   if (!fuse_default_permissions) {
11472     int r = xattr_permission(in, name, MAY_READ, perms);
11473     if (r < 0)
11474       return r;
11475   }
11476
11477   return _getxattr(in, name, value, size, perms);
11478 }
11479
11480 int Client::_listxattr(Inode *in, char *name, size_t size,
11481                        const UserPerm& perms)
11482 {
11483   bool len_only = (size == 0);
11484   int r = _getattr(in, CEPH_STAT_CAP_XATTR, perms, in->xattr_version == 0);
11485   if (r != 0) {
11486     goto out;
11487   }
11488
11489   r = 0;
11490   for (const auto& p : in->xattrs) {
11491     size_t this_len = p.first.length() + 1;
11492     r += this_len;
11493     if (len_only)
11494       continue;
11495
11496     if (this_len > size) {
11497       r = -ERANGE;
11498       goto out;
11499     }
11500
11501     memcpy(name, p.first.c_str(), this_len);
11502     name += this_len;
11503     size -= this_len;
11504   }
11505 out:
11506   ldout(cct, 8) << __func__ << "(" << in->ino << ", " << size << ") = " << r << dendl;
11507   return r;
11508 }
11509
11510 int Client::ll_listxattr(Inode *in, char *names, size_t size,
11511                          const UserPerm& perms)
11512 {
11513   std::lock_guard lock(client_lock);
11514
11515   if (unmounting)
11516     return -ENOTCONN;
11517
11518   vinodeno_t vino = _get_vino(in);
11519
11520   ldout(cct, 3) << __func__ << " " << vino << " size " << size << dendl;
11521   tout(cct) << __func__ << std::endl;
11522   tout(cct) << vino.ino.val << std::endl;
11523   tout(cct) << size << std::endl;
11524
11525   return _listxattr(in, names, size, perms);
11526 }
11527
11528 int Client::_do_setxattr(Inode *in, const char *name, const void *value,
11529                          size_t size, int flags, const UserPerm& perms)
11530 {
11531
11532   int xattr_flags = 0;
11533   if (!value)
11534     xattr_flags |= CEPH_XATTR_REMOVE;
11535   if (flags & XATTR_CREATE)
11536     xattr_flags |= CEPH_XATTR_CREATE;
11537   if (flags & XATTR_REPLACE)
11538     xattr_flags |= CEPH_XATTR_REPLACE;
11539
11540   MetaRequest *req = new MetaRequest(CEPH_MDS_OP_SETXATTR);
11541   filepath path;
11542   in->make_nosnap_relative_path(path);
11543   req->set_filepath(path);
11544   req->set_string2(name);
11545   req->set_inode(in);
11546   req->head.args.setxattr.flags = xattr_flags;
11547
11548   bufferlist bl;
11549   assert (value || size == 0);
11550   bl.append((const char*)value, size);
11551   req->set_data(bl);
11552
11553   int res = make_request(req, perms);
11554
11555   trim_cache();
11556   ldout(cct, 3) << __func__ << "(" << in->ino << ", \"" << name << "\") = " <<
11557     res << dendl;
11558   return res;
11559 }
11560
11561 int Client::_setxattr(Inode *in, const char *name, const void *value,
11562                       size_t size, int flags, const UserPerm& perms)
11563 {
11564   if (in->snapid != CEPH_NOSNAP) {
11565     return -EROFS;
11566   }
11567
11568   bool posix_acl_xattr = false;
11569   if (acl_type == POSIX_ACL)
11570     posix_acl_xattr = !strncmp(name, "system.", 7);
11571
11572   if (strncmp(name, "user.", 5) &&
11573       strncmp(name, "security.", 9) &&
11574       strncmp(name, "trusted.", 8) &&
11575       strncmp(name, "ceph.", 5) &&
11576       !posix_acl_xattr)
11577     return -EOPNOTSUPP;
11578
11579   bool check_realm = false;
11580
11581   if (posix_acl_xattr) {
11582     if (!strcmp(name, ACL_EA_ACCESS)) {
11583       mode_t new_mode = in->mode;
11584       if (value) {
11585         int ret = posix_acl_equiv_mode(value, size, &new_mode);
11586         if (ret < 0)
11587           return ret;
11588         if (ret == 0) {
11589           value = NULL;
11590           size = 0;
11591         }
11592         if (new_mode != in->mode) {
11593           struct ceph_statx stx;
11594           stx.stx_mode = new_mode;
11595           ret = _do_setattr(in, &stx, CEPH_SETATTR_MODE, perms, NULL);
11596           if (ret < 0)
11597             return ret;
11598         }
11599       }
11600     } else if (!strcmp(name, ACL_EA_DEFAULT)) {
11601       if (value) {
11602         if (!S_ISDIR(in->mode))
11603           return -EACCES;
11604         int ret = posix_acl_check(value, size);
11605         if (ret < 0)
11606           return -EINVAL;
11607         if (ret == 0) {
11608           value = NULL;
11609           size = 0;
11610         }
11611       }
11612     } else {
11613       return -EOPNOTSUPP;
11614     }
11615   } else {
11616     const VXattr *vxattr = _match_vxattr(in, name);
11617     if (vxattr) {
11618       if (vxattr->readonly)
11619         return -EOPNOTSUPP;
11620       if (vxattr->name.compare(0, 10, "ceph.quota") == 0 && value)
11621         check_realm = true;
11622     }
11623   }
11624
11625   int ret = _do_setxattr(in, name, value, size, flags, perms);
11626   if (ret >= 0 && check_realm) {
11627     // check if snaprealm was created for quota inode
11628     if (in->quota.is_enable() &&
11629         !(in->snaprealm && in->snaprealm->ino == in->ino))
11630       ret = -EOPNOTSUPP;
11631   }
11632
11633   return ret;
11634 }
11635
11636 int Client::_setxattr(InodeRef &in, const char *name, const void *value,
11637                       size_t size, int flags, const UserPerm& perms)
11638 {
11639   if (cct->_conf->client_permissions) {
11640     int r = xattr_permission(in.get(), name, MAY_WRITE, perms);
11641     if (r < 0)
11642       return r;
11643   }
11644   return _setxattr(in.get(), name, value, size, flags, perms);
11645 }
11646
11647 int Client::_setxattr_check_data_pool(string& name, string& value, const OSDMap *osdmap)
11648 {
11649   string tmp;
11650   if (name == "layout") {
11651     string::iterator begin = value.begin();
11652     string::iterator end = value.end();
11653     keys_and_values<string::iterator> p;    // create instance of parser
11654     std::map<string, string> m;             // map to receive results
11655     if (!qi::parse(begin, end, p, m)) {     // returns true if successful
11656       return -EINVAL;
11657     }
11658     if (begin != end)
11659       return -EINVAL;
11660     for (map<string,string>::iterator q = m.begin(); q != m.end(); ++q) {
11661       if (q->first == "pool") {
11662         tmp = q->second;
11663         break;
11664       }
11665     }
11666   } else if (name == "layout.pool") {
11667     tmp = value;
11668   }
11669
11670   if (tmp.length()) {
11671     int64_t pool;
11672     try {
11673       pool = boost::lexical_cast<unsigned>(tmp);
11674       if (!osdmap->have_pg_pool(pool))
11675         return -ENOENT;
11676     } catch (boost::bad_lexical_cast const&) {
11677       pool = osdmap->lookup_pg_pool_name(tmp);
11678       if (pool < 0) {
11679         return -ENOENT;
11680       }
11681     }
11682   }
11683
11684   return 0;
11685 }
11686
11687 void Client::_setxattr_maybe_wait_for_osdmap(const char *name, const void *value, size_t size)
11688 {
11689   // For setting pool of layout, MetaRequest need osdmap epoch.
11690   // There is a race which create a new data pool but client and mds both don't have.
11691   // Make client got the latest osdmap which make mds quickly judge whether get newer osdmap.
11692   if (strcmp(name, "ceph.file.layout.pool") == 0 || strcmp(name, "ceph.dir.layout.pool") == 0 ||
11693       strcmp(name, "ceph.file.layout") == 0 || strcmp(name, "ceph.dir.layout") == 0) {
11694     string rest(strstr(name, "layout"));
11695     string v((const char*)value, size);
11696     int r = objecter->with_osdmap([&](const OSDMap& o) {
11697       return _setxattr_check_data_pool(rest, v, &o);
11698     });
11699
11700     if (r == -ENOENT) {
11701       C_SaferCond ctx;
11702       objecter->wait_for_latest_osdmap(&ctx);
11703       ctx.wait();
11704     }
11705   }
11706 }
11707
11708 int Client::ll_setxattr(Inode *in, const char *name, const void *value,
11709                         size_t size, int flags, const UserPerm& perms)
11710 {
11711   _setxattr_maybe_wait_for_osdmap(name, value, size);
11712
11713   std::lock_guard lock(client_lock);
11714
11715   if (unmounting)
11716     return -ENOTCONN;
11717
11718   vinodeno_t vino = _get_vino(in);
11719
11720   ldout(cct, 3) << __func__ << " " << vino << " " << name << " size " << size << dendl;
11721   tout(cct) << __func__ << std::endl;
11722   tout(cct) << vino.ino.val << std::endl;
11723   tout(cct) << name << std::endl;
11724
11725   if (!fuse_default_permissions) {
11726     int r = xattr_permission(in, name, MAY_WRITE, perms);
11727     if (r < 0)
11728       return r;
11729   }
11730   return _setxattr(in, name, value, size, flags, perms);
11731 }
11732
11733 int Client::_removexattr(Inode *in, const char *name, const UserPerm& perms)
11734 {
11735   if (in->snapid != CEPH_NOSNAP) {
11736     return -EROFS;
11737   }
11738
11739   // same xattrs supported by kernel client
11740   if (strncmp(name, "user.", 5) &&
11741       strncmp(name, "system.", 7) &&
11742       strncmp(name, "security.", 9) &&
11743       strncmp(name, "trusted.", 8) &&
11744       strncmp(name, "ceph.", 5))
11745     return -EOPNOTSUPP;
11746
11747   const VXattr *vxattr = _match_vxattr(in, name);
11748   if (vxattr && vxattr->readonly)
11749     return -EOPNOTSUPP;
11750
11751   MetaRequest *req = new MetaRequest(CEPH_MDS_OP_RMXATTR);
11752   filepath path;
11753   in->make_nosnap_relative_path(path);
11754   req->set_filepath(path);
11755   req->set_filepath2(name);
11756   req->set_inode(in);
11757
11758   int res = make_request(req, perms);
11759
11760   trim_cache();
11761   ldout(cct, 8) << "_removexattr(" << in->ino << ", \"" << name << "\") = " << res << dendl;
11762   return res;
11763 }
11764
11765 int Client::_removexattr(InodeRef &in, const char *name, const UserPerm& perms)
11766 {
11767   if (cct->_conf->client_permissions) {
11768     int r = xattr_permission(in.get(), name, MAY_WRITE, perms);
11769     if (r < 0)
11770       return r;
11771   }
11772   return _removexattr(in.get(), name, perms);
11773 }
11774
11775 int Client::ll_removexattr(Inode *in, const char *name, const UserPerm& perms)
11776 {
11777   std::lock_guard lock(client_lock);
11778
11779   if (unmounting)
11780     return -ENOTCONN;
11781
11782   vinodeno_t vino = _get_vino(in);
11783
11784   ldout(cct, 3) << "ll_removexattr " << vino << " " << name << dendl;
11785   tout(cct) << "ll_removexattr" << std::endl;
11786   tout(cct) << vino.ino.val << std::endl;
11787   tout(cct) << name << std::endl;
11788
11789   if (!fuse_default_permissions) {
11790     int r = xattr_permission(in, name, MAY_WRITE, perms);
11791     if (r < 0)
11792       return r;
11793   }
11794
11795   return _removexattr(in, name, perms);
11796 }
11797
11798 bool Client::_vxattrcb_quota_exists(Inode *in)
11799 {
11800   return in->quota.is_enable() &&
11801          in->snaprealm && in->snaprealm->ino == in->ino;
11802 }
11803 size_t Client::_vxattrcb_quota(Inode *in, char *val, size_t size)
11804 {
11805   return snprintf(val, size,
11806                   "max_bytes=%lld max_files=%lld",
11807                   (long long int)in->quota.max_bytes,
11808                   (long long int)in->quota.max_files);
11809 }
11810 size_t Client::_vxattrcb_quota_max_bytes(Inode *in, char *val, size_t size)
11811 {
11812   return snprintf(val, size, "%lld", (long long int)in->quota.max_bytes);
11813 }
11814 size_t Client::_vxattrcb_quota_max_files(Inode *in, char *val, size_t size)
11815 {
11816   return snprintf(val, size, "%lld", (long long int)in->quota.max_files);
11817 }
11818
11819 bool Client::_vxattrcb_layout_exists(Inode *in)
11820 {
11821   return in->layout != file_layout_t();
11822 }
11823 size_t Client::_vxattrcb_layout(Inode *in, char *val, size_t size)
11824 {
11825   int r = snprintf(val, size,
11826       "stripe_unit=%llu stripe_count=%llu object_size=%llu pool=",
11827       (unsigned long long)in->layout.stripe_unit,
11828       (unsigned long long)in->layout.stripe_count,
11829       (unsigned long long)in->layout.object_size);
11830   objecter->with_osdmap([&](const OSDMap& o) {
11831       if (o.have_pg_pool(in->layout.pool_id))
11832         r += snprintf(val + r, size - r, "%s",
11833                       o.get_pool_name(in->layout.pool_id).c_str());
11834       else
11835         r += snprintf(val + r, size - r, "%" PRIu64,
11836                       (uint64_t)in->layout.pool_id);
11837     });
11838   if (in->layout.pool_ns.length())
11839     r += snprintf(val + r, size - r, " pool_namespace=%s",
11840                   in->layout.pool_ns.c_str());
11841   return r;
11842 }
11843 size_t Client::_vxattrcb_layout_stripe_unit(Inode *in, char *val, size_t size)
11844 {
11845   return snprintf(val, size, "%llu", (unsigned long long)in->layout.stripe_unit);
11846 }
11847 size_t Client::_vxattrcb_layout_stripe_count(Inode *in, char *val, size_t size)
11848 {
11849   return snprintf(val, size, "%llu", (unsigned long long)in->layout.stripe_count);
11850 }
11851 size_t Client::_vxattrcb_layout_object_size(Inode *in, char *val, size_t size)
11852 {
11853   return snprintf(val, size, "%llu", (unsigned long long)in->layout.object_size);
11854 }
11855 size_t Client::_vxattrcb_layout_pool(Inode *in, char *val, size_t size)
11856 {
11857   size_t r;
11858   objecter->with_osdmap([&](const OSDMap& o) {
11859       if (o.have_pg_pool(in->layout.pool_id))
11860         r = snprintf(val, size, "%s", o.get_pool_name(
11861                        in->layout.pool_id).c_str());
11862       else
11863         r = snprintf(val, size, "%" PRIu64, (uint64_t)in->layout.pool_id);
11864     });
11865   return r;
11866 }
11867 size_t Client::_vxattrcb_layout_pool_namespace(Inode *in, char *val, size_t size)
11868 {
11869   return snprintf(val, size, "%s", in->layout.pool_ns.c_str());
11870 }
11871 size_t Client::_vxattrcb_dir_entries(Inode *in, char *val, size_t size)
11872 {
11873   return snprintf(val, size, "%llu", (unsigned long long)(in->dirstat.nfiles + in->dirstat.nsubdirs));
11874 }
11875 size_t Client::_vxattrcb_dir_files(Inode *in, char *val, size_t size)
11876 {
11877   return snprintf(val, size, "%llu", (unsigned long long)in->dirstat.nfiles);
11878 }
11879 size_t Client::_vxattrcb_dir_subdirs(Inode *in, char *val, size_t size)
11880 {
11881   return snprintf(val, size, "%llu", (unsigned long long)in->dirstat.nsubdirs);
11882 }
11883 size_t Client::_vxattrcb_dir_rentries(Inode *in, char *val, size_t size)
11884 {
11885   return snprintf(val, size, "%llu", (unsigned long long)(in->rstat.rfiles + in->rstat.rsubdirs));
11886 }
11887 size_t Client::_vxattrcb_dir_rfiles(Inode *in, char *val, size_t size)
11888 {
11889   return snprintf(val, size, "%llu", (unsigned long long)in->rstat.rfiles);
11890 }
11891 size_t Client::_vxattrcb_dir_rsubdirs(Inode *in, char *val, size_t size)
11892 {
11893   return snprintf(val, size, "%llu", (unsigned long long)in->rstat.rsubdirs);
11894 }
11895 size_t Client::_vxattrcb_dir_rbytes(Inode *in, char *val, size_t size)
11896 {
11897   return snprintf(val, size, "%llu", (unsigned long long)in->rstat.rbytes);
11898 }
11899 size_t Client::_vxattrcb_dir_rctime(Inode *in, char *val, size_t size)
11900 {
11901   return snprintf(val, size, "%ld.%09ld", (long)in->rstat.rctime.sec(),
11902       (long)in->rstat.rctime.nsec());
11903 }
11904 bool Client::_vxattrcb_dir_pin_exists(Inode *in)
11905 {
11906   return in->dir_pin != -ENODATA;
11907 }
11908 size_t Client::_vxattrcb_dir_pin(Inode *in, char *val, size_t size)
11909 {
11910   return snprintf(val, size, "%ld", (long)in->dir_pin);
11911 }
11912
11913 bool Client::_vxattrcb_snap_btime_exists(Inode *in)
11914 {
11915   return !in->snap_btime.is_zero();
11916 }
11917
11918 size_t Client::_vxattrcb_snap_btime(Inode *in, char *val, size_t size)
11919 {
11920   return snprintf(val, size, "%llu.%09lu",
11921       (long long unsigned)in->snap_btime.sec(),
11922       (long unsigned)in->snap_btime.nsec());
11923 }
11924
11925 #define CEPH_XATTR_NAME(_type, _name) "ceph." #_type "." #_name
11926 #define CEPH_XATTR_NAME2(_type, _name, _name2) "ceph." #_type "." #_name "." #_name2
11927
11928 #define XATTR_NAME_CEPH(_type, _name)                           \
11929 {                                                               \
11930   name: CEPH_XATTR_NAME(_type, _name),                          \
11931   getxattr_cb: &Client::_vxattrcb_ ## _type ## _ ## _name,      \
11932   readonly: true,                                               \
11933   exists_cb: NULL,                                              \
11934   flags: 0,                                                     \
11935 }
11936 #define XATTR_NAME_CEPH2(_type, _name, _flags)                 \
11937 {                                                              \
11938   name: CEPH_XATTR_NAME(_type, _name),                         \
11939   getxattr_cb: &Client::_vxattrcb_ ## _type ## _ ## _name,     \
11940   readonly: true,                                              \
11941   exists_cb: NULL,                                             \
11942   flags: _flags,                                               \
11943 }
11944 #define XATTR_LAYOUT_FIELD(_type, _name, _field)                \
11945 {                                                               \
11946   name: CEPH_XATTR_NAME2(_type, _name, _field),                 \
11947   getxattr_cb: &Client::_vxattrcb_ ## _name ## _ ## _field,     \
11948   readonly: false,                                              \
11949   exists_cb: &Client::_vxattrcb_layout_exists,                  \
11950   flags: 0,                                                     \
11951 }
11952 #define XATTR_QUOTA_FIELD(_type, _name)                         \
11953 {                                                               \
11954   name: CEPH_XATTR_NAME(_type, _name),                          \
11955   getxattr_cb: &Client::_vxattrcb_ ## _type ## _ ## _name,      \
11956   readonly: false,                                              \
11957   exists_cb: &Client::_vxattrcb_quota_exists,                   \
11958   flags: 0,                                                     \
11959 }
11960
11961 const Client::VXattr Client::_dir_vxattrs[] = {
11962   {
11963     name: "ceph.dir.layout",
11964     getxattr_cb: &Client::_vxattrcb_layout,
11965     readonly: false,
11966     exists_cb: &Client::_vxattrcb_layout_exists,
11967     flags: 0,
11968   },
11969   XATTR_LAYOUT_FIELD(dir, layout, stripe_unit),
11970   XATTR_LAYOUT_FIELD(dir, layout, stripe_count),
11971   XATTR_LAYOUT_FIELD(dir, layout, object_size),
11972   XATTR_LAYOUT_FIELD(dir, layout, pool),
11973   XATTR_LAYOUT_FIELD(dir, layout, pool_namespace),
11974   XATTR_NAME_CEPH(dir, entries),
11975   XATTR_NAME_CEPH(dir, files),
11976   XATTR_NAME_CEPH(dir, subdirs),
11977   XATTR_NAME_CEPH2(dir, rentries, VXATTR_RSTAT),
11978   XATTR_NAME_CEPH2(dir, rfiles, VXATTR_RSTAT),
11979   XATTR_NAME_CEPH2(dir, rsubdirs, VXATTR_RSTAT),
11980   XATTR_NAME_CEPH2(dir, rbytes, VXATTR_RSTAT),
11981   XATTR_NAME_CEPH2(dir, rctime, VXATTR_RSTAT),
11982   {
11983     name: "ceph.quota",
11984     getxattr_cb: &Client::_vxattrcb_quota,
11985     readonly: false,
11986     exists_cb: &Client::_vxattrcb_quota_exists,
11987     flags: 0,
11988   },
11989   XATTR_QUOTA_FIELD(quota, max_bytes),
11990   XATTR_QUOTA_FIELD(quota, max_files),
11991   {
11992     name: "ceph.dir.pin",
11993     getxattr_cb: &Client::_vxattrcb_dir_pin,
11994     readonly: false,
11995     exists_cb: &Client::_vxattrcb_dir_pin_exists,
11996     flags: 0,
11997   },
11998   {
11999     name: "ceph.snap.btime",
12000     getxattr_cb: &Client::_vxattrcb_snap_btime,
12001     readonly: true,
12002     exists_cb: &Client::_vxattrcb_snap_btime_exists,
12003     flags: 0,
12004   },
12005   { name: "" }     /* Required table terminator */
12006 };
12007
12008 const Client::VXattr Client::_file_vxattrs[] = {
12009   {
12010     name: "ceph.file.layout",
12011     getxattr_cb: &Client::_vxattrcb_layout,
12012     readonly: false,
12013     exists_cb: &Client::_vxattrcb_layout_exists,
12014     flags: 0,
12015   },
12016   XATTR_LAYOUT_FIELD(file, layout, stripe_unit),
12017   XATTR_LAYOUT_FIELD(file, layout, stripe_count),
12018   XATTR_LAYOUT_FIELD(file, layout, object_size),
12019   XATTR_LAYOUT_FIELD(file, layout, pool),
12020   XATTR_LAYOUT_FIELD(file, layout, pool_namespace),
12021   {
12022     name: "ceph.snap.btime",
12023     getxattr_cb: &Client::_vxattrcb_snap_btime,
12024     readonly: true,
12025     exists_cb: &Client::_vxattrcb_snap_btime_exists,
12026     flags: 0,
12027   },
12028   { name: "" }     /* Required table terminator */
12029 };
12030
12031 const Client::VXattr *Client::_get_vxattrs(Inode *in)
12032 {
12033   if (in->is_dir())
12034     return _dir_vxattrs;
12035   else if (in->is_file())
12036     return _file_vxattrs;
12037   return NULL;
12038 }
12039
12040 const Client::VXattr *Client::_match_vxattr(Inode *in, const char *name)
12041 {
12042   if (strncmp(name, "ceph.", 5) == 0) {
12043     const VXattr *vxattr = _get_vxattrs(in);
12044     if (vxattr) {
12045       while (!vxattr->name.empty()) {
12046         if (vxattr->name == name)
12047           return vxattr;
12048         vxattr++;
12049       }
12050     }
12051   }
12052   return NULL;
12053 }
12054
12055 int Client::ll_readlink(Inode *in, char *buf, size_t buflen, const UserPerm& perms)
12056 {
12057   std::lock_guard lock(client_lock);
12058
12059   if (unmounting)
12060     return -ENOTCONN;
12061
12062   vinodeno_t vino = _get_vino(in);
12063
12064   ldout(cct, 3) << "ll_readlink " << vino << dendl;
12065   tout(cct) << "ll_readlink" << std::endl;
12066   tout(cct) << vino.ino.val << std::endl;
12067
12068   for (auto dn : in->dentries) {
12069     touch_dn(dn);
12070   }
12071
12072   int r = _readlink(in, buf, buflen); // FIXME: no permission checking!
12073   ldout(cct, 3) << "ll_readlink " << vino << " = " << r << dendl;
12074   return r;
12075 }
12076
12077 int Client::_mknod(Inode *dir, const char *name, mode_t mode, dev_t rdev,
12078                    const UserPerm& perms, InodeRef *inp)
12079 {
12080   ldout(cct, 8) << "_mknod(" << dir->ino << " " << name << ", 0" << oct
12081                 << mode << dec << ", " << rdev << ", uid " << perms.uid()
12082                 << ", gid " << perms.gid() << ")" << dendl;
12083
12084   if (strlen(name) > NAME_MAX)
12085     return -ENAMETOOLONG;
12086
12087   if (dir->snapid != CEPH_NOSNAP) {
12088     return -EROFS;
12089   }
12090   if (is_quota_files_exceeded(dir, perms)) {
12091     return -EDQUOT;
12092   }
12093
12094   MetaRequest *req = new MetaRequest(CEPH_MDS_OP_MKNOD);
12095
12096   filepath path;
12097   dir->make_nosnap_relative_path(path);
12098   path.push_dentry(name);
12099   req->set_filepath(path);
12100   req->set_inode(dir);
12101   req->head.args.mknod.rdev = rdev;
12102   req->dentry_drop = CEPH_CAP_FILE_SHARED;
12103   req->dentry_unless = CEPH_CAP_FILE_EXCL;
12104
12105   bufferlist xattrs_bl;
12106   int res = _posix_acl_create(dir, &mode, xattrs_bl, perms);
12107   if (res < 0)
12108     goto fail;
12109   req->head.args.mknod.mode = mode;
12110   if (xattrs_bl.length() > 0)
12111     req->set_data(xattrs_bl);
12112
12113   Dentry *de;
12114   res = get_or_create(dir, name, &de);
12115   if (res < 0)
12116     goto fail;
12117   req->set_dentry(de);
12118
12119   res = make_request(req, perms, inp);
12120
12121   trim_cache();
12122
12123   ldout(cct, 8) << "mknod(" << path << ", 0" << oct << mode << dec << ") = " << res << dendl;
12124   return res;
12125
12126  fail:
12127   put_request(req);
12128   return res;
12129 }
12130
12131 int Client::ll_mknod(Inode *parent, const char *name, mode_t mode,
12132                      dev_t rdev, struct stat *attr, Inode **out,
12133                      const UserPerm& perms)
12134 {
12135   std::lock_guard lock(client_lock);
12136
12137   if (unmounting)
12138     return -ENOTCONN;
12139
12140   vinodeno_t vparent = _get_vino(parent);
12141
12142   ldout(cct, 3) << "ll_mknod " << vparent << " " << name << dendl;
12143   tout(cct) << "ll_mknod" << std::endl;
12144   tout(cct) << vparent.ino.val << std::endl;
12145   tout(cct) << name << std::endl;
12146   tout(cct) << mode << std::endl;
12147   tout(cct) << rdev << std::endl;
12148
12149   if (!fuse_default_permissions) {
12150     int r = may_create(parent, perms);
12151     if (r < 0)
12152       return r;
12153   }
12154
12155   InodeRef in;
12156   int r = _mknod(parent, name, mode, rdev, perms, &in);
12157   if (r == 0) {
12158     fill_stat(in, attr);
12159     _ll_get(in.get());
12160   }
12161   tout(cct) << attr->st_ino << std::endl;
12162   ldout(cct, 3) << "ll_mknod " << vparent << " " << name
12163           << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
12164   *out = in.get();
12165   return r;
12166 }
12167
12168 int Client::ll_mknodx(Inode *parent, const char *name, mode_t mode,
12169                       dev_t rdev, Inode **out,
12170                       struct ceph_statx *stx, unsigned want, unsigned flags,
12171                       const UserPerm& perms)
12172 {
12173   unsigned caps = statx_to_mask(flags, want);
12174   std::lock_guard lock(client_lock);
12175
12176   if (unmounting)
12177     return -ENOTCONN;
12178
12179   vinodeno_t vparent = _get_vino(parent);
12180
12181   ldout(cct, 3) << "ll_mknodx " << vparent << " " << name << dendl;
12182   tout(cct) << "ll_mknodx" << std::endl;
12183   tout(cct) << vparent.ino.val << std::endl;
12184   tout(cct) << name << std::endl;
12185   tout(cct) << mode << std::endl;
12186   tout(cct) << rdev << std::endl;
12187
12188   if (!fuse_default_permissions) {
12189     int r = may_create(parent, perms);
12190     if (r < 0)
12191       return r;
12192   }
12193
12194   InodeRef in;
12195   int r = _mknod(parent, name, mode, rdev, perms, &in);
12196   if (r == 0) {
12197     fill_statx(in, caps, stx);
12198     _ll_get(in.get());
12199   }
12200   tout(cct) << stx->stx_ino << std::endl;
12201   ldout(cct, 3) << "ll_mknodx " << vparent << " " << name
12202           << " = " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
12203   *out = in.get();
12204   return r;
12205 }
12206
12207 int Client::_create(Inode *dir, const char *name, int flags, mode_t mode,
12208                     InodeRef *inp, Fh **fhp, int stripe_unit, int stripe_count,
12209                     int object_size, const char *data_pool, bool *created,
12210                     const UserPerm& perms)
12211 {
12212   ldout(cct, 8) << "_create(" << dir->ino << " " << name << ", 0" << oct <<
12213     mode << dec << ")" << dendl;
12214
12215   if (strlen(name) > NAME_MAX)
12216     return -ENAMETOOLONG;
12217   if (dir->snapid != CEPH_NOSNAP) {
12218     return -EROFS;
12219   }
12220   if (is_quota_files_exceeded(dir, perms)) {
12221     return -EDQUOT;
12222   }
12223
12224   // use normalized flags to generate cmode
12225   int cflags = ceph_flags_sys2wire(flags);
12226   if (cct->_conf.get_val<bool>("client_force_lazyio"))
12227     cflags |= CEPH_O_LAZY;
12228
12229   int cmode = ceph_flags_to_mode(cflags);
12230
12231   int64_t pool_id = -1;
12232   if (data_pool && *data_pool) {
12233     pool_id = objecter->with_osdmap(
12234       std::mem_fn(&OSDMap::lookup_pg_pool_name), data_pool);
12235     if (pool_id < 0)
12236       return -EINVAL;
12237     if (pool_id > 0xffffffffll)
12238       return -ERANGE;  // bummer!
12239   }
12240
12241   MetaRequest *req = new MetaRequest(CEPH_MDS_OP_CREATE);
12242
12243   filepath path;
12244   dir->make_nosnap_relative_path(path);
12245   path.push_dentry(name);
12246   req->set_filepath(path);
12247   req->set_inode(dir);
12248   req->head.args.open.flags = cflags | CEPH_O_CREAT;
12249
12250   req->head.args.open.stripe_unit = stripe_unit;
12251   req->head.args.open.stripe_count = stripe_count;
12252   req->head.args.open.object_size = object_size;
12253   if (cct->_conf->client_debug_getattr_caps)
12254     req->head.args.open.mask = DEBUG_GETATTR_CAPS;
12255   else
12256     req->head.args.open.mask = 0;
12257   req->head.args.open.pool = pool_id;
12258   req->dentry_drop = CEPH_CAP_FILE_SHARED;
12259   req->dentry_unless = CEPH_CAP_FILE_EXCL;
12260
12261   mode |= S_IFREG;
12262   bufferlist xattrs_bl;
12263   int res = _posix_acl_create(dir, &mode, xattrs_bl, perms);
12264   if (res < 0)
12265     goto fail;
12266   req->head.args.open.mode = mode;
12267   if (xattrs_bl.length() > 0)
12268     req->set_data(xattrs_bl);
12269
12270   Dentry *de;
12271   res = get_or_create(dir, name, &de);
12272   if (res < 0)
12273     goto fail;
12274   req->set_dentry(de);
12275
12276   res = make_request(req, perms, inp, created);
12277   if (res < 0) {
12278     goto reply_error;
12279   }
12280
12281   /* If the caller passed a value in fhp, do the open */
12282   if(fhp) {
12283     (*inp)->get_open_ref(cmode);
12284     *fhp = _create_fh(inp->get(), flags, cmode, perms);
12285   }
12286
12287  reply_error:
12288   trim_cache();
12289
12290   ldout(cct, 8) << "create(" << path << ", 0" << oct << mode << dec
12291                 << " layout " << stripe_unit
12292                 << ' ' << stripe_count
12293                 << ' ' << object_size
12294                 <<") = " << res << dendl;
12295   return res;
12296
12297  fail:
12298   put_request(req);
12299   return res;
12300 }
12301
12302
12303 int Client::_mkdir(Inode *dir, const char *name, mode_t mode, const UserPerm& perm,
12304                    InodeRef *inp)
12305 {
12306   ldout(cct, 8) << "_mkdir(" << dir->ino << " " << name << ", 0" << oct
12307                 << mode << dec << ", uid " << perm.uid()
12308                 << ", gid " << perm.gid() << ")" << dendl;
12309
12310   if (strlen(name) > NAME_MAX)
12311     return -ENAMETOOLONG;
12312
12313   if (dir->snapid != CEPH_NOSNAP && dir->snapid != CEPH_SNAPDIR) {
12314     return -EROFS;
12315   }
12316   if (is_quota_files_exceeded(dir, perm)) {
12317     return -EDQUOT;
12318   }
12319   MetaRequest *req = new MetaRequest(dir->snapid == CEPH_SNAPDIR ?
12320                                      CEPH_MDS_OP_MKSNAP : CEPH_MDS_OP_MKDIR);
12321
12322   filepath path;
12323   dir->make_nosnap_relative_path(path);
12324   path.push_dentry(name);
12325   req->set_filepath(path);
12326   req->set_inode(dir);
12327   req->dentry_drop = CEPH_CAP_FILE_SHARED;
12328   req->dentry_unless = CEPH_CAP_FILE_EXCL;
12329
12330   mode |= S_IFDIR;
12331   bufferlist xattrs_bl;
12332   int res = _posix_acl_create(dir, &mode, xattrs_bl, perm);
12333   if (res < 0)
12334     goto fail;
12335   req->head.args.mkdir.mode = mode;
12336   if (xattrs_bl.length() > 0)
12337     req->set_data(xattrs_bl);
12338
12339   Dentry *de;
12340   res = get_or_create(dir, name, &de);
12341   if (res < 0)
12342     goto fail;
12343   req->set_dentry(de);
12344
12345   ldout(cct, 10) << "_mkdir: making request" << dendl;
12346   res = make_request(req, perm, inp);
12347   ldout(cct, 10) << "_mkdir result is " << res << dendl;
12348
12349   trim_cache();
12350
12351   ldout(cct, 8) << "_mkdir(" << path << ", 0" << oct << mode << dec << ") = " << res << dendl;
12352   return res;
12353
12354  fail:
12355   put_request(req);
12356   return res;
12357 }
12358
12359 int Client::ll_mkdir(Inode *parent, const char *name, mode_t mode,
12360                      struct stat *attr, Inode **out, const UserPerm& perm)
12361 {
12362   std::lock_guard lock(client_lock);
12363
12364   if (unmounting)
12365     return -ENOTCONN;
12366
12367   vinodeno_t vparent = _get_vino(parent);
12368
12369   ldout(cct, 3) << "ll_mkdir " << vparent << " " << name << dendl;
12370   tout(cct) << "ll_mkdir" << std::endl;
12371   tout(cct) << vparent.ino.val << std::endl;
12372   tout(cct) << name << std::endl;
12373   tout(cct) << mode << std::endl;
12374
12375   if (!fuse_default_permissions) {
12376     int r = may_create(parent, perm);
12377     if (r < 0)
12378       return r;
12379   }
12380
12381   InodeRef in;
12382   int r = _mkdir(parent, name, mode, perm, &in);
12383   if (r == 0) {
12384     fill_stat(in, attr);
12385     _ll_get(in.get());
12386   }
12387   tout(cct) << attr->st_ino << std::endl;
12388   ldout(cct, 3) << "ll_mkdir " << vparent << " " << name
12389           << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
12390   *out = in.get();
12391   return r;
12392 }
12393
12394 int Client::ll_mkdirx(Inode *parent, const char *name, mode_t mode, Inode **out,
12395                       struct ceph_statx *stx, unsigned want, unsigned flags,
12396                       const UserPerm& perms)
12397 {
12398   std::lock_guard lock(client_lock);
12399
12400   if (unmounting)
12401     return -ENOTCONN;
12402
12403   vinodeno_t vparent = _get_vino(parent);
12404
12405   ldout(cct, 3) << "ll_mkdirx " << vparent << " " << name << dendl;
12406   tout(cct) << "ll_mkdirx" << std::endl;
12407   tout(cct) << vparent.ino.val << std::endl;
12408   tout(cct) << name << std::endl;
12409   tout(cct) << mode << std::endl;
12410
12411   if (!fuse_default_permissions) {
12412     int r = may_create(parent, perms);
12413     if (r < 0)
12414       return r;
12415   }
12416
12417   InodeRef in;
12418   int r = _mkdir(parent, name, mode, perms, &in);
12419   if (r == 0) {
12420     fill_statx(in, statx_to_mask(flags, want), stx);
12421     _ll_get(in.get());
12422   } else {
12423     stx->stx_ino = 0;
12424     stx->stx_mask = 0;
12425   }
12426   tout(cct) << stx->stx_ino << std::endl;
12427   ldout(cct, 3) << "ll_mkdirx " << vparent << " " << name
12428           << " = " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
12429   *out = in.get();
12430   return r;
12431 }
12432
12433 int Client::_symlink(Inode *dir, const char *name, const char *target,
12434                      const UserPerm& perms, InodeRef *inp)
12435 {
12436   ldout(cct, 8) << "_symlink(" << dir->ino << " " << name << ", " << target
12437                 << ", uid " << perms.uid() << ", gid " << perms.gid() << ")"
12438                 << dendl;
12439
12440   if (strlen(name) > NAME_MAX)
12441     return -ENAMETOOLONG;
12442
12443   if (dir->snapid != CEPH_NOSNAP) {
12444     return -EROFS;
12445   }
12446   if (is_quota_files_exceeded(dir, perms)) {
12447     return -EDQUOT;
12448   }
12449
12450   MetaRequest *req = new MetaRequest(CEPH_MDS_OP_SYMLINK);
12451
12452   filepath path;
12453   dir->make_nosnap_relative_path(path);
12454   path.push_dentry(name);
12455   req->set_filepath(path);
12456   req->set_inode(dir);
12457   req->set_string2(target);
12458   req->dentry_drop = CEPH_CAP_FILE_SHARED;
12459   req->dentry_unless = CEPH_CAP_FILE_EXCL;
12460
12461   Dentry *de;
12462   int res = get_or_create(dir, name, &de);
12463   if (res < 0)
12464     goto fail;
12465   req->set_dentry(de);
12466
12467   res = make_request(req, perms, inp);
12468
12469   trim_cache();
12470   ldout(cct, 8) << "_symlink(\"" << path << "\", \"" << target << "\") = " <<
12471     res << dendl;
12472   return res;
12473
12474  fail:
12475   put_request(req);
12476   return res;
12477 }
12478
12479 int Client::ll_symlink(Inode *parent, const char *name, const char *value,
12480                        struct stat *attr, Inode **out, const UserPerm& perms)
12481 {
12482   std::lock_guard lock(client_lock);
12483
12484   if (unmounting)
12485     return -ENOTCONN;
12486
12487   vinodeno_t vparent = _get_vino(parent);
12488
12489   ldout(cct, 3) << "ll_symlink " << vparent << " " << name << " -> " << value
12490                 << dendl;
12491   tout(cct) << "ll_symlink" << std::endl;
12492   tout(cct) << vparent.ino.val << std::endl;
12493   tout(cct) << name << std::endl;
12494   tout(cct) << value << std::endl;
12495
12496   if (!fuse_default_permissions) {
12497     int r = may_create(parent, perms);
12498     if (r < 0)
12499       return r;
12500   }
12501
12502   InodeRef in;
12503   int r = _symlink(parent, name, value, perms, &in);
12504   if (r == 0) {
12505     fill_stat(in, attr);
12506     _ll_get(in.get());
12507   }
12508   tout(cct) << attr->st_ino << std::endl;
12509   ldout(cct, 3) << "ll_symlink " << vparent << " " << name
12510           << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
12511   *out = in.get();
12512   return r;
12513 }
12514
12515 int Client::ll_symlinkx(Inode *parent, const char *name, const char *value,
12516                         Inode **out, struct ceph_statx *stx, unsigned want,
12517                         unsigned flags, const UserPerm& perms)
12518 {
12519   std::lock_guard lock(client_lock);
12520
12521   if (unmounting)
12522     return -ENOTCONN;
12523
12524   vinodeno_t vparent = _get_vino(parent);
12525
12526   ldout(cct, 3) << "ll_symlinkx " << vparent << " " << name << " -> " << value
12527                 << dendl;
12528   tout(cct) << "ll_symlinkx" << std::endl;
12529   tout(cct) << vparent.ino.val << std::endl;
12530   tout(cct) << name << std::endl;
12531   tout(cct) << value << std::endl;
12532
12533   if (!fuse_default_permissions) {
12534     int r = may_create(parent, perms);
12535     if (r < 0)
12536       return r;
12537   }
12538
12539   InodeRef in;
12540   int r = _symlink(parent, name, value, perms, &in);
12541   if (r == 0) {
12542     fill_statx(in, statx_to_mask(flags, want), stx);
12543     _ll_get(in.get());
12544   }
12545   tout(cct) << stx->stx_ino << std::endl;
12546   ldout(cct, 3) << "ll_symlinkx " << vparent << " " << name
12547           << " = " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
12548   *out = in.get();
12549   return r;
12550 }
12551
12552 int Client::_unlink(Inode *dir, const char *name, const UserPerm& perm)
12553 {
12554   ldout(cct, 8) << "_unlink(" << dir->ino << " " << name
12555                 << " uid " << perm.uid() << " gid " << perm.gid()
12556                 << ")" << dendl;
12557
12558   if (dir->snapid != CEPH_NOSNAP) {
12559     return -EROFS;
12560   }
12561
12562   MetaRequest *req = new MetaRequest(CEPH_MDS_OP_UNLINK);
12563
12564   filepath path;
12565   dir->make_nosnap_relative_path(path);
12566   path.push_dentry(name);
12567   req->set_filepath(path);
12568
12569   InodeRef otherin;
12570   Inode *in;
12571   Dentry *de;
12572
12573   int res = get_or_create(dir, name, &de);
12574   if (res < 0)
12575     goto fail;
12576   req->set_dentry(de);
12577   req->dentry_drop = CEPH_CAP_FILE_SHARED;
12578   req->dentry_unless = CEPH_CAP_FILE_EXCL;
12579
12580   res = _lookup(dir, name, 0, &otherin, perm);
12581   if (res < 0)
12582     goto fail;
12583
12584   in = otherin.get();
12585   req->set_other_inode(in);
12586   in->break_all_delegs();
12587   req->other_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
12588
12589   req->set_inode(dir);
12590
12591   res = make_request(req, perm);
12592
12593   trim_cache();
12594   ldout(cct, 8) << "unlink(" << path << ") = " << res << dendl;
12595   return res;
12596
12597  fail:
12598   put_request(req);
12599   return res;
12600 }
12601
12602 int Client::ll_unlink(Inode *in, const char *name, const UserPerm& perm)
12603 {
12604   std::lock_guard lock(client_lock);
12605
12606   if (unmounting)
12607     return -ENOTCONN;
12608
12609   vinodeno_t vino = _get_vino(in);
12610
12611   ldout(cct, 3) << "ll_unlink " << vino << " " << name << dendl;
12612   tout(cct) << "ll_unlink" << std::endl;
12613   tout(cct) << vino.ino.val << std::endl;
12614   tout(cct) << name << std::endl;
12615
12616   if (!fuse_default_permissions) {
12617     int r = may_delete(in, name, perm);
12618     if (r < 0)
12619       return r;
12620   }
12621   return _unlink(in, name, perm);
12622 }
12623
12624 int Client::_rmdir(Inode *dir, const char *name, const UserPerm& perms)
12625 {
12626   ldout(cct, 8) << "_rmdir(" << dir->ino << " " << name << " uid "
12627                 << perms.uid() << " gid " << perms.gid() << ")" << dendl;
12628
12629   if (dir->snapid != CEPH_NOSNAP && dir->snapid != CEPH_SNAPDIR) {
12630     return -EROFS;
12631   }
12632
12633   int op = dir->snapid == CEPH_SNAPDIR ? CEPH_MDS_OP_RMSNAP : CEPH_MDS_OP_RMDIR;
12634   MetaRequest *req = new MetaRequest(op);
12635   filepath path;
12636   dir->make_nosnap_relative_path(path);
12637   path.push_dentry(name);
12638   req->set_filepath(path);
12639   req->set_inode(dir);
12640
12641   req->dentry_drop = CEPH_CAP_FILE_SHARED;
12642   req->dentry_unless = CEPH_CAP_FILE_EXCL;
12643   req->other_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
12644
12645   InodeRef in;
12646
12647   Dentry *de;
12648   int res = get_or_create(dir, name, &de);
12649   if (res < 0)
12650     goto fail;
12651   if (op == CEPH_MDS_OP_RMDIR)
12652     req->set_dentry(de);
12653   else
12654     de->get();
12655
12656   res = _lookup(dir, name, 0, &in, perms);
12657   if (res < 0)
12658     goto fail;
12659
12660   if (op == CEPH_MDS_OP_RMSNAP) {
12661     unlink(de, true, true);
12662     de->put();
12663   }
12664   req->set_other_inode(in.get());
12665
12666   res = make_request(req, perms);
12667
12668   trim_cache();
12669   ldout(cct, 8) << "rmdir(" << path << ") = " << res << dendl;
12670   return res;
12671
12672  fail:
12673   put_request(req);
12674   return res;
12675 }
12676
12677 int Client::ll_rmdir(Inode *in, const char *name, const UserPerm& perms)
12678 {
12679   std::lock_guard lock(client_lock);
12680
12681   if (unmounting)
12682     return -ENOTCONN;
12683
12684   vinodeno_t vino = _get_vino(in);
12685
12686   ldout(cct, 3) << "ll_rmdir " << vino << " " << name << dendl;
12687   tout(cct) << "ll_rmdir" << std::endl;
12688   tout(cct) << vino.ino.val << std::endl;
12689   tout(cct) << name << std::endl;
12690
12691   if (!fuse_default_permissions) {
12692     int r = may_delete(in, name, perms);
12693     if (r < 0)
12694       return r;
12695   }
12696
12697   return _rmdir(in, name, perms);
12698 }
12699
12700 int Client::_rename(Inode *fromdir, const char *fromname, Inode *todir, const char *toname, const UserPerm& perm)
12701 {
12702   ldout(cct, 8) << "_rename(" << fromdir->ino << " " << fromname << " to "
12703                 << todir->ino << " " << toname
12704                 << " uid " << perm.uid() << " gid " << perm.gid() << ")"
12705                 << dendl;
12706
12707   if (fromdir->snapid != todir->snapid)
12708     return -EXDEV;
12709
12710   int op = CEPH_MDS_OP_RENAME;
12711   if (fromdir->snapid != CEPH_NOSNAP) {
12712     if (fromdir == todir && fromdir->snapid == CEPH_SNAPDIR)
12713       op = CEPH_MDS_OP_RENAMESNAP;
12714     else
12715       return -EROFS;
12716   }
12717
12718   InodeRef target;
12719   MetaRequest *req = new MetaRequest(op);
12720
12721   filepath from;
12722   fromdir->make_nosnap_relative_path(from);
12723   from.push_dentry(fromname);
12724   filepath to;
12725   todir->make_nosnap_relative_path(to);
12726   to.push_dentry(toname);
12727   req->set_filepath(to);
12728   req->set_filepath2(from);
12729
12730   Dentry *oldde;
12731   int res = get_or_create(fromdir, fromname, &oldde);
12732   if (res < 0)
12733     goto fail;
12734   Dentry *de;
12735   res = get_or_create(todir, toname, &de);
12736   if (res < 0)
12737     goto fail;
12738
12739   if (op == CEPH_MDS_OP_RENAME) {
12740     req->set_old_dentry(oldde);
12741     req->old_dentry_drop = CEPH_CAP_FILE_SHARED;
12742     req->old_dentry_unless = CEPH_CAP_FILE_EXCL;
12743
12744     req->set_dentry(de);
12745     req->dentry_drop = CEPH_CAP_FILE_SHARED;
12746     req->dentry_unless = CEPH_CAP_FILE_EXCL;
12747
12748     InodeRef oldin, otherin;
12749     Inode *fromdir_root = nullptr;
12750     Inode *todir_root = nullptr;
12751     int mask = 0;
12752     bool quota_check = false;
12753     if (fromdir != todir) {
12754       fromdir_root =
12755         fromdir->quota.is_enable() ? fromdir : get_quota_root(fromdir, perm);
12756       todir_root =
12757         todir->quota.is_enable() ? todir : get_quota_root(todir, perm);
12758
12759       if (todir_root->quota.is_enable() && fromdir_root != todir_root) {
12760         // use CEPH_STAT_RSTAT mask to force send getattr or lookup request
12761         // to auth MDS to get latest rstat for todir_root and source dir
12762         // even if their dentry caches and inode caps are satisfied.
12763         res = _getattr(todir_root, CEPH_STAT_RSTAT, perm, true);
12764         if (res < 0)
12765           goto fail;
12766
12767         quota_check = true;
12768         if (oldde->inode && oldde->inode->is_dir()) {
12769           mask |= CEPH_STAT_RSTAT;
12770         }
12771       }
12772     }
12773
12774     res = _lookup(fromdir, fromname, mask, &oldin, perm);
12775     if (res < 0)
12776       goto fail;
12777
12778     Inode *oldinode = oldin.get();
12779     oldinode->break_all_delegs();
12780     req->set_old_inode(oldinode);
12781     req->old_inode_drop = CEPH_CAP_LINK_SHARED;
12782
12783     if (quota_check) {
12784       int64_t old_bytes, old_files;
12785       if (oldinode->is_dir()) {
12786         old_bytes = oldinode->rstat.rbytes;
12787         old_files = oldinode->rstat.rsize();
12788       } else {
12789         old_bytes = oldinode->size;
12790         old_files = 1;
12791       }
12792
12793       bool quota_exceed = false;
12794       if (todir_root && todir_root->quota.max_bytes &&
12795           (old_bytes + todir_root->rstat.rbytes) >= todir_root->quota.max_bytes) {
12796         ldout(cct, 10) << "_rename (" << oldinode->ino << " bytes="
12797                        << old_bytes << ") to (" << todir->ino
12798                        << ") will exceed quota on " << *todir_root << dendl;
12799         quota_exceed = true;
12800       }
12801
12802       if (todir_root && todir_root->quota.max_files &&
12803           (old_files + todir_root->rstat.rsize()) >= todir_root->quota.max_files) {
12804         ldout(cct, 10) << "_rename (" << oldinode->ino << " files="
12805                        << old_files << ") to (" << todir->ino
12806                        << ") will exceed quota on " << *todir_root << dendl;
12807         quota_exceed = true;
12808       }
12809
12810       if (quota_exceed) {
12811         res = (oldinode->is_dir()) ? -EXDEV : -EDQUOT;
12812         goto fail;
12813       }
12814     }
12815
12816     res = _lookup(todir, toname, 0, &otherin, perm);
12817     switch (res) {
12818     case 0:
12819       {
12820         Inode *in = otherin.get();
12821         req->set_other_inode(in);
12822         in->break_all_delegs();
12823       }
12824       req->other_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
12825       break;
12826     case -ENOENT:
12827       break;
12828     default:
12829       goto fail;
12830     }
12831
12832     req->set_inode(todir);
12833   } else {
12834     // renamesnap reply contains no tracedn, so we need to invalidate
12835     // dentry manually
12836     unlink(oldde, true, true);
12837     unlink(de, true, true);
12838
12839     req->set_inode(todir);
12840   }
12841
12842   res = make_request(req, perm, &target);
12843   ldout(cct, 10) << "rename result is " << res << dendl;
12844
12845   // renamed item from our cache
12846
12847   trim_cache();
12848   ldout(cct, 8) << "_rename(" << from << ", " << to << ") = " << res << dendl;
12849   return res;
12850
12851  fail:
12852   put_request(req);
12853   return res;
12854 }
12855
12856 int Client::ll_rename(Inode *parent, const char *name, Inode *newparent,
12857                       const char *newname, const UserPerm& perm)
12858 {
12859   std::lock_guard lock(client_lock);
12860
12861   if (unmounting)
12862     return -ENOTCONN;
12863
12864   vinodeno_t vparent = _get_vino(parent);
12865   vinodeno_t vnewparent = _get_vino(newparent);
12866
12867   ldout(cct, 3) << "ll_rename " << vparent << " " << name << " to "
12868           << vnewparent << " " << newname << dendl;
12869   tout(cct) << "ll_rename" << std::endl;
12870   tout(cct) << vparent.ino.val << std::endl;
12871   tout(cct) << name << std::endl;
12872   tout(cct) << vnewparent.ino.val << std::endl;
12873   tout(cct) << newname << std::endl;
12874
12875   if (!fuse_default_permissions) {
12876     int r = may_delete(parent, name, perm);
12877     if (r < 0)
12878       return r;
12879     r = may_delete(newparent, newname, perm);
12880     if (r < 0 && r != -ENOENT)
12881       return r;
12882   }
12883
12884   return _rename(parent, name, newparent, newname, perm);
12885 }
12886
12887 int Client::_link(Inode *in, Inode *dir, const char *newname, const UserPerm& perm, InodeRef *inp)
12888 {
12889   ldout(cct, 8) << "_link(" << in->ino << " to " << dir->ino << " " << newname
12890                 << " uid " << perm.uid() << " gid " << perm.gid() << ")" << dendl;
12891
12892   if (strlen(newname) > NAME_MAX)
12893     return -ENAMETOOLONG;
12894
12895   if (in->snapid != CEPH_NOSNAP || dir->snapid != CEPH_NOSNAP) {
12896     return -EROFS;
12897   }
12898   if (is_quota_files_exceeded(dir, perm)) {
12899     return -EDQUOT;
12900   }
12901
12902   in->break_all_delegs();
12903   MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LINK);
12904
12905   filepath path(newname, dir->ino);
12906   req->set_filepath(path);
12907   filepath existing(in->ino);
12908   req->set_filepath2(existing);
12909
12910   req->set_inode(dir);
12911   req->inode_drop = CEPH_CAP_FILE_SHARED;
12912   req->inode_unless = CEPH_CAP_FILE_EXCL;
12913
12914   Dentry *de;
12915   int res = get_or_create(dir, newname, &de);
12916   if (res < 0)
12917     goto fail;
12918   req->set_dentry(de);
12919
12920   res = make_request(req, perm, inp);
12921   ldout(cct, 10) << "link result is " << res << dendl;
12922
12923   trim_cache();
12924   ldout(cct, 8) << "link(" << existing << ", " << path << ") = " << res << dendl;
12925   return res;
12926
12927  fail:
12928   put_request(req);
12929   return res;
12930 }
12931
12932 int Client::ll_link(Inode *in, Inode *newparent, const char *newname,
12933                     const UserPerm& perm)
12934 {
12935   std::lock_guard lock(client_lock);
12936
12937   if (unmounting)
12938     return -ENOTCONN;
12939
12940   vinodeno_t vino = _get_vino(in);
12941   vinodeno_t vnewparent = _get_vino(newparent);
12942
12943   ldout(cct, 3) << "ll_link " << vino << " to " << vnewparent << " " <<
12944     newname << dendl;
12945   tout(cct) << "ll_link" << std::endl;
12946   tout(cct) << vino.ino.val << std::endl;
12947   tout(cct) << vnewparent << std::endl;
12948   tout(cct) << newname << std::endl;
12949
12950   InodeRef target;
12951
12952   if (!fuse_default_permissions) {
12953     if (S_ISDIR(in->mode))
12954       return -EPERM;
12955
12956     int r = may_hardlink(in, perm);
12957     if (r < 0)
12958       return r;
12959
12960     r = may_create(newparent, perm);
12961     if (r < 0)
12962       return r;
12963   }
12964
12965   return _link(in, newparent, newname, perm, &target);
12966 }
12967
12968 int Client::ll_num_osds(void)
12969 {
12970   std::lock_guard lock(client_lock);
12971   return objecter->with_osdmap(std::mem_fn(&OSDMap::get_num_osds));
12972 }
12973
12974 int Client::ll_osdaddr(int osd, uint32_t *addr)
12975 {
12976   std::lock_guard lock(client_lock);
12977
12978   entity_addr_t g;
12979   bool exists = objecter->with_osdmap([&](const OSDMap& o) {
12980       if (!o.exists(osd))
12981         return false;
12982       g = o.get_addrs(osd).front();
12983       return true;
12984     });
12985   if (!exists)
12986     return -1;
12987   uint32_t nb_addr = (g.in4_addr()).sin_addr.s_addr;
12988   *addr = ntohl(nb_addr);
12989   return 0;
12990 }
12991
12992 uint32_t Client::ll_stripe_unit(Inode *in)
12993 {
12994   std::lock_guard lock(client_lock);
12995   return in->layout.stripe_unit;
12996 }
12997
12998 uint64_t Client::ll_snap_seq(Inode *in)
12999 {
13000   std::lock_guard lock(client_lock);
13001   return in->snaprealm->seq;
13002 }
13003
13004 int Client::ll_file_layout(Inode *in, file_layout_t *layout)
13005 {
13006   std::lock_guard lock(client_lock);
13007   *layout = in->layout;
13008   return 0;
13009 }
13010
13011 int Client::ll_file_layout(Fh *fh, file_layout_t *layout)
13012 {
13013   return ll_file_layout(fh->inode.get(), layout);
13014 }
13015
13016 /* Currently we cannot take advantage of redundancy in reads, since we
13017    would have to go through all possible placement groups (a
13018    potentially quite large number determined by a hash), and use CRUSH
13019    to calculate the appropriate set of OSDs for each placement group,
13020    then index into that.  An array with one entry per OSD is much more
13021    tractable and works for demonstration purposes. */
13022
13023 int Client::ll_get_stripe_osd(Inode *in, uint64_t blockno,
13024                               file_layout_t* layout)
13025 {
13026   std::lock_guard lock(client_lock);
13027
13028   inodeno_t ino = in->ino;
13029   uint32_t object_size = layout->object_size;
13030   uint32_t su = layout->stripe_unit;
13031   uint32_t stripe_count = layout->stripe_count;
13032   uint64_t stripes_per_object = object_size / su;
13033   uint64_t stripeno = 0, stripepos = 0;
13034
13035   if(stripe_count) {
13036       stripeno = blockno / stripe_count;    // which horizontal stripe        (Y)
13037       stripepos = blockno % stripe_count;   // which object in the object set (X)
13038   }
13039   uint64_t objectsetno = stripeno / stripes_per_object;       // which object set
13040   uint64_t objectno = objectsetno * stripe_count + stripepos;  // object id
13041
13042   object_t oid = file_object_t(ino, objectno);
13043   return objecter->with_osdmap([&](const OSDMap& o) {
13044       ceph_object_layout olayout =
13045         o.file_to_object_layout(oid, *layout);
13046       pg_t pg = (pg_t)olayout.ol_pgid;
13047       vector<int> osds;
13048       int primary;
13049       o.pg_to_acting_osds(pg, &osds, &primary);
13050       return primary;
13051     });
13052 }
13053
13054 /* Return the offset of the block, internal to the object */
13055
13056 uint64_t Client::ll_get_internal_offset(Inode *in, uint64_t blockno)
13057 {
13058   std::lock_guard lock(client_lock);
13059   file_layout_t *layout=&(in->layout);
13060   uint32_t object_size = layout->object_size;
13061   uint32_t su = layout->stripe_unit;
13062   uint64_t stripes_per_object = object_size / su;
13063
13064   return (blockno % stripes_per_object) * su;
13065 }
13066
13067 int Client::ll_opendir(Inode *in, int flags, dir_result_t** dirpp,
13068                        const UserPerm& perms)
13069 {
13070   std::lock_guard lock(client_lock);
13071
13072   if (unmounting)
13073     return -ENOTCONN;
13074
13075   vinodeno_t vino = _get_vino(in);
13076
13077   ldout(cct, 3) << "ll_opendir " << vino << dendl;
13078   tout(cct) << "ll_opendir" << std::endl;
13079   tout(cct) << vino.ino.val << std::endl;
13080
13081   if (!fuse_default_permissions) {
13082     int r = may_open(in, flags, perms);
13083     if (r < 0)
13084       return r;
13085   }
13086
13087   int r = _opendir(in, dirpp, perms);
13088   tout(cct) << (unsigned long)*dirpp << std::endl;
13089
13090   ldout(cct, 3) << "ll_opendir " << vino << " = " << r << " (" << *dirpp << ")"
13091                 << dendl;
13092   return r;
13093 }
13094
13095 int Client::ll_releasedir(dir_result_t *dirp)
13096 {
13097   std::lock_guard lock(client_lock);
13098   ldout(cct, 3) << "ll_releasedir " << dirp << dendl;
13099   tout(cct) << "ll_releasedir" << std::endl;
13100   tout(cct) << (unsigned long)dirp << std::endl;
13101
13102   if (unmounting)
13103     return -ENOTCONN;
13104
13105   _closedir(dirp);
13106   return 0;
13107 }
13108
13109 int Client::ll_fsyncdir(dir_result_t *dirp)
13110 {
13111   std::lock_guard lock(client_lock);
13112   ldout(cct, 3) << "ll_fsyncdir " << dirp << dendl;
13113   tout(cct) << "ll_fsyncdir" << std::endl;
13114   tout(cct) << (unsigned long)dirp << std::endl;
13115
13116   if (unmounting)
13117     return -ENOTCONN;
13118
13119   return _fsync(dirp->inode.get(), false);
13120 }
13121
13122 int Client::ll_open(Inode *in, int flags, Fh **fhp, const UserPerm& perms)
13123 {
13124   ceph_assert(!(flags & O_CREAT));
13125
13126   std::lock_guard lock(client_lock);
13127
13128   if (unmounting)
13129     return -ENOTCONN;
13130
13131   vinodeno_t vino = _get_vino(in);
13132
13133   ldout(cct, 3) << "ll_open " << vino << " " << ceph_flags_sys2wire(flags) << dendl;
13134   tout(cct) << "ll_open" << std::endl;
13135   tout(cct) << vino.ino.val << std::endl;
13136   tout(cct) << ceph_flags_sys2wire(flags) << std::endl;
13137
13138   int r;
13139   if (!fuse_default_permissions) {
13140     r = may_open(in, flags, perms);
13141     if (r < 0)
13142       goto out;
13143   }
13144
13145   r = _open(in, flags, 0, fhp /* may be NULL */, perms);
13146
13147  out:
13148   Fh *fhptr = fhp ? *fhp : NULL;
13149   if (fhptr) {
13150     ll_unclosed_fh_set.insert(fhptr);
13151   }
13152   tout(cct) << (unsigned long)fhptr << std::endl;
13153   ldout(cct, 3) << "ll_open " << vino << " " << ceph_flags_sys2wire(flags) <<
13154       " = " << r << " (" << fhptr << ")" << dendl;
13155   return r;
13156 }
13157
13158 int Client::_ll_create(Inode *parent, const char *name, mode_t mode,
13159                       int flags, InodeRef *in, int caps, Fh **fhp,
13160                       const UserPerm& perms)
13161 {
13162   *fhp = NULL;
13163
13164   vinodeno_t vparent = _get_vino(parent);
13165
13166   ldout(cct, 8) << "_ll_create " << vparent << " " << name << " 0" << oct <<
13167     mode << dec << " " << ceph_flags_sys2wire(flags) << ", uid " << perms.uid()
13168                 << ", gid " << perms.gid() << dendl;
13169   tout(cct) << "ll_create" << std::endl;
13170   tout(cct) << vparent.ino.val << std::endl;
13171   tout(cct) << name << std::endl;
13172   tout(cct) << mode << std::endl;
13173   tout(cct) << ceph_flags_sys2wire(flags) << std::endl;
13174
13175   bool created = false;
13176   int r = _lookup(parent, name, caps, in, perms);
13177
13178   if (r == 0 && (flags & O_CREAT) && (flags & O_EXCL))
13179     return -EEXIST;
13180
13181   if (r == -ENOENT && (flags & O_CREAT)) {
13182     if (!fuse_default_permissions) {
13183       r = may_create(parent, perms);
13184       if (r < 0)
13185         goto out;
13186     }
13187     r = _create(parent, name, flags, mode, in, fhp, 0, 0, 0, NULL, &created,
13188                 perms);
13189     if (r < 0)
13190       goto out;
13191   }
13192
13193   if (r < 0)
13194     goto out;
13195
13196   ceph_assert(*in);
13197
13198   ldout(cct, 20) << "_ll_create created = " << created << dendl;
13199   if (!created) {
13200     if (!fuse_default_permissions) {
13201       r = may_open(in->get(), flags, perms);
13202       if (r < 0) {
13203         if (*fhp) {
13204           int release_r = _release_fh(*fhp);
13205           ceph_assert(release_r == 0);  // during create, no async data ops should have happened
13206         }
13207         goto out;
13208       }
13209     }
13210     if (*fhp == NULL) {
13211       r = _open(in->get(), flags, mode, fhp, perms);
13212       if (r < 0)
13213         goto out;
13214     }
13215   }
13216
13217 out:
13218   if (*fhp) {
13219     ll_unclosed_fh_set.insert(*fhp);
13220   }
13221
13222   ino_t ino = 0;
13223   if (r >= 0) {
13224     Inode *inode = in->get();
13225     if (use_faked_inos())
13226       ino = inode->faked_ino;
13227     else
13228       ino = inode->ino;
13229   }
13230
13231   tout(cct) << (unsigned long)*fhp << std::endl;
13232   tout(cct) << ino << std::endl;
13233   ldout(cct, 8) << "_ll_create " << vparent << " " << name << " 0" << oct <<
13234     mode << dec << " " << ceph_flags_sys2wire(flags) << " = " << r << " (" <<
13235     *fhp << " " << hex << ino << dec << ")" << dendl;
13236
13237   return r;
13238 }
13239
13240 int Client::ll_create(Inode *parent, const char *name, mode_t mode,
13241                       int flags, struct stat *attr, Inode **outp, Fh **fhp,
13242                       const UserPerm& perms)
13243 {
13244   std::lock_guard lock(client_lock);
13245   InodeRef in;
13246
13247   if (unmounting)
13248     return -ENOTCONN;
13249
13250   int r = _ll_create(parent, name, mode, flags, &in, CEPH_STAT_CAP_INODE_ALL,
13251                       fhp, perms);
13252   if (r >= 0) {
13253     ceph_assert(in);
13254
13255     // passing an Inode in outp requires an additional ref
13256     if (outp) {
13257       _ll_get(in.get());
13258       *outp = in.get();
13259     }
13260     fill_stat(in, attr);
13261   } else {
13262     attr->st_ino = 0;
13263   }
13264
13265   return r;
13266 }
13267
13268 int Client::ll_createx(Inode *parent, const char *name, mode_t mode,
13269                         int oflags, Inode **outp, Fh **fhp,
13270                         struct ceph_statx *stx, unsigned want, unsigned lflags,
13271                         const UserPerm& perms)
13272 {
13273   unsigned caps = statx_to_mask(lflags, want);
13274   std::lock_guard lock(client_lock);
13275   InodeRef in;
13276
13277   if (unmounting)
13278     return -ENOTCONN;
13279
13280   int r = _ll_create(parent, name, mode, oflags, &in, caps, fhp, perms);
13281   if (r >= 0) {
13282     ceph_assert(in);
13283
13284     // passing an Inode in outp requires an additional ref
13285     if (outp) {
13286       _ll_get(in.get());
13287       *outp = in.get();
13288     }
13289     fill_statx(in, caps, stx);
13290   } else {
13291     stx->stx_ino = 0;
13292     stx->stx_mask = 0;
13293   }
13294
13295   return r;
13296 }
13297
13298 loff_t Client::ll_lseek(Fh *fh, loff_t offset, int whence)
13299 {
13300   std::lock_guard lock(client_lock);
13301   tout(cct) << "ll_lseek" << std::endl;
13302   tout(cct) << offset << std::endl;
13303   tout(cct) << whence << std::endl;
13304
13305   if (unmounting)
13306     return -ENOTCONN;
13307
13308   return _lseek(fh, offset, whence);
13309 }
13310
13311 int Client::ll_read(Fh *fh, loff_t off, loff_t len, bufferlist *bl)
13312 {
13313   std::lock_guard lock(client_lock);
13314   ldout(cct, 3) << "ll_read " << fh << " " << fh->inode->ino << " " << " " << off << "~" << len << dendl;
13315   tout(cct) << "ll_read" << std::endl;
13316   tout(cct) << (unsigned long)fh << std::endl;
13317   tout(cct) << off << std::endl;
13318   tout(cct) << len << std::endl;
13319
13320   if (unmounting)
13321     return -ENOTCONN;
13322
13323   /* We can't return bytes written larger than INT_MAX, clamp len to that */
13324   len = std::min(len, (loff_t)INT_MAX);
13325   return _read(fh, off, len, bl);
13326 }
13327
13328 int Client::ll_read_block(Inode *in, uint64_t blockid,
13329                           char *buf,
13330                           uint64_t offset,
13331                           uint64_t length,
13332                           file_layout_t* layout)
13333 {
13334   std::lock_guard lock(client_lock);
13335
13336   if (unmounting)
13337     return -ENOTCONN;
13338
13339   vinodeno_t vino = _get_vino(in);
13340   object_t oid = file_object_t(vino.ino, blockid);
13341   C_SaferCond onfinish;
13342   bufferlist bl;
13343
13344   objecter->read(oid,
13345                  object_locator_t(layout->pool_id),
13346                  offset,
13347                  length,
13348                  vino.snapid,
13349                  &bl,
13350                  CEPH_OSD_FLAG_READ,
13351                  &onfinish);
13352
13353   client_lock.unlock();
13354   int r = onfinish.wait();
13355   client_lock.lock();
13356
13357   if (r >= 0) {
13358       bl.begin().copy(bl.length(), buf);
13359       r = bl.length();
13360   }
13361
13362   return r;
13363 }
13364
13365 /* It appears that the OSD doesn't return success unless the entire
13366    buffer was written, return the write length on success. */
13367
13368 int Client::ll_write_block(Inode *in, uint64_t blockid,
13369                            char* buf, uint64_t offset,
13370                            uint64_t length, file_layout_t* layout,
13371                            uint64_t snapseq, uint32_t sync)
13372 {
13373   vinodeno_t vino = ll_get_vino(in);
13374   int r = 0;
13375   std::unique_ptr<C_SaferCond> onsafe = nullptr;
13376
13377   if (length == 0) {
13378     return -EINVAL;
13379   }
13380   if (true || sync) {
13381     /* if write is stable, the epilogue is waiting on
13382      * flock */
13383     onsafe.reset(new C_SaferCond("Client::ll_write_block flock"));
13384   }
13385   object_t oid = file_object_t(vino.ino, blockid);
13386   SnapContext fakesnap;
13387   ceph::bufferlist bl;
13388   if (length > 0) {
13389     bl.push_back(buffer::copy(buf, length));
13390   }
13391
13392   ldout(cct, 1) << "ll_block_write for " << vino.ino << "." << blockid
13393                 << dendl;
13394
13395   fakesnap.seq = snapseq;
13396
13397   /* lock just in time */
13398   client_lock.lock();
13399   if (unmounting) {
13400     client_lock.unlock();
13401     return -ENOTCONN;
13402   }
13403
13404   objecter->write(oid,
13405                   object_locator_t(layout->pool_id),
13406                   offset,
13407                   length,
13408                   fakesnap,
13409                   bl,
13410                   ceph::real_clock::now(),
13411                   0,
13412                   onsafe.get());
13413
13414   client_lock.unlock();
13415   if (nullptr != onsafe) {
13416     r = onsafe->wait();
13417   }
13418
13419   if (r < 0) {
13420     return r;
13421   } else {
13422     return length;
13423   }
13424 }
13425
13426 int Client::ll_commit_blocks(Inode *in,
13427                              uint64_t offset,
13428                              uint64_t length)
13429 {
13430     std::lock_guard lock(client_lock);
13431     /*
13432     BarrierContext *bctx;
13433     vinodeno_t vino = _get_vino(in);
13434     uint64_t ino = vino.ino;
13435
13436     ldout(cct, 1) << "ll_commit_blocks for " << vino.ino << " from "
13437                   << offset << " to " << length << dendl;
13438
13439     if (length == 0) {
13440       return -EINVAL;
13441     }
13442
13443     map<uint64_t, BarrierContext*>::iterator p = barriers.find(ino);
13444     if (p != barriers.end()) {
13445       barrier_interval civ(offset, offset + length);
13446       p->second->commit_barrier(civ);
13447     }
13448     */
13449     return 0;
13450 }
13451
13452 int Client::ll_write(Fh *fh, loff_t off, loff_t len, const char *data)
13453 {
13454   std::lock_guard lock(client_lock);
13455   ldout(cct, 3) << "ll_write " << fh << " " << fh->inode->ino << " " << off <<
13456     "~" << len << dendl;
13457   tout(cct) << "ll_write" << std::endl;
13458   tout(cct) << (unsigned long)fh << std::endl;
13459   tout(cct) << off << std::endl;
13460   tout(cct) << len << std::endl;
13461
13462   if (unmounting)
13463     return -ENOTCONN;
13464
13465   /* We can't return bytes written larger than INT_MAX, clamp len to that */
13466   len = std::min(len, (loff_t)INT_MAX);
13467   int r = _write(fh, off, len, data, NULL, 0);
13468   ldout(cct, 3) << "ll_write " << fh << " " << off << "~" << len << " = " << r
13469                 << dendl;
13470   return r;
13471 }
13472
13473 int64_t Client::ll_writev(struct Fh *fh, const struct iovec *iov, int iovcnt, int64_t off)
13474 {
13475   std::lock_guard lock(client_lock);
13476   if (unmounting)
13477    return -ENOTCONN;
13478   return _preadv_pwritev_locked(fh, iov, iovcnt, off, true, false);
13479 }
13480
13481 int64_t Client::ll_readv(struct Fh *fh, const struct iovec *iov, int iovcnt, int64_t off)
13482 {
13483   std::lock_guard lock(client_lock);
13484   if (unmounting)
13485    return -ENOTCONN;
13486   return _preadv_pwritev_locked(fh, iov, iovcnt, off, false, false);
13487 }
13488
13489 int Client::ll_flush(Fh *fh)
13490 {
13491   std::lock_guard lock(client_lock);
13492   ldout(cct, 3) << "ll_flush " << fh << " " << fh->inode->ino << " " << dendl;
13493   tout(cct) << "ll_flush" << std::endl;
13494   tout(cct) << (unsigned long)fh << std::endl;
13495
13496   if (unmounting)
13497     return -ENOTCONN;
13498
13499   return _flush(fh);
13500 }
13501
13502 int Client::ll_fsync(Fh *fh, bool syncdataonly)
13503 {
13504   std::lock_guard lock(client_lock);
13505   ldout(cct, 3) << "ll_fsync " << fh << " " << fh->inode->ino << " " << dendl;
13506   tout(cct) << "ll_fsync" << std::endl;
13507   tout(cct) << (unsigned long)fh << std::endl;
13508
13509   if (unmounting)
13510     return -ENOTCONN;
13511
13512   int r = _fsync(fh, syncdataonly);
13513   if (r) {
13514     // If we're returning an error, clear it from the FH
13515     fh->take_async_err();
13516   }
13517   return r;
13518 }
13519
13520 int Client::ll_sync_inode(Inode *in, bool syncdataonly)
13521 {
13522   std::lock_guard lock(client_lock);
13523   ldout(cct, 3) << "ll_sync_inode " << *in << " " << dendl;
13524   tout(cct) << "ll_sync_inode" << std::endl;
13525   tout(cct) << (unsigned long)in << std::endl;
13526
13527   if (unmounting)
13528     return -ENOTCONN;
13529
13530   return _fsync(in, syncdataonly);
13531 }
13532
13533 #ifdef FALLOC_FL_PUNCH_HOLE
13534
13535 int Client::_fallocate(Fh *fh, int mode, int64_t offset, int64_t length)
13536 {
13537   if (offset < 0 || length <= 0)
13538     return -EINVAL;
13539
13540   if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
13541     return -EOPNOTSUPP;
13542
13543   if ((mode & FALLOC_FL_PUNCH_HOLE) && !(mode & FALLOC_FL_KEEP_SIZE))
13544     return -EOPNOTSUPP;
13545
13546   Inode *in = fh->inode.get();
13547
13548   if (objecter->osdmap_pool_full(in->layout.pool_id) &&
13549       !(mode & FALLOC_FL_PUNCH_HOLE)) {
13550     return -ENOSPC;
13551   }
13552
13553   if (in->snapid != CEPH_NOSNAP)
13554     return -EROFS;
13555
13556   if ((fh->mode & CEPH_FILE_MODE_WR) == 0)
13557     return -EBADF;
13558
13559   uint64_t size = offset + length;
13560   if (!(mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE)) &&
13561       size > in->size &&
13562       is_quota_bytes_exceeded(in, size - in->size, fh->actor_perms)) {
13563     return -EDQUOT;
13564   }
13565
13566   int have;
13567   int r = get_caps(in, CEPH_CAP_FILE_WR, CEPH_CAP_FILE_BUFFER, &have, -1);
13568   if (r < 0)
13569     return r;
13570
13571   std::unique_ptr<C_SaferCond> onuninline = nullptr;
13572   if (mode & FALLOC_FL_PUNCH_HOLE) {
13573     if (in->inline_version < CEPH_INLINE_NONE &&
13574         (have & CEPH_CAP_FILE_BUFFER)) {
13575       bufferlist bl;
13576       auto inline_iter = in->inline_data.cbegin();
13577       int len = in->inline_data.length();
13578       if (offset < len) {
13579         if (offset > 0)
13580           inline_iter.copy(offset, bl);
13581         int size = length;
13582         if (offset + size > len)
13583           size = len - offset;
13584         if (size > 0)
13585           bl.append_zero(size);
13586         if (offset + size < len) {
13587           inline_iter += size;
13588           inline_iter.copy(len - offset - size, bl);
13589         }
13590         in->inline_data = bl;
13591         in->inline_version++;
13592       }
13593       in->mtime = in->ctime = ceph_clock_now();
13594       in->change_attr++;
13595       in->mark_caps_dirty(CEPH_CAP_FILE_WR);
13596     } else {
13597       if (in->inline_version < CEPH_INLINE_NONE) {
13598         onuninline.reset(new C_SaferCond("Client::_fallocate_uninline_data flock"));
13599         uninline_data(in, onuninline.get());
13600       }
13601
13602       C_SaferCond onfinish("Client::_punch_hole flock");
13603
13604       unsafe_sync_write++;
13605       get_cap_ref(in, CEPH_CAP_FILE_BUFFER);
13606
13607       _invalidate_inode_cache(in, offset, length);
13608       filer->zero(in->ino, &in->layout,
13609                   in->snaprealm->get_snap_context(),
13610                   offset, length,
13611                   ceph::real_clock::now(),
13612                   0, true, &onfinish);
13613       in->mtime = in->ctime = ceph_clock_now();
13614       in->change_attr++;
13615       in->mark_caps_dirty(CEPH_CAP_FILE_WR);
13616
13617       client_lock.unlock();
13618       onfinish.wait();
13619       client_lock.lock();
13620       _sync_write_commit(in);
13621     }
13622   } else if (!(mode & FALLOC_FL_KEEP_SIZE)) {
13623     uint64_t size = offset + length;
13624     if (size > in->size) {
13625       in->size = size;
13626       in->mtime = in->ctime = ceph_clock_now();
13627       in->change_attr++;
13628       in->mark_caps_dirty(CEPH_CAP_FILE_WR);
13629
13630       if (is_quota_bytes_approaching(in, fh->actor_perms)) {
13631         check_caps(in, CHECK_CAPS_NODELAY);
13632       } else if (is_max_size_approaching(in)) {
13633         check_caps(in, 0);
13634       }
13635     }
13636   }
13637
13638   if (nullptr != onuninline) {
13639     client_lock.unlock();
13640     int ret = onuninline->wait();
13641     client_lock.lock();
13642
13643     if (ret >= 0 || ret == -ECANCELED) {
13644       in->inline_data.clear();
13645       in->inline_version = CEPH_INLINE_NONE;
13646       in->mark_caps_dirty(CEPH_CAP_FILE_WR);
13647       check_caps(in, 0);
13648     } else
13649       r = ret;
13650   }
13651
13652   put_cap_ref(in, CEPH_CAP_FILE_WR);
13653   return r;
13654 }
13655 #else
13656
13657 int Client::_fallocate(Fh *fh, int mode, int64_t offset, int64_t length)
13658 {
13659   return -EOPNOTSUPP;
13660 }
13661
13662 #endif
13663
13664
13665 int Client::ll_fallocate(Fh *fh, int mode, int64_t offset, int64_t length)
13666 {
13667   std::lock_guard lock(client_lock);
13668   ldout(cct, 3) << __func__ << " " << fh << " " << fh->inode->ino << " " << dendl;
13669   tout(cct) << __func__ << " " << mode << " " << offset << " " << length << std::endl;
13670   tout(cct) << (unsigned long)fh << std::endl;
13671
13672   if (unmounting)
13673     return -ENOTCONN;
13674
13675   return _fallocate(fh, mode, offset, length);
13676 }
13677
13678 int Client::fallocate(int fd, int mode, loff_t offset, loff_t length)
13679 {
13680   std::lock_guard lock(client_lock);
13681   tout(cct) << __func__ << " " << " " << fd << mode << " " << offset << " " << length << std::endl;
13682
13683   if (unmounting)
13684     return -ENOTCONN;
13685
13686   Fh *fh = get_filehandle(fd);
13687   if (!fh)
13688     return -EBADF;
13689 #if defined(__linux__) && defined(O_PATH)
13690   if (fh->flags & O_PATH)
13691     return -EBADF;
13692 #endif
13693   return _fallocate(fh, mode, offset, length);
13694 }
13695
13696 int Client::ll_release(Fh *fh)
13697 {
13698   std::lock_guard lock(client_lock);
13699
13700   if (unmounting)
13701     return -ENOTCONN;
13702
13703   ldout(cct, 3) << __func__ << " (fh)" << fh << " " << fh->inode->ino << " " <<
13704     dendl;
13705   tout(cct) << __func__ << " (fh)" << std::endl;
13706   tout(cct) << (unsigned long)fh << std::endl;
13707
13708   if (ll_unclosed_fh_set.count(fh))
13709     ll_unclosed_fh_set.erase(fh);
13710   return _release_fh(fh);
13711 }
13712
13713 int Client::ll_getlk(Fh *fh, struct flock *fl, uint64_t owner)
13714 {
13715   std::lock_guard lock(client_lock);
13716
13717   ldout(cct, 3) << "ll_getlk (fh)" << fh << " " << fh->inode->ino << dendl;
13718   tout(cct) << "ll_getk (fh)" << (unsigned long)fh << std::endl;
13719
13720   if (unmounting)
13721     return -ENOTCONN;
13722
13723   return _getlk(fh, fl, owner);
13724 }
13725
13726 int Client::ll_setlk(Fh *fh, struct flock *fl, uint64_t owner, int sleep)
13727 {
13728   std::lock_guard lock(client_lock);
13729
13730   ldout(cct, 3) << __func__ << "  (fh) " << fh << " " << fh->inode->ino << dendl;
13731   tout(cct) << __func__ << " (fh)" << (unsigned long)fh << std::endl;
13732
13733   if (unmounting)
13734     return -ENOTCONN;
13735
13736   return _setlk(fh, fl, owner, sleep);
13737 }
13738
13739 int Client::ll_flock(Fh *fh, int cmd, uint64_t owner)
13740 {
13741   std::lock_guard lock(client_lock);
13742
13743   ldout(cct, 3) << __func__ << "  (fh) " << fh << " " << fh->inode->ino << dendl;
13744   tout(cct) << __func__ << " (fh)" << (unsigned long)fh << std::endl;
13745
13746   if (unmounting)
13747     return -ENOTCONN;
13748
13749   return _flock(fh, cmd, owner);
13750 }
13751
13752 int Client::set_deleg_timeout(uint32_t timeout)
13753 {
13754   std::lock_guard lock(client_lock);
13755
13756   /*
13757    * The whole point is to prevent blacklisting so we must time out the
13758    * delegation before the session autoclose timeout kicks in.
13759    */
13760   if (timeout >= mdsmap->get_session_autoclose())
13761     return -EINVAL;
13762
13763   deleg_timeout = timeout;
13764   return 0;
13765 }
13766
13767 int Client::ll_delegation(Fh *fh, unsigned cmd, ceph_deleg_cb_t cb, void *priv)
13768 {
13769   int ret = -EINVAL;
13770
13771   std::lock_guard lock(client_lock);
13772
13773   if (!mounted)
13774     return -ENOTCONN;
13775
13776   Inode *inode = fh->inode.get();
13777
13778   switch(cmd) {
13779   case CEPH_DELEGATION_NONE:
13780     inode->unset_deleg(fh);
13781     ret = 0;
13782     break;
13783   default:
13784     try {
13785       ret = inode->set_deleg(fh, cmd, cb, priv);
13786     } catch (std::bad_alloc&) {
13787       ret = -ENOMEM;
13788     }
13789     break;
13790   }
13791   return ret;
13792 }
13793
13794 class C_Client_RequestInterrupt : public Context  {
13795 private:
13796   Client *client;
13797   MetaRequest *req;
13798 public:
13799   C_Client_RequestInterrupt(Client *c, MetaRequest *r) : client(c), req(r) {
13800     req->get();
13801   }
13802   void finish(int r) override {
13803     std::lock_guard l(client->client_lock);
13804     ceph_assert(req->head.op == CEPH_MDS_OP_SETFILELOCK);
13805     client->_interrupt_filelock(req);
13806     client->put_request(req);
13807   }
13808 };
13809
13810 void Client::ll_interrupt(void *d)
13811 {
13812   MetaRequest *req = static_cast<MetaRequest*>(d);
13813   ldout(cct, 3) << __func__ << " tid " << req->get_tid() << dendl;
13814   tout(cct) << __func__ << " tid " << req->get_tid() << std::endl;
13815   interrupt_finisher.queue(new C_Client_RequestInterrupt(this, req));
13816 }
13817
13818 // =========================================
13819 // layout
13820
13821 // expose file layouts
13822
13823 int Client::describe_layout(const char *relpath, file_layout_t *lp,
13824                             const UserPerm& perms)
13825 {
13826   std::lock_guard lock(client_lock);
13827
13828   if (unmounting)
13829     return -ENOTCONN;
13830
13831   filepath path(relpath);
13832   InodeRef in;
13833   int r = path_walk(path, &in, perms);
13834   if (r < 0)
13835     return r;
13836
13837   *lp = in->layout;
13838
13839   ldout(cct, 3) << __func__ << "(" << relpath << ") = 0" << dendl;
13840   return 0;
13841 }
13842
13843 int Client::fdescribe_layout(int fd, file_layout_t *lp)
13844 {
13845   std::lock_guard lock(client_lock);
13846
13847   if (unmounting)
13848     return -ENOTCONN;
13849
13850   Fh *f = get_filehandle(fd);
13851   if (!f)
13852     return -EBADF;
13853   Inode *in = f->inode.get();
13854
13855   *lp = in->layout;
13856
13857   ldout(cct, 3) << __func__ << "(" << fd << ") = 0" << dendl;
13858   return 0;
13859 }
13860
13861 int64_t Client::get_default_pool_id()
13862 {
13863   std::lock_guard lock(client_lock);
13864
13865   if (unmounting)
13866     return -ENOTCONN;
13867
13868   /* first data pool is the default */
13869   return mdsmap->get_first_data_pool();
13870 }
13871
13872 // expose osdmap
13873
13874 int64_t Client::get_pool_id(const char *pool_name)
13875 {
13876   std::lock_guard lock(client_lock);
13877
13878   if (unmounting)
13879     return -ENOTCONN;
13880
13881   return objecter->with_osdmap(std::mem_fn(&OSDMap::lookup_pg_pool_name),
13882                                pool_name);
13883 }
13884
13885 string Client::get_pool_name(int64_t pool)
13886 {
13887   std::lock_guard lock(client_lock);
13888
13889   if (unmounting)
13890     return string();
13891
13892   return objecter->with_osdmap([pool](const OSDMap& o) {
13893       return o.have_pg_pool(pool) ? o.get_pool_name(pool) : string();
13894     });
13895 }
13896
13897 int Client::get_pool_replication(int64_t pool)
13898 {
13899   std::lock_guard lock(client_lock);
13900
13901   if (unmounting)
13902     return -ENOTCONN;
13903
13904   return objecter->with_osdmap([pool](const OSDMap& o) {
13905       return o.have_pg_pool(pool) ? o.get_pg_pool(pool)->get_size() : -ENOENT;
13906     });
13907 }
13908
13909 int Client::get_file_extent_osds(int fd, loff_t off, loff_t *len, vector<int>& osds)
13910 {
13911   std::lock_guard lock(client_lock);
13912
13913   if (unmounting)
13914     return -ENOTCONN;
13915
13916   Fh *f = get_filehandle(fd);
13917   if (!f)
13918     return -EBADF;
13919   Inode *in = f->inode.get();
13920
13921   vector<ObjectExtent> extents;
13922   Striper::file_to_extents(cct, in->ino, &in->layout, off, 1, in->truncate_size, extents);
13923   ceph_assert(extents.size() == 1);
13924
13925   objecter->with_osdmap([&](const OSDMap& o) {
13926       pg_t pg = o.object_locator_to_pg(extents[0].oid, extents[0].oloc);
13927       o.pg_to_acting_osds(pg, osds);
13928     });
13929
13930   if (osds.empty())
13931     return -EINVAL;
13932
13933   /*
13934    * Return the remainder of the extent (stripe unit)
13935    *
13936    * If length = 1 is passed to Striper::file_to_extents we get a single
13937    * extent back, but its length is one so we still need to compute the length
13938    * to the end of the stripe unit.
13939    *
13940    * If length = su then we may get 1 or 2 objects back in the extents vector
13941    * which would have to be examined. Even then, the offsets are local to the
13942    * object, so matching up to the file offset is extra work.
13943    *
13944    * It seems simpler to stick with length = 1 and manually compute the
13945    * remainder.
13946    */
13947   if (len) {
13948     uint64_t su = in->layout.stripe_unit;
13949     *len = su - (off % su);
13950   }
13951
13952   return 0;
13953 }
13954
13955 int Client::get_osd_crush_location(int id, vector<pair<string, string> >& path)
13956 {
13957   std::lock_guard lock(client_lock);
13958
13959   if (unmounting)
13960     return -ENOTCONN;
13961
13962   if (id < 0)
13963     return -EINVAL;
13964   return objecter->with_osdmap([&](const OSDMap& o) {
13965       return o.crush->get_full_location_ordered(id, path);
13966     });
13967 }
13968
13969 int Client::get_file_stripe_address(int fd, loff_t offset,
13970                                     vector<entity_addr_t>& address)
13971 {
13972   std::lock_guard lock(client_lock);
13973
13974   if (unmounting)
13975     return -ENOTCONN;
13976
13977   Fh *f = get_filehandle(fd);
13978   if (!f)
13979     return -EBADF;
13980   Inode *in = f->inode.get();
13981
13982   // which object?
13983   vector<ObjectExtent> extents;
13984   Striper::file_to_extents(cct, in->ino, &in->layout, offset, 1,
13985                            in->truncate_size, extents);
13986   ceph_assert(extents.size() == 1);
13987
13988   // now we have the object and its 'layout'
13989   return objecter->with_osdmap([&](const OSDMap& o) {
13990       pg_t pg = o.object_locator_to_pg(extents[0].oid, extents[0].oloc);
13991       vector<int> osds;
13992       o.pg_to_acting_osds(pg, osds);
13993       if (osds.empty())
13994         return -EINVAL;
13995       for (unsigned i = 0; i < osds.size(); i++) {
13996         entity_addr_t addr = o.get_addrs(osds[i]).front();
13997         address.push_back(addr);
13998       }
13999       return 0;
14000     });
14001 }
14002
14003 int Client::get_osd_addr(int osd, entity_addr_t& addr)
14004 {
14005   std::lock_guard lock(client_lock);
14006
14007   if (unmounting)
14008     return -ENOTCONN;
14009
14010   return objecter->with_osdmap([&](const OSDMap& o) {
14011       if (!o.exists(osd))
14012         return -ENOENT;
14013
14014       addr = o.get_addrs(osd).front();
14015       return 0;
14016     });
14017 }
14018
14019 int Client::enumerate_layout(int fd, vector<ObjectExtent>& result,
14020                              loff_t length, loff_t offset)
14021 {
14022   std::lock_guard lock(client_lock);
14023
14024   if (unmounting)
14025     return -ENOTCONN;
14026
14027   Fh *f = get_filehandle(fd);
14028   if (!f)
14029     return -EBADF;
14030   Inode *in = f->inode.get();
14031
14032   // map to a list of extents
14033   Striper::file_to_extents(cct, in->ino, &in->layout, offset, length, in->truncate_size, result);
14034
14035   ldout(cct, 3) << __func__ << "(" << fd << ", " << length << ", " << offset << ") = 0" << dendl;
14036   return 0;
14037 }
14038
14039
14040 /* find an osd with the same ip.  -ENXIO if none. */
14041 int Client::get_local_osd()
14042 {
14043   std::lock_guard lock(client_lock);
14044
14045   if (unmounting)
14046     return -ENOTCONN;
14047
14048   objecter->with_osdmap([this](const OSDMap& o) {
14049       if (o.get_epoch() != local_osd_epoch) {
14050         local_osd = o.find_osd_on_ip(messenger->get_myaddrs().front());
14051         local_osd_epoch = o.get_epoch();
14052       }
14053     });
14054   return local_osd;
14055 }
14056
14057
14058
14059
14060
14061
14062 // ===============================
14063
14064 void Client::ms_handle_connect(Connection *con)
14065 {
14066   ldout(cct, 10) << __func__ << " on " << con->get_peer_addr() << dendl;
14067 }
14068
14069 bool Client::ms_handle_reset(Connection *con)
14070 {
14071   ldout(cct, 0) << __func__ << " on " << con->get_peer_addr() << dendl;
14072   return false;
14073 }
14074
14075 void Client::ms_handle_remote_reset(Connection *con)
14076 {
14077   ldout(cct, 0) << __func__ << " on " << con->get_peer_addr() << dendl;
14078   std::lock_guard l(client_lock);
14079   switch (con->get_peer_type()) {
14080   case CEPH_ENTITY_TYPE_MDS:
14081     {
14082       // kludge to figure out which mds this is; fixme with a Connection* state
14083       mds_rank_t mds = MDS_RANK_NONE;
14084       MetaSession *s = NULL;
14085       for (auto &p : mds_sessions) {
14086         if (mdsmap->get_addrs(p.first) == con->get_peer_addrs()) {
14087           mds = p.first;
14088           s = &p.second;
14089         }
14090       }
14091       if (mds >= 0) {
14092         assert (s != NULL);
14093         switch (s->state) {
14094         case MetaSession::STATE_CLOSING:
14095           ldout(cct, 1) << "reset from mds we were closing; we'll call that closed" << dendl;
14096           _closed_mds_session(s);
14097           break;
14098
14099         case MetaSession::STATE_OPENING:
14100           {
14101             ldout(cct, 1) << "reset from mds we were opening; retrying" << dendl;
14102             list<Context*> waiters;
14103             waiters.swap(s->waiting_for_open);
14104             _closed_mds_session(s);
14105             MetaSession *news = _get_or_open_mds_session(mds);
14106             news->waiting_for_open.swap(waiters);
14107           }
14108           break;
14109
14110         case MetaSession::STATE_OPEN:
14111           {
14112             objecter->maybe_request_map(); /* to check if we are blacklisted */
14113             const auto& conf = cct->_conf;
14114             if (conf->client_reconnect_stale) {
14115               ldout(cct, 1) << "reset from mds we were open; close mds session for reconnect" << dendl;
14116               _closed_mds_session(s);
14117             } else {
14118               ldout(cct, 1) << "reset from mds we were open; mark session as stale" << dendl;
14119               s->state = MetaSession::STATE_STALE;
14120             }
14121           }
14122           break;
14123
14124         case MetaSession::STATE_NEW:
14125         case MetaSession::STATE_CLOSED:
14126         default:
14127           break;
14128         }
14129       }
14130     }
14131     break;
14132   }
14133 }
14134
14135 bool Client::ms_handle_refused(Connection *con)
14136 {
14137   ldout(cct, 1) << __func__ << " on " << con->get_peer_addr() << dendl;
14138   return false;
14139 }
14140
14141 Inode *Client::get_quota_root(Inode *in, const UserPerm& perms)
14142 {
14143   Inode *quota_in = root_ancestor;
14144   SnapRealm *realm = in->snaprealm;
14145   while (realm) {
14146     ldout(cct, 10) << __func__ << " realm " << realm->ino << dendl;
14147     if (realm->ino != in->ino) {
14148       auto p = inode_map.find(vinodeno_t(realm->ino, CEPH_NOSNAP));
14149       if (p == inode_map.end())
14150         break;
14151
14152       if (p->second->quota.is_enable()) {
14153         quota_in = p->second;
14154         break;
14155       }
14156     }
14157     realm = realm->pparent;
14158   }
14159   ldout(cct, 10) << __func__ << " " << in->vino() << " -> " << quota_in->vino() << dendl;
14160   return quota_in;
14161 }
14162
14163 /**
14164  * Traverse quota ancestors of the Inode, return true
14165  * if any of them passes the passed function
14166  */
14167 bool Client::check_quota_condition(Inode *in, const UserPerm& perms,
14168                                    std::function<bool (const Inode &in)> test)
14169 {
14170   while (true) {
14171     ceph_assert(in != NULL);
14172     if (test(*in)) {
14173       return true;
14174     }
14175
14176     if (in == root_ancestor) {
14177       // We're done traversing, drop out
14178       return false;
14179     } else {
14180       // Continue up the tree
14181       in = get_quota_root(in, perms);
14182     }
14183   }
14184
14185   return false;
14186 }
14187
14188 bool Client::is_quota_files_exceeded(Inode *in, const UserPerm& perms)
14189 {
14190   return check_quota_condition(in, perms,
14191       [](const Inode &in) {
14192         return in.quota.max_files && in.rstat.rsize() >= in.quota.max_files;
14193       });
14194 }
14195
14196 bool Client::is_quota_bytes_exceeded(Inode *in, int64_t new_bytes,
14197                                      const UserPerm& perms)
14198 {
14199   return check_quota_condition(in, perms,
14200       [&new_bytes](const Inode &in) {
14201         return in.quota.max_bytes && (in.rstat.rbytes + new_bytes)
14202                > in.quota.max_bytes;
14203       });
14204 }
14205
14206 bool Client::is_quota_bytes_approaching(Inode *in, const UserPerm& perms)
14207 {
14208   ceph_assert(in->size >= in->reported_size);
14209   const uint64_t size = in->size - in->reported_size;
14210   return check_quota_condition(in, perms,
14211       [&size](const Inode &in) {
14212         if (in.quota.max_bytes) {
14213           if (in.rstat.rbytes >= in.quota.max_bytes) {
14214             return true;
14215           }
14216
14217           const uint64_t space = in.quota.max_bytes - in.rstat.rbytes;
14218           return (space >> 4) < size;
14219         } else {
14220           return false;
14221         }
14222       });
14223 }
14224
14225 enum {
14226   POOL_CHECKED = 1,
14227   POOL_CHECKING = 2,
14228   POOL_READ = 4,
14229   POOL_WRITE = 8,
14230 };
14231
14232 int Client::check_pool_perm(Inode *in, int need)
14233 {
14234   if (!cct->_conf->client_check_pool_perm)
14235     return 0;
14236
14237   int64_t pool_id = in->layout.pool_id;
14238   std::string pool_ns = in->layout.pool_ns;
14239   std::pair<int64_t, std::string> perm_key(pool_id, pool_ns);
14240   int have = 0;
14241   while (true) {
14242     auto it = pool_perms.find(perm_key);
14243     if (it == pool_perms.end())
14244       break;
14245     if (it->second == POOL_CHECKING) {
14246       // avoid concurrent checkings
14247       wait_on_list(waiting_for_pool_perm);
14248     } else {
14249       have = it->second;
14250       ceph_assert(have & POOL_CHECKED);
14251       break;
14252     }
14253   }
14254
14255   if (!have) {
14256     if (in->snapid != CEPH_NOSNAP) {
14257       // pool permission check needs to write to the first object. But for snapshot,
14258       // head of the first object may have alread been deleted. To avoid creating
14259       // orphan object, skip the check for now.
14260       return 0;
14261     }
14262
14263     pool_perms[perm_key] = POOL_CHECKING;
14264
14265     char oid_buf[32];
14266     snprintf(oid_buf, sizeof(oid_buf), "%llx.00000000", (unsigned long long)in->ino);
14267     object_t oid = oid_buf;
14268
14269     SnapContext nullsnapc;
14270
14271     C_SaferCond rd_cond;
14272     ObjectOperation rd_op;
14273     rd_op.stat(NULL, (ceph::real_time*)nullptr, NULL);
14274
14275     objecter->mutate(oid, OSDMap::file_to_object_locator(in->layout), rd_op,
14276                      nullsnapc, ceph::real_clock::now(), 0, &rd_cond);
14277
14278     C_SaferCond wr_cond;
14279     ObjectOperation wr_op;
14280     wr_op.create(true);
14281
14282     objecter->mutate(oid, OSDMap::file_to_object_locator(in->layout), wr_op,
14283                      nullsnapc, ceph::real_clock::now(), 0, &wr_cond);
14284
14285     client_lock.unlock();
14286     int rd_ret = rd_cond.wait();
14287     int wr_ret = wr_cond.wait();
14288     client_lock.lock();
14289
14290     bool errored = false;
14291
14292     if (rd_ret == 0 || rd_ret == -ENOENT)
14293       have |= POOL_READ;
14294     else if (rd_ret != -EPERM) {
14295       ldout(cct, 10) << __func__ << " on pool " << pool_id << " ns " << pool_ns
14296                      << " rd_err = " << rd_ret << " wr_err = " << wr_ret << dendl;
14297       errored = true;
14298     }
14299
14300     if (wr_ret == 0 || wr_ret == -EEXIST)
14301       have |= POOL_WRITE;
14302     else if (wr_ret != -EPERM) {
14303       ldout(cct, 10) << __func__ << " on pool " << pool_id << " ns " << pool_ns
14304                      << " rd_err = " << rd_ret << " wr_err = " << wr_ret << dendl;
14305       errored = true;
14306     }
14307
14308     if (errored) {
14309       // Indeterminate: erase CHECKING state so that subsequent calls re-check.
14310       // Raise EIO because actual error code might be misleading for
14311       // userspace filesystem user.
14312       pool_perms.erase(perm_key);
14313       signal_cond_list(waiting_for_pool_perm);
14314       return -EIO;
14315     }
14316
14317     pool_perms[perm_key] = have | POOL_CHECKED;
14318     signal_cond_list(waiting_for_pool_perm);
14319   }
14320
14321   if ((need & CEPH_CAP_FILE_RD) && !(have & POOL_READ)) {
14322     ldout(cct, 10) << __func__ << " on pool " << pool_id << " ns " << pool_ns
14323                    << " need " << ccap_string(need) << ", but no read perm" << dendl;
14324     return -EPERM;
14325   }
14326   if ((need & CEPH_CAP_FILE_WR) && !(have & POOL_WRITE)) {
14327     ldout(cct, 10) << __func__ << " on pool " << pool_id << " ns " << pool_ns
14328                    << " need " << ccap_string(need) << ", but no write perm" << dendl;
14329     return -EPERM;
14330   }
14331
14332   return 0;
14333 }
14334
14335 int Client::_posix_acl_permission(Inode *in, const UserPerm& perms, unsigned want)
14336 {
14337   if (acl_type == POSIX_ACL) {
14338     if (in->xattrs.count(ACL_EA_ACCESS)) {
14339       const bufferptr& access_acl = in->xattrs[ACL_EA_ACCESS];
14340
14341       return posix_acl_permits(access_acl, in->uid, in->gid, perms, want);
14342     }
14343   }
14344   return -EAGAIN;
14345 }
14346
14347 int Client::_posix_acl_chmod(Inode *in, mode_t mode, const UserPerm& perms)
14348 {
14349   if (acl_type == NO_ACL)
14350     return 0;
14351
14352   int r = _getattr(in, CEPH_STAT_CAP_XATTR, perms, in->xattr_version == 0);
14353   if (r < 0)
14354     goto out;
14355
14356   if (acl_type == POSIX_ACL) {
14357     if (in->xattrs.count(ACL_EA_ACCESS)) {
14358       const bufferptr& access_acl = in->xattrs[ACL_EA_ACCESS];
14359       bufferptr acl(access_acl.c_str(), access_acl.length());
14360       r = posix_acl_access_chmod(acl, mode);
14361       if (r < 0)
14362         goto out;
14363       r = _do_setxattr(in, ACL_EA_ACCESS, acl.c_str(), acl.length(), 0, perms);
14364     } else {
14365       r = 0;
14366     }
14367   }
14368 out:
14369   ldout(cct, 10) << __func__ << " ino " << in->ino << " result=" << r << dendl;
14370   return r;
14371 }
14372
14373 int Client::_posix_acl_create(Inode *dir, mode_t *mode, bufferlist& xattrs_bl,
14374                               const UserPerm& perms)
14375 {
14376   if (acl_type == NO_ACL)
14377     return 0;
14378
14379   if (S_ISLNK(*mode))
14380     return 0;
14381
14382   int r = _getattr(dir, CEPH_STAT_CAP_XATTR, perms, dir->xattr_version == 0);
14383   if (r < 0)
14384     goto out;
14385
14386   if (acl_type == POSIX_ACL) {
14387     if (dir->xattrs.count(ACL_EA_DEFAULT)) {
14388       map<string, bufferptr> xattrs;
14389
14390       const bufferptr& default_acl = dir->xattrs[ACL_EA_DEFAULT];
14391       bufferptr acl(default_acl.c_str(), default_acl.length());
14392       r = posix_acl_inherit_mode(acl, mode);
14393       if (r < 0)
14394         goto out;
14395
14396       if (r > 0) {
14397         r = posix_acl_equiv_mode(acl.c_str(), acl.length(), mode);
14398         if (r < 0)
14399           goto out;
14400         if (r > 0)
14401           xattrs[ACL_EA_ACCESS] = acl;
14402       }
14403
14404       if (S_ISDIR(*mode))
14405         xattrs[ACL_EA_DEFAULT] = dir->xattrs[ACL_EA_DEFAULT];
14406
14407       r = xattrs.size();
14408       if (r > 0)
14409         encode(xattrs, xattrs_bl);
14410     } else {
14411       if (umask_cb)
14412         *mode &= ~umask_cb(callback_handle);
14413       r = 0;
14414     }
14415   }
14416 out:
14417   ldout(cct, 10) << __func__ << " dir ino " << dir->ino << " result=" << r << dendl;
14418   return r;
14419 }
14420
14421 void Client::set_filer_flags(int flags)
14422 {
14423   std::lock_guard l(client_lock);
14424   ceph_assert(flags == 0 ||
14425          flags == CEPH_OSD_FLAG_LOCALIZE_READS);
14426   objecter->add_global_op_flags(flags);
14427 }
14428
14429 void Client::clear_filer_flags(int flags)
14430 {
14431   std::lock_guard l(client_lock);
14432   ceph_assert(flags == CEPH_OSD_FLAG_LOCALIZE_READS);
14433   objecter->clear_global_op_flag(flags);
14434 }
14435
14436 // called before mount
14437 void Client::set_uuid(const std::string& uuid)
14438 {
14439   std::lock_guard l(client_lock);
14440   assert(initialized);
14441   assert(!uuid.empty());
14442
14443   metadata["uuid"] = uuid;
14444   _close_sessions();
14445 }
14446
14447 // called before mount. 0 means infinite
14448 void Client::set_session_timeout(unsigned timeout)
14449 {
14450   std::lock_guard l(client_lock);
14451   assert(initialized);
14452
14453   metadata["timeout"] = stringify(timeout);
14454 }
14455
14456 // called before mount
14457 int Client::start_reclaim(const std::string& uuid, unsigned flags,
14458                           const std::string& fs_name)
14459 {
14460   std::lock_guard l(client_lock);
14461   if (!initialized)
14462     return -ENOTCONN;
14463
14464   if (uuid.empty())
14465     return -EINVAL;
14466
14467   {
14468     auto it = metadata.find("uuid");
14469     if (it != metadata.end() && it->second == uuid)
14470       return -EINVAL;
14471   }
14472
14473   int r = subscribe_mdsmap(fs_name);
14474   if (r < 0) {
14475     lderr(cct) << "mdsmap subscription failed: " << cpp_strerror(r) << dendl;
14476     return r;
14477   }
14478
14479   if (metadata.empty())
14480     populate_metadata("");
14481
14482   while (mdsmap->get_epoch() == 0)
14483     wait_on_list(waiting_for_mdsmap);
14484
14485   reclaim_errno = 0;
14486   for (unsigned mds = 0; mds < mdsmap->get_num_in_mds(); ) {
14487     if (!mdsmap->is_up(mds)) {
14488       ldout(cct, 10) << "mds." << mds << " not active, waiting for new mdsmap" << dendl;
14489       wait_on_list(waiting_for_mdsmap);
14490       continue;
14491     }
14492
14493     MetaSession *session;
14494     if (!have_open_session(mds)) {
14495       session = _get_or_open_mds_session(mds);
14496       if (session->state != MetaSession::STATE_OPENING) {
14497         // umounting?
14498         return -EINVAL;
14499       }
14500       ldout(cct, 10) << "waiting for session to mds." << mds << " to open" << dendl;
14501       wait_on_context_list(session->waiting_for_open);
14502       if (rejected_by_mds.count(mds))
14503         return -EPERM;
14504       continue;
14505     }
14506
14507     session = &mds_sessions.at(mds);
14508     if (!session->mds_features.test(CEPHFS_FEATURE_RECLAIM_CLIENT))
14509       return -EOPNOTSUPP;
14510
14511     if (session->reclaim_state == MetaSession::RECLAIM_NULL ||
14512         session->reclaim_state == MetaSession::RECLAIMING) {
14513       session->reclaim_state = MetaSession::RECLAIMING;
14514       auto m = make_message<MClientReclaim>(uuid, flags);
14515       session->con->send_message2(std::move(m));
14516       wait_on_list(waiting_for_reclaim);
14517     } else if (session->reclaim_state == MetaSession::RECLAIM_FAIL) {
14518       return reclaim_errno ? : -ENOTRECOVERABLE;
14519     } else {
14520       mds++;
14521     }
14522   }
14523
14524   // didn't find target session in any mds
14525   if (reclaim_target_addrs.empty()) {
14526     if (flags & CEPH_RECLAIM_RESET)
14527       return -ENOENT;
14528     return -ENOTRECOVERABLE;
14529   }
14530
14531   if (flags & CEPH_RECLAIM_RESET)
14532     return 0;
14533
14534   // use blacklist to check if target session was killed
14535   // (config option mds_session_blacklist_on_evict needs to be true)
14536   C_SaferCond cond;
14537   if (!objecter->wait_for_map(reclaim_osd_epoch, &cond)) {
14538     ldout(cct, 10) << __func__ << ": waiting for OSD epoch " << reclaim_osd_epoch << dendl;
14539     client_lock.unlock();
14540     cond.wait();
14541     client_lock.lock();
14542   }
14543
14544   bool blacklisted = objecter->with_osdmap(
14545       [this](const OSDMap &osd_map) -> bool {
14546         return osd_map.is_blacklisted(reclaim_target_addrs);
14547       });
14548   if (blacklisted)
14549     return -ENOTRECOVERABLE;
14550
14551   metadata["reclaiming_uuid"] = uuid;
14552   return 0;
14553 }
14554
14555 void Client::finish_reclaim()
14556 {
14557   auto it = metadata.find("reclaiming_uuid");
14558   if (it == metadata.end()) {
14559     for (auto &p : mds_sessions)
14560       p.second.reclaim_state = MetaSession::RECLAIM_NULL;
14561     return;
14562   }
14563
14564   for (auto &p : mds_sessions) {
14565     p.second.reclaim_state = MetaSession::RECLAIM_NULL;
14566     auto m = make_message<MClientReclaim>("", MClientReclaim::FLAG_FINISH);
14567     p.second.con->send_message2(std::move(m));
14568   }
14569
14570   metadata["uuid"] = it->second;
14571   metadata.erase(it);
14572 }
14573
14574 void Client::handle_client_reclaim_reply(const MConstRef<MClientReclaimReply>& reply)
14575 {
14576   mds_rank_t from = mds_rank_t(reply->get_source().num());
14577   ldout(cct, 10) << __func__ << " " << *reply << " from mds." << from << dendl;
14578
14579   MetaSession *session = _get_mds_session(from, reply->get_connection().get());
14580   if (!session) {
14581     ldout(cct, 10) << " discarding reclaim reply from sessionless mds." <<  from << dendl;
14582     return;
14583   }
14584
14585   if (reply->get_result() >= 0) {
14586     session->reclaim_state = MetaSession::RECLAIM_OK;
14587     if (reply->get_epoch() > reclaim_osd_epoch)
14588       reclaim_osd_epoch = reply->get_epoch();
14589     if (!reply->get_addrs().empty())
14590       reclaim_target_addrs = reply->get_addrs();
14591   } else {
14592     session->reclaim_state = MetaSession::RECLAIM_FAIL;
14593     reclaim_errno = reply->get_result();
14594   }
14595
14596   signal_cond_list(waiting_for_reclaim);
14597 }
14598
14599 /**
14600  * This is included in cap release messages, to cause
14601  * the MDS to wait until this OSD map epoch.  It is necessary
14602  * in corner cases where we cancel RADOS ops, so that
14603  * nobody else tries to do IO to the same objects in
14604  * the same epoch as the cancelled ops.
14605  */
14606 void Client::set_cap_epoch_barrier(epoch_t e)
14607 {
14608   ldout(cct, 5) << __func__ << " epoch = " << e << dendl;
14609   cap_epoch_barrier = e;
14610 }
14611
14612 const char** Client::get_tracked_conf_keys() const
14613 {
14614   static const char* keys[] = {
14615     "client_cache_size",
14616     "client_cache_mid",
14617     "client_acl_type",
14618     "client_deleg_timeout",
14619     "client_deleg_break_on_open",
14620     NULL
14621   };
14622   return keys;
14623 }
14624
14625 void Client::handle_conf_change(const ConfigProxy& conf,
14626                                 const std::set <std::string> &changed)
14627 {
14628   std::lock_guard lock(client_lock);
14629
14630   if (changed.count("client_cache_mid")) {
14631     lru.lru_set_midpoint(cct->_conf->client_cache_mid);
14632   }
14633   if (changed.count("client_acl_type")) {
14634     acl_type = NO_ACL;
14635     if (cct->_conf->client_acl_type == "posix_acl")
14636       acl_type = POSIX_ACL;
14637   }
14638 }
14639
14640 void intrusive_ptr_add_ref(Inode *in)
14641 {
14642   in->get();
14643 }
14644
14645 void intrusive_ptr_release(Inode *in)
14646 {
14647   in->client->put_inode(in);
14648 }
14649
14650 mds_rank_t Client::_get_random_up_mds() const
14651 {
14652   ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
14653
14654   std::set<mds_rank_t> up;
14655   mdsmap->get_up_mds_set(up);
14656
14657   if (up.empty())
14658     return MDS_RANK_NONE;
14659   std::set<mds_rank_t>::const_iterator p = up.begin();
14660   for (int n = rand() % up.size(); n; n--)
14661     ++p;
14662   return *p;
14663 }
14664
14665
14666 StandaloneClient::StandaloneClient(Messenger *m, MonClient *mc)
14667     : Client(m, mc, new Objecter(m->cct, m, mc, NULL, 0, 0))
14668 {
14669   monclient->set_messenger(m);
14670   objecter->set_client_incarnation(0);
14671 }
14672
14673 StandaloneClient::~StandaloneClient()
14674 {
14675   delete objecter;
14676   objecter = nullptr;
14677 }
14678
14679 int StandaloneClient::init()
14680 {
14681   _pre_init();
14682   objecter->init();
14683
14684   client_lock.lock();
14685   ceph_assert(!is_initialized());
14686
14687   messenger->add_dispatcher_tail(objecter);
14688   messenger->add_dispatcher_tail(this);
14689
14690   monclient->set_want_keys(CEPH_ENTITY_TYPE_MDS | CEPH_ENTITY_TYPE_OSD);
14691   int r = monclient->init();
14692   if (r < 0) {
14693     // need to do cleanup because we're in an intermediate init state
14694     timer.shutdown();
14695     client_lock.unlock();
14696     objecter->shutdown();
14697     objectcacher->stop();
14698     monclient->shutdown();
14699     return r;
14700   }
14701   objecter->start();
14702
14703   client_lock.unlock();
14704   _finish_init();
14705
14706   return 0;
14707 }
14708
14709 void StandaloneClient::shutdown()
14710 {
14711   Client::shutdown();
14712   objecter->shutdown();
14713   monclient->shutdown();
14714 }