ceph/src/client/Client.cc

   1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
   2 // vim: ts=8 sw=2 smarttab
   3 /*
   4  * Ceph - scalable distributed file system
   5  *
   6  * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
   7  *
   8  * This is free software; you can redistribute it and/or
   9  * modify it under the terms of the GNU Lesser General Public
  10  * License version 2.1, as published by the Free Software
  11  * Foundation.  See file COPYING.
  12  *
  13  */
  14
  15
  16 // unix-ey fs stuff
  17 #include <unistd.h>
  18 #include <sys/types.h>
  19 #include <time.h>
  20 #include <utime.h>
  21 #include <string.h>
  22 #include <sys/stat.h>
  23 #include <sys/param.h>
  24 #include <fcntl.h>
  25 #include <sys/file.h>
  26 #include <sys/utsname.h>
  27 #include <sys/uio.h>
  28
  29 #include <boost/lexical_cast.hpp>
  30 #include <boost/fusion/include/std_pair.hpp>
  31
  32 #if defined(__FreeBSD__)
  33 #define XATTR_CREATE    0x1
  34 #define XATTR_REPLACE   0x2
  35 #else
  36 #include <sys/xattr.h>
  37 #endif
  38
  39 #if defined(__linux__)
  40 #include <linux/falloc.h>
  41 #endif
  42
  43 #include <sys/statvfs.h>
  44
  45 #include "common/config.h"
  46 #include "common/version.h"
  47
  48 #include "mon/MonClient.h"
  49
  50 #include "messages/MClientCaps.h"
  51 #include "messages/MClientLease.h"
  52 #include "messages/MClientQuota.h"
  53 #include "messages/MClientReclaim.h"
  54 #include "messages/MClientReclaimReply.h"
  55 #include "messages/MClientReconnect.h"
  56 #include "messages/MClientReply.h"
  57 #include "messages/MClientRequest.h"
  58 #include "messages/MClientRequestForward.h"
  59 #include "messages/MClientSession.h"
  60 #include "messages/MClientSnap.h"
  61 #include "messages/MCommandReply.h"
  62 #include "messages/MFSMap.h"
  63 #include "messages/MFSMapUser.h"
  64 #include "messages/MMDSMap.h"
  65 #include "messages/MOSDMap.h"
  66
  67 #include "mds/flock.h"
  68 #include "mds/cephfs_features.h"
  69 #include "osd/OSDMap.h"
  70 #include "osdc/Filer.h"
  71
  72 #include "common/Cond.h"
  73 #include "common/perf_counters.h"
  74 #include "common/admin_socket.h"
  75 #include "common/errno.h"
  76 #include "include/str_list.h"
  77
  78 #define dout_subsys ceph_subsys_client
  79
  80 #include "include/lru.h"
  81 #include "include/compat.h"
  82 #include "include/stringify.h"
  83
  84 #include "Client.h"
  85 #include "Inode.h"
  86 #include "Dentry.h"
  87 #include "Delegation.h"
  88 #include "Dir.h"
  89 #include "ClientSnapRealm.h"
  90 #include "Fh.h"
  91 #include "MetaSession.h"
  92 #include "MetaRequest.h"
  93 #include "ObjecterWriteback.h"
  94 #include "posix_acl.h"
  95
  96 #include "include/ceph_assert.h"
  97 #include "include/stat.h"
  98
  99 #include "include/cephfs/ceph_ll_client.h"
 100
 101 #if HAVE_GETGROUPLIST
 102 #include <grp.h>
 103 #include <pwd.h>
 104 #include <unistd.h>
 105 #endif
 106
 107 #undef dout_prefix
 108 #define dout_prefix *_dout << "client." << whoami << " "
 109
 110 #define  tout(cct)       if (!cct->_conf->client_trace.empty()) traceout
 111
 112 // FreeBSD fails to define this
 113 #ifndef O_DSYNC
 114 #define O_DSYNC 0x0
 115 #endif
 116 // Darwin fails to define this
 117 #ifndef O_RSYNC
 118 #define O_RSYNC 0x0
 119 #endif
 120
 121 #ifndef O_DIRECT
 122 #define O_DIRECT 0x0
 123 #endif
 124
 125 #define DEBUG_GETATTR_CAPS (CEPH_CAP_XATTR_SHARED)
 126
 127 void client_flush_set_callback(void *p, ObjectCacher::ObjectSet *oset)
 128 {
 129   Client *client = static_cast<Client*>(p);
 130   client->flush_set_callback(oset);
 131 }
 132
 133
 134 // -------------
 135
 136 Client::CommandHook::CommandHook(Client *client) :
 137   m_client(client)
 138 {
 139 }
 140
 141 int Client::CommandHook::call(
 142   std::string_view command,
 143   const cmdmap_t& cmdmap,
 144   Formatter *f,
 145   std::ostream& errss,
 146   bufferlist& out)
 147 {
 148   f->open_object_section("result");
 149   {
 150     std::lock_guard l{m_client->client_lock};
 151     if (command == "mds_requests")
 152       m_client->dump_mds_requests(f);
 153     else if (command == "mds_sessions")
 154       m_client->dump_mds_sessions(f);
 155     else if (command == "dump_cache")
 156       m_client->dump_cache(f);
 157     else if (command == "kick_stale_sessions")
 158       m_client->_kick_stale_sessions();
 159     else if (command == "status")
 160       m_client->dump_status(f);
 161     else
 162       ceph_abort_msg("bad command registered");
 163   }
 164   f->close_section();
 165   return 0;
 166 }
 167
 168
 169 // -------------
 170
 171 dir_result_t::dir_result_t(Inode *in, const UserPerm& perms)
 172   : inode(in), offset(0), next_offset(2),
 173     release_count(0), ordered_count(0), cache_index(0), start_shared_gen(0),
 174     perms(perms)
 175   { }
 176
 177 void Client::_reset_faked_inos()
 178 {
 179   ino_t start = 1024;
 180   free_faked_inos.clear();
 181   free_faked_inos.insert(start, (uint32_t)-1 - start + 1);
 182   last_used_faked_ino = 0;
 183   last_used_faked_root = 0;
 184   _use_faked_inos = sizeof(ino_t) < 8 || cct->_conf->client_use_faked_inos;
 185 }
 186
 187 void Client::_assign_faked_ino(Inode *in)
 188 {
 189   if (0 == last_used_faked_ino)
 190     last_used_faked_ino = last_used_faked_ino + 2048; // start(1024)~2048 reserved for _assign_faked_root
 191   interval_set<ino_t>::const_iterator it = free_faked_inos.lower_bound(last_used_faked_ino + 1);
 192   if (it == free_faked_inos.end() && last_used_faked_ino > 0) {
 193     last_used_faked_ino = 2048;
 194     it = free_faked_inos.lower_bound(last_used_faked_ino + 1);
 195   }
 196   ceph_assert(it != free_faked_inos.end());
 197   if (last_used_faked_ino < it.get_start()) {
 198     ceph_assert(it.get_len() > 0);
 199     last_used_faked_ino = it.get_start();
 200   } else {
 201     ++last_used_faked_ino;
 202     ceph_assert(it.get_start() + it.get_len() > last_used_faked_ino);
 203   }
 204   in->faked_ino = last_used_faked_ino;
 205   free_faked_inos.erase(in->faked_ino);
 206   faked_ino_map[in->faked_ino] = in->vino();
 207 }
 208
 209 /*
 210  * In the faked mode, if you export multiple subdirectories,
 211  * you will see that the inode numbers of the exported subdirectories
 212  * are the same. so we distinguish the mount point by reserving
 213  * the "fake ids" between "1024~2048" and combining the last
 214  * 10bits(0x3ff) of the "root inodes".
 215 */
 216 void Client::_assign_faked_root(Inode *in)
 217 {
 218   interval_set<ino_t>::const_iterator it = free_faked_inos.lower_bound(last_used_faked_root + 1);
 219   if (it == free_faked_inos.end() && last_used_faked_root > 0) {
 220     last_used_faked_root = 0;
 221     it = free_faked_inos.lower_bound(last_used_faked_root + 1);
 222   }
 223   assert(it != free_faked_inos.end());
 224   vinodeno_t inode_info = in->vino();
 225   uint64_t inode_num = (uint64_t)inode_info.ino;
 226   ldout(cct, 10) << "inode_num " << inode_num << "inode_num & 0x3ff=" << (inode_num & 0x3ff)<< dendl;
 227   last_used_faked_root = it.get_start()  + (inode_num & 0x3ff); // 0x3ff mask and get_start will not exceed 2048
 228   assert(it.get_start() + it.get_len() > last_used_faked_root);
 229
 230   in->faked_ino = last_used_faked_root;
 231   free_faked_inos.erase(in->faked_ino);
 232   faked_ino_map[in->faked_ino] = in->vino();
 233 }
 234
 235 void Client::_release_faked_ino(Inode *in)
 236 {
 237   free_faked_inos.insert(in->faked_ino);
 238   faked_ino_map.erase(in->faked_ino);
 239 }
 240
 241 vinodeno_t Client::_map_faked_ino(ino_t ino)
 242 {
 243   vinodeno_t vino;
 244   if (ino == 1)
 245     vino = root->vino();
 246   else if (faked_ino_map.count(ino))
 247     vino = faked_ino_map[ino];
 248   else
 249     vino = vinodeno_t(0, CEPH_NOSNAP);
 250   ldout(cct, 10) << __func__ << " " << ino << " -> " << vino << dendl;
 251   return vino;
 252 }
 253
 254 vinodeno_t Client::map_faked_ino(ino_t ino)
 255 {
 256   std::lock_guard lock(client_lock);
 257   return _map_faked_ino(ino);
 258 }
 259
 260 // cons/des
 261
 262 Client::Client(Messenger *m, MonClient *mc, Objecter *objecter_)
 263   : Dispatcher(m->cct),
 264     timer(m->cct, client_lock),
 265     messenger(m),
 266     monclient(mc),
 267     objecter(objecter_),
 268     whoami(mc->get_global_id()),
 269     async_ino_invalidator(m->cct),
 270     async_dentry_invalidator(m->cct),
 271     interrupt_finisher(m->cct),
 272     remount_finisher(m->cct),
 273     async_ino_releasor(m->cct),
 274     objecter_finisher(m->cct),
 275     m_command_hook(this),
 276     fscid(0)
 277 {
 278   _reset_faked_inos();
 279
 280   user_id = cct->_conf->client_mount_uid;
 281   group_id = cct->_conf->client_mount_gid;
 282   fuse_default_permissions = cct->_conf.get_val<bool>(
 283     "fuse_default_permissions");
 284
 285   if (cct->_conf->client_acl_type == "posix_acl")
 286     acl_type = POSIX_ACL;
 287
 288   lru.lru_set_midpoint(cct->_conf->client_cache_mid);
 289
 290   // file handles
 291   free_fd_set.insert(10, 1<<30);
 292
 293   mdsmap.reset(new MDSMap);
 294
 295   // osd interfaces
 296   writeback_handler.reset(new ObjecterWriteback(objecter, &objecter_finisher,
 297                                             &client_lock));
 298   objectcacher.reset(new ObjectCacher(cct, "libcephfs", *writeback_handler, client_lock,
 299                                   client_flush_set_callback,    // all commit callback
 300                                   (void*)this,
 301                                   cct->_conf->client_oc_size,
 302                                   cct->_conf->client_oc_max_objects,
 303                                   cct->_conf->client_oc_max_dirty,
 304                                   cct->_conf->client_oc_target_dirty,
 305                                   cct->_conf->client_oc_max_dirty_age,
 306                                   true));
 307 }
 308
 309
 310 Client::~Client()
 311 {
 312   ceph_assert(ceph_mutex_is_not_locked(client_lock));
 313
 314   // It is necessary to hold client_lock, because any inode destruction
 315   // may call into ObjectCacher, which asserts that it's lock (which is
 316   // client_lock) is held.
 317   std::lock_guard l{client_lock};
 318   tear_down_cache();
 319 }
 320
 321 void Client::tear_down_cache()
 322 {
 323   // fd's
 324   for (ceph::unordered_map<int, Fh*>::iterator it = fd_map.begin();
 325        it != fd_map.end();
 326        ++it) {
 327     Fh *fh = it->second;
 328     ldout(cct, 1) << __func__ << " forcing close of fh " << it->first << " ino " << fh->inode->ino << dendl;
 329     _release_fh(fh);
 330   }
 331   fd_map.clear();
 332
 333   while (!opened_dirs.empty()) {
 334     dir_result_t *dirp = *opened_dirs.begin();
 335     ldout(cct, 1) << __func__ << " forcing close of dir " << dirp << " ino " << dirp->inode->ino << dendl;
 336     _closedir(dirp);
 337   }
 338
 339   // caps!
 340   // *** FIXME ***
 341
 342   // empty lru
 343   trim_cache();
 344   ceph_assert(lru.lru_get_size() == 0);
 345
 346   // close root ino
 347   ceph_assert(inode_map.size() <= 1 + root_parents.size());
 348   if (root && inode_map.size() == 1 + root_parents.size()) {
 349     delete root;
 350     root = 0;
 351     root_ancestor = 0;
 352     while (!root_parents.empty())
 353       root_parents.erase(root_parents.begin());
 354     inode_map.clear();
 355     _reset_faked_inos();
 356   }
 357
 358   ceph_assert(inode_map.empty());
 359 }
 360
 361 inodeno_t Client::get_root_ino()
 362 {
 363   std::lock_guard l(client_lock);
 364   if (use_faked_inos())
 365     return root->faked_ino;
 366   else
 367     return root->ino;
 368 }
 369
 370 Inode *Client::get_root()
 371 {
 372   std::lock_guard l(client_lock);
 373   root->ll_get();
 374   return root;
 375 }
 376
 377
 378 // debug crapola
 379
 380 void Client::dump_inode(Formatter *f, Inode *in, set<Inode*>& did, bool disconnected)
 381 {
 382   filepath path;
 383   in->make_long_path(path);
 384   ldout(cct, 1) << "dump_inode: "
 385                 << (disconnected ? "DISCONNECTED ":"")
 386                 << "inode " << in->ino
 387                 << " " << path
 388                 << " ref " << in->get_num_ref()
 389                 << *in << dendl;
 390
 391   if (f) {
 392     f->open_object_section("inode");
 393     f->dump_stream("path") << path;
 394     if (disconnected)
 395       f->dump_int("disconnected", 1);
 396     in->dump(f);
 397     f->close_section();
 398   }
 399
 400   did.insert(in);
 401   if (in->dir) {
 402     ldout(cct, 1) << "  dir " << in->dir << " size " << in->dir->dentries.size() << dendl;
 403     for (ceph::unordered_map<string, Dentry*>::iterator it = in->dir->dentries.begin();
 404          it != in->dir->dentries.end();
 405          ++it) {
 406       ldout(cct, 1) << "   " << in->ino << " dn " << it->first << " " << it->second << " ref " << it->second->ref << dendl;
 407       if (f) {
 408         f->open_object_section("dentry");
 409         it->second->dump(f);
 410         f->close_section();
 411       }
 412       if (it->second->inode)
 413         dump_inode(f, it->second->inode.get(), did, false);
 414     }
 415   }
 416 }
 417
 418 void Client::dump_cache(Formatter *f)
 419 {
 420   set<Inode*> did;
 421
 422   ldout(cct, 1) << __func__ << dendl;
 423
 424   if (f)
 425     f->open_array_section("cache");
 426
 427   if (root)
 428     dump_inode(f, root, did, true);
 429
 430   // make a second pass to catch anything disconnected
 431   for (ceph::unordered_map<vinodeno_t, Inode*>::iterator it = inode_map.begin();
 432        it != inode_map.end();
 433        ++it) {
 434     if (did.count(it->second))
 435       continue;
 436     dump_inode(f, it->second, did, true);
 437   }
 438
 439   if (f)
 440     f->close_section();
 441 }
 442
 443 void Client::dump_status(Formatter *f)
 444 {
 445   ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
 446
 447   ldout(cct, 1) << __func__ << dendl;
 448
 449   const epoch_t osd_epoch
 450     = objecter->with_osdmap(std::mem_fn(&OSDMap::get_epoch));
 451
 452   if (f) {
 453     f->open_object_section("metadata");
 454     for (const auto& kv : metadata)
 455       f->dump_string(kv.first.c_str(), kv.second);
 456     f->close_section();
 457
 458     f->dump_int("dentry_count", lru.lru_get_size());
 459     f->dump_int("dentry_pinned_count", lru.lru_get_num_pinned());
 460     f->dump_int("id", get_nodeid().v);
 461     entity_inst_t inst(messenger->get_myname(), messenger->get_myaddr_legacy());
 462     f->dump_object("inst", inst);
 463     f->dump_object("addr", inst.addr);
 464     f->dump_stream("inst_str") << inst.name << " " << inst.addr.get_legacy_str();
 465     f->dump_string("addr_str", inst.addr.get_legacy_str());
 466     f->dump_int("inode_count", inode_map.size());
 467     f->dump_int("mds_epoch", mdsmap->get_epoch());
 468     f->dump_int("osd_epoch", osd_epoch);
 469     f->dump_int("osd_epoch_barrier", cap_epoch_barrier);
 470     f->dump_bool("blacklisted", blacklisted);
 471   }
 472 }
 473
 474 void Client::_pre_init()
 475 {
 476   timer.init();
 477
 478   objecter_finisher.start();
 479   filer.reset(new Filer(objecter, &objecter_finisher));
 480   objecter->enable_blacklist_events();
 481
 482   objectcacher->start();
 483 }
 484
 485 int Client::init()
 486 {
 487   _pre_init();
 488   {
 489     std::lock_guard l{client_lock};
 490     ceph_assert(!initialized);
 491     messenger->add_dispatcher_tail(this);
 492   }
 493   _finish_init();
 494   return 0;
 495 }
 496
 497 void Client::_finish_init()
 498 {
 499   {
 500     std::lock_guard l{client_lock};
 501     // logger
 502     PerfCountersBuilder plb(cct, "client", l_c_first, l_c_last);
 503     plb.add_time_avg(l_c_reply, "reply", "Latency of receiving a reply on metadata request");
 504     plb.add_time_avg(l_c_lat, "lat", "Latency of processing a metadata request");
 505     plb.add_time_avg(l_c_wrlat, "wrlat", "Latency of a file data write operation");
 506     plb.add_time_avg(l_c_read, "rdlat", "Latency of a file data read operation");
 507     plb.add_time_avg(l_c_fsync, "fsync", "Latency of a file sync operation");
 508     logger.reset(plb.create_perf_counters());
 509     cct->get_perfcounters_collection()->add(logger.get());
 510   }
 511
 512   cct->_conf.add_observer(this);
 513
 514   AdminSocket* admin_socket = cct->get_admin_socket();
 515   int ret = admin_socket->register_command("mds_requests",
 516                                            &m_command_hook,
 517                                            "show in-progress mds requests");
 518   if (ret < 0) {
 519     lderr(cct) << "error registering admin socket command: "
 520                << cpp_strerror(-ret) << dendl;
 521   }
 522   ret = admin_socket->register_command("mds_sessions",
 523                                        &m_command_hook,
 524                                        "show mds session state");
 525   if (ret < 0) {
 526     lderr(cct) << "error registering admin socket command: "
 527                << cpp_strerror(-ret) << dendl;
 528   }
 529   ret = admin_socket->register_command("dump_cache",
 530                                        &m_command_hook,
 531                                        "show in-memory metadata cache contents");
 532   if (ret < 0) {
 533     lderr(cct) << "error registering admin socket command: "
 534                << cpp_strerror(-ret) << dendl;
 535   }
 536   ret = admin_socket->register_command("kick_stale_sessions",
 537                                        &m_command_hook,
 538                                        "kick sessions that were remote reset");
 539   if (ret < 0) {
 540     lderr(cct) << "error registering admin socket command: "
 541                << cpp_strerror(-ret) << dendl;
 542   }
 543   ret = admin_socket->register_command("status",
 544                                        &m_command_hook,
 545                                        "show overall client status");
 546   if (ret < 0) {
 547     lderr(cct) << "error registering admin socket command: "
 548                << cpp_strerror(-ret) << dendl;
 549   }
 550
 551   std::lock_guard l{client_lock};
 552   initialized = true;
 553 }
 554
 555 void Client::shutdown()
 556 {
 557   ldout(cct, 1) << __func__ << dendl;
 558
 559   // If we were not mounted, but were being used for sending
 560   // MDS commands, we may have sessions that need closing.
 561   {
 562     std::lock_guard l{client_lock};
 563     _close_sessions();
 564   }
 565   cct->_conf.remove_observer(this);
 566
 567   cct->get_admin_socket()->unregister_commands(&m_command_hook);
 568
 569   if (ino_invalidate_cb) {
 570     ldout(cct, 10) << "shutdown stopping cache invalidator finisher" << dendl;
 571     async_ino_invalidator.wait_for_empty();
 572     async_ino_invalidator.stop();
 573   }
 574
 575   if (dentry_invalidate_cb) {
 576     ldout(cct, 10) << "shutdown stopping dentry invalidator finisher" << dendl;
 577     async_dentry_invalidator.wait_for_empty();
 578     async_dentry_invalidator.stop();
 579   }
 580
 581   if (switch_interrupt_cb) {
 582     ldout(cct, 10) << "shutdown stopping interrupt finisher" << dendl;
 583     interrupt_finisher.wait_for_empty();
 584     interrupt_finisher.stop();
 585   }
 586
 587   if (remount_cb) {
 588     ldout(cct, 10) << "shutdown stopping remount finisher" << dendl;
 589     remount_finisher.wait_for_empty();
 590     remount_finisher.stop();
 591   }
 592
 593   if (ino_release_cb) {
 594     ldout(cct, 10) << "shutdown stopping inode release finisher" << dendl;
 595     async_ino_releasor.wait_for_empty();
 596     async_ino_releasor.stop();
 597   }
 598
 599   objectcacher->stop();  // outside of client_lock! this does a join.
 600   {
 601     std::lock_guard l{client_lock};
 602     ceph_assert(initialized);
 603     initialized = false;
 604     timer.shutdown();
 605   }
 606   objecter_finisher.wait_for_empty();
 607   objecter_finisher.stop();
 608
 609   if (logger) {
 610     cct->get_perfcounters_collection()->remove(logger.get());
 611     logger.reset();
 612   }
 613 }
 614
 615
 616 // ===================
 617 // metadata cache stuff
 618
 619 void Client::trim_cache(bool trim_kernel_dcache)
 620 {
 621   uint64_t max = cct->_conf->client_cache_size;
 622   ldout(cct, 20) << "trim_cache size " << lru.lru_get_size() << " max " << max << dendl;
 623   unsigned last = 0;
 624   while (lru.lru_get_size() != last) {
 625     last = lru.lru_get_size();
 626
 627     if (!unmounting && lru.lru_get_size() <= max)  break;
 628
 629     // trim!
 630     Dentry *dn = static_cast<Dentry*>(lru.lru_get_next_expire());
 631     if (!dn)
 632       break;  // done
 633
 634     trim_dentry(dn);
 635   }
 636
 637   if (trim_kernel_dcache && lru.lru_get_size() > max)
 638     _invalidate_kernel_dcache();
 639
 640   // hose root?
 641   if (lru.lru_get_size() == 0 && root && root->get_num_ref() == 0 && inode_map.size() == 1 + root_parents.size()) {
 642     ldout(cct, 15) << "trim_cache trimmed root " << root << dendl;
 643     delete root;
 644     root = 0;
 645     root_ancestor = 0;
 646     while (!root_parents.empty())
 647       root_parents.erase(root_parents.begin());
 648     inode_map.clear();
 649     _reset_faked_inos();
 650   }
 651 }
 652
 653 void Client::trim_cache_for_reconnect(MetaSession *s)
 654 {
 655   mds_rank_t mds = s->mds_num;
 656   ldout(cct, 20) << __func__ << " mds." << mds << dendl;
 657
 658   int trimmed = 0;
 659   list<Dentry*> skipped;
 660   while (lru.lru_get_size() > 0) {
 661     Dentry *dn = static_cast<Dentry*>(lru.lru_expire());
 662     if (!dn)
 663       break;
 664
 665     if ((dn->inode && dn->inode->caps.count(mds)) ||
 666         dn->dir->parent_inode->caps.count(mds)) {
 667       trim_dentry(dn);
 668       trimmed++;
 669     } else
 670       skipped.push_back(dn);
 671   }
 672
 673   for(list<Dentry*>::iterator p = skipped.begin(); p != skipped.end(); ++p)
 674     lru.lru_insert_mid(*p);
 675
 676   ldout(cct, 20) << __func__ << " mds." << mds
 677                  << " trimmed " << trimmed << " dentries" << dendl;
 678
 679   if (s->caps.size() > 0)
 680     _invalidate_kernel_dcache();
 681 }
 682
 683 void Client::trim_dentry(Dentry *dn)
 684 {
 685   ldout(cct, 15) << "trim_dentry unlinking dn " << dn->name
 686                  << " in dir "
 687                  << std::hex << dn->dir->parent_inode->ino << std::dec
 688                  << dendl;
 689   if (dn->inode) {
 690     Inode *diri = dn->dir->parent_inode;
 691     diri->dir_release_count++;
 692     clear_dir_complete_and_ordered(diri, true);
 693   }
 694   unlink(dn, false, false);  // drop dir, drop dentry
 695 }
 696
 697
 698 void Client::update_inode_file_size(Inode *in, int issued, uint64_t size,
 699                                     uint64_t truncate_seq, uint64_t truncate_size)
 700 {
 701   uint64_t prior_size = in->size;
 702
 703   if (truncate_seq > in->truncate_seq ||
 704       (truncate_seq == in->truncate_seq && size > in->size)) {
 705     ldout(cct, 10) << "size " << in->size << " -> " << size << dendl;
 706     in->size = size;
 707     in->reported_size = size;
 708     if (truncate_seq != in->truncate_seq) {
 709       ldout(cct, 10) << "truncate_seq " << in->truncate_seq << " -> "
 710                << truncate_seq << dendl;
 711       in->truncate_seq = truncate_seq;
 712       in->oset.truncate_seq = truncate_seq;
 713
 714       // truncate cached file data
 715       if (prior_size > size) {
 716         _invalidate_inode_cache(in, truncate_size, prior_size - truncate_size);
 717       }
 718     }
 719
 720     // truncate inline data
 721     if (in->inline_version < CEPH_INLINE_NONE) {
 722       uint32_t len = in->inline_data.length();
 723       if (size < len)
 724         in->inline_data.splice(size, len - size);
 725     }
 726   }
 727   if (truncate_seq >= in->truncate_seq &&
 728       in->truncate_size != truncate_size) {
 729     if (in->is_file()) {
 730       ldout(cct, 10) << "truncate_size " << in->truncate_size << " -> "
 731                << truncate_size << dendl;
 732       in->truncate_size = truncate_size;
 733       in->oset.truncate_size = truncate_size;
 734     } else {
 735       ldout(cct, 0) << "Hmmm, truncate_seq && truncate_size changed on non-file inode!" << dendl;
 736     }
 737   }
 738 }
 739
 740 void Client::update_inode_file_time(Inode *in, int issued, uint64_t time_warp_seq,
 741                                     utime_t ctime, utime_t mtime, utime_t atime)
 742 {
 743   ldout(cct, 10) << __func__ << " " << *in << " " << ccap_string(issued)
 744                  << " ctime " << ctime << " mtime " << mtime << dendl;
 745
 746   if (time_warp_seq > in->time_warp_seq)
 747     ldout(cct, 10) << " mds time_warp_seq " << time_warp_seq
 748                    << " is higher than local time_warp_seq "
 749                    << in->time_warp_seq << dendl;
 750
 751   int warn = false;
 752   // be careful with size, mtime, atime
 753   if (issued & (CEPH_CAP_FILE_EXCL|
 754                 CEPH_CAP_FILE_WR|
 755                 CEPH_CAP_FILE_BUFFER|
 756                 CEPH_CAP_AUTH_EXCL|
 757                 CEPH_CAP_XATTR_EXCL)) {
 758     ldout(cct, 30) << "Yay have enough caps to look at our times" << dendl;
 759     if (ctime > in->ctime)
 760       in->ctime = ctime;
 761     if (time_warp_seq > in->time_warp_seq) {
 762       //the mds updated times, so take those!
 763       in->mtime = mtime;
 764       in->atime = atime;
 765       in->time_warp_seq = time_warp_seq;
 766     } else if (time_warp_seq == in->time_warp_seq) {
 767       //take max times
 768       if (mtime > in->mtime)
 769         in->mtime = mtime;
 770       if (atime > in->atime)
 771         in->atime = atime;
 772     } else if (issued & CEPH_CAP_FILE_EXCL) {
 773       //ignore mds values as we have a higher seq
 774     } else warn = true;
 775   } else {
 776     ldout(cct, 30) << "Don't have enough caps, just taking mds' time values" << dendl;
 777     if (time_warp_seq >= in->time_warp_seq) {
 778       in->ctime = ctime;
 779       in->mtime = mtime;
 780       in->atime = atime;
 781       in->time_warp_seq = time_warp_seq;
 782     } else warn = true;
 783   }
 784   if (warn) {
 785     ldout(cct, 0) << "WARNING: " << *in << " mds time_warp_seq "
 786             << time_warp_seq << " is lower than local time_warp_seq "
 787             << in->time_warp_seq
 788             << dendl;
 789   }
 790 }
 791
 792 void Client::_fragmap_remove_non_leaves(Inode *in)
 793 {
 794   for (map<frag_t,int>::iterator p = in->fragmap.begin(); p != in->fragmap.end(); )
 795     if (!in->dirfragtree.is_leaf(p->first))
 796       in->fragmap.erase(p++);
 797     else
 798       ++p;
 799 }
 800
 801 void Client::_fragmap_remove_stopped_mds(Inode *in, mds_rank_t mds)
 802 {
 803   for (auto p = in->fragmap.begin(); p != in->fragmap.end(); )
 804     if (p->second == mds)
 805       in->fragmap.erase(p++);
 806     else
 807       ++p;
 808 }
 809
 810 Inode * Client::add_update_inode(InodeStat *st, utime_t from,
 811                                  MetaSession *session,
 812                                  const UserPerm& request_perms)
 813 {
 814   Inode *in;
 815   bool was_new = false;
 816   if (inode_map.count(st->vino)) {
 817     in = inode_map[st->vino];
 818     ldout(cct, 12) << __func__ << " had " << *in << " caps " << ccap_string(st->cap.caps) << dendl;
 819   } else {
 820     in = new Inode(this, st->vino, &st->layout);
 821     inode_map[st->vino] = in;
 822
 823     if (use_faked_inos())
 824       _assign_faked_ino(in);
 825
 826     if (!root) {
 827       root = in;
 828       if (use_faked_inos())
 829         _assign_faked_root(root);
 830       root_ancestor = in;
 831       cwd = root;
 832     } else if (!mounted) {
 833       root_parents[root_ancestor] = in;
 834       root_ancestor = in;
 835     }
 836
 837     // immutable bits
 838     in->ino = st->vino.ino;
 839     in->snapid = st->vino.snapid;
 840     in->mode = st->mode & S_IFMT;
 841     was_new = true;
 842   }
 843
 844   in->rdev = st->rdev;
 845   if (in->is_symlink())
 846     in->symlink = st->symlink;
 847
 848   // only update inode if mds info is strictly newer, or it is the same and projected (odd).
 849   bool new_version = false;
 850   if (in->version == 0 ||
 851       ((st->cap.flags & CEPH_CAP_FLAG_AUTH) &&
 852        (in->version & ~1) < st->version))
 853     new_version = true;
 854
 855   int issued;
 856   in->caps_issued(&issued);
 857   issued |= in->caps_dirty();
 858   int new_issued = ~issued & (int)st->cap.caps;
 859
 860   if ((new_version || (new_issued & CEPH_CAP_AUTH_SHARED)) &&
 861       !(issued & CEPH_CAP_AUTH_EXCL)) {
 862     in->mode = st->mode;
 863     in->uid = st->uid;
 864     in->gid = st->gid;
 865     in->btime = st->btime;
 866     in->snap_btime = st->snap_btime;
 867   }
 868
 869   if ((new_version || (new_issued & CEPH_CAP_LINK_SHARED)) &&
 870       !(issued & CEPH_CAP_LINK_EXCL)) {
 871     in->nlink = st->nlink;
 872   }
 873
 874   if (new_version || (new_issued & CEPH_CAP_ANY_RD)) {
 875     update_inode_file_time(in, issued, st->time_warp_seq,
 876                            st->ctime, st->mtime, st->atime);
 877   }
 878
 879   if (new_version ||
 880       (new_issued & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR))) {
 881     in->layout = st->layout;
 882     update_inode_file_size(in, issued, st->size, st->truncate_seq, st->truncate_size);
 883   }
 884
 885   if (in->is_dir()) {
 886     if (new_version || (new_issued & CEPH_CAP_FILE_SHARED)) {
 887       in->dirstat = st->dirstat;
 888     }
 889     // dir_layout/rstat/quota are not tracked by capability, update them only if
 890     // the inode stat is from auth mds
 891     if (new_version || (st->cap.flags & CEPH_CAP_FLAG_AUTH)) {
 892       in->dir_layout = st->dir_layout;
 893       ldout(cct, 20) << " dir hash is " << (int)in->dir_layout.dl_dir_hash << dendl;
 894       in->rstat = st->rstat;
 895       in->quota = st->quota;
 896       in->dir_pin = st->dir_pin;
 897     }
 898     // move me if/when version reflects fragtree changes.
 899     if (in->dirfragtree != st->dirfragtree) {
 900       in->dirfragtree = st->dirfragtree;
 901       _fragmap_remove_non_leaves(in);
 902     }
 903   }
 904
 905   if ((in->xattr_version  == 0 || !(issued & CEPH_CAP_XATTR_EXCL)) &&
 906       st->xattrbl.length() &&
 907       st->xattr_version > in->xattr_version) {
 908     auto p = st->xattrbl.cbegin();
 909     decode(in->xattrs, p);
 910     in->xattr_version = st->xattr_version;
 911   }
 912
 913   if (st->inline_version > in->inline_version) {
 914     in->inline_data = st->inline_data;
 915     in->inline_version = st->inline_version;
 916   }
 917
 918   /* always take a newer change attr */
 919   if (st->change_attr > in->change_attr)
 920     in->change_attr = st->change_attr;
 921
 922   if (st->version > in->version)
 923     in->version = st->version;
 924
 925   if (was_new)
 926     ldout(cct, 12) << __func__ << " adding " << *in << " caps " << ccap_string(st->cap.caps) << dendl;
 927
 928   if (!st->cap.caps)
 929     return in;   // as with readdir returning indoes in different snaprealms (no caps!)
 930
 931   if (in->snapid == CEPH_NOSNAP) {
 932     add_update_cap(in, session, st->cap.cap_id, st->cap.caps, st->cap.wanted,
 933                    st->cap.seq, st->cap.mseq, inodeno_t(st->cap.realm),
 934                    st->cap.flags, request_perms);
 935     if (in->auth_cap && in->auth_cap->session == session) {
 936       in->max_size = st->max_size;
 937       in->rstat = st->rstat;
 938     }
 939
 940     // setting I_COMPLETE needs to happen after adding the cap
 941     if (in->is_dir() &&
 942         (st->cap.caps & CEPH_CAP_FILE_SHARED) &&
 943         (issued & CEPH_CAP_FILE_EXCL) == 0 &&
 944         in->dirstat.nfiles == 0 &&
 945         in->dirstat.nsubdirs == 0) {
 946       ldout(cct, 10) << " marking (I_COMPLETE|I_DIR_ORDERED) on empty dir " << *in << dendl;
 947       in->flags |= I_COMPLETE | I_DIR_ORDERED;
 948       if (in->dir) {
 949         ldout(cct, 10) << " dir is open on empty dir " << in->ino << " with "
 950                        << in->dir->dentries.size() << " entries, marking all dentries null" << dendl;
 951         in->dir->readdir_cache.clear();
 952         for (const auto& p : in->dir->dentries) {
 953           unlink(p.second, true, true);  // keep dir, keep dentry
 954         }
 955         if (in->dir->dentries.empty())
 956           close_dir(in->dir);
 957       }
 958     }
 959   } else {
 960     in->snap_caps |= st->cap.caps;
 961   }
 962
 963   return in;
 964 }
 965
 966
 967 /*
 968  * insert_dentry_inode - insert + link a single dentry + inode into the metadata cache.
 969  */
 970 Dentry *Client::insert_dentry_inode(Dir *dir, const string& dname, LeaseStat *dlease,
 971                                     Inode *in, utime_t from, MetaSession *session,
 972                                     Dentry *old_dentry)
 973 {
 974   Dentry *dn = NULL;
 975   if (dir->dentries.count(dname))
 976     dn = dir->dentries[dname];
 977
 978   ldout(cct, 12) << __func__ << " '" << dname << "' vino " << in->vino()
 979                  << " in dir " << dir->parent_inode->vino() << " dn " << dn
 980                  << dendl;
 981
 982   if (dn && dn->inode) {
 983     if (dn->inode->vino() == in->vino()) {
 984       touch_dn(dn);
 985       ldout(cct, 12) << " had dentry " << dname
 986                << " with correct vino " << dn->inode->vino()
 987                << dendl;
 988     } else {
 989       ldout(cct, 12) << " had dentry " << dname
 990                << " with WRONG vino " << dn->inode->vino()
 991                << dendl;
 992       unlink(dn, true, true);  // keep dir, keep dentry
 993     }
 994   }
 995
 996   if (!dn || !dn->inode) {
 997     InodeRef tmp_ref(in);
 998     if (old_dentry) {
 999       if (old_dentry->dir != dir) {
1000         Inode *old_diri = old_dentry->dir->parent_inode;
1001         old_diri->dir_ordered_count++;
1002         clear_dir_complete_and_ordered(old_diri, false);
1003       }
1004       unlink(old_dentry, dir == old_dentry->dir, false);  // drop dentry, keep dir open if its the same dir
1005     }
1006     Inode *diri = dir->parent_inode;
1007     diri->dir_ordered_count++;
1008     clear_dir_complete_and_ordered(diri, false);
1009     dn = link(dir, dname, in, dn);
1010   }
1011
1012   update_dentry_lease(dn, dlease, from, session);
1013   return dn;
1014 }
1015
1016 void Client::update_dentry_lease(Dentry *dn, LeaseStat *dlease, utime_t from, MetaSession *session)
1017 {
1018   utime_t dttl = from;
1019   dttl += (float)dlease->duration_ms / 1000.0;
1020
1021   ceph_assert(dn);
1022
1023   if (dlease->mask & CEPH_LEASE_VALID) {
1024     if (dttl > dn->lease_ttl) {
1025       ldout(cct, 10) << "got dentry lease on " << dn->name
1026                << " dur " << dlease->duration_ms << "ms ttl " << dttl << dendl;
1027       dn->lease_ttl = dttl;
1028       dn->lease_mds = session->mds_num;
1029       dn->lease_seq = dlease->seq;
1030       dn->lease_gen = session->cap_gen;
1031     }
1032   }
1033   dn->cap_shared_gen = dn->dir->parent_inode->shared_gen;
1034 }
1035
1036
1037 /*
1038  * update MDS location cache for a single inode
1039  */
1040 void Client::update_dir_dist(Inode *in, DirStat *dst)
1041 {
1042   // auth
1043   ldout(cct, 20) << "got dirfrag map for " << in->ino << " frag " << dst->frag << " to mds " << dst->auth << dendl;
1044   if (dst->auth >= 0) {
1045     in->fragmap[dst->frag] = dst->auth;
1046   } else {
1047     in->fragmap.erase(dst->frag);
1048   }
1049   if (!in->dirfragtree.is_leaf(dst->frag)) {
1050     in->dirfragtree.force_to_leaf(cct, dst->frag);
1051     _fragmap_remove_non_leaves(in);
1052   }
1053
1054   // replicated
1055   in->dir_replicated = !dst->dist.empty();  // FIXME that's just one frag!
1056 }
1057
1058 void Client::clear_dir_complete_and_ordered(Inode *diri, bool complete)
1059 {
1060   if (diri->flags & I_COMPLETE) {
1061     if (complete) {
1062       ldout(cct, 10) << " clearing (I_COMPLETE|I_DIR_ORDERED) on " << *diri << dendl;
1063       diri->flags &= ~(I_COMPLETE | I_DIR_ORDERED);
1064     } else {
1065       if (diri->flags & I_DIR_ORDERED) {
1066         ldout(cct, 10) << " clearing I_DIR_ORDERED on " << *diri << dendl;
1067         diri->flags &= ~I_DIR_ORDERED;
1068       }
1069     }
1070     if (diri->dir)
1071       diri->dir->readdir_cache.clear();
1072   }
1073 }
1074
1075 /*
1076  * insert results from readdir or lssnap into the metadata cache.
1077  */
1078 void Client::insert_readdir_results(MetaRequest *request, MetaSession *session, Inode *diri) {
1079
1080   auto& reply = request->reply;
1081   ConnectionRef con = request->reply->get_connection();
1082   uint64_t features;
1083   if(session->mds_features.test(CEPHFS_FEATURE_REPLY_ENCODING)) {
1084     features = (uint64_t)-1;
1085   }
1086   else {
1087     features = con->get_features();
1088   }
1089
1090   dir_result_t *dirp = request->dirp;
1091   ceph_assert(dirp);
1092
1093   // the extra buffer list is only set for readdir and lssnap replies
1094   auto p = reply->get_extra_bl().cbegin();
1095   if (!p.end()) {
1096     // snapdir?
1097     if (request->head.op == CEPH_MDS_OP_LSSNAP) {
1098       ceph_assert(diri);
1099       diri = open_snapdir(diri);
1100     }
1101
1102     // only open dir if we're actually adding stuff to it!
1103     Dir *dir = diri->open_dir();
1104     ceph_assert(dir);
1105
1106     // dirstat
1107     DirStat dst(p, features);
1108     __u32 numdn;
1109     __u16 flags;
1110     decode(numdn, p);
1111     decode(flags, p);
1112
1113     bool end = ((unsigned)flags & CEPH_READDIR_FRAG_END);
1114     bool hash_order = ((unsigned)flags & CEPH_READDIR_HASH_ORDER);
1115
1116     frag_t fg = (unsigned)request->head.args.readdir.frag;
1117     unsigned readdir_offset = dirp->next_offset;
1118     string readdir_start = dirp->last_name;
1119     ceph_assert(!readdir_start.empty() || readdir_offset == 2);
1120
1121     unsigned last_hash = 0;
1122     if (hash_order) {
1123       if (!readdir_start.empty()) {
1124         last_hash = ceph_frag_value(diri->hash_dentry_name(readdir_start));
1125       } else if (flags & CEPH_READDIR_OFFSET_HASH) {
1126         /* mds understands offset_hash */
1127         last_hash = (unsigned)request->head.args.readdir.offset_hash;
1128       }
1129     }
1130
1131     if (fg != dst.frag) {
1132       ldout(cct, 10) << "insert_trace got new frag " << fg << " -> " << dst.frag << dendl;
1133       fg = dst.frag;
1134       if (!hash_order) {
1135         readdir_offset = 2;
1136         readdir_start.clear();
1137         dirp->offset = dir_result_t::make_fpos(fg, readdir_offset, false);
1138       }
1139     }
1140
1141     ldout(cct, 10) << __func__ << " " << numdn << " readdir items, end=" << end
1142                    << ", hash_order=" << hash_order
1143                    << ", readdir_start " << readdir_start
1144                    << ", last_hash " << last_hash
1145                    << ", next_offset " << readdir_offset << dendl;
1146
1147     if (diri->snapid != CEPH_SNAPDIR &&
1148         fg.is_leftmost() && readdir_offset == 2 &&
1149         !(hash_order && last_hash)) {
1150       dirp->release_count = diri->dir_release_count;
1151       dirp->ordered_count = diri->dir_ordered_count;
1152       dirp->start_shared_gen = diri->shared_gen;
1153       dirp->cache_index = 0;
1154     }
1155
1156     dirp->buffer_frag = fg;
1157
1158     _readdir_drop_dirp_buffer(dirp);
1159     dirp->buffer.reserve(numdn);
1160
1161     string dname;
1162     LeaseStat dlease;
1163     for (unsigned i=0; i<numdn; i++) {
1164       decode(dname, p);
1165       dlease.decode(p, features);
1166       InodeStat ist(p, features);
1167
1168       ldout(cct, 15) << "" << i << ": '" << dname << "'" << dendl;
1169
1170       Inode *in = add_update_inode(&ist, request->sent_stamp, session,
1171                                    request->perms);
1172       Dentry *dn;
1173       if (diri->dir->dentries.count(dname)) {
1174         Dentry *olddn = diri->dir->dentries[dname];
1175         if (olddn->inode != in) {
1176           // replace incorrect dentry
1177           unlink(olddn, true, true);  // keep dir, dentry
1178           dn = link(dir, dname, in, olddn);
1179           ceph_assert(dn == olddn);
1180         } else {
1181           // keep existing dn
1182           dn = olddn;
1183           touch_dn(dn);
1184         }
1185       } else {
1186         // new dn
1187         dn = link(dir, dname, in, NULL);
1188       }
1189
1190       update_dentry_lease(dn, &dlease, request->sent_stamp, session);
1191       if (hash_order) {
1192         unsigned hash = ceph_frag_value(diri->hash_dentry_name(dname));
1193         if (hash != last_hash)
1194           readdir_offset = 2;
1195         last_hash = hash;
1196         dn->offset = dir_result_t::make_fpos(hash, readdir_offset++, true);
1197       } else {
1198         dn->offset = dir_result_t::make_fpos(fg, readdir_offset++, false);
1199       }
1200       // add to readdir cache
1201       if (dirp->release_count == diri->dir_release_count &&
1202           dirp->ordered_count == diri->dir_ordered_count &&
1203           dirp->start_shared_gen == diri->shared_gen) {
1204         if (dirp->cache_index == dir->readdir_cache.size()) {
1205           if (i == 0) {
1206             ceph_assert(!dirp->inode->is_complete_and_ordered());
1207             dir->readdir_cache.reserve(dirp->cache_index + numdn);
1208           }
1209           dir->readdir_cache.push_back(dn);
1210         } else if (dirp->cache_index < dir->readdir_cache.size()) {
1211           if (dirp->inode->is_complete_and_ordered())
1212             ceph_assert(dir->readdir_cache[dirp->cache_index] == dn);
1213           else
1214             dir->readdir_cache[dirp->cache_index] = dn;
1215         } else {
1216           ceph_abort_msg("unexpected readdir buffer idx");
1217         }
1218         dirp->cache_index++;
1219       }
1220       // add to cached result list
1221       dirp->buffer.push_back(dir_result_t::dentry(dn->offset, dname, in));
1222       ldout(cct, 15) << __func__ << "  " << hex << dn->offset << dec << ": '" << dname << "' -> " << in->ino << dendl;
1223     }
1224
1225     if (numdn > 0)
1226       dirp->last_name = dname;
1227     if (end)
1228       dirp->next_offset = 2;
1229     else
1230       dirp->next_offset = readdir_offset;
1231
1232     if (dir->is_empty())
1233       close_dir(dir);
1234   }
1235 }
1236
1237 /** insert_trace
1238  *
1239  * insert a trace from a MDS reply into the cache.
1240  */
1241 Inode* Client::insert_trace(MetaRequest *request, MetaSession *session)
1242 {
1243   auto& reply = request->reply;
1244   int op = request->get_op();
1245
1246   ldout(cct, 10) << "insert_trace from " << request->sent_stamp << " mds." << session->mds_num
1247            << " is_target=" << (int)reply->head.is_target
1248            << " is_dentry=" << (int)reply->head.is_dentry
1249            << dendl;
1250
1251   auto p = reply->get_trace_bl().cbegin();
1252   if (request->got_unsafe) {
1253     ldout(cct, 10) << "insert_trace -- already got unsafe; ignoring" << dendl;
1254     ceph_assert(p.end());
1255     return NULL;
1256   }
1257
1258   if (p.end()) {
1259     ldout(cct, 10) << "insert_trace -- no trace" << dendl;
1260
1261     Dentry *d = request->dentry();
1262     if (d) {
1263       Inode *diri = d->dir->parent_inode;
1264       diri->dir_release_count++;
1265       clear_dir_complete_and_ordered(diri, true);
1266     }
1267
1268     if (d && reply->get_result() == 0) {
1269       if (op == CEPH_MDS_OP_RENAME) {
1270         // rename
1271         Dentry *od = request->old_dentry();
1272         ldout(cct, 10) << " unlinking rename src dn " << od << " for traceless reply" << dendl;
1273         ceph_assert(od);
1274         unlink(od, true, true);  // keep dir, dentry
1275       } else if (op == CEPH_MDS_OP_RMDIR ||
1276                  op == CEPH_MDS_OP_UNLINK) {
1277         // unlink, rmdir
1278         ldout(cct, 10) << " unlinking unlink/rmdir dn " << d << " for traceless reply" << dendl;
1279         unlink(d, true, true);  // keep dir, dentry
1280       }
1281     }
1282     return NULL;
1283   }
1284
1285   ConnectionRef con = request->reply->get_connection();
1286   uint64_t features;
1287   if (session->mds_features.test(CEPHFS_FEATURE_REPLY_ENCODING)) {
1288     features = (uint64_t)-1;
1289   }
1290   else {
1291     features = con->get_features();
1292   }
1293   ldout(cct, 10) << " features 0x" << hex << features << dec << dendl;
1294
1295   // snap trace
1296   SnapRealm *realm = NULL;
1297   if (reply->snapbl.length())
1298     update_snap_trace(reply->snapbl, &realm);
1299
1300   ldout(cct, 10) << " hrm "
1301            << " is_target=" << (int)reply->head.is_target
1302            << " is_dentry=" << (int)reply->head.is_dentry
1303            << dendl;
1304
1305   InodeStat dirst;
1306   DirStat dst;
1307   string dname;
1308   LeaseStat dlease;
1309   InodeStat ist;
1310
1311   if (reply->head.is_dentry) {
1312     dirst.decode(p, features);
1313     dst.decode(p, features);
1314     decode(dname, p);
1315     dlease.decode(p, features);
1316   }
1317
1318   Inode *in = 0;
1319   if (reply->head.is_target) {
1320     ist.decode(p, features);
1321     if (cct->_conf->client_debug_getattr_caps) {
1322       unsigned wanted = 0;
1323       if (op == CEPH_MDS_OP_GETATTR || op == CEPH_MDS_OP_LOOKUP)
1324         wanted = request->head.args.getattr.mask;
1325       else if (op == CEPH_MDS_OP_OPEN || op == CEPH_MDS_OP_CREATE)
1326         wanted = request->head.args.open.mask;
1327
1328       if ((wanted & CEPH_CAP_XATTR_SHARED) &&
1329           !(ist.xattr_version > 0 && ist.xattrbl.length() > 0))
1330         ceph_abort_msg("MDS reply does not contain xattrs");
1331     }
1332
1333     in = add_update_inode(&ist, request->sent_stamp, session,
1334                           request->perms);
1335   }
1336
1337   Inode *diri = NULL;
1338   if (reply->head.is_dentry) {
1339     diri = add_update_inode(&dirst, request->sent_stamp, session,
1340                             request->perms);
1341     update_dir_dist(diri, &dst);  // dir stat info is attached to ..
1342
1343     if (in) {
1344       Dir *dir = diri->open_dir();
1345       insert_dentry_inode(dir, dname, &dlease, in, request->sent_stamp, session,
1346                           (op == CEPH_MDS_OP_RENAME) ? request->old_dentry() : NULL);
1347     } else {
1348       Dentry *dn = NULL;
1349       if (diri->dir && diri->dir->dentries.count(dname)) {
1350         dn = diri->dir->dentries[dname];
1351         if (dn->inode) {
1352           diri->dir_ordered_count++;
1353           clear_dir_complete_and_ordered(diri, false);
1354           unlink(dn, true, true);  // keep dir, dentry
1355         }
1356       }
1357       if (dlease.duration_ms > 0) {
1358         if (!dn) {
1359           Dir *dir = diri->open_dir();
1360           dn = link(dir, dname, NULL, NULL);
1361         }
1362         update_dentry_lease(dn, &dlease, request->sent_stamp, session);
1363       }
1364     }
1365   } else if (op == CEPH_MDS_OP_LOOKUPSNAP ||
1366              op == CEPH_MDS_OP_MKSNAP) {
1367     ldout(cct, 10) << " faking snap lookup weirdness" << dendl;
1368     // fake it for snap lookup
1369     vinodeno_t vino = ist.vino;
1370     vino.snapid = CEPH_SNAPDIR;
1371     ceph_assert(inode_map.count(vino));
1372     diri = inode_map[vino];
1373
1374     string dname = request->path.last_dentry();
1375
1376     LeaseStat dlease;
1377     dlease.duration_ms = 0;
1378
1379     if (in) {
1380       Dir *dir = diri->open_dir();
1381       insert_dentry_inode(dir, dname, &dlease, in, request->sent_stamp, session);
1382     } else {
1383       if (diri->dir && diri->dir->dentries.count(dname)) {
1384         Dentry *dn = diri->dir->dentries[dname];
1385         if (dn->inode)
1386           unlink(dn, true, true);  // keep dir, dentry
1387       }
1388     }
1389   }
1390
1391   if (in) {
1392     if (op == CEPH_MDS_OP_READDIR ||
1393         op == CEPH_MDS_OP_LSSNAP) {
1394       insert_readdir_results(request, session, in);
1395     } else if (op == CEPH_MDS_OP_LOOKUPNAME) {
1396       // hack: return parent inode instead
1397       in = diri;
1398     }
1399
1400     if (request->dentry() == NULL && in != request->inode()) {
1401       // pin the target inode if its parent dentry is not pinned
1402       request->set_other_inode(in);
1403     }
1404   }
1405
1406   if (realm)
1407     put_snap_realm(realm);
1408
1409   request->target = in;
1410   return in;
1411 }
1412
1413 // -------
1414
1415 mds_rank_t Client::choose_target_mds(MetaRequest *req, Inode** phash_diri)
1416 {
1417   mds_rank_t mds = MDS_RANK_NONE;
1418   __u32 hash = 0;
1419   bool is_hash = false;
1420
1421   Inode *in = NULL;
1422   Dentry *de = NULL;
1423
1424   if (req->resend_mds >= 0) {
1425     mds = req->resend_mds;
1426     req->resend_mds = -1;
1427     ldout(cct, 10) << __func__ << " resend_mds specified as mds." << mds << dendl;
1428     goto out;
1429   }
1430
1431   if (cct->_conf->client_use_random_mds)
1432     goto random_mds;
1433
1434   in = req->inode();
1435   de = req->dentry();
1436   if (in) {
1437     ldout(cct, 20) << __func__ << " starting with req->inode " << *in << dendl;
1438     if (req->path.depth()) {
1439       hash = in->hash_dentry_name(req->path[0]);
1440       ldout(cct, 20) << __func__ << " inode dir hash is " << (int)in->dir_layout.dl_dir_hash
1441                << " on " << req->path[0]
1442                << " => " << hash << dendl;
1443       is_hash = true;
1444     }
1445   } else if (de) {
1446     if (de->inode) {
1447       in = de->inode.get();
1448       ldout(cct, 20) << __func__ << " starting with req->dentry inode " << *in << dendl;
1449     } else {
1450       in = de->dir->parent_inode;
1451       hash = in->hash_dentry_name(de->name);
1452       ldout(cct, 20) << __func__ << " dentry dir hash is " << (int)in->dir_layout.dl_dir_hash
1453                << " on " << de->name
1454                << " => " << hash << dendl;
1455       is_hash = true;
1456     }
1457   }
1458   if (in) {
1459     if (in->snapid != CEPH_NOSNAP) {
1460       ldout(cct, 10) << __func__ << " " << *in << " is snapped, using nonsnap parent" << dendl;
1461       while (in->snapid != CEPH_NOSNAP) {
1462         if (in->snapid == CEPH_SNAPDIR)
1463           in = in->snapdir_parent.get();
1464         else if (!in->dentries.empty())
1465           /* In most cases there will only be one dentry, so getting it
1466            * will be the correct action. If there are multiple hard links,
1467            * I think the MDS should be able to redirect as needed*/
1468           in = in->get_first_parent()->dir->parent_inode;
1469         else {
1470           ldout(cct, 10) << "got unlinked inode, can't look at parent" << dendl;
1471           break;
1472         }
1473       }
1474       is_hash = false;
1475     }
1476
1477     ldout(cct, 20) << __func__ << " " << *in << " is_hash=" << is_hash
1478              << " hash=" << hash << dendl;
1479
1480     if (is_hash && S_ISDIR(in->mode) && !in->fragmap.empty()) {
1481       frag_t fg = in->dirfragtree[hash];
1482       if (in->fragmap.count(fg)) {
1483         mds = in->fragmap[fg];
1484         if (phash_diri)
1485           *phash_diri = in;
1486       } else if (in->auth_cap) {
1487         mds = in->auth_cap->session->mds_num;
1488       }
1489       if (mds >= 0) {
1490         ldout(cct, 10) << __func__ << " from dirfragtree hash" << dendl;
1491         goto out;
1492       }
1493     }
1494
1495     if (in->auth_cap && req->auth_is_best()) {
1496       mds = in->auth_cap->session->mds_num;
1497     } else if (!in->caps.empty()) {
1498       mds = in->caps.begin()->second.session->mds_num;
1499     } else {
1500       goto random_mds;
1501     }
1502     ldout(cct, 10) << __func__ << " from caps on inode " << *in << dendl;
1503
1504     goto out;
1505   }
1506
1507 random_mds:
1508   if (mds < 0) {
1509     mds = _get_random_up_mds();
1510     ldout(cct, 10) << "did not get mds through better means, so chose random mds " << mds << dendl;
1511   }
1512
1513 out:
1514   ldout(cct, 20) << "mds is " << mds << dendl;
1515   return mds;
1516 }
1517
1518
1519 void Client::connect_mds_targets(mds_rank_t mds)
1520 {
1521   ldout(cct, 10) << __func__ << " for mds." << mds << dendl;
1522   ceph_assert(mds_sessions.count(mds));
1523   const MDSMap::mds_info_t& info = mdsmap->get_mds_info(mds);
1524   for (set<mds_rank_t>::const_iterator q = info.export_targets.begin();
1525        q != info.export_targets.end();
1526        ++q) {
1527     if (mds_sessions.count(*q) == 0 &&
1528         mdsmap->is_clientreplay_or_active_or_stopping(*q)) {
1529       ldout(cct, 10) << "check_mds_sessions opening mds." << mds
1530                      << " export target mds." << *q << dendl;
1531       _open_mds_session(*q);
1532     }
1533   }
1534 }
1535
1536 void Client::dump_mds_sessions(Formatter *f)
1537 {
1538   f->dump_int("id", get_nodeid().v);
1539   entity_inst_t inst(messenger->get_myname(), messenger->get_myaddr_legacy());
1540   f->dump_object("inst", inst);
1541   f->dump_stream("inst_str") << inst;
1542   f->dump_stream("addr_str") << inst.addr;
1543   f->open_array_section("sessions");
1544   for (const auto &p : mds_sessions) {
1545     f->open_object_section("session");
1546     p.second.dump(f);
1547     f->close_section();
1548   }
1549   f->close_section();
1550   f->dump_int("mdsmap_epoch", mdsmap->get_epoch());
1551 }
1552 void Client::dump_mds_requests(Formatter *f)
1553 {
1554   for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
1555        p != mds_requests.end();
1556        ++p) {
1557     f->open_object_section("request");
1558     p->second->dump(f);
1559     f->close_section();
1560   }
1561 }
1562
1563 int Client::verify_reply_trace(int r, MetaSession *session,
1564                                MetaRequest *request, const MConstRef<MClientReply>& reply,
1565                                InodeRef *ptarget, bool *pcreated,
1566                                const UserPerm& perms)
1567 {
1568   // check whether this request actually did the create, and set created flag
1569   bufferlist extra_bl;
1570   inodeno_t created_ino;
1571   bool got_created_ino = false;
1572   ceph::unordered_map<vinodeno_t, Inode*>::iterator p;
1573
1574   extra_bl = reply->get_extra_bl();
1575   if (extra_bl.length() >= 8) {
1576     if (session->mds_features.test(CEPHFS_FEATURE_DELEG_INO)) {
1577      struct openc_response_t    ocres;
1578
1579      decode(ocres, extra_bl);
1580      created_ino = ocres.created_ino;
1581      /*
1582       * The userland cephfs client doesn't have a way to do an async create
1583       * (yet), so just discard delegated_inos for now. Eventually we should
1584       * store them and use them in create calls, even if they are synchronous,
1585       * if only for testing purposes.
1586       */
1587      ldout(cct, 10) << "delegated_inos: " << ocres.delegated_inos << dendl;
1588     } else {
1589      // u64 containing number of created ino
1590      decode(created_ino, extra_bl);
1591     }
1592     ldout(cct, 10) << "make_request created ino " << created_ino << dendl;
1593     got_created_ino = true;
1594   }
1595
1596   if (pcreated)
1597     *pcreated = got_created_ino;
1598
1599   if (request->target) {
1600     *ptarget = request->target;
1601     ldout(cct, 20) << "make_request target is " << *ptarget->get() << dendl;
1602   } else {
1603     if (got_created_ino && (p = inode_map.find(vinodeno_t(created_ino, CEPH_NOSNAP))) != inode_map.end()) {
1604       (*ptarget) = p->second;
1605       ldout(cct, 20) << "make_request created, target is " << *ptarget->get() << dendl;
1606     } else {
1607       // we got a traceless reply, and need to look up what we just
1608       // created.  for now, do this by name.  someday, do this by the
1609       // ino... which we know!  FIXME.
1610       InodeRef target;
1611       Dentry *d = request->dentry();
1612       if (d) {
1613         if (d->dir) {
1614           ldout(cct, 10) << "make_request got traceless reply, looking up #"
1615                          << d->dir->parent_inode->ino << "/" << d->name
1616                          << " got_ino " << got_created_ino
1617                          << " ino " << created_ino
1618                          << dendl;
1619           r = _do_lookup(d->dir->parent_inode, d->name, request->regetattr_mask,
1620                          &target, perms);
1621         } else {
1622           // if the dentry is not linked, just do our best. see #5021.
1623           ceph_abort_msg("how did this happen?  i want logs!");
1624         }
1625       } else {
1626         Inode *in = request->inode();
1627         ldout(cct, 10) << "make_request got traceless reply, forcing getattr on #"
1628                        << in->ino << dendl;
1629         r = _getattr(in, request->regetattr_mask, perms, true);
1630         target = in;
1631       }
1632       if (r >= 0) {
1633         // verify ino returned in reply and trace_dist are the same
1634         if (got_created_ino &&
1635             created_ino.val != target->ino.val) {
1636           ldout(cct, 5) << "create got ino " << created_ino << " but then failed on lookup; EINTR?" << dendl;
1637           r = -EINTR;
1638         }
1639         if (ptarget)
1640           ptarget->swap(target);
1641       }
1642     }
1643   }
1644
1645   return r;
1646 }
1647
1648
1649 /**
1650  * make a request
1651  *
1652  * Blocking helper to make an MDS request.
1653  *
1654  * If the ptarget flag is set, behavior changes slightly: the caller
1655  * expects to get a pointer to the inode we are creating or operating
1656  * on.  As a result, we will follow up any traceless mutation reply
1657  * with a getattr or lookup to transparently handle a traceless reply
1658  * from the MDS (as when the MDS restarts and the client has to replay
1659  * a request).
1660  *
1661  * @param request the MetaRequest to execute
1662  * @param perms The user uid/gid to execute as (eventually, full group lists?)
1663  * @param ptarget [optional] address to store a pointer to the target inode we want to create or operate on
1664  * @param pcreated [optional; required if ptarget] where to store a bool of whether our create atomically created a file
1665  * @param use_mds [optional] prefer a specific mds (-1 for default)
1666  * @param pdirbl [optional; disallowed if ptarget] where to pass extra reply payload to the caller
1667  */
1668 int Client::make_request(MetaRequest *request,
1669                          const UserPerm& perms,
1670                          InodeRef *ptarget, bool *pcreated,
1671                          mds_rank_t use_mds,
1672                          bufferlist *pdirbl)
1673 {
1674   int r = 0;
1675
1676   // assign a unique tid
1677   ceph_tid_t tid = ++last_tid;
1678   request->set_tid(tid);
1679
1680   // and timestamp
1681   request->op_stamp = ceph_clock_now();
1682
1683   // make note
1684   mds_requests[tid] = request->get();
1685   if (oldest_tid == 0 && request->get_op() != CEPH_MDS_OP_SETFILELOCK)
1686     oldest_tid = tid;
1687
1688   request->set_caller_perms(perms);
1689
1690   if (cct->_conf->client_inject_fixed_oldest_tid) {
1691     ldout(cct, 20) << __func__ << " injecting fixed oldest_client_tid(1)" << dendl;
1692     request->set_oldest_client_tid(1);
1693   } else {
1694     request->set_oldest_client_tid(oldest_tid);
1695   }
1696
1697   // hack target mds?
1698   if (use_mds >= 0)
1699     request->resend_mds = use_mds;
1700
1701   MetaSession *session = NULL;
1702   while (1) {
1703     if (request->aborted())
1704       break;
1705
1706     if (blacklisted) {
1707       request->abort(-EBLACKLISTED);
1708       break;
1709     }
1710
1711     // set up wait cond
1712     ceph::condition_variable caller_cond;
1713     request->caller_cond = &caller_cond;
1714
1715     // choose mds
1716     Inode *hash_diri = NULL;
1717     mds_rank_t mds = choose_target_mds(request, &hash_diri);
1718     int mds_state = (mds == MDS_RANK_NONE) ? MDSMap::STATE_NULL : mdsmap->get_state(mds);
1719     if (mds_state != MDSMap::STATE_ACTIVE && mds_state != MDSMap::STATE_STOPPING) {
1720       if (mds_state == MDSMap::STATE_NULL && mds >= mdsmap->get_max_mds()) {
1721         if (hash_diri) {
1722           ldout(cct, 10) << " target mds." << mds << " has stopped, remove it from fragmap" << dendl;
1723           _fragmap_remove_stopped_mds(hash_diri, mds);
1724         } else {
1725           ldout(cct, 10) << " target mds." << mds << " has stopped, trying a random mds" << dendl;
1726           request->resend_mds = _get_random_up_mds();
1727         }
1728       } else {
1729         ldout(cct, 10) << " target mds." << mds << " not active, waiting for new mdsmap" << dendl;
1730         wait_on_list(waiting_for_mdsmap);
1731       }
1732       continue;
1733     }
1734
1735     // open a session?
1736     if (!have_open_session(mds)) {
1737       session = _get_or_open_mds_session(mds);
1738       if (session->state == MetaSession::STATE_REJECTED) {
1739         request->abort(-EPERM);
1740         break;
1741       }
1742       // wait
1743       if (session->state == MetaSession::STATE_OPENING) {
1744         ldout(cct, 10) << "waiting for session to mds." << mds << " to open" << dendl;
1745         wait_on_context_list(session->waiting_for_open);
1746         continue;
1747       }
1748
1749       if (!have_open_session(mds))
1750         continue;
1751     } else {
1752       session = &mds_sessions.at(mds);
1753     }
1754
1755     // send request.
1756     send_request(request, session);
1757
1758     // wait for signal
1759     ldout(cct, 20) << "awaiting reply|forward|kick on " << &caller_cond << dendl;
1760     request->kick = false;
1761     std::unique_lock l{client_lock, std::adopt_lock};
1762     caller_cond.wait(l, [request] {
1763       return (request->reply ||           // reply
1764               request->resend_mds >= 0 || // forward
1765               request->kick);
1766     });
1767     l.release();
1768     request->caller_cond = nullptr;
1769
1770     // did we get a reply?
1771     if (request->reply)
1772       break;
1773   }
1774
1775   if (!request->reply) {
1776     ceph_assert(request->aborted());
1777     ceph_assert(!request->got_unsafe);
1778     r = request->get_abort_code();
1779     request->item.remove_myself();
1780     unregister_request(request);
1781     put_request(request);
1782     return r;
1783   }
1784
1785   // got it!
1786   auto reply = std::move(request->reply);
1787   r = reply->get_result();
1788   if (r >= 0)
1789     request->success = true;
1790
1791   // kick dispatcher (we've got it!)
1792   ceph_assert(request->dispatch_cond);
1793   request->dispatch_cond->notify_all();
1794   ldout(cct, 20) << "sendrecv kickback on tid " << tid << " " << request->dispatch_cond << dendl;
1795   request->dispatch_cond = 0;
1796
1797   if (r >= 0 && ptarget)
1798     r = verify_reply_trace(r, session, request, reply, ptarget, pcreated, perms);
1799
1800   if (pdirbl)
1801     *pdirbl = reply->get_extra_bl();
1802
1803   // -- log times --
1804   utime_t lat = ceph_clock_now();
1805   lat -= request->sent_stamp;
1806   ldout(cct, 20) << "lat " << lat << dendl;
1807   logger->tinc(l_c_lat, lat);
1808   logger->tinc(l_c_reply, lat);
1809
1810   put_request(request);
1811   return r;
1812 }
1813
1814 void Client::unregister_request(MetaRequest *req)
1815 {
1816   mds_requests.erase(req->tid);
1817   if (req->tid == oldest_tid) {
1818     map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.upper_bound(oldest_tid);
1819     while (true) {
1820       if (p == mds_requests.end()) {
1821         oldest_tid = 0;
1822         break;
1823       }
1824       if (p->second->get_op() != CEPH_MDS_OP_SETFILELOCK) {
1825         oldest_tid = p->first;
1826         break;
1827       }
1828       ++p;
1829     }
1830   }
1831   put_request(req);
1832 }
1833
1834 void Client::put_request(MetaRequest *request)
1835 {
1836   if (request->_put()) {
1837     int op = -1;
1838     if (request->success)
1839       op = request->get_op();
1840     InodeRef other_in;
1841     request->take_other_inode(&other_in);
1842     delete request;
1843
1844     if (other_in &&
1845         (op == CEPH_MDS_OP_RMDIR ||
1846          op == CEPH_MDS_OP_RENAME ||
1847          op == CEPH_MDS_OP_RMSNAP)) {
1848       _try_to_trim_inode(other_in.get(), false);
1849     }
1850   }
1851 }
1852
1853 int Client::encode_inode_release(Inode *in, MetaRequest *req,
1854                          mds_rank_t mds, int drop,
1855                          int unless, int force)
1856 {
1857   ldout(cct, 20) << __func__ << " enter(in:" << *in << ", req:" << req
1858            << " mds:" << mds << ", drop:" << drop << ", unless:" << unless
1859            << ", force:" << force << ")" << dendl;
1860   int released = 0;
1861   auto it = in->caps.find(mds);
1862   if (it != in->caps.end()) {
1863     Cap &cap = it->second;
1864     drop &= ~(in->dirty_caps | get_caps_used(in));
1865     if ((drop & cap.issued) &&
1866         !(unless & cap.issued)) {
1867       ldout(cct, 25) << "dropping caps " << ccap_string(drop) << dendl;
1868       cap.issued &= ~drop;
1869       cap.implemented &= ~drop;
1870       released = 1;
1871     } else {
1872       released = force;
1873     }
1874     if (released) {
1875       cap.wanted = in->caps_wanted();
1876       if (&cap == in->auth_cap &&
1877           !(cap.wanted & CEPH_CAP_ANY_FILE_WR)) {
1878         in->requested_max_size = 0;
1879         ldout(cct, 25) << "reset requested_max_size due to not wanting any file write cap" << dendl;
1880       }
1881       ceph_mds_request_release rel;
1882       rel.ino = in->ino;
1883       rel.cap_id = cap.cap_id;
1884       rel.seq = cap.seq;
1885       rel.issue_seq = cap.issue_seq;
1886       rel.mseq = cap.mseq;
1887       rel.caps = cap.implemented;
1888       rel.wanted = cap.wanted;
1889       rel.dname_len = 0;
1890       rel.dname_seq = 0;
1891       req->cap_releases.push_back(MClientRequest::Release(rel,""));
1892     }
1893   }
1894   ldout(cct, 25) << __func__ << " exit(in:" << *in << ") released:"
1895            << released << dendl;
1896   return released;
1897 }
1898
1899 void Client::encode_dentry_release(Dentry *dn, MetaRequest *req,
1900                            mds_rank_t mds, int drop, int unless)
1901 {
1902   ldout(cct, 20) << __func__ << " enter(dn:"
1903            << dn << ")" << dendl;
1904   int released = 0;
1905   if (dn->dir)
1906     released = encode_inode_release(dn->dir->parent_inode, req,
1907                                     mds, drop, unless, 1);
1908   if (released && dn->lease_mds == mds) {
1909     ldout(cct, 25) << "preemptively releasing dn to mds" << dendl;
1910     auto& rel = req->cap_releases.back();
1911     rel.item.dname_len = dn->name.length();
1912     rel.item.dname_seq = dn->lease_seq;
1913     rel.dname = dn->name;
1914   }
1915   ldout(cct, 25) << __func__ << " exit(dn:"
1916            << dn << ")" << dendl;
1917 }
1918
1919
1920 /*
1921  * This requires the MClientRequest *request member to be set.
1922  * It will error out horribly without one.
1923  * Additionally, if you set any *drop member, you'd better have
1924  * set the corresponding dentry!
1925  */
1926 void Client::encode_cap_releases(MetaRequest *req, mds_rank_t mds)
1927 {
1928   ldout(cct, 20) << __func__ << " enter (req: "
1929                  << req << ", mds: " << mds << ")" << dendl;
1930   if (req->inode_drop && req->inode())
1931     encode_inode_release(req->inode(), req,
1932                          mds, req->inode_drop,
1933                          req->inode_unless);
1934
1935   if (req->old_inode_drop && req->old_inode())
1936     encode_inode_release(req->old_inode(), req,
1937                          mds, req->old_inode_drop,
1938                          req->old_inode_unless);
1939   if (req->other_inode_drop && req->other_inode())
1940     encode_inode_release(req->other_inode(), req,
1941                          mds, req->other_inode_drop,
1942                          req->other_inode_unless);
1943
1944   if (req->dentry_drop && req->dentry())
1945     encode_dentry_release(req->dentry(), req,
1946                           mds, req->dentry_drop,
1947                           req->dentry_unless);
1948
1949   if (req->old_dentry_drop && req->old_dentry())
1950     encode_dentry_release(req->old_dentry(), req,
1951                           mds, req->old_dentry_drop,
1952                           req->old_dentry_unless);
1953   ldout(cct, 25) << __func__ << " exit (req: "
1954            << req << ", mds " << mds <<dendl;
1955 }
1956
1957 bool Client::have_open_session(mds_rank_t mds)
1958 {
1959   const auto &it = mds_sessions.find(mds);
1960   return it != mds_sessions.end() &&
1961     (it->second.state == MetaSession::STATE_OPEN ||
1962      it->second.state == MetaSession::STATE_STALE);
1963 }
1964
1965 MetaSession *Client::_get_mds_session(mds_rank_t mds, Connection *con)
1966 {
1967   const auto &it = mds_sessions.find(mds);
1968   if (it == mds_sessions.end() || it->second.con != con) {
1969     return NULL;
1970   } else {
1971     return &it->second;
1972   }
1973 }
1974
1975 MetaSession *Client::_get_or_open_mds_session(mds_rank_t mds)
1976 {
1977   auto it = mds_sessions.find(mds);
1978   return it == mds_sessions.end() ? _open_mds_session(mds) : &it->second;
1979 }
1980
1981 /**
1982  * Populate a map of strings with client-identifying metadata,
1983  * such as the hostname.  Call this once at initialization.
1984  */
1985 void Client::populate_metadata(const std::string &mount_root)
1986 {
1987   // Hostname
1988   struct utsname u;
1989   int r = uname(&u);
1990   if (r >= 0) {
1991     metadata["hostname"] = u.nodename;
1992     ldout(cct, 20) << __func__ << " read hostname '" << u.nodename << "'" << dendl;
1993   } else {
1994     ldout(cct, 1) << __func__ << " failed to read hostname (" << cpp_strerror(r) << ")" << dendl;
1995   }
1996
1997   metadata["pid"] = stringify(getpid());
1998
1999   // Ceph entity id (the '0' in "client.0")
2000   metadata["entity_id"] = cct->_conf->name.get_id();
2001
2002   // Our mount position
2003   if (!mount_root.empty()) {
2004     metadata["root"] = mount_root;
2005   }
2006
2007   // Ceph version
2008   metadata["ceph_version"] = pretty_version_to_str();
2009   metadata["ceph_sha1"] = git_version_to_str();
2010
2011   // Apply any metadata from the user's configured overrides
2012   std::vector<std::string> tokens;
2013   get_str_vec(cct->_conf->client_metadata, ",", tokens);
2014   for (const auto &i : tokens) {
2015     auto eqpos = i.find("=");
2016     // Throw out anything that isn't of the form "<str>=<str>"
2017     if (eqpos == 0 || eqpos == std::string::npos || eqpos == i.size()) {
2018       lderr(cct) << "Invalid metadata keyval pair: '" << i << "'" << dendl;
2019       continue;
2020     }
2021     metadata[i.substr(0, eqpos)] = i.substr(eqpos + 1);
2022   }
2023 }
2024
2025 /**
2026  * Optionally add or override client metadata fields.
2027  */
2028 void Client::update_metadata(std::string const &k, std::string const &v)
2029 {
2030   std::lock_guard l(client_lock);
2031   ceph_assert(initialized);
2032
2033   auto it = metadata.find(k);
2034   if (it != metadata.end()) {
2035     ldout(cct, 1) << __func__ << " warning, overriding metadata field '" << k
2036                   << "' from '" << it->second << "' to '" << v << "'" << dendl;
2037   }
2038
2039   metadata[k] = v;
2040 }
2041
2042 MetaSession *Client::_open_mds_session(mds_rank_t mds)
2043 {
2044   ldout(cct, 10) << __func__ << " mds." << mds << dendl;
2045   auto addrs = mdsmap->get_addrs(mds);
2046   auto em = mds_sessions.emplace(std::piecewise_construct,
2047       std::forward_as_tuple(mds),
2048       std::forward_as_tuple(mds, messenger->connect_to_mds(addrs), addrs));
2049   ceph_assert(em.second); /* not already present */
2050   MetaSession *session = &em.first->second;
2051
2052   auto m = make_message<MClientSession>(CEPH_SESSION_REQUEST_OPEN);
2053   m->metadata = metadata;
2054   m->supported_features = feature_bitset_t(CEPHFS_FEATURES_CLIENT_SUPPORTED);
2055   session->con->send_message2(std::move(m));
2056   return session;
2057 }
2058
2059 void Client::_close_mds_session(MetaSession *s)
2060 {
2061   ldout(cct, 2) << __func__ << " mds." << s->mds_num << " seq " << s->seq << dendl;
2062   s->state = MetaSession::STATE_CLOSING;
2063   s->con->send_message2(make_message<MClientSession>(CEPH_SESSION_REQUEST_CLOSE, s->seq));
2064 }
2065
2066 void Client::_closed_mds_session(MetaSession *s, int err, bool rejected)
2067 {
2068   ldout(cct, 5) << __func__ << " mds." << s->mds_num << " seq " << s->seq << dendl;
2069   if (rejected && s->state != MetaSession::STATE_CLOSING)
2070     s->state = MetaSession::STATE_REJECTED;
2071   else
2072     s->state = MetaSession::STATE_CLOSED;
2073   s->con->mark_down();
2074   signal_context_list(s->waiting_for_open);
2075   mount_cond.notify_all();
2076   remove_session_caps(s, err);
2077   kick_requests_closed(s);
2078   mds_ranks_closing.erase(s->mds_num);
2079   if (s->state == MetaSession::STATE_CLOSED)
2080     mds_sessions.erase(s->mds_num);
2081 }
2082
2083 void Client::handle_client_session(const MConstRef<MClientSession>& m)
2084 {
2085   mds_rank_t from = mds_rank_t(m->get_source().num());
2086   ldout(cct, 10) << __func__ << " " << *m << " from mds." << from << dendl;
2087
2088   MetaSession *session = _get_mds_session(from, m->get_connection().get());
2089   if (!session) {
2090     ldout(cct, 10) << " discarding session message from sessionless mds " << m->get_source_inst() << dendl;
2091     return;
2092   }
2093
2094   switch (m->get_op()) {
2095   case CEPH_SESSION_OPEN:
2096     {
2097       feature_bitset_t missing_features(CEPHFS_FEATURES_CLIENT_REQUIRED);
2098       missing_features -= m->supported_features;
2099       if (!missing_features.empty()) {
2100         lderr(cct) << "mds." << from << " lacks required features '"
2101                    << missing_features << "', closing session " << dendl;
2102         _close_mds_session(session);
2103         _closed_mds_session(session, -EPERM, true);
2104         break;
2105       }
2106       session->mds_features = std::move(m->supported_features);
2107
2108       renew_caps(session);
2109       session->state = MetaSession::STATE_OPEN;
2110       if (unmounting)
2111         mount_cond.notify_all();
2112       else
2113         connect_mds_targets(from);
2114       signal_context_list(session->waiting_for_open);
2115       break;
2116     }
2117
2118   case CEPH_SESSION_CLOSE:
2119     _closed_mds_session(session);
2120     break;
2121
2122   case CEPH_SESSION_RENEWCAPS:
2123     if (session->cap_renew_seq == m->get_seq()) {
2124       bool was_stale = ceph_clock_now() >= session->cap_ttl;
2125       session->cap_ttl =
2126         session->last_cap_renew_request + mdsmap->get_session_timeout();
2127       if (was_stale)
2128         wake_up_session_caps(session, false);
2129     }
2130     break;
2131
2132   case CEPH_SESSION_STALE:
2133     // invalidate session caps/leases
2134     session->cap_gen++;
2135     session->cap_ttl = ceph_clock_now();
2136     session->cap_ttl -= 1;
2137     renew_caps(session);
2138     break;
2139
2140   case CEPH_SESSION_RECALL_STATE:
2141     trim_caps(session, m->get_max_caps());
2142     break;
2143
2144   case CEPH_SESSION_FLUSHMSG:
2145     /* flush cap release */
2146     if (auto& m = session->release; m) {
2147       session->con->send_message2(std::move(m));
2148     }
2149     session->con->send_message2(make_message<MClientSession>(CEPH_SESSION_FLUSHMSG_ACK, m->get_seq()));
2150     break;
2151
2152   case CEPH_SESSION_FORCE_RO:
2153     force_session_readonly(session);
2154     break;
2155
2156   case CEPH_SESSION_REJECT:
2157     {
2158       std::string_view error_str;
2159       auto it = m->metadata.find("error_string");
2160       if (it != m->metadata.end())
2161         error_str = it->second;
2162       else
2163         error_str = "unknown error";
2164       lderr(cct) << "mds." << from << " rejected us (" << error_str << ")" << dendl;
2165
2166       _closed_mds_session(session, -EPERM, true);
2167     }
2168     break;
2169
2170   default:
2171     ceph_abort();
2172   }
2173 }
2174
2175 bool Client::_any_stale_sessions() const
2176 {
2177   ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
2178
2179   for (const auto &p : mds_sessions) {
2180     if (p.second.state == MetaSession::STATE_STALE) {
2181       return true;
2182     }
2183   }
2184
2185   return false;
2186 }
2187
2188 void Client::_kick_stale_sessions()
2189 {
2190   ldout(cct, 1) << __func__ << dendl;
2191
2192   for (auto it = mds_sessions.begin(); it != mds_sessions.end(); ) {
2193     MetaSession &s = it->second;
2194     if (s.state == MetaSession::STATE_REJECTED) {
2195       mds_sessions.erase(it++);
2196       continue;
2197     }
2198     ++it;
2199     if (s.state == MetaSession::STATE_STALE)
2200       _closed_mds_session(&s);
2201   }
2202 }
2203
2204 void Client::send_request(MetaRequest *request, MetaSession *session,
2205                           bool drop_cap_releases)
2206 {
2207   // make the request
2208   mds_rank_t mds = session->mds_num;
2209   ldout(cct, 10) << __func__ << " rebuilding request " << request->get_tid()
2210                  << " for mds." << mds << dendl;
2211   auto r = build_client_request(request);
2212   if (request->dentry()) {
2213     r->set_dentry_wanted();
2214   }
2215   if (request->got_unsafe) {
2216     r->set_replayed_op();
2217     if (request->target)
2218       r->head.ino = request->target->ino;
2219   } else {
2220     encode_cap_releases(request, mds);
2221     if (drop_cap_releases) // we haven't send cap reconnect yet, drop cap releases
2222       request->cap_releases.clear();
2223     else
2224       r->releases.swap(request->cap_releases);
2225   }
2226   r->set_mdsmap_epoch(mdsmap->get_epoch());
2227   if (r->head.op == CEPH_MDS_OP_SETXATTR) {
2228     objecter->with_osdmap([r](const OSDMap& o) {
2229         r->set_osdmap_epoch(o.get_epoch());
2230       });
2231   }
2232
2233   if (request->mds == -1) {
2234     request->sent_stamp = ceph_clock_now();
2235     ldout(cct, 20) << __func__ << " set sent_stamp to " << request->sent_stamp << dendl;
2236   }
2237   request->mds = mds;
2238
2239   Inode *in = request->inode();
2240   if (in) {
2241     auto it = in->caps.find(mds);
2242     if (it != in->caps.end()) {
2243       request->sent_on_mseq = it->second.mseq;
2244     }
2245   }
2246
2247   session->requests.push_back(&request->item);
2248
2249   ldout(cct, 10) << __func__ << " " << *r << " to mds." << mds << dendl;
2250   session->con->send_message2(std::move(r));
2251 }
2252
2253 ref_t<MClientRequest> Client::build_client_request(MetaRequest *request)
2254 {
2255   auto req = make_message<MClientRequest>(request->get_op());
2256   req->set_tid(request->tid);
2257   req->set_stamp(request->op_stamp);
2258   memcpy(&req->head, &request->head, sizeof(ceph_mds_request_head));
2259
2260   // if the filepath's haven't been set, set them!
2261   if (request->path.empty()) {
2262     Inode *in = request->inode();
2263     Dentry *de = request->dentry();
2264     if (in)
2265       in->make_nosnap_relative_path(request->path);
2266     else if (de) {
2267       if (de->inode)
2268         de->inode->make_nosnap_relative_path(request->path);
2269       else if (de->dir) {
2270         de->dir->parent_inode->make_nosnap_relative_path(request->path);
2271         request->path.push_dentry(de->name);
2272       }
2273       else ldout(cct, 1) << "Warning -- unable to construct a filepath!"
2274                    << " No path, inode, or appropriately-endowed dentry given!"
2275                    << dendl;
2276     } else ldout(cct, 1) << "Warning -- unable to construct a filepath!"
2277                    << " No path, inode, or dentry given!"
2278                    << dendl;
2279   }
2280   req->set_filepath(request->get_filepath());
2281   req->set_filepath2(request->get_filepath2());
2282   req->set_data(request->data);
2283   req->set_retry_attempt(request->retry_attempt++);
2284   req->head.num_fwd = request->num_fwd;
2285   const gid_t *_gids;
2286   int gid_count = request->perms.get_gids(&_gids);
2287   req->set_gid_list(gid_count, _gids);
2288   return req;
2289 }
2290
2291
2292
2293 void Client::handle_client_request_forward(const MConstRef<MClientRequestForward>& fwd)
2294 {
2295   mds_rank_t mds = mds_rank_t(fwd->get_source().num());
2296   MetaSession *session = _get_mds_session(mds, fwd->get_connection().get());
2297   if (!session) {
2298     return;
2299   }
2300   ceph_tid_t tid = fwd->get_tid();
2301
2302   if (mds_requests.count(tid) == 0) {
2303     ldout(cct, 10) << __func__ << " no pending request on tid " << tid << dendl;
2304     return;
2305   }
2306
2307   MetaRequest *request = mds_requests[tid];
2308   ceph_assert(request);
2309
2310   // reset retry counter
2311   request->retry_attempt = 0;
2312
2313   // request not forwarded, or dest mds has no session.
2314   // resend.
2315   ldout(cct, 10) << __func__ << " tid " << tid
2316            << " fwd " << fwd->get_num_fwd()
2317            << " to mds." << fwd->get_dest_mds()
2318            << ", resending to " << fwd->get_dest_mds()
2319            << dendl;
2320
2321   request->mds = -1;
2322   request->item.remove_myself();
2323   request->num_fwd = fwd->get_num_fwd();
2324   request->resend_mds = fwd->get_dest_mds();
2325   request->caller_cond->notify_all();
2326 }
2327
2328 bool Client::is_dir_operation(MetaRequest *req)
2329 {
2330   int op = req->get_op();
2331   if (op == CEPH_MDS_OP_MKNOD || op == CEPH_MDS_OP_LINK ||
2332       op == CEPH_MDS_OP_UNLINK || op == CEPH_MDS_OP_RENAME ||
2333       op == CEPH_MDS_OP_MKDIR || op == CEPH_MDS_OP_RMDIR ||
2334       op == CEPH_MDS_OP_SYMLINK || op == CEPH_MDS_OP_CREATE)
2335     return true;
2336   return false;
2337 }
2338
2339 void Client::handle_client_reply(const MConstRef<MClientReply>& reply)
2340 {
2341   mds_rank_t mds_num = mds_rank_t(reply->get_source().num());
2342   MetaSession *session = _get_mds_session(mds_num, reply->get_connection().get());
2343   if (!session) {
2344     return;
2345   }
2346
2347   ceph_tid_t tid = reply->get_tid();
2348   bool is_safe = reply->is_safe();
2349
2350   if (mds_requests.count(tid) == 0) {
2351     lderr(cct) << __func__ << " no pending request on tid " << tid
2352                << " safe is:" << is_safe << dendl;
2353     return;
2354   }
2355   MetaRequest *request = mds_requests.at(tid);
2356
2357   ldout(cct, 20) << __func__ << " got a reply. Safe:" << is_safe
2358                  << " tid " << tid << dendl;
2359
2360   if (request->got_unsafe && !is_safe) {
2361     //duplicate response
2362     ldout(cct, 0) << "got a duplicate reply on tid " << tid << " from mds "
2363             << mds_num << " safe:" << is_safe << dendl;
2364     return;
2365   }
2366
2367   if (-ESTALE == reply->get_result()) { // see if we can get to proper MDS
2368     ldout(cct, 20) << "got ESTALE on tid " << request->tid
2369                    << " from mds." << request->mds << dendl;
2370     request->send_to_auth = true;
2371     request->resend_mds = choose_target_mds(request);
2372     Inode *in = request->inode();
2373     std::map<mds_rank_t, Cap>::const_iterator it;
2374     if (request->resend_mds >= 0 &&
2375         request->resend_mds == request->mds &&
2376         (in == NULL ||
2377          (it = in->caps.find(request->resend_mds)) != in->caps.end() ||
2378          request->sent_on_mseq == it->second.mseq)) {
2379       ldout(cct, 20) << "have to return ESTALE" << dendl;
2380     } else {
2381       request->caller_cond->notify_all();
2382       return;
2383     }
2384   }
2385
2386   ceph_assert(!request->reply);
2387   request->reply = reply;
2388   insert_trace(request, session);
2389
2390   // Handle unsafe reply
2391   if (!is_safe) {
2392     request->got_unsafe = true;
2393     session->unsafe_requests.push_back(&request->unsafe_item);
2394     if (is_dir_operation(request)) {
2395       Inode *dir = request->inode();
2396       ceph_assert(dir);
2397       dir->unsafe_ops.push_back(&request->unsafe_dir_item);
2398     }
2399     if (request->target) {
2400       InodeRef &in = request->target;
2401       in->unsafe_ops.push_back(&request->unsafe_target_item);
2402     }
2403   }
2404
2405   // Only signal the caller once (on the first reply):
2406   // Either its an unsafe reply, or its a safe reply and no unsafe reply was sent.
2407   if (!is_safe || !request->got_unsafe) {
2408     ceph::condition_variable cond;
2409     request->dispatch_cond = &cond;
2410
2411     // wake up waiter
2412     ldout(cct, 20) << __func__ << " signalling caller " << (void*)request->caller_cond << dendl;
2413     request->caller_cond->notify_all();
2414
2415     // wake for kick back
2416     std::unique_lock l{client_lock, std::adopt_lock};
2417     cond.wait(l, [tid, request, &cond, this] {
2418       if (request->dispatch_cond) {
2419         ldout(cct, 20) << "handle_client_reply awaiting kickback on tid "
2420                        << tid << " " << &cond << dendl;
2421       }
2422       return !request->dispatch_cond;
2423     });
2424     l.release();
2425   }
2426
2427   if (is_safe) {
2428     // the filesystem change is committed to disk
2429     // we're done, clean up
2430     if (request->got_unsafe) {
2431       request->unsafe_item.remove_myself();
2432       request->unsafe_dir_item.remove_myself();
2433       request->unsafe_target_item.remove_myself();
2434       signal_cond_list(request->waitfor_safe);
2435     }
2436     request->item.remove_myself();
2437     unregister_request(request);
2438   }
2439   if (unmounting)
2440     mount_cond.notify_all();
2441 }
2442
2443 void Client::_handle_full_flag(int64_t pool)
2444 {
2445   ldout(cct, 1) << __func__ << ": FULL: cancelling outstanding operations "
2446     << "on " << pool << dendl;
2447   // Cancel all outstanding ops in this pool with -ENOSPC: it is necessary
2448   // to do this rather than blocking, because otherwise when we fill up we
2449   // potentially lock caps forever on files with dirty pages, and we need
2450   // to be able to release those caps to the MDS so that it can delete files
2451   // and free up space.
2452   epoch_t cancelled_epoch = objecter->op_cancel_writes(-ENOSPC, pool);
2453
2454   // For all inodes with layouts in this pool and a pending flush write op
2455   // (i.e. one of the ones we will cancel), we've got to purge_set their data
2456   // from ObjectCacher so that it doesn't re-issue the write in response to
2457   // the ENOSPC error.
2458   // Fortunately since we're cancelling everything in a given pool, we don't
2459   // need to know which ops belong to which ObjectSet, we can just blow all
2460   // the un-flushed cached data away and mark any dirty inodes' async_err
2461   // field with -ENOSPC as long as we're sure all the ops we cancelled were
2462   // affecting this pool, and all the objectsets we're purging were also
2463   // in this pool.
2464   for (unordered_map<vinodeno_t,Inode*>::iterator i = inode_map.begin();
2465        i != inode_map.end(); ++i)
2466   {
2467     Inode *inode = i->second;
2468     if (inode->oset.dirty_or_tx
2469         && (pool == -1 || inode->layout.pool_id == pool)) {
2470       ldout(cct, 4) << __func__ << ": FULL: inode 0x" << std::hex << i->first << std::dec
2471         << " has dirty objects, purging and setting ENOSPC" << dendl;
2472       objectcacher->purge_set(&inode->oset);
2473       inode->set_async_err(-ENOSPC);
2474     }
2475   }
2476
2477   if (cancelled_epoch != (epoch_t)-1) {
2478     set_cap_epoch_barrier(cancelled_epoch);
2479   }
2480 }
2481
2482 void Client::handle_osd_map(const MConstRef<MOSDMap>& m)
2483 {
2484   std::set<entity_addr_t> new_blacklists;
2485   objecter->consume_blacklist_events(&new_blacklists);
2486
2487   const auto myaddrs = messenger->get_myaddrs();
2488   bool new_blacklist = false;
2489   bool prenautilus = objecter->with_osdmap(
2490     [&](const OSDMap& o) {
2491       return o.require_osd_release < ceph_release_t::nautilus;
2492     });
2493   if (!blacklisted) {
2494     for (auto a : myaddrs.v) {
2495       // blacklist entries are always TYPE_ANY for nautilus+
2496       a.set_type(entity_addr_t::TYPE_ANY);
2497       if (new_blacklists.count(a)) {
2498         new_blacklist = true;
2499         break;
2500       }
2501       if (prenautilus) {
2502         // ...except pre-nautilus, they were TYPE_LEGACY
2503         a.set_type(entity_addr_t::TYPE_LEGACY);
2504         if (new_blacklists.count(a)) {
2505           new_blacklist = true;
2506           break;
2507         }
2508       }
2509     }
2510   }
2511   if (new_blacklist) {
2512     auto epoch = objecter->with_osdmap([](const OSDMap &o){
2513         return o.get_epoch();
2514         });
2515     lderr(cct) << "I was blacklisted at osd epoch " << epoch << dendl;
2516     blacklisted = true;
2517
2518     _abort_mds_sessions(-EBLACKLISTED);
2519
2520     // Since we know all our OSD ops will fail, cancel them all preemtively,
2521     // so that on an unhealthy cluster we can umount promptly even if e.g.
2522     // some PGs were inaccessible.
2523     objecter->op_cancel_writes(-EBLACKLISTED);
2524
2525   } else if (blacklisted) {
2526     // Handle case where we were blacklisted but no longer are
2527     blacklisted = objecter->with_osdmap([myaddrs](const OSDMap &o){
2528         return o.is_blacklisted(myaddrs);});
2529   }
2530
2531   // Always subscribe to next osdmap for blacklisted client
2532   // until this client is not blacklisted.
2533   if (blacklisted) {
2534     objecter->maybe_request_map();
2535   }
2536
2537   if (objecter->osdmap_full_flag()) {
2538     _handle_full_flag(-1);
2539   } else {
2540     // Accumulate local list of full pools so that I can drop
2541     // the objecter lock before re-entering objecter in
2542     // cancel_writes
2543     std::vector<int64_t> full_pools;
2544
2545     objecter->with_osdmap([&full_pools](const OSDMap &o) {
2546         for (const auto& kv : o.get_pools()) {
2547           if (kv.second.has_flag(pg_pool_t::FLAG_FULL)) {
2548             full_pools.push_back(kv.first);
2549           }
2550         }
2551       });
2552
2553     for (auto p : full_pools)
2554       _handle_full_flag(p);
2555
2556     // Subscribe to subsequent maps to watch for the full flag going
2557     // away.  For the global full flag objecter does this for us, but
2558     // it pays no attention to the per-pool full flag so in this branch
2559     // we do it ourselves.
2560     if (!full_pools.empty()) {
2561       objecter->maybe_request_map();
2562     }
2563   }
2564 }
2565
2566
2567 // ------------------------
2568 // incoming messages
2569
2570
2571 bool Client::ms_dispatch2(const MessageRef &m)
2572 {
2573   std::lock_guard l(client_lock);
2574   if (!initialized) {
2575     ldout(cct, 10) << "inactive, discarding " << *m << dendl;
2576     return true;
2577   }
2578
2579   switch (m->get_type()) {
2580     // mounting and mds sessions
2581   case CEPH_MSG_MDS_MAP:
2582     handle_mds_map(ref_cast<MMDSMap>(m));
2583     break;
2584   case CEPH_MSG_FS_MAP:
2585     handle_fs_map(ref_cast<MFSMap>(m));
2586     break;
2587   case CEPH_MSG_FS_MAP_USER:
2588     handle_fs_map_user(ref_cast<MFSMapUser>(m));
2589     break;
2590   case CEPH_MSG_CLIENT_SESSION:
2591     handle_client_session(ref_cast<MClientSession>(m));
2592     break;
2593
2594   case CEPH_MSG_OSD_MAP:
2595     handle_osd_map(ref_cast<MOSDMap>(m));
2596     break;
2597
2598     // requests
2599   case CEPH_MSG_CLIENT_REQUEST_FORWARD:
2600     handle_client_request_forward(ref_cast<MClientRequestForward>(m));
2601     break;
2602   case CEPH_MSG_CLIENT_REPLY:
2603     handle_client_reply(ref_cast<MClientReply>(m));
2604     break;
2605
2606   // reclaim reply
2607   case CEPH_MSG_CLIENT_RECLAIM_REPLY:
2608     handle_client_reclaim_reply(ref_cast<MClientReclaimReply>(m));
2609     break;
2610
2611   case CEPH_MSG_CLIENT_SNAP:
2612     handle_snap(ref_cast<MClientSnap>(m));
2613     break;
2614   case CEPH_MSG_CLIENT_CAPS:
2615     handle_caps(ref_cast<MClientCaps>(m));
2616     break;
2617   case CEPH_MSG_CLIENT_LEASE:
2618     handle_lease(ref_cast<MClientLease>(m));
2619     break;
2620   case MSG_COMMAND_REPLY:
2621     if (m->get_source().type() == CEPH_ENTITY_TYPE_MDS) {
2622       handle_command_reply(ref_cast<MCommandReply>(m));
2623     } else {
2624       return false;
2625     }
2626     break;
2627   case CEPH_MSG_CLIENT_QUOTA:
2628     handle_quota(ref_cast<MClientQuota>(m));
2629     break;
2630
2631   default:
2632     return false;
2633   }
2634
2635   // unmounting?
2636   if (unmounting) {
2637     ldout(cct, 10) << "unmounting: trim pass, size was " << lru.lru_get_size()
2638              << "+" << inode_map.size() << dendl;
2639     long unsigned size = lru.lru_get_size() + inode_map.size();
2640     trim_cache();
2641     if (size < lru.lru_get_size() + inode_map.size()) {
2642       ldout(cct, 10) << "unmounting: trim pass, cache shrank, poking unmount()" << dendl;
2643       mount_cond.notify_all();
2644     } else {
2645       ldout(cct, 10) << "unmounting: trim pass, size still " << lru.lru_get_size()
2646                << "+" << inode_map.size() << dendl;
2647     }
2648   }
2649
2650   return true;
2651 }
2652
2653 void Client::handle_fs_map(const MConstRef<MFSMap>& m)
2654 {
2655   fsmap.reset(new FSMap(m->get_fsmap()));
2656
2657   signal_cond_list(waiting_for_fsmap);
2658
2659   monclient->sub_got("fsmap", fsmap->get_epoch());
2660 }
2661
2662 void Client::handle_fs_map_user(const MConstRef<MFSMapUser>& m)
2663 {
2664   fsmap_user.reset(new FSMapUser);
2665   *fsmap_user = m->get_fsmap();
2666
2667   monclient->sub_got("fsmap.user", fsmap_user->get_epoch());
2668   signal_cond_list(waiting_for_fsmap);
2669 }
2670
2671 void Client::handle_mds_map(const MConstRef<MMDSMap>& m)
2672 {
2673   mds_gid_t old_inc, new_inc;
2674   if (m->get_epoch() <= mdsmap->get_epoch()) {
2675     ldout(cct, 1) << __func__ << " epoch " << m->get_epoch()
2676                   << " is identical to or older than our "
2677                   << mdsmap->get_epoch() << dendl;
2678     return;
2679   }
2680
2681   ldout(cct, 1) << __func__ << " epoch " << m->get_epoch() << dendl;
2682
2683   std::unique_ptr<MDSMap> oldmap(new MDSMap);
2684   oldmap.swap(mdsmap);
2685
2686   mdsmap->decode(m->get_encoded());
2687
2688   // Cancel any commands for missing or laggy GIDs
2689   std::list<ceph_tid_t> cancel_ops;
2690   auto &commands = command_table.get_commands();
2691   for (const auto &i : commands) {
2692     auto &op = i.second;
2693     const mds_gid_t op_mds_gid = op.mds_gid;
2694     if (mdsmap->is_dne_gid(op_mds_gid) || mdsmap->is_laggy_gid(op_mds_gid)) {
2695       ldout(cct, 1) << __func__ << ": cancelling command op " << i.first << dendl;
2696       cancel_ops.push_back(i.first);
2697       if (op.outs) {
2698         std::ostringstream ss;
2699         ss << "MDS " << op_mds_gid << " went away";
2700         *(op.outs) = ss.str();
2701       }
2702       op.con->mark_down();
2703       if (op.on_finish) {
2704         op.on_finish->complete(-ETIMEDOUT);
2705       }
2706     }
2707   }
2708
2709   for (std::list<ceph_tid_t>::iterator i = cancel_ops.begin();
2710        i != cancel_ops.end(); ++i) {
2711     command_table.erase(*i);
2712   }
2713
2714   // reset session
2715   for (auto p = mds_sessions.begin(); p != mds_sessions.end(); ) {
2716     mds_rank_t mds = p->first;
2717     MetaSession *session = &p->second;
2718     ++p;
2719
2720     int oldstate = oldmap->get_state(mds);
2721     int newstate = mdsmap->get_state(mds);
2722     if (!mdsmap->is_up(mds)) {
2723       session->con->mark_down();
2724     } else if (mdsmap->get_addrs(mds) != session->addrs) {
2725       old_inc = oldmap->get_incarnation(mds);
2726       new_inc = mdsmap->get_incarnation(mds);
2727       if (old_inc != new_inc) {
2728         ldout(cct, 1) << "mds incarnation changed from "
2729                       << old_inc << " to " << new_inc << dendl;
2730         oldstate = MDSMap::STATE_NULL;
2731       }
2732       session->con->mark_down();
2733       session->addrs = mdsmap->get_addrs(mds);
2734       // When new MDS starts to take over, notify kernel to trim unused entries
2735       // in its dcache/icache. Hopefully, the kernel will release some unused
2736       // inodes before the new MDS enters reconnect state.
2737       trim_cache_for_reconnect(session);
2738     } else if (oldstate == newstate)
2739       continue;  // no change
2740
2741     session->mds_state = newstate;
2742     if (newstate == MDSMap::STATE_RECONNECT) {
2743       session->con = messenger->connect_to_mds(session->addrs);
2744       send_reconnect(session);
2745     } else if (newstate > MDSMap::STATE_RECONNECT) {
2746       if (oldstate < MDSMap::STATE_RECONNECT) {
2747         ldout(cct, 1) << "we may miss the MDSMap::RECONNECT, close mds session ... " << dendl;
2748         _closed_mds_session(session);
2749         continue;
2750       }
2751       if (newstate >= MDSMap::STATE_ACTIVE) {
2752         if (oldstate < MDSMap::STATE_ACTIVE) {
2753           // kick new requests
2754           kick_requests(session);
2755           kick_flushing_caps(session);
2756           signal_context_list(session->waiting_for_open);
2757           wake_up_session_caps(session, true);
2758         }
2759         connect_mds_targets(mds);
2760       }
2761     } else if (newstate == MDSMap::STATE_NULL &&
2762                mds >= mdsmap->get_max_mds()) {
2763       _closed_mds_session(session);
2764     }
2765   }
2766
2767   // kick any waiting threads
2768   signal_cond_list(waiting_for_mdsmap);
2769
2770   monclient->sub_got("mdsmap", mdsmap->get_epoch());
2771 }
2772
2773 void Client::send_reconnect(MetaSession *session)
2774 {
2775   mds_rank_t mds = session->mds_num;
2776   ldout(cct, 10) << __func__ << " to mds." << mds << dendl;
2777
2778   // trim unused caps to reduce MDS's cache rejoin time
2779   trim_cache_for_reconnect(session);
2780
2781   session->readonly = false;
2782
2783   session->release.reset();
2784
2785   // reset my cap seq number
2786   session->seq = 0;
2787   //connect to the mds' offload targets
2788   connect_mds_targets(mds);
2789   //make sure unsafe requests get saved
2790   resend_unsafe_requests(session);
2791
2792   early_kick_flushing_caps(session);
2793
2794   auto m = make_message<MClientReconnect>();
2795   bool allow_multi = session->mds_features.test(CEPHFS_FEATURE_MULTI_RECONNECT);
2796
2797   // i have an open session.
2798   ceph::unordered_set<inodeno_t> did_snaprealm;
2799   for (ceph::unordered_map<vinodeno_t, Inode*>::iterator p = inode_map.begin();
2800        p != inode_map.end();
2801        ++p) {
2802     Inode *in = p->second;
2803     auto it = in->caps.find(mds);
2804     if (it != in->caps.end()) {
2805       if (allow_multi &&
2806           m->get_approx_size() >=
2807           static_cast<size_t>((std::numeric_limits<int>::max() >> 1))) {
2808         m->mark_more();
2809         session->con->send_message2(std::move(m));
2810
2811         m = make_message<MClientReconnect>();
2812       }
2813
2814       Cap &cap = it->second;
2815       ldout(cct, 10) << " caps on " << p->first
2816                << " " << ccap_string(cap.issued)
2817                << " wants " << ccap_string(in->caps_wanted())
2818                << dendl;
2819       filepath path;
2820       in->make_long_path(path);
2821       ldout(cct, 10) << "    path " << path << dendl;
2822
2823       bufferlist flockbl;
2824       _encode_filelocks(in, flockbl);
2825
2826       cap.seq = 0;  // reset seq.
2827       cap.issue_seq = 0;  // reset seq.
2828       cap.mseq = 0;  // reset seq.
2829       // cap gen should catch up with session cap_gen
2830       if (cap.gen < session->cap_gen) {
2831         cap.gen = session->cap_gen;
2832         cap.issued = cap.implemented = CEPH_CAP_PIN;
2833       } else {
2834         cap.issued = cap.implemented;
2835       }
2836       snapid_t snap_follows = 0;
2837       if (!in->cap_snaps.empty())
2838         snap_follows = in->cap_snaps.begin()->first;
2839
2840       m->add_cap(p->first.ino,
2841                  cap.cap_id,
2842                  path.get_ino(), path.get_path(),   // ino
2843                  in->caps_wanted(), // wanted
2844                  cap.issued,     // issued
2845                  in->snaprealm->ino,
2846                  snap_follows,
2847                  flockbl);
2848
2849       if (did_snaprealm.count(in->snaprealm->ino) == 0) {
2850         ldout(cct, 10) << " snaprealm " << *in->snaprealm << dendl;
2851         m->add_snaprealm(in->snaprealm->ino, in->snaprealm->seq, in->snaprealm->parent);
2852         did_snaprealm.insert(in->snaprealm->ino);
2853       }
2854     }
2855   }
2856
2857   if (!allow_multi)
2858     m->set_encoding_version(0); // use connection features to choose encoding
2859   session->con->send_message2(std::move(m));
2860
2861   mount_cond.notify_all();
2862
2863   if (session->reclaim_state == MetaSession::RECLAIMING)
2864     signal_cond_list(waiting_for_reclaim);
2865 }
2866
2867
2868 void Client::kick_requests(MetaSession *session)
2869 {
2870   ldout(cct, 10) << __func__ << " for mds." << session->mds_num << dendl;
2871   for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
2872        p != mds_requests.end();
2873        ++p) {
2874     MetaRequest *req = p->second;
2875     if (req->got_unsafe)
2876       continue;
2877     if (req->aborted()) {
2878       if (req->caller_cond) {
2879         req->kick = true;
2880         req->caller_cond->notify_all();
2881       }
2882       continue;
2883     }
2884     if (req->retry_attempt > 0)
2885       continue; // new requests only
2886     if (req->mds == session->mds_num) {
2887       send_request(p->second, session);
2888     }
2889   }
2890 }
2891
2892 void Client::resend_unsafe_requests(MetaSession *session)
2893 {
2894   for (xlist<MetaRequest*>::iterator iter = session->unsafe_requests.begin();
2895        !iter.end();
2896        ++iter)
2897     send_request(*iter, session);
2898
2899   // also re-send old requests when MDS enters reconnect stage. So that MDS can
2900   // process completed requests in clientreplay stage.
2901   for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
2902        p != mds_requests.end();
2903        ++p) {
2904     MetaRequest *req = p->second;
2905     if (req->got_unsafe)
2906       continue;
2907     if (req->aborted())
2908       continue;
2909     if (req->retry_attempt == 0)
2910       continue; // old requests only
2911     if (req->mds == session->mds_num)
2912       send_request(req, session, true);
2913   }
2914 }
2915
2916 void Client::wait_unsafe_requests()
2917 {
2918   list<MetaRequest*> last_unsafe_reqs;
2919   for (const auto &p : mds_sessions) {
2920     const MetaSession &s = p.second;
2921     if (!s.unsafe_requests.empty()) {
2922       MetaRequest *req = s.unsafe_requests.back();
2923       req->get();
2924       last_unsafe_reqs.push_back(req);
2925     }
2926   }
2927
2928   for (list<MetaRequest*>::iterator p = last_unsafe_reqs.begin();
2929        p != last_unsafe_reqs.end();
2930        ++p) {
2931     MetaRequest *req = *p;
2932     if (req->unsafe_item.is_on_list())
2933       wait_on_list(req->waitfor_safe);
2934     put_request(req);
2935   }
2936 }
2937
2938 void Client::kick_requests_closed(MetaSession *session)
2939 {
2940   ldout(cct, 10) << __func__ << " for mds." << session->mds_num << dendl;
2941   for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
2942        p != mds_requests.end(); ) {
2943     MetaRequest *req = p->second;
2944     ++p;
2945     if (req->mds == session->mds_num) {
2946       if (req->caller_cond) {
2947         req->kick = true;
2948         req->caller_cond->notify_all();
2949       }
2950       req->item.remove_myself();
2951       if (req->got_unsafe) {
2952         lderr(cct) << __func__ << " removing unsafe request " << req->get_tid() << dendl;
2953         req->unsafe_item.remove_myself();
2954         if (is_dir_operation(req)) {
2955           Inode *dir = req->inode();
2956           assert(dir);
2957           dir->set_async_err(-EIO);
2958           lderr(cct) << "kick_requests_closed drop req of inode(dir) : "
2959                      <<  dir->ino  << " " << req->get_tid() << dendl;
2960           req->unsafe_dir_item.remove_myself();
2961         }
2962         if (req->target) {
2963           InodeRef &in = req->target;
2964           in->set_async_err(-EIO);
2965           lderr(cct) << "kick_requests_closed drop req of inode : "
2966                      <<  in->ino  << " " << req->get_tid() << dendl;
2967           req->unsafe_target_item.remove_myself();
2968         }
2969         signal_cond_list(req->waitfor_safe);
2970         unregister_request(req);
2971       }
2972     }
2973   }
2974   ceph_assert(session->requests.empty());
2975   ceph_assert(session->unsafe_requests.empty());
2976 }
2977
2978
2979
2980
2981 /************
2982  * leases
2983  */
2984
2985 void Client::got_mds_push(MetaSession *s)
2986 {
2987   s->seq++;
2988   ldout(cct, 10) << " mds." << s->mds_num << " seq now " << s->seq << dendl;
2989   if (s->state == MetaSession::STATE_CLOSING) {
2990     s->con->send_message2(make_message<MClientSession>(CEPH_SESSION_REQUEST_CLOSE, s->seq));
2991   }
2992 }
2993
2994 void Client::handle_lease(const MConstRef<MClientLease>& m)
2995 {
2996   ldout(cct, 10) << __func__ << " " << *m << dendl;
2997
2998   ceph_assert(m->get_action() == CEPH_MDS_LEASE_REVOKE);
2999
3000   mds_rank_t mds = mds_rank_t(m->get_source().num());
3001   MetaSession *session = _get_mds_session(mds, m->get_connection().get());
3002   if (!session) {
3003     return;
3004   }
3005
3006   got_mds_push(session);
3007
3008   ceph_seq_t seq = m->get_seq();
3009
3010   Inode *in;
3011   vinodeno_t vino(m->get_ino(), CEPH_NOSNAP);
3012   if (inode_map.count(vino) == 0) {
3013     ldout(cct, 10) << " don't have vino " << vino << dendl;
3014     goto revoke;
3015   }
3016   in = inode_map[vino];
3017
3018   if (m->get_mask() & CEPH_LEASE_VALID) {
3019     if (!in->dir || in->dir->dentries.count(m->dname) == 0) {
3020       ldout(cct, 10) << " don't have dir|dentry " << m->get_ino() << "/" << m->dname <<dendl;
3021       goto revoke;
3022     }
3023     Dentry *dn = in->dir->dentries[m->dname];
3024     ldout(cct, 10) << " revoked DN lease on " << dn << dendl;
3025     dn->lease_mds = -1;
3026   }
3027
3028  revoke:
3029   {
3030     auto reply = make_message<MClientLease>(CEPH_MDS_LEASE_RELEASE, seq,
3031                                             m->get_mask(), m->get_ino(),
3032                                             m->get_first(), m->get_last(), m->dname);
3033     m->get_connection()->send_message2(std::move(reply));
3034   }
3035 }
3036
3037 void Client::put_inode(Inode *in, int n)
3038 {
3039   ldout(cct, 10) << __func__ << " on " << *in << dendl;
3040   int left = in->_put(n);
3041   if (left == 0) {
3042     // release any caps
3043     remove_all_caps(in);
3044
3045     ldout(cct, 10) << __func__ << " deleting " << *in << dendl;
3046     bool unclean = objectcacher->release_set(&in->oset);
3047     ceph_assert(!unclean);
3048     inode_map.erase(in->vino());
3049     if (use_faked_inos())
3050       _release_faked_ino(in);
3051
3052     if (in == root) {
3053       root = 0;
3054       root_ancestor = 0;
3055       while (!root_parents.empty())
3056         root_parents.erase(root_parents.begin());
3057     }
3058
3059     delete in;
3060   }
3061 }
3062
3063 void Client::close_dir(Dir *dir)
3064 {
3065   Inode *in = dir->parent_inode;
3066   ldout(cct, 15) << __func__ << " dir " << dir << " on " << in << dendl;
3067   ceph_assert(dir->is_empty());
3068   ceph_assert(in->dir == dir);
3069   ceph_assert(in->dentries.size() < 2);     // dirs can't be hard-linked
3070   if (!in->dentries.empty())
3071     in->get_first_parent()->put();   // unpin dentry
3072
3073   delete in->dir;
3074   in->dir = 0;
3075   put_inode(in);               // unpin inode
3076 }
3077
3078   /**
3079    * Don't call this with in==NULL, use get_or_create for that
3080    * leave dn set to default NULL unless you're trying to add
3081    * a new inode to a pre-created Dentry
3082    */
3083 Dentry* Client::link(Dir *dir, const string& name, Inode *in, Dentry *dn)
3084 {
3085   if (!dn) {
3086     // create a new Dentry
3087     dn = new Dentry(dir, name);
3088
3089     lru.lru_insert_mid(dn);    // mid or top?
3090
3091     ldout(cct, 15) << "link dir " << dir->parent_inode << " '" << name << "' to inode " << in
3092                    << " dn " << dn << " (new dn)" << dendl;
3093   } else {
3094     ceph_assert(!dn->inode);
3095     ldout(cct, 15) << "link dir " << dir->parent_inode << " '" << name << "' to inode " << in
3096                    << " dn " << dn << " (old dn)" << dendl;
3097   }
3098
3099   if (in) {    // link to inode
3100     InodeRef tmp_ref;
3101     // only one parent for directories!
3102     if (in->is_dir() && !in->dentries.empty()) {
3103       tmp_ref = in; // prevent unlink below from freeing the inode.
3104       Dentry *olddn = in->get_first_parent();
3105       ceph_assert(olddn->dir != dir || olddn->name != name);
3106       Inode *old_diri = olddn->dir->parent_inode;
3107       old_diri->dir_release_count++;
3108       clear_dir_complete_and_ordered(old_diri, true);
3109       unlink(olddn, true, true);  // keep dir, dentry
3110     }
3111
3112     dn->link(in);
3113     ldout(cct, 20) << "link  inode " << in << " parents now " << in->dentries << dendl;
3114   }
3115
3116   return dn;
3117 }
3118
3119 void Client::unlink(Dentry *dn, bool keepdir, bool keepdentry)
3120 {
3121   InodeRef in(dn->inode);
3122   ldout(cct, 15) << "unlink dir " << dn->dir->parent_inode << " '" << dn->name << "' dn " << dn
3123                  << " inode " << dn->inode << dendl;
3124
3125   // unlink from inode
3126   if (dn->inode) {
3127     dn->unlink();
3128     ldout(cct, 20) << "unlink  inode " << in << " parents now " << in->dentries << dendl;
3129   }
3130
3131   if (keepdentry) {
3132     dn->lease_mds = -1;
3133   } else {
3134     ldout(cct, 15) << "unlink  removing '" << dn->name << "' dn " << dn << dendl;
3135
3136     // unlink from dir
3137     Dir *dir = dn->dir;
3138     dn->detach();
3139
3140     // delete den
3141     lru.lru_remove(dn);
3142     dn->put();
3143
3144     if (dir->is_empty() && !keepdir)
3145       close_dir(dir);
3146   }
3147 }
3148
3149 /**
3150  * For asynchronous flushes, check for errors from the IO and
3151  * update the inode if necessary
3152  */
3153 class C_Client_FlushComplete : public Context {
3154 private:
3155   Client *client;
3156   InodeRef inode;
3157 public:
3158   C_Client_FlushComplete(Client *c, Inode *in) : client(c), inode(in) { }
3159   void finish(int r) override {
3160     ceph_assert(ceph_mutex_is_locked_by_me(client->client_lock));
3161     if (r != 0) {
3162       client_t const whoami = client->whoami;  // For the benefit of ldout prefix
3163       ldout(client->cct, 1) << "I/O error from flush on inode " << inode
3164         << " 0x" << std::hex << inode->ino << std::dec
3165         << ": " << r << "(" << cpp_strerror(r) << ")" << dendl;
3166       inode->set_async_err(r);
3167     }
3168   }
3169 };
3170
3171
3172 /****
3173  * caps
3174  */
3175
3176 void Client::get_cap_ref(Inode *in, int cap)
3177 {
3178   if ((cap & CEPH_CAP_FILE_BUFFER) &&
3179       in->cap_refs[CEPH_CAP_FILE_BUFFER] == 0) {
3180     ldout(cct, 5) << __func__ << " got first FILE_BUFFER ref on " << *in << dendl;
3181     in->get();
3182   }
3183   if ((cap & CEPH_CAP_FILE_CACHE) &&
3184       in->cap_refs[CEPH_CAP_FILE_CACHE] == 0) {
3185     ldout(cct, 5) << __func__ << " got first FILE_CACHE ref on " << *in << dendl;
3186     in->get();
3187   }
3188   in->get_cap_ref(cap);
3189 }
3190
3191 void Client::put_cap_ref(Inode *in, int cap)
3192 {
3193   int last = in->put_cap_ref(cap);
3194   if (last) {
3195     int put_nref = 0;
3196     int drop = last & ~in->caps_issued();
3197     if (in->snapid == CEPH_NOSNAP) {
3198       if ((last & CEPH_CAP_FILE_WR) &&
3199           !in->cap_snaps.empty() &&
3200           in->cap_snaps.rbegin()->second.writing) {
3201         ldout(cct, 10) << __func__ << " finishing pending cap_snap on " << *in << dendl;
3202         in->cap_snaps.rbegin()->second.writing = 0;
3203         finish_cap_snap(in, in->cap_snaps.rbegin()->second, get_caps_used(in));
3204         signal_cond_list(in->waitfor_caps);  // wake up blocked sync writers
3205       }
3206       if (last & CEPH_CAP_FILE_BUFFER) {
3207         for (auto &p : in->cap_snaps)
3208           p.second.dirty_data = 0;
3209         signal_cond_list(in->waitfor_commit);
3210         ldout(cct, 5) << __func__ << " dropped last FILE_BUFFER ref on " << *in << dendl;
3211         ++put_nref;
3212       }
3213     }
3214     if (last & CEPH_CAP_FILE_CACHE) {
3215       ldout(cct, 5) << __func__ << " dropped last FILE_CACHE ref on " << *in << dendl;
3216       ++put_nref;
3217     }
3218     if (drop)
3219       check_caps(in, 0);
3220     if (put_nref)
3221       put_inode(in, put_nref);
3222   }
3223 }
3224
3225 int Client::get_caps(Fh *fh, int need, int want, int *phave, loff_t endoff)
3226 {
3227   Inode *in = fh->inode.get();
3228
3229   int r = check_pool_perm(in, need);
3230   if (r < 0)
3231     return r;
3232
3233   while (1) {
3234     int file_wanted = in->caps_file_wanted();
3235     if ((file_wanted & need) != need) {
3236       ldout(cct, 10) << "get_caps " << *in << " need " << ccap_string(need)
3237                      << " file_wanted " << ccap_string(file_wanted) << ", EBADF "
3238                      << dendl;
3239       return -EBADF;
3240     }
3241
3242     if ((fh->mode & CEPH_FILE_MODE_WR) && fh->gen != fd_gen)
3243       return -EBADF;
3244
3245     if ((in->flags & I_ERROR_FILELOCK) && fh->has_any_filelocks())
3246       return -EIO;
3247
3248     int implemented;
3249     int have = in->caps_issued(&implemented);
3250
3251     bool waitfor_caps = false;
3252     bool waitfor_commit = false;
3253
3254     if (have & need & CEPH_CAP_FILE_WR) {
3255       if (endoff > 0) {
3256          if ((endoff >= (loff_t)in->max_size ||
3257               endoff > (loff_t)(in->size << 1)) &&
3258              endoff > (loff_t)in->wanted_max_size) {
3259            ldout(cct, 10) << "wanted_max_size " << in->wanted_max_size << " -> " << endoff << dendl;
3260            in->wanted_max_size = endoff;
3261          }
3262          if (in->wanted_max_size > in->max_size &&
3263              in->wanted_max_size > in->requested_max_size)
3264            check_caps(in, 0);
3265       }
3266
3267       if (endoff >= 0 && endoff > (loff_t)in->max_size) {
3268         ldout(cct, 10) << "waiting on max_size, endoff " << endoff << " max_size " << in->max_size << " on " << *in << dendl;
3269         waitfor_caps = true;
3270       }
3271       if (!in->cap_snaps.empty()) {
3272         if (in->cap_snaps.rbegin()->second.writing) {
3273           ldout(cct, 10) << "waiting on cap_snap write to complete" << dendl;
3274           waitfor_caps = true;
3275         }
3276         for (auto &p : in->cap_snaps) {
3277           if (p.second.dirty_data) {
3278             waitfor_commit = true;
3279             break;
3280           }
3281         }
3282         if (waitfor_commit) {
3283           _flush(in, new C_Client_FlushComplete(this, in));
3284           ldout(cct, 10) << "waiting for WRBUFFER to get dropped" << dendl;
3285         }
3286       }
3287     }
3288
3289     if (!waitfor_caps && !waitfor_commit) {
3290       if ((have & need) == need) {
3291         int revoking = implemented & ~have;
3292         ldout(cct, 10) << "get_caps " << *in << " have " << ccap_string(have)
3293                  << " need " << ccap_string(need) << " want " << ccap_string(want)
3294                  << " revoking " << ccap_string(revoking)
3295                  << dendl;
3296         if ((revoking & want) == 0) {
3297           *phave = need | (have & want);
3298           in->get_cap_ref(need);
3299           return 0;
3300         }
3301       }
3302       ldout(cct, 10) << "waiting for caps " << *in << " need " << ccap_string(need) << " want " << ccap_string(want) << dendl;
3303       waitfor_caps = true;
3304     }
3305
3306     if ((need & CEPH_CAP_FILE_WR) && in->auth_cap &&
3307         in->auth_cap->session->readonly)
3308       return -EROFS;
3309
3310     if (in->flags & I_CAP_DROPPED) {
3311       int mds_wanted = in->caps_mds_wanted();
3312       if ((mds_wanted & need) != need) {
3313         int ret = _renew_caps(in);
3314         if (ret < 0)
3315           return ret;
3316         continue;
3317       }
3318       if (!(file_wanted & ~mds_wanted))
3319         in->flags &= ~I_CAP_DROPPED;
3320     }
3321
3322     if (waitfor_caps)
3323       wait_on_list(in->waitfor_caps);
3324     else if (waitfor_commit)
3325       wait_on_list(in->waitfor_commit);
3326   }
3327 }
3328
3329 int Client::get_caps_used(Inode *in)
3330 {
3331   unsigned used = in->caps_used();
3332   if (!(used & CEPH_CAP_FILE_CACHE) &&
3333       !objectcacher->set_is_empty(&in->oset))
3334     used |= CEPH_CAP_FILE_CACHE;
3335   return used;
3336 }
3337
3338 void Client::cap_delay_requeue(Inode *in)
3339 {
3340   ldout(cct, 10) << __func__ << " on " << *in << dendl;
3341   in->hold_caps_until = ceph_clock_now();
3342   in->hold_caps_until += cct->_conf->client_caps_release_delay;
3343   delayed_list.push_back(&in->delay_cap_item);
3344 }
3345
3346 void Client::send_cap(Inode *in, MetaSession *session, Cap *cap,
3347                       int flags, int used, int want, int retain,
3348                       int flush, ceph_tid_t flush_tid)
3349 {
3350   int held = cap->issued | cap->implemented;
3351   int revoking = cap->implemented & ~cap->issued;
3352   retain &= ~revoking;
3353   int dropping = cap->issued & ~retain;
3354   int op = CEPH_CAP_OP_UPDATE;
3355
3356   ldout(cct, 10) << __func__ << " " << *in
3357            << " mds." << session->mds_num << " seq " << cap->seq
3358            << " used " << ccap_string(used)
3359            << " want " << ccap_string(want)
3360            << " flush " << ccap_string(flush)
3361            << " retain " << ccap_string(retain)
3362            << " held "<< ccap_string(held)
3363            << " revoking " << ccap_string(revoking)
3364            << " dropping " << ccap_string(dropping)
3365            << dendl;
3366
3367   if (cct->_conf->client_inject_release_failure && revoking) {
3368     const int would_have_issued = cap->issued & retain;
3369     const int would_have_implemented = cap->implemented & (cap->issued | used);
3370     // Simulated bug:
3371     //  - tell the server we think issued is whatever they issued plus whatever we implemented
3372     //  - leave what we have implemented in place
3373     ldout(cct, 20) << __func__ << " injecting failure to release caps" << dendl;
3374     cap->issued = cap->issued | cap->implemented;
3375
3376     // Make an exception for revoking xattr caps: we are injecting
3377     // failure to release other caps, but allow xattr because client
3378     // will block on xattr ops if it can't release these to MDS (#9800)
3379     const int xattr_mask = CEPH_CAP_XATTR_SHARED | CEPH_CAP_XATTR_EXCL;
3380     cap->issued ^= xattr_mask & revoking;
3381     cap->implemented ^= xattr_mask & revoking;
3382
3383     ldout(cct, 20) << __func__ << " issued " << ccap_string(cap->issued) << " vs " << ccap_string(would_have_issued) << dendl;
3384     ldout(cct, 20) << __func__ << " implemented " << ccap_string(cap->implemented) << " vs " << ccap_string(would_have_implemented) << dendl;
3385   } else {
3386     // Normal behaviour
3387     cap->issued &= retain;
3388     cap->implemented &= cap->issued | used;
3389   }
3390
3391   snapid_t follows = 0;
3392
3393   if (flush)
3394     follows = in->snaprealm->get_snap_context().seq;
3395
3396   auto m = make_message<MClientCaps>(op,
3397                                    in->ino,
3398                                    0,
3399                                    cap->cap_id, cap->seq,
3400                                    cap->implemented,
3401                                    want,
3402                                    flush,
3403                                    cap->mseq,
3404                                    cap_epoch_barrier);
3405   m->caller_uid = in->cap_dirtier_uid;
3406   m->caller_gid = in->cap_dirtier_gid;
3407
3408   m->head.issue_seq = cap->issue_seq;
3409   m->set_tid(flush_tid);
3410
3411   m->head.uid = in->uid;
3412   m->head.gid = in->gid;
3413   m->head.mode = in->mode;
3414
3415   m->head.nlink = in->nlink;
3416
3417   if (flush & CEPH_CAP_XATTR_EXCL) {
3418     encode(in->xattrs, m->xattrbl);
3419     m->head.xattr_version = in->xattr_version;
3420   }
3421
3422   m->size = in->size;
3423   m->max_size = in->max_size;
3424   m->truncate_seq = in->truncate_seq;
3425   m->truncate_size = in->truncate_size;
3426   m->mtime = in->mtime;
3427   m->atime = in->atime;
3428   m->ctime = in->ctime;
3429   m->btime = in->btime;
3430   m->time_warp_seq = in->time_warp_seq;
3431   m->change_attr = in->change_attr;
3432
3433   if (!(flags & MClientCaps::FLAG_PENDING_CAPSNAP) &&
3434       !in->cap_snaps.empty() &&
3435       in->cap_snaps.rbegin()->second.flush_tid == 0)
3436     flags |= MClientCaps::FLAG_PENDING_CAPSNAP;
3437   m->flags = flags;
3438
3439   if (flush & CEPH_CAP_FILE_WR) {
3440     m->inline_version = in->inline_version;
3441     m->inline_data = in->inline_data;
3442   }
3443
3444   in->reported_size = in->size;
3445   m->set_snap_follows(follows);
3446   cap->wanted = want;
3447   if (cap == in->auth_cap) {
3448     if (want & CEPH_CAP_ANY_FILE_WR) {
3449       m->set_max_size(in->wanted_max_size);
3450       in->requested_max_size = in->wanted_max_size;
3451       ldout(cct, 15) << "auth cap, requesting max_size " << in->requested_max_size << dendl;
3452     } else {
3453       in->requested_max_size = 0;
3454       ldout(cct, 15) << "auth cap, reset requested_max_size due to not wanting any file write cap" << dendl;
3455     }
3456   }
3457
3458   if (!session->flushing_caps_tids.empty())
3459     m->set_oldest_flush_tid(*session->flushing_caps_tids.begin());
3460
3461   session->con->send_message2(std::move(m));
3462 }
3463
3464 static bool is_max_size_approaching(Inode *in)
3465 {
3466   /* mds will adjust max size according to the reported size */
3467   if (in->flushing_caps & CEPH_CAP_FILE_WR)
3468     return false;
3469   if (in->size >= in->max_size)
3470     return true;
3471   /* half of previous max_size increment has been used */
3472   if (in->max_size > in->reported_size &&
3473       (in->size << 1) >= in->max_size + in->reported_size)
3474     return true;
3475   return false;
3476 }
3477
3478 static int adjust_caps_used_for_lazyio(int used, int issued, int implemented)
3479 {
3480   if (!(used & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_BUFFER)))
3481     return used;
3482   if (!(implemented & CEPH_CAP_FILE_LAZYIO))
3483     return used;
3484
3485   if (issued & CEPH_CAP_FILE_LAZYIO) {
3486     if (!(issued & CEPH_CAP_FILE_CACHE)) {
3487       used &= ~CEPH_CAP_FILE_CACHE;
3488       used |= CEPH_CAP_FILE_LAZYIO;
3489     }
3490     if (!(issued & CEPH_CAP_FILE_BUFFER)) {
3491       used &= ~CEPH_CAP_FILE_BUFFER;
3492       used |= CEPH_CAP_FILE_LAZYIO;
3493     }
3494   } else {
3495     if (!(implemented & CEPH_CAP_FILE_CACHE)) {
3496       used &= ~CEPH_CAP_FILE_CACHE;
3497       used |= CEPH_CAP_FILE_LAZYIO;
3498     }
3499     if (!(implemented & CEPH_CAP_FILE_BUFFER)) {
3500       used &= ~CEPH_CAP_FILE_BUFFER;
3501       used |= CEPH_CAP_FILE_LAZYIO;
3502     }
3503   }
3504   return used;
3505 }
3506
3507 /**
3508  * check_caps
3509  *
3510  * Examine currently used and wanted versus held caps. Release, flush or ack
3511  * revoked caps to the MDS as appropriate.
3512  *
3513  * @param in the inode to check
3514  * @param flags flags to apply to cap check
3515  */
3516 void Client::check_caps(Inode *in, unsigned flags)
3517 {
3518   unsigned wanted = in->caps_wanted();
3519   unsigned used = get_caps_used(in);
3520   unsigned cap_used;
3521
3522   int implemented;
3523   int issued = in->caps_issued(&implemented);
3524   int revoking = implemented & ~issued;
3525
3526   int orig_used = used;
3527   used = adjust_caps_used_for_lazyio(used, issued, implemented);
3528
3529   int retain = wanted | used | CEPH_CAP_PIN;
3530   if (!unmounting && in->nlink > 0) {
3531     if (wanted) {
3532       retain |= CEPH_CAP_ANY;
3533     } else if (in->is_dir() &&
3534                (issued & CEPH_CAP_FILE_SHARED) &&
3535                (in->flags & I_COMPLETE)) {
3536       // we do this here because we don't want to drop to Fs (and then
3537       // drop the Fs if we do a create!) if that alone makes us send lookups
3538       // to the MDS. Doing it in in->caps_wanted() has knock-on effects elsewhere
3539       wanted = CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_EXCL;
3540       retain |= wanted;
3541     } else {
3542       retain |= CEPH_CAP_ANY_SHARED;
3543       // keep RD only if we didn't have the file open RW,
3544       // because then the mds would revoke it anyway to
3545       // journal max_size=0.
3546       if (in->max_size == 0)
3547         retain |= CEPH_CAP_ANY_RD;
3548     }
3549   }
3550
3551   ldout(cct, 10) << __func__ << " on " << *in
3552            << " wanted " << ccap_string(wanted)
3553            << " used " << ccap_string(used)
3554            << " issued " << ccap_string(issued)
3555            << " revoking " << ccap_string(revoking)
3556            << " flags=" << flags
3557            << dendl;
3558
3559   if (in->snapid != CEPH_NOSNAP)
3560     return; //snap caps last forever, can't write
3561
3562   if (in->caps.empty())
3563     return;   // guard if at end of func
3564
3565   if (!(orig_used & CEPH_CAP_FILE_BUFFER) &&
3566       (revoking & used & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO))) {
3567     if (_release(in))
3568       used &= ~(CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO);
3569   }
3570
3571
3572   for (auto &p : in->caps) {
3573     mds_rank_t mds = p.first;
3574     Cap &cap = p.second;
3575
3576     MetaSession *session = &mds_sessions.at(mds);
3577
3578     cap_used = used;
3579     if (in->auth_cap && &cap != in->auth_cap)
3580       cap_used &= ~in->auth_cap->issued;
3581
3582     revoking = cap.implemented & ~cap.issued;
3583
3584     ldout(cct, 10) << " cap mds." << mds
3585              << " issued " << ccap_string(cap.issued)
3586              << " implemented " << ccap_string(cap.implemented)
3587              << " revoking " << ccap_string(revoking) << dendl;
3588
3589     if (in->wanted_max_size > in->max_size &&
3590         in->wanted_max_size > in->requested_max_size &&
3591         &cap == in->auth_cap)
3592       goto ack;
3593
3594     /* approaching file_max? */
3595     if ((cap.issued & CEPH_CAP_FILE_WR) &&
3596         &cap == in->auth_cap &&
3597         is_max_size_approaching(in)) {
3598       ldout(cct, 10) << "size " << in->size << " approaching max_size " << in->max_size
3599                      << ", reported " << in->reported_size << dendl;
3600       goto ack;
3601     }
3602
3603     /* completed revocation? */
3604     if (revoking && (revoking & cap_used) == 0) {
3605       ldout(cct, 10) << "completed revocation of " << ccap_string(cap.implemented & ~cap.issued) << dendl;
3606       goto ack;
3607     }
3608
3609     /* want more caps from mds? */
3610     if (wanted & ~(cap.wanted | cap.issued))
3611       goto ack;
3612
3613     if (!revoking && unmounting && (cap_used == 0))
3614       goto ack;
3615
3616     if ((cap.issued & ~retain) == 0 && // and we don't have anything we wouldn't like
3617         !in->dirty_caps)               // and we have no dirty caps
3618       continue;
3619
3620     if (!(flags & CHECK_CAPS_NODELAY)) {
3621       ldout(cct, 10) << "delaying cap release" << dendl;
3622       cap_delay_requeue(in);
3623       continue;
3624     }
3625
3626   ack:
3627     if (&cap == in->auth_cap) {
3628       if (in->flags & I_KICK_FLUSH) {
3629         ldout(cct, 20) << " reflushing caps (check_caps) on " << *in
3630                        << " to mds." << mds << dendl;
3631         kick_flushing_caps(in, session);
3632       }
3633       if (!in->cap_snaps.empty() &&
3634           in->cap_snaps.rbegin()->second.flush_tid == 0)
3635         flush_snaps(in);
3636     }
3637
3638     int flushing;
3639     int msg_flags = 0;
3640     ceph_tid_t flush_tid;
3641     if (in->auth_cap == &cap && in->dirty_caps) {
3642       flushing = mark_caps_flushing(in, &flush_tid);
3643       if (flags & CHECK_CAPS_SYNCHRONOUS)
3644         msg_flags |= MClientCaps::FLAG_SYNC;
3645     } else {
3646       flushing = 0;
3647       flush_tid = 0;
3648     }
3649
3650     send_cap(in, session, &cap, msg_flags, cap_used, wanted, retain,
3651              flushing, flush_tid);
3652   }
3653 }
3654
3655
3656 void Client::queue_cap_snap(Inode *in, SnapContext& old_snapc)
3657 {
3658   int used = get_caps_used(in);
3659   int dirty = in->caps_dirty();
3660   ldout(cct, 10) << __func__ << " " << *in << " snapc " << old_snapc << " used " << ccap_string(used) << dendl;
3661
3662   if (in->cap_snaps.size() &&
3663       in->cap_snaps.rbegin()->second.writing) {
3664     ldout(cct, 10) << __func__ << " already have pending cap_snap on " << *in << dendl;
3665     return;
3666   } else if (in->caps_dirty() ||
3667             (used & CEPH_CAP_FILE_WR) ||
3668              (dirty & CEPH_CAP_ANY_WR)) {
3669     const auto &capsnapem = in->cap_snaps.emplace(std::piecewise_construct, std::make_tuple(old_snapc.seq), std::make_tuple(in));
3670     ceph_assert(capsnapem.second); /* element inserted */
3671     CapSnap &capsnap = capsnapem.first->second;
3672     capsnap.context = old_snapc;
3673     capsnap.issued = in->caps_issued();
3674     capsnap.dirty = in->caps_dirty();
3675
3676     capsnap.dirty_data = (used & CEPH_CAP_FILE_BUFFER);
3677
3678     capsnap.uid = in->uid;
3679     capsnap.gid = in->gid;
3680     capsnap.mode = in->mode;
3681     capsnap.btime = in->btime;
3682     capsnap.xattrs = in->xattrs;
3683     capsnap.xattr_version = in->xattr_version;
3684     capsnap.cap_dirtier_uid = in->cap_dirtier_uid;
3685     capsnap.cap_dirtier_gid = in->cap_dirtier_gid;
3686
3687     if (used & CEPH_CAP_FILE_WR) {
3688       ldout(cct, 10) << __func__ << " WR used on " << *in << dendl;
3689       capsnap.writing = 1;
3690     } else {
3691       finish_cap_snap(in, capsnap, used);
3692     }
3693   } else {
3694     ldout(cct, 10) << __func__ << " not dirty|writing on " << *in << dendl;
3695   }
3696 }
3697
3698 void Client::finish_cap_snap(Inode *in, CapSnap &capsnap, int used)
3699 {
3700   ldout(cct, 10) << __func__ << " " << *in << " capsnap " << (void *)&capsnap << " used " << ccap_string(used) << dendl;
3701   capsnap.size = in->size;
3702   capsnap.mtime = in->mtime;
3703   capsnap.atime = in->atime;
3704   capsnap.ctime = in->ctime;
3705   capsnap.time_warp_seq = in->time_warp_seq;
3706   capsnap.change_attr = in->change_attr;
3707   capsnap.dirty |= in->caps_dirty();
3708
3709   /* Only reset it if it wasn't set before */
3710   if (capsnap.cap_dirtier_uid == -1) {
3711     capsnap.cap_dirtier_uid = in->cap_dirtier_uid;
3712     capsnap.cap_dirtier_gid = in->cap_dirtier_gid;
3713   }
3714
3715   if (capsnap.dirty & CEPH_CAP_FILE_WR) {
3716     capsnap.inline_data = in->inline_data;
3717     capsnap.inline_version = in->inline_version;
3718   }
3719
3720   if (used & CEPH_CAP_FILE_BUFFER) {
3721     ldout(cct, 10) << __func__ << " " << *in << " cap_snap " << &capsnap << " used " << used
3722              << " WRBUFFER, delaying" << dendl;
3723   } else {
3724     capsnap.dirty_data = 0;
3725     flush_snaps(in);
3726   }
3727 }
3728
3729 void Client::_flushed_cap_snap(Inode *in, snapid_t seq)
3730 {
3731   ldout(cct, 10) << __func__ << " seq " << seq << " on " << *in << dendl;
3732   in->cap_snaps.at(seq).dirty_data = 0;
3733   flush_snaps(in);
3734 }
3735
3736 void Client::send_flush_snap(Inode *in, MetaSession *session,
3737                              snapid_t follows, CapSnap& capsnap)
3738 {
3739   auto m = make_message<MClientCaps>(CEPH_CAP_OP_FLUSHSNAP,
3740                                      in->ino, in->snaprealm->ino, 0,
3741                                      in->auth_cap->mseq, cap_epoch_barrier);
3742   m->caller_uid = capsnap.cap_dirtier_uid;
3743   m->caller_gid = capsnap.cap_dirtier_gid;
3744
3745   m->set_client_tid(capsnap.flush_tid);
3746   m->head.snap_follows = follows;
3747
3748   m->head.caps = capsnap.issued;
3749   m->head.dirty = capsnap.dirty;
3750
3751   m->head.uid = capsnap.uid;
3752   m->head.gid = capsnap.gid;
3753   m->head.mode = capsnap.mode;
3754   m->btime = capsnap.btime;
3755
3756   m->size = capsnap.size;
3757
3758   m->head.xattr_version = capsnap.xattr_version;
3759   encode(capsnap.xattrs, m->xattrbl);
3760
3761   m->ctime = capsnap.ctime;
3762   m->btime = capsnap.btime;
3763   m->mtime = capsnap.mtime;
3764   m->atime = capsnap.atime;
3765   m->time_warp_seq = capsnap.time_warp_seq;
3766   m->change_attr = capsnap.change_attr;
3767
3768   if (capsnap.dirty & CEPH_CAP_FILE_WR) {
3769     m->inline_version = in->inline_version;
3770     m->inline_data = in->inline_data;
3771   }
3772
3773   ceph_assert(!session->flushing_caps_tids.empty());
3774   m->set_oldest_flush_tid(*session->flushing_caps_tids.begin());
3775
3776   session->con->send_message2(std::move(m));
3777 }
3778
3779 void Client::flush_snaps(Inode *in)
3780 {
3781   ldout(cct, 10) << "flush_snaps on " << *in << dendl;
3782   ceph_assert(in->cap_snaps.size());
3783
3784   // pick auth mds
3785   ceph_assert(in->auth_cap);
3786   MetaSession *session = in->auth_cap->session;
3787
3788   for (auto &p : in->cap_snaps) {
3789     CapSnap &capsnap = p.second;
3790     // only do new flush
3791     if (capsnap.flush_tid > 0)
3792       continue;
3793
3794     ldout(cct, 10) << "flush_snaps mds." << session->mds_num
3795              << " follows " << p.first
3796              << " size " << capsnap.size
3797              << " mtime " << capsnap.mtime
3798              << " dirty_data=" << capsnap.dirty_data
3799              << " writing=" << capsnap.writing
3800              << " on " << *in << dendl;
3801     if (capsnap.dirty_data || capsnap.writing)
3802       break;
3803
3804     capsnap.flush_tid = ++last_flush_tid;
3805     session->flushing_caps_tids.insert(capsnap.flush_tid);
3806     in->flushing_cap_tids[capsnap.flush_tid] = 0;
3807     if (!in->flushing_cap_item.is_on_list())
3808       session->flushing_caps.push_back(&in->flushing_cap_item);
3809
3810     send_flush_snap(in, session, p.first, capsnap);
3811   }
3812 }
3813
3814 void Client::wait_on_list(list<ceph::condition_variable*>& ls)
3815 {
3816   ceph::condition_variable cond;
3817   ls.push_back(&cond);
3818   std::unique_lock l{client_lock, std::adopt_lock};
3819   cond.wait(l);
3820   l.release();
3821   ls.remove(&cond);
3822 }
3823
3824 void Client::signal_cond_list(list<ceph::condition_variable*>& ls)
3825 {
3826   for (auto cond : ls) {
3827     cond->notify_all();
3828   }
3829 }
3830
3831 void Client::wait_on_context_list(list<Context*>& ls)
3832 {
3833   ceph::condition_variable cond;
3834   bool done = false;
3835   int r;
3836   ls.push_back(new C_Cond(cond, &done, &r));
3837   std::unique_lock l{client_lock, std::adopt_lock};
3838   cond.wait(l, [&done] { return done;});
3839   l.release();
3840 }
3841
3842 void Client::signal_context_list(list<Context*>& ls)
3843 {
3844   while (!ls.empty()) {
3845     ls.front()->complete(0);
3846     ls.pop_front();
3847   }
3848 }
3849
3850 void Client::wake_up_session_caps(MetaSession *s, bool reconnect)
3851 {
3852   for (const auto &cap : s->caps) {
3853     auto &in = cap->inode;
3854     if (reconnect) {
3855       in.requested_max_size = 0;
3856       in.wanted_max_size = 0;
3857     } else {
3858       if (cap->gen < s->cap_gen) {
3859         // mds did not re-issue stale cap.
3860         cap->issued = cap->implemented = CEPH_CAP_PIN;
3861         // make sure mds knows what we want.
3862         if (in.caps_file_wanted() & ~cap->wanted)
3863           in.flags |= I_CAP_DROPPED;
3864       }
3865     }
3866     signal_cond_list(in.waitfor_caps);
3867   }
3868 }
3869
3870
3871 // flush dirty data (from objectcache)
3872
3873 class C_Client_CacheInvalidate : public Context  {
3874 private:
3875   Client *client;
3876   vinodeno_t ino;
3877   int64_t offset, length;
3878 public:
3879   C_Client_CacheInvalidate(Client *c, Inode *in, int64_t off, int64_t len) :
3880     client(c), offset(off), length(len) {
3881     if (client->use_faked_inos())
3882       ino = vinodeno_t(in->faked_ino, CEPH_NOSNAP);
3883     else
3884       ino = in->vino();
3885   }
3886   void finish(int r) override {
3887     // _async_invalidate takes the lock when it needs to, call this back from outside of lock.
3888     ceph_assert(ceph_mutex_is_not_locked_by_me(client->client_lock));
3889     client->_async_invalidate(ino, offset, length);
3890   }
3891 };
3892
3893 void Client::_async_invalidate(vinodeno_t ino, int64_t off, int64_t len)
3894 {
3895   if (unmounting)
3896     return;
3897   ldout(cct, 10) << __func__ << " " << ino << " " << off << "~" << len << dendl;
3898   ino_invalidate_cb(callback_handle, ino, off, len);
3899 }
3900
3901 void Client::_schedule_invalidate_callback(Inode *in, int64_t off, int64_t len) {
3902
3903   if (ino_invalidate_cb)
3904     // we queue the invalidate, which calls the callback and decrements the ref
3905     async_ino_invalidator.queue(new C_Client_CacheInvalidate(this, in, off, len));
3906 }
3907
3908 void Client::_invalidate_inode_cache(Inode *in)
3909 {
3910   ldout(cct, 10) << __func__ << " " << *in << dendl;
3911
3912   // invalidate our userspace inode cache
3913   if (cct->_conf->client_oc) {
3914     objectcacher->release_set(&in->oset);
3915     if (!objectcacher->set_is_empty(&in->oset))
3916       lderr(cct) << "failed to invalidate cache for " << *in << dendl;
3917   }
3918
3919   _schedule_invalidate_callback(in, 0, 0);
3920 }
3921
3922 void Client::_invalidate_inode_cache(Inode *in, int64_t off, int64_t len)
3923 {
3924   ldout(cct, 10) << __func__ << " " << *in << " " << off << "~" << len << dendl;
3925
3926   // invalidate our userspace inode cache
3927   if (cct->_conf->client_oc) {
3928     vector<ObjectExtent> ls;
3929     Striper::file_to_extents(cct, in->ino, &in->layout, off, len, in->truncate_size, ls);
3930     objectcacher->discard_writeback(&in->oset, ls, nullptr);
3931   }
3932
3933   _schedule_invalidate_callback(in, off, len);
3934 }
3935
3936 bool Client::_release(Inode *in)
3937 {
3938   ldout(cct, 20) << "_release " << *in << dendl;
3939   if (in->cap_refs[CEPH_CAP_FILE_CACHE] == 0) {
3940     _invalidate_inode_cache(in);
3941     return true;
3942   }
3943   return false;
3944 }
3945
3946 bool Client::_flush(Inode *in, Context *onfinish)
3947 {
3948   ldout(cct, 10) << "_flush " << *in << dendl;
3949
3950   if (!in->oset.dirty_or_tx) {
3951     ldout(cct, 10) << " nothing to flush" << dendl;
3952     onfinish->complete(0);
3953     return true;
3954   }
3955
3956   if (objecter->osdmap_pool_full(in->layout.pool_id)) {
3957     ldout(cct, 8) << __func__ << ": FULL, purging for ENOSPC" << dendl;
3958     objectcacher->purge_set(&in->oset);
3959     if (onfinish) {
3960       onfinish->complete(-ENOSPC);
3961     }
3962     return true;
3963   }
3964
3965   return objectcacher->flush_set(&in->oset, onfinish);
3966 }
3967
3968 void Client::_flush_range(Inode *in, int64_t offset, uint64_t size)
3969 {
3970   ceph_assert(ceph_mutex_is_locked(client_lock));
3971   if (!in->oset.dirty_or_tx) {
3972     ldout(cct, 10) << " nothing to flush" << dendl;
3973     return;
3974   }
3975
3976   C_SaferCond onflush("Client::_flush_range flock");
3977   bool ret = objectcacher->file_flush(&in->oset, &in->layout, in->snaprealm->get_snap_context(),
3978                                       offset, size, &onflush);
3979   if (!ret) {
3980     // wait for flush
3981     client_lock.unlock();
3982     onflush.wait();
3983     client_lock.lock();
3984   }
3985 }
3986
3987 void Client::flush_set_callback(ObjectCacher::ObjectSet *oset)
3988 {
3989   //  std::lock_guard l(client_lock);
3990   ceph_assert(ceph_mutex_is_locked(client_lock));   // will be called via dispatch() -> objecter -> ...
3991   Inode *in = static_cast<Inode *>(oset->parent);
3992   ceph_assert(in);
3993   _flushed(in);
3994 }
3995
3996 void Client::_flushed(Inode *in)
3997 {
3998   ldout(cct, 10) << "_flushed " << *in << dendl;
3999
4000   put_cap_ref(in, CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_BUFFER);
4001 }
4002
4003
4004
4005 // checks common to add_update_cap, handle_cap_grant
4006 void Client::check_cap_issue(Inode *in, unsigned issued)
4007 {
4008   unsigned had = in->caps_issued();
4009
4010   if ((issued & CEPH_CAP_FILE_CACHE) &&
4011       !(had & CEPH_CAP_FILE_CACHE))
4012     in->cache_gen++;
4013
4014   if ((issued & CEPH_CAP_FILE_SHARED) &&
4015       !(had & CEPH_CAP_FILE_SHARED)) {
4016     in->shared_gen++;
4017
4018     if (in->is_dir())
4019       clear_dir_complete_and_ordered(in, true);
4020   }
4021 }
4022
4023 void Client::add_update_cap(Inode *in, MetaSession *mds_session, uint64_t cap_id,
4024                             unsigned issued, unsigned wanted, unsigned seq, unsigned mseq,
4025                             inodeno_t realm, int flags, const UserPerm& cap_perms)
4026 {
4027   if (!in->is_any_caps()) {
4028     ceph_assert(in->snaprealm == 0);
4029     in->snaprealm = get_snap_realm(realm);
4030     in->snaprealm->inodes_with_caps.push_back(&in->snaprealm_item);
4031     ldout(cct, 15) << __func__ << " first one, opened snaprealm " << in->snaprealm << dendl;
4032   } else {
4033     ceph_assert(in->snaprealm);
4034     if ((flags & CEPH_CAP_FLAG_AUTH) &&
4035         realm != inodeno_t(-1) && in->snaprealm->ino != realm) {
4036       in->snaprealm_item.remove_myself();
4037       auto oldrealm = in->snaprealm;
4038       in->snaprealm = get_snap_realm(realm);
4039       in->snaprealm->inodes_with_caps.push_back(&in->snaprealm_item);
4040       put_snap_realm(oldrealm);
4041     }
4042   }
4043
4044   mds_rank_t mds = mds_session->mds_num;
4045   const auto &capem = in->caps.emplace(std::piecewise_construct, std::forward_as_tuple(mds), std::forward_as_tuple(*in, mds_session));
4046   Cap &cap = capem.first->second;
4047   if (!capem.second) {
4048     if (cap.gen < mds_session->cap_gen)
4049       cap.issued = cap.implemented = CEPH_CAP_PIN;
4050
4051     /*
4052      * auth mds of the inode changed. we received the cap export
4053      * message, but still haven't received the cap import message.
4054      * handle_cap_export() updated the new auth MDS' cap.
4055      *
4056      * "ceph_seq_cmp(seq, cap->seq) <= 0" means we are processing
4057      * a message that was send before the cap import message. So
4058      * don't remove caps.
4059      */
4060     if (ceph_seq_cmp(seq, cap.seq) <= 0) {
4061       if (&cap != in->auth_cap)
4062          ldout(cct, 0) << "WARNING: " <<  "inode " << *in << " caps on mds." << mds << " != auth_cap." << dendl;
4063
4064       ceph_assert(cap.cap_id == cap_id);
4065       seq = cap.seq;
4066       mseq = cap.mseq;
4067       issued |= cap.issued;
4068       flags |= CEPH_CAP_FLAG_AUTH;
4069     }
4070   }
4071
4072   check_cap_issue(in, issued);
4073
4074   if (flags & CEPH_CAP_FLAG_AUTH) {
4075     if (in->auth_cap != &cap &&
4076         (!in->auth_cap || ceph_seq_cmp(in->auth_cap->mseq, mseq) < 0)) {
4077       if (in->auth_cap && in->flushing_cap_item.is_on_list()) {
4078         ldout(cct, 10) << __func__ << " changing auth cap: "
4079                        << "add myself to new auth MDS' flushing caps list" << dendl;
4080         adjust_session_flushing_caps(in, in->auth_cap->session, mds_session);
4081       }
4082       in->auth_cap = &cap;
4083     }
4084   }
4085
4086   unsigned old_caps = cap.issued;
4087   cap.cap_id = cap_id;
4088   cap.issued = issued;
4089   cap.implemented |= issued;
4090   if (ceph_seq_cmp(mseq, cap.mseq) > 0)
4091     cap.wanted = wanted;
4092   else
4093     cap.wanted |= wanted;
4094   cap.seq = seq;
4095   cap.issue_seq = seq;
4096   cap.mseq = mseq;
4097   cap.gen = mds_session->cap_gen;
4098   cap.latest_perms = cap_perms;
4099   ldout(cct, 10) << __func__ << " issued " << ccap_string(old_caps) << " -> " << ccap_string(cap.issued)
4100            << " from mds." << mds
4101            << " on " << *in
4102            << dendl;
4103
4104   if ((issued & ~old_caps) && in->auth_cap == &cap) {
4105     // non-auth MDS is revoking the newly grant caps ?
4106     for (auto &p : in->caps) {
4107       if (&p.second == &cap)
4108         continue;
4109       if (p.second.implemented & ~p.second.issued & issued) {
4110         check_caps(in, CHECK_CAPS_NODELAY);
4111         break;
4112       }
4113     }
4114   }
4115
4116   if (issued & ~old_caps)
4117     signal_cond_list(in->waitfor_caps);
4118 }
4119
4120 void Client::remove_cap(Cap *cap, bool queue_release)
4121 {
4122   auto &in = cap->inode;
4123   MetaSession *session = cap->session;
4124   mds_rank_t mds = cap->session->mds_num;
4125
4126   ldout(cct, 10) << __func__ << " mds." << mds << " on " << in << dendl;
4127
4128   if (queue_release) {
4129     session->enqueue_cap_release(
4130       in.ino,
4131       cap->cap_id,
4132       cap->issue_seq,
4133       cap->mseq,
4134       cap_epoch_barrier);
4135   }
4136
4137   if (in.auth_cap == cap) {
4138     if (in.flushing_cap_item.is_on_list()) {
4139       ldout(cct, 10) << " removing myself from flushing_cap list" << dendl;
4140       in.flushing_cap_item.remove_myself();
4141     }
4142     in.auth_cap = NULL;
4143   }
4144   size_t n = in.caps.erase(mds);
4145   ceph_assert(n == 1);
4146   cap = nullptr;
4147
4148   if (!in.is_any_caps()) {
4149     ldout(cct, 15) << __func__ << " last one, closing snaprealm " << in.snaprealm << dendl;
4150     in.snaprealm_item.remove_myself();
4151     put_snap_realm(in.snaprealm);
4152     in.snaprealm = 0;
4153   }
4154 }
4155
4156 void Client::remove_all_caps(Inode *in)
4157 {
4158   while (!in->caps.empty())
4159     remove_cap(&in->caps.begin()->second, true);
4160 }
4161
4162 void Client::remove_session_caps(MetaSession *s, int err)
4163 {
4164   ldout(cct, 10) << __func__ << " mds." << s->mds_num << dendl;
4165
4166   while (s->caps.size()) {
4167     Cap *cap = *s->caps.begin();
4168     InodeRef in(&cap->inode);
4169     bool dirty_caps = false;
4170     if (in->auth_cap == cap) {
4171       dirty_caps = in->dirty_caps | in->flushing_caps;
4172       in->wanted_max_size = 0;
4173       in->requested_max_size = 0;
4174       if (in->has_any_filelocks())
4175         in->flags |= I_ERROR_FILELOCK;
4176     }
4177     auto caps = cap->implemented;
4178     if (cap->wanted | cap->issued)
4179       in->flags |= I_CAP_DROPPED;
4180     remove_cap(cap, false);
4181     in->cap_snaps.clear();
4182     if (dirty_caps) {
4183       lderr(cct) << __func__ << " still has dirty|flushing caps on " << *in << dendl;
4184       if (in->flushing_caps) {
4185         num_flushing_caps--;
4186         in->flushing_cap_tids.clear();
4187       }
4188       in->flushing_caps = 0;
4189       in->mark_caps_clean();
4190       put_inode(in.get());
4191     }
4192     caps &= CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_BUFFER;
4193     if (caps && !in->caps_issued_mask(caps, true)) {
4194       if (err == -EBLACKLISTED) {
4195         if (in->oset.dirty_or_tx) {
4196           lderr(cct) << __func__ << " still has dirty data on " << *in << dendl;
4197           in->set_async_err(err);
4198         }
4199         objectcacher->purge_set(&in->oset);
4200       } else {
4201         objectcacher->release_set(&in->oset);
4202       }
4203       _schedule_invalidate_callback(in.get(), 0, 0);
4204     }
4205
4206     signal_cond_list(in->waitfor_caps);
4207   }
4208   s->flushing_caps_tids.clear();
4209   sync_cond.notify_all();
4210 }
4211
4212 int Client::_do_remount(bool retry_on_error)
4213 {
4214   uint64_t max_retries = g_conf().get_val<uint64_t>("mds_max_retries_on_remount_failure");
4215
4216   errno = 0;
4217   int r = remount_cb(callback_handle);
4218   if (r == 0) {
4219     retries_on_invalidate = 0;
4220   } else {
4221     int e = errno;
4222     client_t whoami = get_nodeid();
4223     if (r == -1) {
4224       lderr(cct) <<
4225           "failed to remount (to trim kernel dentries): "
4226           "errno = " << e << " (" << strerror(e) << ")" << dendl;
4227     } else {
4228       lderr(cct) <<
4229           "failed to remount (to trim kernel dentries): "
4230           "return code = " << r << dendl;
4231     }
4232     bool should_abort =
4233       (cct->_conf.get_val<bool>("client_die_on_failed_remount") ||
4234        cct->_conf.get_val<bool>("client_die_on_failed_dentry_invalidate")) &&
4235       !(retry_on_error && (++retries_on_invalidate < max_retries));
4236     if (should_abort && !unmounting) {
4237       lderr(cct) << "failed to remount for kernel dentry trimming; quitting!" << dendl;
4238       ceph_abort();
4239     }
4240   }
4241   return r;
4242 }
4243
4244 class C_Client_Remount : public Context  {
4245 private:
4246   Client *client;
4247 public:
4248   explicit C_Client_Remount(Client *c) : client(c) {}
4249   void finish(int r) override {
4250     ceph_assert(r == 0);
4251     client->_do_remount(true);
4252   }
4253 };
4254
4255 void Client::_invalidate_kernel_dcache()
4256 {
4257   if (unmounting)
4258     return;
4259   if (can_invalidate_dentries) {
4260     if (dentry_invalidate_cb && root->dir) {
4261       for (ceph::unordered_map<string, Dentry*>::iterator p = root->dir->dentries.begin();
4262          p != root->dir->dentries.end();
4263          ++p) {
4264        if (p->second->inode)
4265         _schedule_invalidate_dentry_callback(p->second, false);
4266       }
4267     }
4268   } else if (remount_cb) {
4269     // Hacky:
4270     // when remounting a file system, linux kernel trims all unused dentries in the fs
4271     remount_finisher.queue(new C_Client_Remount(this));
4272   }
4273 }
4274
4275 void Client::_trim_negative_child_dentries(InodeRef& in)
4276 {
4277   if (!in->is_dir())
4278     return;
4279
4280   Dir* dir = in->dir;
4281   if (dir && dir->dentries.size() == dir->num_null_dentries) {
4282     for (auto p = dir->dentries.begin(); p != dir->dentries.end(); ) {
4283       Dentry *dn = p->second;
4284       ++p;
4285       ceph_assert(!dn->inode);
4286       if (dn->lru_is_expireable())
4287         unlink(dn, true, false);  // keep dir, drop dentry
4288     }
4289     if (dir->dentries.empty()) {
4290       close_dir(dir);
4291     }
4292   }
4293
4294   if (in->flags & I_SNAPDIR_OPEN) {
4295     InodeRef snapdir = open_snapdir(in.get());
4296     _trim_negative_child_dentries(snapdir);
4297   }
4298 }
4299
4300 class C_Client_CacheRelease : public Context  {
4301 private:
4302   Client *client;
4303   vinodeno_t ino;
4304 public:
4305   C_Client_CacheRelease(Client *c, Inode *in) :
4306     client(c) {
4307     if (client->use_faked_inos())
4308       ino = vinodeno_t(in->faked_ino, CEPH_NOSNAP);
4309     else
4310       ino = in->vino();
4311   }
4312   void finish(int r) override {
4313     ceph_assert(ceph_mutex_is_not_locked_by_me(client->client_lock));
4314     client->_async_inode_release(ino);
4315   }
4316 };
4317
4318 void Client::_async_inode_release(vinodeno_t ino)
4319 {
4320   if (unmounting)
4321     return;
4322   ldout(cct, 10) << __func__ << " " << ino << dendl;
4323   ino_release_cb(callback_handle, ino);
4324 }
4325
4326 void Client::_schedule_ino_release_callback(Inode *in) {
4327
4328   if (ino_release_cb)
4329     // we queue the invalidate, which calls the callback and decrements the ref
4330     async_ino_releasor.queue(new C_Client_CacheRelease(this, in));
4331 }
4332
4333 void Client::trim_caps(MetaSession *s, uint64_t max)
4334 {
4335   mds_rank_t mds = s->mds_num;
4336   size_t caps_size = s->caps.size();
4337   ldout(cct, 10) << __func__ << " mds." << mds << " max " << max
4338     << " caps " << caps_size << dendl;
4339
4340   uint64_t trimmed = 0;
4341   auto p = s->caps.begin();
4342   std::set<Dentry *> to_trim; /* this avoids caps other than the one we're
4343                                * looking at from getting deleted during traversal. */
4344   while ((caps_size - trimmed) > max && !p.end()) {
4345     Cap *cap = *p;
4346     InodeRef in(&cap->inode);
4347
4348     // Increment p early because it will be invalidated if cap
4349     // is deleted inside remove_cap
4350     ++p;
4351
4352     if (in->caps.size() > 1 && cap != in->auth_cap) {
4353       int mine = cap->issued | cap->implemented;
4354       int oissued = in->auth_cap ? in->auth_cap->issued : 0;
4355       // disposable non-auth cap
4356       if (!(get_caps_used(in.get()) & ~oissued & mine)) {
4357         ldout(cct, 20) << " removing unused, unneeded non-auth cap on " << *in << dendl;
4358         cap = (remove_cap(cap, true), nullptr);
4359         trimmed++;
4360       }
4361     } else {
4362       ldout(cct, 20) << " trying to trim dentries for " << *in << dendl;
4363       _trim_negative_child_dentries(in);
4364       bool all = true;
4365       auto q = in->dentries.begin();
4366       while (q != in->dentries.end()) {
4367         Dentry *dn = *q;
4368         ++q;
4369         if (dn->lru_is_expireable()) {
4370           if (can_invalidate_dentries &&
4371               dn->dir->parent_inode->ino == MDS_INO_ROOT) {
4372             // Only issue one of these per DN for inodes in root: handle
4373             // others more efficiently by calling for root-child DNs at
4374             // the end of this function.
4375             _schedule_invalidate_dentry_callback(dn, true);
4376           }
4377           ldout(cct, 20) << " queueing dentry for trimming: " << dn->name << dendl;
4378           to_trim.insert(dn);
4379         } else {
4380           ldout(cct, 20) << "  not expirable: " << dn->name << dendl;
4381           all = false;
4382         }
4383       }
4384       if (all && in->ino != MDS_INO_ROOT) {
4385         ldout(cct, 20) << __func__ << " counting as trimmed: " << *in << dendl;
4386         trimmed++;
4387         _schedule_ino_release_callback(in.get());
4388       }
4389     }
4390   }
4391   ldout(cct, 20) << " trimming queued dentries: " << dendl;
4392   for (const auto &dn : to_trim) {
4393     trim_dentry(dn);
4394   }
4395   to_trim.clear();
4396
4397   caps_size = s->caps.size();
4398   if (caps_size > (size_t)max)
4399     _invalidate_kernel_dcache();
4400 }
4401
4402 void Client::force_session_readonly(MetaSession *s)
4403 {
4404   s->readonly = true;
4405   for (xlist<Cap*>::iterator p = s->caps.begin(); !p.end(); ++p) {
4406     auto &in = (*p)->inode;
4407     if (in.caps_wanted() & CEPH_CAP_FILE_WR)
4408       signal_cond_list(in.waitfor_caps);
4409   }
4410 }
4411
4412 int Client::mark_caps_flushing(Inode *in, ceph_tid_t* ptid)
4413 {
4414   MetaSession *session = in->auth_cap->session;
4415
4416   int flushing = in->dirty_caps;
4417   ceph_assert(flushing);
4418
4419   ceph_tid_t flush_tid = ++last_flush_tid;
4420   in->flushing_cap_tids[flush_tid] = flushing;
4421
4422   if (!in->flushing_caps) {
4423     ldout(cct, 10) << __func__ << " " << ccap_string(flushing) << " " << *in << dendl;
4424     num_flushing_caps++;
4425   } else {
4426     ldout(cct, 10) << __func__ << " (more) " << ccap_string(flushing) << " " << *in << dendl;
4427   }
4428
4429   in->flushing_caps |= flushing;
4430   in->mark_caps_clean();
4431
4432   if (!in->flushing_cap_item.is_on_list())
4433     session->flushing_caps.push_back(&in->flushing_cap_item);
4434   session->flushing_caps_tids.insert(flush_tid);
4435
4436   *ptid = flush_tid;
4437   return flushing;
4438 }
4439
4440 void Client::adjust_session_flushing_caps(Inode *in, MetaSession *old_s,  MetaSession *new_s)
4441 {
4442   for (auto &p : in->cap_snaps) {
4443     CapSnap &capsnap = p.second;
4444     if (capsnap.flush_tid > 0) {
4445       old_s->flushing_caps_tids.erase(capsnap.flush_tid);
4446       new_s->flushing_caps_tids.insert(capsnap.flush_tid);
4447     }
4448   }
4449   for (map<ceph_tid_t, int>::iterator it = in->flushing_cap_tids.begin();
4450        it != in->flushing_cap_tids.end();
4451        ++it) {
4452     old_s->flushing_caps_tids.erase(it->first);
4453     new_s->flushing_caps_tids.insert(it->first);
4454   }
4455   new_s->flushing_caps.push_back(&in->flushing_cap_item);
4456 }
4457
4458 /*
4459  * Flush all caps back to the MDS. Because the callers generally wait on the
4460  * result of this function (syncfs and umount cases), we set
4461  * CHECK_CAPS_SYNCHRONOUS on the last check_caps call.
4462  */
4463 void Client::flush_caps_sync()
4464 {
4465   ldout(cct, 10) << __func__ << dendl;
4466   xlist<Inode*>::iterator p = delayed_list.begin();
4467   while (!p.end()) {
4468     unsigned flags = CHECK_CAPS_NODELAY;
4469     Inode *in = *p;
4470
4471     ++p;
4472     delayed_list.pop_front();
4473     if (p.end() && dirty_list.empty())
4474       flags |= CHECK_CAPS_SYNCHRONOUS;
4475     check_caps(in, flags);
4476   }
4477
4478   // other caps, too
4479   p = dirty_list.begin();
4480   while (!p.end()) {
4481     unsigned flags = CHECK_CAPS_NODELAY;
4482     Inode *in = *p;
4483
4484     ++p;
4485     if (p.end())
4486       flags |= CHECK_CAPS_SYNCHRONOUS;
4487     check_caps(in, flags);
4488   }
4489 }
4490
4491 void Client::wait_sync_caps(Inode *in, ceph_tid_t want)
4492 {
4493   while (in->flushing_caps) {
4494     map<ceph_tid_t, int>::iterator it = in->flushing_cap_tids.begin();
4495     ceph_assert(it != in->flushing_cap_tids.end());
4496     if (it->first > want)
4497       break;
4498     ldout(cct, 10) << __func__ << " on " << *in << " flushing "
4499                    << ccap_string(it->second) << " want " << want
4500                    << " last " << it->first << dendl;
4501     wait_on_list(in->waitfor_caps);
4502   }
4503 }
4504
4505 void Client::wait_sync_caps(ceph_tid_t want)
4506 {
4507  retry:
4508   ldout(cct, 10) << __func__ << " want " << want  << " (last is " << last_flush_tid << ", "
4509            << num_flushing_caps << " total flushing)" << dendl;
4510   for (auto &p : mds_sessions) {
4511     MetaSession *s = &p.second;
4512     if (s->flushing_caps_tids.empty())
4513         continue;
4514     ceph_tid_t oldest_tid = *s->flushing_caps_tids.begin();
4515     if (oldest_tid <= want) {
4516       ldout(cct, 10) << " waiting on mds." << p.first << " tid " << oldest_tid
4517                      << " (want " << want << ")" << dendl;
4518       std::unique_lock l{client_lock, std::adopt_lock};
4519       sync_cond.wait(l);
4520       l.release();
4521       goto retry;
4522     }
4523   }
4524 }
4525
4526 void Client::kick_flushing_caps(Inode *in, MetaSession *session)
4527 {
4528   in->flags &= ~I_KICK_FLUSH;
4529
4530   Cap *cap = in->auth_cap;
4531   ceph_assert(cap->session == session);
4532
4533   ceph_tid_t last_snap_flush = 0;
4534   for (auto p = in->flushing_cap_tids.rbegin();
4535        p != in->flushing_cap_tids.rend();
4536        ++p) {
4537     if (!p->second) {
4538       last_snap_flush = p->first;
4539       break;
4540     }
4541   }
4542
4543   int wanted = in->caps_wanted();
4544   int used = get_caps_used(in) | in->caps_dirty();
4545   auto it = in->cap_snaps.begin();
4546   for (auto& p : in->flushing_cap_tids) {
4547     if (p.second) {
4548       int msg_flags = p.first < last_snap_flush ? MClientCaps::FLAG_PENDING_CAPSNAP : 0;
4549       send_cap(in, session, cap, msg_flags, used, wanted, (cap->issued | cap->implemented),
4550                p.second, p.first);
4551     } else {
4552       ceph_assert(it != in->cap_snaps.end());
4553       ceph_assert(it->second.flush_tid == p.first);
4554       send_flush_snap(in, session, it->first, it->second);
4555       ++it;
4556     }
4557   }
4558 }
4559
4560 void Client::kick_flushing_caps(MetaSession *session)
4561 {
4562   mds_rank_t mds = session->mds_num;
4563   ldout(cct, 10) << __func__ << " mds." << mds << dendl;
4564
4565   for (xlist<Inode*>::iterator p = session->flushing_caps.begin(); !p.end(); ++p) {
4566     Inode *in = *p;
4567     if (in->flags & I_KICK_FLUSH) {
4568       ldout(cct, 20) << " reflushing caps on " << *in << " to mds." << mds << dendl;
4569       kick_flushing_caps(in, session);
4570     }
4571   }
4572 }
4573
4574 void Client::early_kick_flushing_caps(MetaSession *session)
4575 {
4576   for (xlist<Inode*>::iterator p = session->flushing_caps.begin(); !p.end(); ++p) {
4577     Inode *in = *p;
4578     Cap *cap = in->auth_cap;
4579     ceph_assert(cap);
4580
4581     // if flushing caps were revoked, we re-send the cap flush in client reconnect
4582     // stage. This guarantees that MDS processes the cap flush message before issuing
4583     // the flushing caps to other client.
4584     if ((in->flushing_caps & in->auth_cap->issued) == in->flushing_caps) {
4585       in->flags |= I_KICK_FLUSH;
4586       continue;
4587     }
4588
4589     ldout(cct, 20) << " reflushing caps (early_kick) on " << *in
4590                    << " to mds." << session->mds_num << dendl;
4591     // send_reconnect() also will reset these sequence numbers. make sure
4592     // sequence numbers in cap flush message match later reconnect message.
4593     cap->seq = 0;
4594     cap->issue_seq = 0;
4595     cap->mseq = 0;
4596     cap->issued = cap->implemented;
4597
4598     kick_flushing_caps(in, session);
4599   }
4600 }
4601
4602 void SnapRealm::build_snap_context()
4603 {
4604   set<snapid_t> snaps;
4605   snapid_t max_seq = seq;
4606
4607   // start with prior_parents?
4608   for (unsigned i=0; i<prior_parent_snaps.size(); i++)
4609     snaps.insert(prior_parent_snaps[i]);
4610
4611   // current parent's snaps
4612   if (pparent) {
4613     const SnapContext& psnapc = pparent->get_snap_context();
4614     for (unsigned i=0; i<psnapc.snaps.size(); i++)
4615       if (psnapc.snaps[i] >= parent_since)
4616         snaps.insert(psnapc.snaps[i]);
4617     if (psnapc.seq > max_seq)
4618       max_seq = psnapc.seq;
4619   }
4620
4621   // my snaps
4622   for (unsigned i=0; i<my_snaps.size(); i++)
4623     snaps.insert(my_snaps[i]);
4624
4625   // ok!
4626   cached_snap_context.seq = max_seq;
4627   cached_snap_context.snaps.resize(0);
4628   cached_snap_context.snaps.reserve(snaps.size());
4629   for (set<snapid_t>::reverse_iterator p = snaps.rbegin(); p != snaps.rend(); ++p)
4630     cached_snap_context.snaps.push_back(*p);
4631 }
4632
4633 void Client::invalidate_snaprealm_and_children(SnapRealm *realm)
4634 {
4635   list<SnapRealm*> q;
4636   q.push_back(realm);
4637
4638   while (!q.empty()) {
4639     realm = q.front();
4640     q.pop_front();
4641
4642     ldout(cct, 10) << __func__ << " " << *realm << dendl;
4643     realm->invalidate_cache();
4644
4645     for (set<SnapRealm*>::iterator p = realm->pchildren.begin();
4646          p != realm->pchildren.end();
4647          ++p)
4648       q.push_back(*p);
4649   }
4650 }
4651
4652 SnapRealm *Client::get_snap_realm(inodeno_t r)
4653 {
4654   SnapRealm *realm = snap_realms[r];
4655   if (!realm)
4656     snap_realms[r] = realm = new SnapRealm(r);
4657   ldout(cct, 20) << __func__ << " " << r << " " << realm << " " << realm->nref << " -> " << (realm->nref + 1) << dendl;
4658   realm->nref++;
4659   return realm;
4660 }
4661
4662 SnapRealm *Client::get_snap_realm_maybe(inodeno_t r)
4663 {
4664   if (snap_realms.count(r) == 0) {
4665     ldout(cct, 20) << __func__ << " " << r << " fail" << dendl;
4666     return NULL;
4667   }
4668   SnapRealm *realm = snap_realms[r];
4669   ldout(cct, 20) << __func__ << " " << r << " " << realm << " " << realm->nref << " -> " << (realm->nref + 1) << dendl;
4670   realm->nref++;
4671   return realm;
4672 }
4673
4674 void Client::put_snap_realm(SnapRealm *realm)
4675 {
4676   ldout(cct, 20) << __func__ << " " << realm->ino << " " << realm
4677                  << " " << realm->nref << " -> " << (realm->nref - 1) << dendl;
4678   if (--realm->nref == 0) {
4679     snap_realms.erase(realm->ino);
4680     if (realm->pparent) {
4681       realm->pparent->pchildren.erase(realm);
4682       put_snap_realm(realm->pparent);
4683     }
4684     delete realm;
4685   }
4686 }
4687
4688 bool Client::adjust_realm_parent(SnapRealm *realm, inodeno_t parent)
4689 {
4690   if (realm->parent != parent) {
4691     ldout(cct, 10) << __func__ << " " << *realm
4692              << " " << realm->parent << " -> " << parent << dendl;
4693     realm->parent = parent;
4694     if (realm->pparent) {
4695       realm->pparent->pchildren.erase(realm);
4696       put_snap_realm(realm->pparent);
4697     }
4698     realm->pparent = get_snap_realm(parent);
4699     realm->pparent->pchildren.insert(realm);
4700     return true;
4701   }
4702   return false;
4703 }
4704
4705 static bool has_new_snaps(const SnapContext& old_snapc,
4706                           const SnapContext& new_snapc)
4707 {
4708   return !new_snapc.snaps.empty() && new_snapc.snaps[0] > old_snapc.seq;
4709 }
4710
4711
4712 void Client::update_snap_trace(const bufferlist& bl, SnapRealm **realm_ret, bool flush)
4713 {
4714   SnapRealm *first_realm = NULL;
4715   ldout(cct, 10) << __func__ << " len " << bl.length() << dendl;
4716
4717   map<SnapRealm*, SnapContext> dirty_realms;
4718
4719   auto p = bl.cbegin();
4720   while (!p.end()) {
4721     SnapRealmInfo info;
4722     decode(info, p);
4723     SnapRealm *realm = get_snap_realm(info.ino());
4724
4725     bool invalidate = false;
4726
4727     if (info.seq() > realm->seq) {
4728       ldout(cct, 10) << __func__ << " " << *realm << " seq " << info.seq() << " > " << realm->seq
4729                << dendl;
4730
4731       if (flush) {
4732         // writeback any dirty caps _before_ updating snap list (i.e. with old snap info)
4733         //  flush me + children
4734         list<SnapRealm*> q;
4735         q.push_back(realm);
4736         while (!q.empty()) {
4737           SnapRealm *realm = q.front();
4738           q.pop_front();
4739
4740           for (set<SnapRealm*>::iterator p = realm->pchildren.begin();
4741                p != realm->pchildren.end();
4742                ++p)
4743             q.push_back(*p);
4744
4745           if (dirty_realms.count(realm) == 0) {
4746             realm->nref++;
4747             dirty_realms[realm] = realm->get_snap_context();
4748           }
4749         }
4750       }
4751
4752       // update
4753       realm->seq = info.seq();
4754       realm->created = info.created();
4755       realm->parent_since = info.parent_since();
4756       realm->prior_parent_snaps = info.prior_parent_snaps;
4757       realm->my_snaps = info.my_snaps;
4758       invalidate = true;
4759     }
4760
4761     // _always_ verify parent
4762     if (adjust_realm_parent(realm, info.parent()))
4763       invalidate = true;
4764
4765     if (invalidate) {
4766       invalidate_snaprealm_and_children(realm);
4767       ldout(cct, 15) << __func__ << " " << *realm << " self|parent updated" << dendl;
4768       ldout(cct, 15) << "  snapc " << realm->get_snap_context() << dendl;
4769     } else {
4770       ldout(cct, 10) << __func__ << " " << *realm << " seq " << info.seq()
4771                << " <= " << realm->seq << " and same parent, SKIPPING" << dendl;
4772     }
4773
4774     if (!first_realm)
4775       first_realm = realm;
4776     else
4777       put_snap_realm(realm);
4778   }
4779
4780   for (map<SnapRealm*, SnapContext>::iterator q = dirty_realms.begin();
4781        q != dirty_realms.end();
4782        ++q) {
4783     SnapRealm *realm = q->first;
4784     // if there are new snaps ?
4785     if (has_new_snaps(q->second, realm->get_snap_context())) {
4786       ldout(cct, 10) << " flushing caps on " << *realm << dendl;
4787       xlist<Inode*>::iterator r = realm->inodes_with_caps.begin();
4788       while (!r.end()) {
4789         Inode *in = *r;
4790         ++r;
4791         queue_cap_snap(in, q->second);
4792       }
4793     } else {
4794       ldout(cct, 10) << " no new snap on " << *realm << dendl;
4795     }
4796     put_snap_realm(realm);
4797   }
4798
4799   if (realm_ret)
4800     *realm_ret = first_realm;
4801   else
4802     put_snap_realm(first_realm);
4803 }
4804
4805 void Client::handle_snap(const MConstRef<MClientSnap>& m)
4806 {
4807   ldout(cct, 10) << __func__ << " " << *m << dendl;
4808   mds_rank_t mds = mds_rank_t(m->get_source().num());
4809   MetaSession *session = _get_mds_session(mds, m->get_connection().get());
4810   if (!session) {
4811     return;
4812   }
4813
4814   got_mds_push(session);
4815
4816   map<Inode*, SnapContext> to_move;
4817   SnapRealm *realm = 0;
4818
4819   if (m->head.op == CEPH_SNAP_OP_SPLIT) {
4820     ceph_assert(m->head.split);
4821     SnapRealmInfo info;
4822     auto p = m->bl.cbegin();
4823     decode(info, p);
4824     ceph_assert(info.ino() == m->head.split);
4825
4826     // flush, then move, ino's.
4827     realm = get_snap_realm(info.ino());
4828     ldout(cct, 10) << " splitting off " << *realm << dendl;
4829     for (auto& ino : m->split_inos) {
4830       vinodeno_t vino(ino, CEPH_NOSNAP);
4831       if (inode_map.count(vino)) {
4832         Inode *in = inode_map[vino];
4833         if (!in->snaprealm || in->snaprealm == realm)
4834           continue;
4835         if (in->snaprealm->created > info.created()) {
4836           ldout(cct, 10) << " NOT moving " << *in << " from _newer_ realm "
4837                    << *in->snaprealm << dendl;
4838           continue;
4839         }
4840         ldout(cct, 10) << " moving " << *in << " from " << *in->snaprealm << dendl;
4841
4842
4843         in->snaprealm_item.remove_myself();
4844         to_move[in] = in->snaprealm->get_snap_context();
4845         put_snap_realm(in->snaprealm);
4846       }
4847     }
4848
4849     // move child snaprealms, too
4850     for (auto& child_realm : m->split_realms) {
4851       ldout(cct, 10) << "adjusting snaprealm " << child_realm << " parent" << dendl;
4852       SnapRealm *child = get_snap_realm_maybe(child_realm);
4853       if (!child)
4854         continue;
4855       adjust_realm_parent(child, realm->ino);
4856       put_snap_realm(child);
4857     }
4858   }
4859
4860   update_snap_trace(m->bl, NULL, m->head.op != CEPH_SNAP_OP_DESTROY);
4861
4862   if (realm) {
4863     for (auto p = to_move.begin(); p != to_move.end(); ++p) {
4864       Inode *in = p->first;
4865       in->snaprealm = realm;
4866       realm->inodes_with_caps.push_back(&in->snaprealm_item);
4867       realm->nref++;
4868       // queue for snap writeback
4869       if (has_new_snaps(p->second, realm->get_snap_context()))
4870         queue_cap_snap(in, p->second);
4871     }
4872     put_snap_realm(realm);
4873   }
4874 }
4875
4876 void Client::handle_quota(const MConstRef<MClientQuota>& m)
4877 {
4878   mds_rank_t mds = mds_rank_t(m->get_source().num());
4879   MetaSession *session = _get_mds_session(mds, m->get_connection().get());
4880   if (!session) {
4881     return;
4882   }
4883
4884   got_mds_push(session);
4885
4886   ldout(cct, 10) << __func__ << " " << *m << " from mds." << mds << dendl;
4887
4888   vinodeno_t vino(m->ino, CEPH_NOSNAP);
4889   if (inode_map.count(vino)) {
4890     Inode *in = NULL;
4891     in = inode_map[vino];
4892
4893     if (in) {
4894       in->quota = m->quota;
4895       in->rstat = m->rstat;
4896     }
4897   }
4898 }
4899
4900 void Client::handle_caps(const MConstRef<MClientCaps>& m)
4901 {
4902   mds_rank_t mds = mds_rank_t(m->get_source().num());
4903   MetaSession *session = _get_mds_session(mds, m->get_connection().get());
4904   if (!session) {
4905     return;
4906   }
4907
4908   if (m->osd_epoch_barrier && !objecter->have_map(m->osd_epoch_barrier)) {
4909     // Pause RADOS operations until we see the required epoch
4910     objecter->set_epoch_barrier(m->osd_epoch_barrier);
4911   }
4912
4913   if (m->osd_epoch_barrier > cap_epoch_barrier) {
4914     // Record the barrier so that we will transmit it to MDS when releasing
4915     set_cap_epoch_barrier(m->osd_epoch_barrier);
4916   }
4917
4918   got_mds_push(session);
4919
4920   Inode *in;
4921   vinodeno_t vino(m->get_ino(), CEPH_NOSNAP);
4922   if (auto it = inode_map.find(vino); it != inode_map.end()) {
4923     in = it->second;
4924   } else {
4925     if (m->get_op() == CEPH_CAP_OP_IMPORT) {
4926       ldout(cct, 5) << __func__ << " don't have vino " << vino << " on IMPORT, immediately releasing" << dendl;
4927       session->enqueue_cap_release(
4928         m->get_ino(),
4929         m->get_cap_id(),
4930         m->get_seq(),
4931         m->get_mseq(),
4932         cap_epoch_barrier);
4933     } else {
4934       ldout(cct, 5) << __func__ << " don't have vino " << vino << ", dropping" << dendl;
4935     }
4936
4937     // in case the mds is waiting on e.g. a revocation
4938     flush_cap_releases();
4939     return;
4940   }
4941
4942   switch (m->get_op()) {
4943     case CEPH_CAP_OP_EXPORT: return handle_cap_export(session, in, m);
4944     case CEPH_CAP_OP_FLUSHSNAP_ACK: return handle_cap_flushsnap_ack(session, in, m);
4945     case CEPH_CAP_OP_IMPORT: /* no return */ handle_cap_import(session, in, m);
4946   }
4947
4948   if (auto it = in->caps.find(mds); it != in->caps.end()) {
4949     Cap &cap = in->caps.at(mds);
4950
4951     switch (m->get_op()) {
4952       case CEPH_CAP_OP_TRUNC: return handle_cap_trunc(session, in, m);
4953       case CEPH_CAP_OP_IMPORT:
4954       case CEPH_CAP_OP_REVOKE:
4955       case CEPH_CAP_OP_GRANT: return handle_cap_grant(session, in, &cap, m);
4956       case CEPH_CAP_OP_FLUSH_ACK: return handle_cap_flush_ack(session, in, &cap, m);
4957     }
4958   } else {
4959     ldout(cct, 5) << __func__ << " don't have " << *in << " cap on mds." << mds << dendl;
4960     return;
4961   }
4962 }
4963
4964 void Client::handle_cap_import(MetaSession *session, Inode *in, const MConstRef<MClientCaps>& m)
4965 {
4966   mds_rank_t mds = session->mds_num;
4967
4968   ldout(cct, 5) << __func__ << " ino " << m->get_ino() << " mseq " << m->get_mseq()
4969                 << " IMPORT from mds." << mds << dendl;
4970
4971   const mds_rank_t peer_mds = mds_rank_t(m->peer.mds);
4972   Cap *cap = NULL;
4973   UserPerm cap_perms;
4974   if (auto it = in->caps.find(peer_mds); m->peer.cap_id && it != in->caps.end()) {
4975     cap = &it->second;
4976     cap_perms = cap->latest_perms;
4977   }
4978
4979   // add/update it
4980   SnapRealm *realm = NULL;
4981   update_snap_trace(m->snapbl, &realm);
4982
4983   int issued = m->get_caps();
4984   int wanted = m->get_wanted();
4985   add_update_cap(in, session, m->get_cap_id(),
4986                  issued, wanted, m->get_seq(), m->get_mseq(),
4987                  m->get_realm(), CEPH_CAP_FLAG_AUTH, cap_perms);
4988
4989   if (cap && cap->cap_id == m->peer.cap_id) {
4990       remove_cap(cap, (m->peer.flags & CEPH_CAP_FLAG_RELEASE));
4991   }
4992
4993   if (realm)
4994     put_snap_realm(realm);
4995
4996   if (in->auth_cap && in->auth_cap->session == session) {
4997     if (!(wanted & CEPH_CAP_ANY_FILE_WR) ||
4998         in->requested_max_size > m->get_max_size()) {
4999       in->requested_max_size = 0;
5000       ldout(cct, 15) << "reset requested_max_size after cap import" << dendl;
5001     }
5002     // reflush any/all caps (if we are now the auth_cap)
5003     kick_flushing_caps(in, session);
5004   }
5005 }
5006
5007 void Client::handle_cap_export(MetaSession *session, Inode *in, const MConstRef<MClientCaps>& m)
5008 {
5009   mds_rank_t mds = session->mds_num;
5010
5011   ldout(cct, 5) << __func__ << " ino " << m->get_ino() << " mseq " << m->get_mseq()
5012                 << " EXPORT from mds." << mds << dendl;
5013
5014   auto it = in->caps.find(mds);
5015   if (it != in->caps.end()) {
5016     Cap &cap = it->second;
5017     if (cap.cap_id == m->get_cap_id()) {
5018       if (m->peer.cap_id) {
5019         const auto peer_mds = mds_rank_t(m->peer.mds);
5020         MetaSession *tsession = _get_or_open_mds_session(peer_mds);
5021         auto it = in->caps.find(peer_mds);
5022         if (it != in->caps.end()) {
5023           Cap &tcap = it->second;
5024           if (tcap.cap_id == m->peer.cap_id &&
5025               ceph_seq_cmp(tcap.seq, m->peer.seq) < 0) {
5026             tcap.cap_id = m->peer.cap_id;
5027             tcap.seq = m->peer.seq - 1;
5028             tcap.issue_seq = tcap.seq;
5029             tcap.issued |= cap.issued;
5030             tcap.implemented |= cap.issued;
5031             if (&cap == in->auth_cap)
5032               in->auth_cap = &tcap;
5033             if (in->auth_cap == &tcap && in->flushing_cap_item.is_on_list())
5034               adjust_session_flushing_caps(in, session, tsession);
5035           }
5036         } else {
5037           add_update_cap(in, tsession, m->peer.cap_id, cap.issued, 0,
5038                          m->peer.seq - 1, m->peer.mseq, (uint64_t)-1,
5039                          &cap == in->auth_cap ? CEPH_CAP_FLAG_AUTH : 0,
5040                          cap.latest_perms);
5041         }
5042       } else {
5043         if (cap.wanted | cap.issued)
5044           in->flags |= I_CAP_DROPPED;
5045       }
5046
5047       remove_cap(&cap, false);
5048     }
5049   }
5050 }
5051
5052 void Client::handle_cap_trunc(MetaSession *session, Inode *in, const MConstRef<MClientCaps>& m)
5053 {
5054   mds_rank_t mds = session->mds_num;
5055   ceph_assert(in->caps.count(mds));
5056
5057   ldout(cct, 10) << __func__ << " on ino " << *in
5058            << " size " << in->size << " -> " << m->get_size()
5059            << dendl;
5060
5061   int issued;
5062   in->caps_issued(&issued);
5063   issued |= in->caps_dirty();
5064   update_inode_file_size(in, issued, m->get_size(),
5065                          m->get_truncate_seq(), m->get_truncate_size());
5066 }
5067
5068 void Client::handle_cap_flush_ack(MetaSession *session, Inode *in, Cap *cap, const MConstRef<MClientCaps>& m)
5069 {
5070   ceph_tid_t flush_ack_tid = m->get_client_tid();
5071   int dirty = m->get_dirty();
5072   int cleaned = 0;
5073   int flushed = 0;
5074
5075   auto it = in->flushing_cap_tids.begin();
5076   if (it->first < flush_ack_tid) {
5077        ldout(cct, 0) << __func__ << " mds." << session->mds_num
5078                    << " got unexpected flush ack tid " << flush_ack_tid
5079                    << " expected is " << it->first << dendl;
5080   }
5081   for (; it != in->flushing_cap_tids.end(); ) {
5082     if (!it->second) {
5083       // cap snap
5084       ++it;
5085       continue;
5086     }
5087     if (it->first == flush_ack_tid)
5088       cleaned = it->second;
5089     if (it->first <= flush_ack_tid) {
5090       session->flushing_caps_tids.erase(it->first);
5091       in->flushing_cap_tids.erase(it++);
5092       ++flushed;
5093       continue;
5094     }
5095     cleaned &= ~it->second;
5096     if (!cleaned)
5097       break;
5098     ++it;
5099   }
5100
5101   ldout(cct, 5) << __func__ << " mds." << session->mds_num
5102           << " cleaned " << ccap_string(cleaned) << " on " << *in
5103           << " with " << ccap_string(dirty) << dendl;
5104
5105   if (flushed) {
5106     signal_cond_list(in->waitfor_caps);
5107     if (session->flushing_caps_tids.empty() ||
5108         *session->flushing_caps_tids.begin() > flush_ack_tid)
5109       sync_cond.notify_all();
5110   }
5111
5112   if (!dirty) {
5113     in->cap_dirtier_uid = -1;
5114     in->cap_dirtier_gid = -1;
5115   }
5116
5117   if (!cleaned) {
5118     ldout(cct, 10) << " tid " << m->get_client_tid() << " != any cap bit tids" << dendl;
5119   } else {
5120     if (in->flushing_caps) {
5121       ldout(cct, 5) << "  flushing_caps " << ccap_string(in->flushing_caps)
5122               << " -> " << ccap_string(in->flushing_caps & ~cleaned) << dendl;
5123       in->flushing_caps &= ~cleaned;
5124       if (in->flushing_caps == 0) {
5125         ldout(cct, 10) << " " << *in << " !flushing" << dendl;
5126         num_flushing_caps--;
5127        if (in->flushing_cap_tids.empty())
5128           in->flushing_cap_item.remove_myself();
5129       }
5130       if (!in->caps_dirty())
5131         put_inode(in);
5132     }
5133   }
5134 }
5135
5136
5137 void Client::handle_cap_flushsnap_ack(MetaSession *session, Inode *in, const MConstRef<MClientCaps>& m)
5138 {
5139   ceph_tid_t flush_ack_tid = m->get_client_tid();
5140   mds_rank_t mds = session->mds_num;
5141   ceph_assert(in->caps.count(mds));
5142   snapid_t follows = m->get_snap_follows();
5143
5144   if (auto it = in->cap_snaps.find(follows); it != in->cap_snaps.end()) {
5145     auto& capsnap = it->second;
5146     if (flush_ack_tid != capsnap.flush_tid) {
5147       ldout(cct, 10) << " tid " << flush_ack_tid << " != " << capsnap.flush_tid << dendl;
5148     } else {
5149       InodeRef tmp_ref(in);
5150       ldout(cct, 5) << __func__ << " mds." << mds << " flushed snap follows " << follows
5151               << " on " << *in << dendl;
5152       session->flushing_caps_tids.erase(capsnap.flush_tid);
5153       in->flushing_cap_tids.erase(capsnap.flush_tid);
5154       if (in->flushing_caps == 0 && in->flushing_cap_tids.empty())
5155         in->flushing_cap_item.remove_myself();
5156       in->cap_snaps.erase(it);
5157
5158       signal_cond_list(in->waitfor_caps);
5159       if (session->flushing_caps_tids.empty() ||
5160           *session->flushing_caps_tids.begin() > flush_ack_tid)
5161         sync_cond.notify_all();
5162     }
5163   } else {
5164     ldout(cct, 5) << __func__ << " DUP(?) mds." << mds << " flushed snap follows " << follows
5165             << " on " << *in << dendl;
5166     // we may not have it if we send multiple FLUSHSNAP requests and (got multiple FLUSHEDSNAPs back)
5167   }
5168 }
5169
5170 class C_Client_DentryInvalidate : public Context  {
5171 private:
5172   Client *client;
5173   vinodeno_t dirino;
5174   vinodeno_t ino;
5175   string name;
5176 public:
5177   C_Client_DentryInvalidate(Client *c, Dentry *dn, bool del) :
5178     client(c), name(dn->name) {
5179       if (client->use_faked_inos()) {
5180         dirino.ino = dn->dir->parent_inode->faked_ino;
5181         if (del)
5182           ino.ino = dn->inode->faked_ino;
5183       } else {
5184         dirino = dn->dir->parent_inode->vino();
5185         if (del)
5186           ino = dn->inode->vino();
5187       }
5188       if (!del)
5189         ino.ino = inodeno_t();
5190   }
5191   void finish(int r) override {
5192     // _async_dentry_invalidate is responsible for its own locking
5193     ceph_assert(ceph_mutex_is_not_locked_by_me(client->client_lock));
5194     client->_async_dentry_invalidate(dirino, ino, name);
5195   }
5196 };
5197
5198 void Client::_async_dentry_invalidate(vinodeno_t dirino, vinodeno_t ino, string& name)
5199 {
5200   if (unmounting)
5201     return;
5202   ldout(cct, 10) << __func__ << " '" << name << "' ino " << ino
5203                  << " in dir " << dirino << dendl;
5204   dentry_invalidate_cb(callback_handle, dirino, ino, name.c_str(), name.length());
5205 }
5206
5207 void Client::_schedule_invalidate_dentry_callback(Dentry *dn, bool del)
5208 {
5209   if (dentry_invalidate_cb && dn->inode->ll_ref > 0)
5210     async_dentry_invalidator.queue(new C_Client_DentryInvalidate(this, dn, del));
5211 }
5212
5213 void Client::_try_to_trim_inode(Inode *in, bool sched_inval)
5214 {
5215   int ref = in->get_num_ref();
5216   ldout(cct, 5) << __func__ << " in " << *in <<dendl;
5217
5218   if (in->dir && !in->dir->dentries.empty()) {
5219     for (auto p = in->dir->dentries.begin();
5220          p != in->dir->dentries.end(); ) {
5221       Dentry *dn = p->second;
5222       ++p;
5223       /* rmsnap removes whole subtree, need trim inodes recursively.
5224        * we don't need to invalidate dentries recursively. because
5225        * invalidating a directory dentry effectively invalidate
5226        * whole subtree */
5227       if (in->snapid != CEPH_NOSNAP && dn->inode && dn->inode->is_dir())
5228         _try_to_trim_inode(dn->inode.get(), false);
5229
5230       if (dn->lru_is_expireable())
5231         unlink(dn, true, false);  // keep dir, drop dentry
5232     }
5233     if (in->dir->dentries.empty()) {
5234       close_dir(in->dir);
5235       --ref;
5236     }
5237   }
5238
5239   if (ref > 0 && (in->flags & I_SNAPDIR_OPEN)) {
5240     InodeRef snapdir = open_snapdir(in);
5241     _try_to_trim_inode(snapdir.get(), false);
5242     --ref;
5243   }
5244
5245   if (ref > 0) {
5246     auto q = in->dentries.begin();
5247     while (q != in->dentries.end()) {
5248       Dentry *dn = *q;
5249       ++q;
5250       if( in->ll_ref > 0 && sched_inval) {
5251         // FIXME: we play lots of unlink/link tricks when handling MDS replies,
5252         //        so in->dentries doesn't always reflect the state of kernel's dcache.
5253         _schedule_invalidate_dentry_callback(dn, true);
5254       }
5255       unlink(dn, true, true);
5256     }
5257   }
5258 }
5259
5260 void Client::handle_cap_grant(MetaSession *session, Inode *in, Cap *cap, const MConstRef<MClientCaps>& m)
5261 {
5262   mds_rank_t mds = session->mds_num;
5263   int used = get_caps_used(in);
5264   int wanted = in->caps_wanted();
5265
5266   const unsigned new_caps = m->get_caps();
5267   const bool was_stale = session->cap_gen > cap->gen;
5268   ldout(cct, 5) << __func__ << " on in " << m->get_ino()
5269                 << " mds." << mds << " seq " << m->get_seq()
5270                 << " caps now " << ccap_string(new_caps)
5271                 << " was " << ccap_string(cap->issued)
5272                 << (was_stale ? " (stale)" : "") << dendl;
5273
5274   if (was_stale)
5275       cap->issued = cap->implemented = CEPH_CAP_PIN;
5276   cap->seq = m->get_seq();
5277   cap->gen = session->cap_gen;
5278
5279   check_cap_issue(in, new_caps);
5280
5281   // update inode
5282   int issued;
5283   in->caps_issued(&issued);
5284   issued |= in->caps_dirty();
5285
5286   if ((new_caps & CEPH_CAP_AUTH_SHARED) &&
5287       !(issued & CEPH_CAP_AUTH_EXCL)) {
5288     in->mode = m->head.mode;
5289     in->uid = m->head.uid;
5290     in->gid = m->head.gid;
5291     in->btime = m->btime;
5292   }
5293   bool deleted_inode = false;
5294   if ((new_caps & CEPH_CAP_LINK_SHARED) &&
5295       !(issued & CEPH_CAP_LINK_EXCL)) {
5296     in->nlink = m->head.nlink;
5297     if (in->nlink == 0 &&
5298         (new_caps & (CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL)))
5299       deleted_inode = true;
5300   }
5301   if (!(issued & CEPH_CAP_XATTR_EXCL) &&
5302       m->xattrbl.length() &&
5303       m->head.xattr_version > in->xattr_version) {
5304     auto p = m->xattrbl.cbegin();
5305     decode(in->xattrs, p);
5306     in->xattr_version = m->head.xattr_version;
5307   }
5308
5309   if ((new_caps & CEPH_CAP_FILE_SHARED) && m->dirstat_is_valid()) {
5310     in->dirstat.nfiles = m->get_nfiles();
5311     in->dirstat.nsubdirs = m->get_nsubdirs();
5312   }
5313
5314   if (new_caps & CEPH_CAP_ANY_RD) {
5315     update_inode_file_time(in, issued, m->get_time_warp_seq(),
5316                            m->get_ctime(), m->get_mtime(), m->get_atime());
5317   }
5318
5319   if (new_caps & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR)) {
5320     in->layout = m->get_layout();
5321     update_inode_file_size(in, issued, m->get_size(),
5322                            m->get_truncate_seq(), m->get_truncate_size());
5323   }
5324
5325   if (m->inline_version > in->inline_version) {
5326     in->inline_data = m->inline_data;
5327     in->inline_version = m->inline_version;
5328   }
5329
5330   /* always take a newer change attr */
5331   if (m->get_change_attr() > in->change_attr)
5332     in->change_attr = m->get_change_attr();
5333
5334   // max_size
5335   if (cap == in->auth_cap &&
5336       (new_caps & CEPH_CAP_ANY_FILE_WR) &&
5337       (m->get_max_size() != in->max_size)) {
5338     ldout(cct, 10) << "max_size " << in->max_size << " -> " << m->get_max_size() << dendl;
5339     in->max_size = m->get_max_size();
5340     if (in->max_size > in->wanted_max_size) {
5341       in->wanted_max_size = 0;
5342       in->requested_max_size = 0;
5343     }
5344   }
5345
5346   bool check = false;
5347   if ((was_stale || m->get_op() == CEPH_CAP_OP_IMPORT) &&
5348       (wanted & ~(cap->wanted | new_caps))) {
5349     // If mds is importing cap, prior cap messages that update 'wanted'
5350     // may get dropped by mds (migrate seq mismatch).
5351     //
5352     // We don't send cap message to update 'wanted' if what we want are
5353     // already issued. If mds revokes caps, cap message that releases caps
5354     // also tells mds what we want. But if caps got revoked by mds forcedly
5355     // (session stale). We may haven't told mds what we want.
5356     check = true;
5357   }
5358
5359
5360   // update caps
5361   auto revoked = cap->issued & ~new_caps;
5362   if (revoked) {
5363     ldout(cct, 10) << "  revocation of " << ccap_string(revoked) << dendl;
5364     cap->issued = new_caps;
5365     cap->implemented |= new_caps;
5366
5367     // recall delegations if we're losing caps necessary for them
5368     if (revoked & ceph_deleg_caps_for_type(CEPH_DELEGATION_RD))
5369       in->recall_deleg(false);
5370     else if (revoked & ceph_deleg_caps_for_type(CEPH_DELEGATION_WR))
5371       in->recall_deleg(true);
5372
5373     used = adjust_caps_used_for_lazyio(used, cap->issued, cap->implemented);
5374     if ((used & revoked & (CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO)) &&
5375         !_flush(in, new C_Client_FlushComplete(this, in))) {
5376       // waitin' for flush
5377     } else if (used & revoked & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO)) {
5378       if (_release(in))
5379         check = true;
5380     } else {
5381       cap->wanted = 0; // don't let check_caps skip sending a response to MDS
5382       check = true;
5383     }
5384   } else if (cap->issued == new_caps) {
5385     ldout(cct, 10) << "  caps unchanged at " << ccap_string(cap->issued) << dendl;
5386   } else {
5387     ldout(cct, 10) << "  grant, new caps are " << ccap_string(new_caps & ~cap->issued) << dendl;
5388     cap->issued = new_caps;
5389     cap->implemented |= new_caps;
5390
5391     if (cap == in->auth_cap) {
5392       // non-auth MDS is revoking the newly grant caps ?
5393       for (const auto &p : in->caps) {
5394         if (&p.second == cap)
5395           continue;
5396         if (p.second.implemented & ~p.second.issued & new_caps) {
5397           check = true;
5398           break;
5399         }
5400       }
5401     }
5402   }
5403
5404   if (check)
5405     check_caps(in, 0);
5406
5407   // wake up waiters
5408   if (new_caps)
5409     signal_cond_list(in->waitfor_caps);
5410
5411   // may drop inode's last ref
5412   if (deleted_inode)
5413     _try_to_trim_inode(in, true);
5414 }
5415
5416 int Client::inode_permission(Inode *in, const UserPerm& perms, unsigned want)
5417 {
5418   if (perms.uid() == 0)
5419     return 0;
5420
5421   if (perms.uid() != in->uid && (in->mode & S_IRWXG)) {
5422     int ret = _posix_acl_permission(in, perms, want);
5423     if (ret != -EAGAIN)
5424       return ret;
5425   }
5426
5427   // check permissions before doing anything else
5428   if (!in->check_mode(perms, want))
5429     return -EACCES;
5430   return 0;
5431 }
5432
5433 int Client::xattr_permission(Inode *in, const char *name, unsigned want,
5434                              const UserPerm& perms)
5435 {
5436   int r = _getattr_for_perm(in, perms);
5437   if (r < 0)
5438     goto out;
5439
5440   r = 0;
5441   if (strncmp(name, "system.", 7) == 0) {
5442     if ((want & MAY_WRITE) && (perms.uid() != 0 && perms.uid() != in->uid))
5443       r = -EPERM;
5444   } else {
5445     r = inode_permission(in, perms, want);
5446   }
5447 out:
5448   ldout(cct, 5) << __func__ << " " << in << " = " << r <<  dendl;
5449   return r;
5450 }
5451
5452 ostream& operator<<(ostream &out, const UserPerm& perm) {
5453   out << "UserPerm(uid: " << perm.uid() << ", gid: " << perm.gid() << ")";
5454   return out;
5455 }
5456
5457 int Client::may_setattr(Inode *in, struct ceph_statx *stx, int mask,
5458                         const UserPerm& perms)
5459 {
5460   ldout(cct, 20) << __func__ << " " << *in << "; " << perms << dendl;
5461   int r = _getattr_for_perm(in, perms);
5462   if (r < 0)
5463     goto out;
5464
5465   if (mask & CEPH_SETATTR_SIZE) {
5466     r = inode_permission(in, perms, MAY_WRITE);
5467     if (r < 0)
5468       goto out;
5469   }
5470
5471   r = -EPERM;
5472   if (mask & CEPH_SETATTR_UID) {
5473     if (perms.uid() != 0 && (perms.uid() != in->uid || stx->stx_uid != in->uid))
5474       goto out;
5475   }
5476   if (mask & CEPH_SETATTR_GID) {
5477     if (perms.uid() != 0 && (perms.uid() != in->uid ||
5478                (!perms.gid_in_groups(stx->stx_gid) && stx->stx_gid != in->gid)))
5479       goto out;
5480   }
5481
5482   if (mask & CEPH_SETATTR_MODE) {
5483     if (perms.uid() != 0 && perms.uid() != in->uid)
5484       goto out;
5485
5486     gid_t i_gid = (mask & CEPH_SETATTR_GID) ? stx->stx_gid : in->gid;
5487     if (perms.uid() != 0 && !perms.gid_in_groups(i_gid))
5488       stx->stx_mode &= ~S_ISGID;
5489   }
5490
5491   if (mask & (CEPH_SETATTR_CTIME | CEPH_SETATTR_BTIME |
5492               CEPH_SETATTR_MTIME | CEPH_SETATTR_ATIME)) {
5493     if (perms.uid() != 0 && perms.uid() != in->uid) {
5494       int check_mask = CEPH_SETATTR_CTIME | CEPH_SETATTR_BTIME;
5495       if (!(mask & CEPH_SETATTR_MTIME_NOW))
5496         check_mask |= CEPH_SETATTR_MTIME;
5497       if (!(mask & CEPH_SETATTR_ATIME_NOW))
5498         check_mask |= CEPH_SETATTR_ATIME;
5499       if (check_mask & mask) {
5500         goto out;
5501       } else {
5502         r = inode_permission(in, perms, MAY_WRITE);
5503         if (r < 0)
5504           goto out;
5505       }
5506     }
5507   }
5508   r = 0;
5509 out:
5510   ldout(cct, 3) << __func__ << " " << in << " = " << r <<  dendl;
5511   return r;
5512 }
5513
5514 int Client::may_open(Inode *in, int flags, const UserPerm& perms)
5515 {
5516   ldout(cct, 20) << __func__ << " " << *in << "; " << perms << dendl;
5517   unsigned want = 0;
5518
5519   if ((flags & O_ACCMODE) == O_WRONLY)
5520     want = MAY_WRITE;
5521   else if ((flags & O_ACCMODE) == O_RDWR)
5522     want = MAY_READ | MAY_WRITE;
5523   else if ((flags & O_ACCMODE) == O_RDONLY)
5524     want = MAY_READ;
5525   if (flags & O_TRUNC)
5526     want |= MAY_WRITE;
5527
5528   int r = 0;
5529   switch (in->mode & S_IFMT) {
5530     case S_IFLNK:
5531       r = -ELOOP;
5532       goto out;
5533     case S_IFDIR:
5534       if (want & MAY_WRITE) {
5535         r = -EISDIR;
5536         goto out;
5537       }
5538       break;
5539   }
5540
5541   r = _getattr_for_perm(in, perms);
5542   if (r < 0)
5543     goto out;
5544
5545   r = inode_permission(in, perms, want);
5546 out:
5547   ldout(cct, 3) << __func__ << " " << in << " = " << r <<  dendl;
5548   return r;
5549 }
5550
5551 int Client::may_lookup(Inode *dir, const UserPerm& perms)
5552 {
5553   ldout(cct, 20) << __func__ << " " << *dir << "; " << perms << dendl;
5554   int r = _getattr_for_perm(dir, perms);
5555   if (r < 0)
5556     goto out;
5557
5558   r = inode_permission(dir, perms, MAY_EXEC);
5559 out:
5560   ldout(cct, 3) << __func__ << " " << dir << " = " << r <<  dendl;
5561   return r;
5562 }
5563
5564 int Client::may_create(Inode *dir, const UserPerm& perms)
5565 {
5566   ldout(cct, 20) << __func__ << " " << *dir << "; " << perms << dendl;
5567   int r = _getattr_for_perm(dir, perms);
5568   if (r < 0)
5569     goto out;
5570
5571   r = inode_permission(dir, perms, MAY_EXEC | MAY_WRITE);
5572 out:
5573   ldout(cct, 3) << __func__ << " " << dir << " = " << r <<  dendl;
5574   return r;
5575 }
5576
5577 int Client::may_delete(Inode *dir, const char *name, const UserPerm& perms)
5578 {
5579   ldout(cct, 20) << __func__ << " " << *dir << "; " << "; name " << name << "; " << perms << dendl;
5580   int r = _getattr_for_perm(dir, perms);
5581   if (r < 0)
5582     goto out;
5583
5584   r = inode_permission(dir, perms, MAY_EXEC | MAY_WRITE);
5585   if (r < 0)
5586     goto out;
5587
5588   /* 'name == NULL' means rmsnap */
5589   if (perms.uid() != 0 && name && (dir->mode & S_ISVTX)) {
5590     InodeRef otherin;
5591     r = _lookup(dir, name, CEPH_CAP_AUTH_SHARED, &otherin, perms);
5592     if (r < 0)
5593       goto out;
5594     if (dir->uid != perms.uid() && otherin->uid != perms.uid())
5595       r = -EPERM;
5596   }
5597 out:
5598   ldout(cct, 3) << __func__ << " " << dir << " = " << r <<  dendl;
5599   return r;
5600 }
5601
5602 int Client::may_hardlink(Inode *in, const UserPerm& perms)
5603 {
5604   ldout(cct, 20) << __func__ << " " << *in << "; " << perms << dendl;
5605   int r = _getattr_for_perm(in, perms);
5606   if (r < 0)
5607     goto out;
5608
5609   if (perms.uid() == 0 || perms.uid() == in->uid) {
5610     r = 0;
5611     goto out;
5612   }
5613
5614   r = -EPERM;
5615   if (!S_ISREG(in->mode))
5616     goto out;
5617
5618   if (in->mode & S_ISUID)
5619     goto out;
5620
5621   if ((in->mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP))
5622     goto out;
5623
5624   r = inode_permission(in, perms, MAY_READ | MAY_WRITE);
5625 out:
5626   ldout(cct, 3) << __func__ << " " << in << " = " << r <<  dendl;
5627   return r;
5628 }
5629
5630 int Client::_getattr_for_perm(Inode *in, const UserPerm& perms)
5631 {
5632   int mask = CEPH_STAT_CAP_MODE;
5633   bool force = false;
5634   if (acl_type != NO_ACL) {
5635     mask |= CEPH_STAT_CAP_XATTR;
5636     force = in->xattr_version == 0;
5637   }
5638   return _getattr(in, mask, perms, force);
5639 }
5640
5641 vinodeno_t Client::_get_vino(Inode *in)
5642 {
5643   /* The caller must hold the client lock */
5644   return vinodeno_t(in->ino, in->snapid);
5645 }
5646
5647 /**
5648  * Resolve an MDS spec to a list of MDS daemon GIDs.
5649  *
5650  * The spec is a string representing a GID, rank, filesystem:rank, or name/id.
5651  * It may be '*' in which case it matches all GIDs.
5652  *
5653  * If no error is returned, the `targets` vector will be populated with at least
5654  * one MDS.
5655  */
5656 int Client::resolve_mds(
5657     const std::string &mds_spec,
5658     std::vector<mds_gid_t> *targets)
5659 {
5660   ceph_assert(fsmap);
5661   ceph_assert(targets != nullptr);
5662
5663   mds_role_t role;
5664   std::stringstream ss;
5665   int role_r = fsmap->parse_role(mds_spec, &role, ss);
5666   if (role_r == 0) {
5667     // We got a role, resolve it to a GID
5668     ldout(cct, 10) << __func__ << ": resolved '" << mds_spec << "' to role '"
5669       << role << "'" << dendl;
5670     targets->push_back(
5671         fsmap->get_filesystem(role.fscid)->mds_map.get_info(role.rank).global_id);
5672     return 0;
5673   }
5674
5675   std::string strtol_err;
5676   long long rank_or_gid = strict_strtoll(mds_spec.c_str(), 10, &strtol_err);
5677   if (strtol_err.empty()) {
5678     // It is a possible GID
5679     const mds_gid_t mds_gid = mds_gid_t(rank_or_gid);
5680     if (fsmap->gid_exists(mds_gid)) {
5681       ldout(cct, 10) << __func__ << ": validated GID " << mds_gid << dendl;
5682       targets->push_back(mds_gid);
5683     } else {
5684       lderr(cct) << __func__ << ": GID " << mds_gid << " not in MDS map"
5685                  << dendl;
5686       return -ENOENT;
5687     }
5688   } else if (mds_spec == "*") {
5689     // It is a wildcard: use all MDSs
5690     const auto mds_info = fsmap->get_mds_info();
5691
5692     if (mds_info.empty()) {
5693       lderr(cct) << __func__ << ": * passed but no MDS daemons found" << dendl;
5694       return -ENOENT;
5695     }
5696
5697     for (const auto i : mds_info) {
5698       targets->push_back(i.first);
5699     }
5700   } else {
5701     // It did not parse as an integer, it is not a wildcard, it must be a name
5702     const mds_gid_t mds_gid = fsmap->find_mds_gid_by_name(mds_spec);
5703     if (mds_gid == 0) {
5704       lderr(cct) << "MDS ID '" << mds_spec << "' not found" << dendl;
5705
5706       lderr(cct) << "FSMap: " << *fsmap << dendl;
5707
5708       return -ENOENT;
5709     } else {
5710       ldout(cct, 10) << __func__ << ": resolved ID '" << mds_spec
5711                      << "' to GID " << mds_gid << dendl;
5712       targets->push_back(mds_gid);
5713     }
5714   }
5715
5716   return 0;
5717 }
5718
5719
5720 /**
5721  * Authenticate with mon and establish global ID
5722  */
5723 int Client::authenticate()
5724 {
5725   ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
5726
5727   if (monclient->is_authenticated()) {
5728     return 0;
5729   }
5730
5731   client_lock.unlock();
5732   int r = monclient->authenticate(cct->_conf->client_mount_timeout);
5733   client_lock.lock();
5734   if (r < 0) {
5735     return r;
5736   }
5737
5738   whoami = monclient->get_global_id();
5739   messenger->set_myname(entity_name_t::CLIENT(whoami.v));
5740
5741   return 0;
5742 }
5743
5744 int Client::fetch_fsmap(bool user)
5745 {
5746   int r;
5747   // Retrieve FSMap to enable looking up daemon addresses.  We need FSMap
5748   // rather than MDSMap because no one MDSMap contains all the daemons, and
5749   // a `tell` can address any daemon.
5750   version_t fsmap_latest;
5751   do {
5752     C_SaferCond cond;
5753     monclient->get_version("fsmap", &fsmap_latest, NULL, &cond);
5754     client_lock.unlock();
5755     r = cond.wait();
5756     client_lock.lock();
5757   } while (r == -EAGAIN);
5758
5759   if (r < 0) {
5760     lderr(cct) << "Failed to learn FSMap version: " << cpp_strerror(r) << dendl;
5761     return r;
5762   }
5763
5764   ldout(cct, 10) << __func__ << " learned FSMap version " << fsmap_latest << dendl;
5765
5766   if (user) {
5767     if (!fsmap_user || fsmap_user->get_epoch() < fsmap_latest) {
5768       monclient->sub_want("fsmap.user", fsmap_latest, CEPH_SUBSCRIBE_ONETIME);
5769       monclient->renew_subs();
5770       wait_on_list(waiting_for_fsmap);
5771     }
5772     ceph_assert(fsmap_user);
5773     ceph_assert(fsmap_user->get_epoch() >= fsmap_latest);
5774   } else {
5775     if (!fsmap || fsmap->get_epoch() < fsmap_latest) {
5776       monclient->sub_want("fsmap", fsmap_latest, CEPH_SUBSCRIBE_ONETIME);
5777       monclient->renew_subs();
5778       wait_on_list(waiting_for_fsmap);
5779     }
5780     ceph_assert(fsmap);
5781     ceph_assert(fsmap->get_epoch() >= fsmap_latest);
5782   }
5783   ldout(cct, 10) << __func__ << " finished waiting for FSMap version "
5784                  << fsmap_latest << dendl;
5785   return 0;
5786 }
5787
5788 /**
5789  *
5790  * @mds_spec one of ID, rank, GID, "*"
5791  *
5792  */
5793 int Client::mds_command(
5794     const std::string &mds_spec,
5795     const vector<string>& cmd,
5796     const bufferlist& inbl,
5797     bufferlist *outbl,
5798     string *outs,
5799     Context *onfinish)
5800 {
5801   std::lock_guard lock(client_lock);
5802
5803   if (!initialized)
5804     return -ENOTCONN;
5805
5806   int r;
5807   r = authenticate();
5808   if (r < 0) {
5809     return r;
5810   }
5811
5812   r = fetch_fsmap(false);
5813   if (r < 0) {
5814     return r;
5815   }
5816
5817   // Look up MDS target(s) of the command
5818   std::vector<mds_gid_t> targets;
5819   r = resolve_mds(mds_spec, &targets);
5820   if (r < 0) {
5821     return r;
5822   }
5823
5824   // If daemons are laggy, we won't send them commands.  If all
5825   // are laggy then we fail.
5826   std::vector<mds_gid_t> non_laggy;
5827   for (const auto gid : targets) {
5828     const auto info = fsmap->get_info_gid(gid);
5829     if (!info.laggy()) {
5830       non_laggy.push_back(gid);
5831     }
5832   }
5833   if (non_laggy.size() == 0) {
5834     *outs = "All targeted MDS daemons are laggy";
5835     return -ENOENT;
5836   }
5837
5838   if (metadata.empty()) {
5839     // We are called on an unmounted client, so metadata
5840     // won't be initialized yet.
5841     populate_metadata("");
5842   }
5843
5844   // Send commands to targets
5845   C_GatherBuilder gather(cct, onfinish);
5846   for (const auto target_gid : non_laggy) {
5847     const auto info = fsmap->get_info_gid(target_gid);
5848
5849     // Open a connection to the target MDS
5850     ConnectionRef conn = messenger->connect_to_mds(info.get_addrs());
5851
5852     // Generate MDSCommandOp state
5853     auto &op = command_table.start_command();
5854
5855     op.on_finish = gather.new_sub();
5856     op.cmd = cmd;
5857     op.outbl = outbl;
5858     op.outs = outs;
5859     op.inbl = inbl;
5860     op.mds_gid = target_gid;
5861     op.con = conn;
5862
5863     ldout(cct, 4) << __func__ << ": new command op to " << target_gid
5864       << " tid=" << op.tid << cmd << dendl;
5865
5866     // Construct and send MCommand
5867     auto m = op.get_message(monclient->get_fsid());
5868     conn->send_message2(std::move(m));
5869   }
5870   gather.activate();
5871
5872   return 0;
5873 }
5874
5875 void Client::handle_command_reply(const MConstRef<MCommandReply>& m)
5876 {
5877   ceph_tid_t const tid = m->get_tid();
5878
5879   ldout(cct, 10) << __func__ << ": tid=" << m->get_tid() << dendl;
5880
5881   if (!command_table.exists(tid)) {
5882     ldout(cct, 1) << __func__ << ": unknown tid " << tid << ", dropping" << dendl;
5883     return;
5884   }
5885
5886   auto &op = command_table.get_command(tid);
5887   if (op.outbl) {
5888     *op.outbl = m->get_data();
5889   }
5890   if (op.outs) {
5891     *op.outs = m->rs;
5892   }
5893
5894   if (op.on_finish) {
5895     op.on_finish->complete(m->r);
5896   }
5897
5898   command_table.erase(tid);
5899 }
5900
5901 // -------------------
5902 // MOUNT
5903
5904 int Client::subscribe_mdsmap(const std::string &fs_name)
5905 {
5906   int r = authenticate();
5907   if (r < 0) {
5908     lderr(cct) << "authentication failed: " << cpp_strerror(r) << dendl;
5909     return r;
5910   }
5911
5912   std::string resolved_fs_name;
5913   if (fs_name.empty()) {
5914     resolved_fs_name = cct->_conf.get_val<std::string>("client_fs");
5915     if (resolved_fs_name.empty())
5916             // Try the backwards compatibility fs name option
5917             resolved_fs_name = cct->_conf.get_val<std::string>("client_mds_namespace");
5918   } else {
5919     resolved_fs_name = fs_name;
5920   }
5921
5922   std::string want = "mdsmap";
5923   if (!resolved_fs_name.empty()) {
5924     r = fetch_fsmap(true);
5925     if (r < 0)
5926       return r;
5927     fscid = fsmap_user->get_fs_cid(resolved_fs_name);
5928     if (fscid == FS_CLUSTER_ID_NONE) {
5929       return -ENOENT;
5930     }
5931
5932     std::ostringstream oss;
5933     oss << want << "." << fscid;
5934     want = oss.str();
5935   }
5936   ldout(cct, 10) << "Subscribing to map '" << want << "'" << dendl;
5937
5938   monclient->sub_want(want, 0, 0);
5939   monclient->renew_subs();
5940
5941   return 0;
5942 }
5943
5944 int Client::mount(const std::string &mount_root, const UserPerm& perms,
5945                   bool require_mds, const std::string &fs_name)
5946 {
5947   std::lock_guard lock(client_lock);
5948
5949   if (mounted) {
5950     ldout(cct, 5) << "already mounted" << dendl;
5951     return 0;
5952   }
5953
5954   unmounting = false;
5955
5956   int r = subscribe_mdsmap(fs_name);
5957   if (r < 0) {
5958     lderr(cct) << "mdsmap subscription failed: " << cpp_strerror(r) << dendl;
5959     return r;
5960   }
5961
5962   tick(); // start tick
5963
5964   if (require_mds) {
5965     while (1) {
5966       auto availability = mdsmap->is_cluster_available();
5967       if (availability == MDSMap::STUCK_UNAVAILABLE) {
5968         // Error out
5969         ldout(cct, 10) << "mds cluster unavailable: epoch=" << mdsmap->get_epoch() << dendl;
5970         return CEPH_FUSE_NO_MDS_UP;
5971       } else if (availability == MDSMap::AVAILABLE) {
5972         // Continue to mount
5973         break;
5974       } else if (availability == MDSMap::TRANSIENT_UNAVAILABLE) {
5975         // Else, wait.  MDSMonitor will update the map to bring
5976         // us to a conclusion eventually.
5977         wait_on_list(waiting_for_mdsmap);
5978       } else {
5979         // Unexpected value!
5980         ceph_abort();
5981       }
5982     }
5983   }
5984
5985   populate_metadata(mount_root.empty() ? "/" : mount_root);
5986
5987   filepath fp(CEPH_INO_ROOT);
5988   if (!mount_root.empty()) {
5989     fp = filepath(mount_root.c_str());
5990   }
5991   while (true) {
5992     MetaRequest *req = new MetaRequest(CEPH_MDS_OP_GETATTR);
5993     req->set_filepath(fp);
5994     req->head.args.getattr.mask = CEPH_STAT_CAP_INODE_ALL;
5995     int res = make_request(req, perms);
5996     if (res < 0) {
5997       if (res == -EACCES && root) {
5998         ldout(cct, 1) << __func__ << " EACCES on parent of mount point; quotas may not work" << dendl;
5999         break;
6000       }
6001       return res;
6002     }
6003
6004     if (fp.depth())
6005       fp.pop_dentry();
6006     else
6007       break;
6008   }
6009
6010   ceph_assert(root);
6011   _ll_get(root);
6012
6013   mounted = true;
6014
6015   // trace?
6016   if (!cct->_conf->client_trace.empty()) {
6017     traceout.open(cct->_conf->client_trace.c_str());
6018     if (traceout.is_open()) {
6019       ldout(cct, 1) << "opened trace file '" << cct->_conf->client_trace << "'" << dendl;
6020     } else {
6021       ldout(cct, 1) << "FAILED to open trace file '" << cct->_conf->client_trace << "'" << dendl;
6022     }
6023   }
6024
6025   /*
6026   ldout(cct, 3) << "op: // client trace data structs" << dendl;
6027   ldout(cct, 3) << "op: struct stat st;" << dendl;
6028   ldout(cct, 3) << "op: struct utimbuf utim;" << dendl;
6029   ldout(cct, 3) << "op: int readlinkbuf_len = 1000;" << dendl;
6030   ldout(cct, 3) << "op: char readlinkbuf[readlinkbuf_len];" << dendl;
6031   ldout(cct, 3) << "op: map<string, inode_t*> dir_contents;" << dendl;
6032   ldout(cct, 3) << "op: map<int, int> open_files;" << dendl;
6033   ldout(cct, 3) << "op: int fd;" << dendl;
6034   */
6035   return 0;
6036 }
6037
6038 // UNMOUNT
6039
6040 void Client::_close_sessions()
6041 {
6042   for (auto it = mds_sessions.begin(); it != mds_sessions.end(); ) {
6043     if (it->second.state == MetaSession::STATE_REJECTED)
6044       mds_sessions.erase(it++);
6045     else
6046       ++it;
6047   }
6048
6049   while (!mds_sessions.empty()) {
6050     // send session closes!
6051     for (auto &p : mds_sessions) {
6052       if (p.second.state != MetaSession::STATE_CLOSING) {
6053         _close_mds_session(&p.second);
6054         mds_ranks_closing.insert(p.first);
6055       }
6056     }
6057
6058     // wait for sessions to close
6059     double timo = cct->_conf.get_val<std::chrono::seconds>("client_shutdown_timeout").count();
6060     ldout(cct, 2) << "waiting for " << mds_ranks_closing.size() << " mds session(s) to close (timeout: "
6061                   << timo << "s)" << dendl;
6062     std::unique_lock l{client_lock, std::adopt_lock};
6063     if (!timo) {
6064       mount_cond.wait(l);
6065     } else if (!mount_cond.wait_for(l, ceph::make_timespan(timo), [this] { return mds_ranks_closing.empty(); })) {
6066       ldout(cct, 1) << mds_ranks_closing.size() << " mds(s) did not respond to session close -- timing out." << dendl;
6067       while (!mds_ranks_closing.empty()) {
6068         auto session = mds_sessions.at(*mds_ranks_closing.begin());
6069         // this prunes entry from mds_sessions and mds_ranks_closing
6070         _closed_mds_session(&session, -ETIMEDOUT);
6071       }
6072     }
6073
6074     mds_ranks_closing.clear();
6075     l.release();
6076   }
6077 }
6078
6079 void Client::flush_mdlog_sync()
6080 {
6081   if (mds_requests.empty())
6082     return;
6083   for (auto &p : mds_sessions) {
6084     flush_mdlog(&p.second);
6085   }
6086 }
6087
6088 void Client::flush_mdlog(MetaSession *session)
6089 {
6090   // Only send this to Luminous or newer MDS daemons, older daemons
6091   // will crash if they see an unknown CEPH_SESSION_* value in this msg.
6092   const uint64_t features = session->con->get_features();
6093   if (HAVE_FEATURE(features, SERVER_LUMINOUS)) {
6094     auto m = make_message<MClientSession>(CEPH_SESSION_REQUEST_FLUSH_MDLOG);
6095     session->con->send_message2(std::move(m));
6096   }
6097 }
6098
6099
6100 void Client::_abort_mds_sessions(int err)
6101 {
6102   for (auto p = mds_requests.begin(); p != mds_requests.end(); ) {
6103     auto req = p->second;
6104     ++p;
6105     // unsafe requests will be removed during close session below.
6106     if (req->got_unsafe)
6107       continue;
6108
6109     req->abort(err);
6110     if (req->caller_cond) {
6111       req->kick = true;
6112       req->caller_cond->notify_all();
6113     }
6114   }
6115
6116   // Process aborts on any requests that were on this waitlist.
6117   // Any requests that were on a waiting_for_open session waitlist
6118   // will get kicked during close session below.
6119   signal_cond_list(waiting_for_mdsmap);
6120
6121   // Force-close all sessions
6122   while(!mds_sessions.empty()) {
6123     auto& session = mds_sessions.begin()->second;
6124     _closed_mds_session(&session, err);
6125   }
6126 }
6127
6128 void Client::_unmount(bool abort)
6129 {
6130   std::unique_lock lock{client_lock, std::adopt_lock};
6131   if (unmounting)
6132     return;
6133
6134   if (abort || blacklisted) {
6135     ldout(cct, 2) << "unmounting (" << (abort ? "abort)" : "blacklisted)") << dendl;
6136   } else {
6137     ldout(cct, 2) << "unmounting" << dendl;
6138   }
6139   unmounting = true;
6140
6141   deleg_timeout = 0;
6142
6143   if (abort) {
6144     // Abort all mds sessions
6145     _abort_mds_sessions(-ENOTCONN);
6146
6147     objecter->op_cancel_writes(-ENOTCONN);
6148   } else {
6149     // flush the mdlog for pending requests, if any
6150     flush_mdlog_sync();
6151   }
6152
6153   mount_cond.wait(lock, [this] {
6154     if (!mds_requests.empty()) {
6155       ldout(cct, 10) << "waiting on " << mds_requests.size() << " requests"
6156                      << dendl;
6157     }
6158     return mds_requests.empty();
6159   });
6160   if (tick_event)
6161     timer.cancel_event(tick_event);
6162   tick_event = 0;
6163
6164   cwd.reset();
6165
6166   // clean up any unclosed files
6167   while (!fd_map.empty()) {
6168     Fh *fh = fd_map.begin()->second;
6169     fd_map.erase(fd_map.begin());
6170     ldout(cct, 0) << " destroyed lost open file " << fh << " on " << *fh->inode << dendl;
6171     _release_fh(fh);
6172   }
6173
6174   while (!ll_unclosed_fh_set.empty()) {
6175     set<Fh*>::iterator it = ll_unclosed_fh_set.begin();
6176     Fh *fh = *it;
6177     ll_unclosed_fh_set.erase(fh);
6178     ldout(cct, 0) << " destroyed lost open file " << fh << " on " << *(fh->inode) << dendl;
6179     _release_fh(fh);
6180   }
6181
6182   while (!opened_dirs.empty()) {
6183     dir_result_t *dirp = *opened_dirs.begin();
6184     ldout(cct, 0) << " destroyed lost open dir " << dirp << " on " << *dirp->inode << dendl;
6185     _closedir(dirp);
6186   }
6187
6188   _ll_drop_pins();
6189
6190   mount_cond.wait(lock, [this] {
6191     if (unsafe_sync_write > 0) {
6192       ldout(cct, 0) << unsafe_sync_write << " unsafe_sync_writes, waiting"
6193                     << dendl;
6194     }
6195     return unsafe_sync_write <= 0;
6196   });
6197
6198   if (cct->_conf->client_oc) {
6199     // flush/release all buffered data
6200     std::list<InodeRef> anchor;
6201     for (auto& p : inode_map) {
6202       Inode *in = p.second;
6203       if (!in) {
6204         ldout(cct, 0) << "null inode_map entry ino " << p.first << dendl;
6205         ceph_assert(in);
6206       }
6207
6208       // prevent inode from getting freed
6209       anchor.emplace_back(in);
6210
6211       if (abort || blacklisted) {
6212         objectcacher->purge_set(&in->oset);
6213       } else if (!in->caps.empty()) {
6214         _release(in);
6215         _flush(in, new C_Client_FlushComplete(this, in));
6216       }
6217     }
6218   }
6219
6220   if (abort || blacklisted) {
6221     for (auto p = dirty_list.begin(); !p.end(); ) {
6222       Inode *in = *p;
6223       ++p;
6224       if (in->dirty_caps) {
6225         ldout(cct, 0) << " drop dirty caps on " << *in << dendl;
6226         in->mark_caps_clean();
6227         put_inode(in);
6228       }
6229     }
6230   } else {
6231     flush_caps_sync();
6232     wait_sync_caps(last_flush_tid);
6233   }
6234
6235   // empty lru cache
6236   trim_cache();
6237
6238   while (lru.lru_get_size() > 0 ||
6239          !inode_map.empty()) {
6240     ldout(cct, 2) << "cache still has " << lru.lru_get_size()
6241             << "+" << inode_map.size() << " items"
6242             << ", waiting (for caps to release?)"
6243             << dendl;
6244     if (auto r = mount_cond.wait_for(lock, ceph::make_timespan(5));
6245         r == std::cv_status::timeout) {
6246       dump_cache(NULL);
6247     }
6248   }
6249   ceph_assert(lru.lru_get_size() == 0);
6250   ceph_assert(inode_map.empty());
6251
6252   // stop tracing
6253   if (!cct->_conf->client_trace.empty()) {
6254     ldout(cct, 1) << "closing trace file '" << cct->_conf->client_trace << "'" << dendl;
6255     traceout.close();
6256   }
6257
6258   _close_sessions();
6259
6260   mounted = false;
6261
6262   lock.release();
6263   ldout(cct, 2) << "unmounted." << dendl;
6264 }
6265
6266 void Client::unmount()
6267 {
6268   std::lock_guard lock(client_lock);
6269   _unmount(false);
6270 }
6271
6272 void Client::abort_conn()
6273 {
6274   std::lock_guard lock(client_lock);
6275   _unmount(true);
6276 }
6277
6278 void Client::flush_cap_releases()
6279 {
6280   // send any cap releases
6281   for (auto &p : mds_sessions) {
6282     auto &session = p.second;
6283     if (session.release && mdsmap->is_clientreplay_or_active_or_stopping(
6284           p.first)) {
6285       if (cct->_conf->client_inject_release_failure) {
6286         ldout(cct, 20) << __func__ << " injecting failure to send cap release message" << dendl;
6287       } else {
6288         session.con->send_message2(std::move(session.release));
6289       }
6290       session.release.reset();
6291     }
6292   }
6293 }
6294
6295 void Client::tick()
6296 {
6297   if (cct->_conf->client_debug_inject_tick_delay > 0) {
6298     sleep(cct->_conf->client_debug_inject_tick_delay);
6299     ceph_assert(0 == cct->_conf.set_val("client_debug_inject_tick_delay", "0"));
6300     cct->_conf.apply_changes(nullptr);
6301   }
6302
6303   ldout(cct, 21) << "tick" << dendl;
6304   tick_event = timer.add_event_after(
6305     cct->_conf->client_tick_interval,
6306     new LambdaContext([this](int) {
6307         // Called back via Timer, which takes client_lock for us
6308         ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
6309         tick();
6310       }));
6311   utime_t now = ceph_clock_now();
6312
6313   if (!mounted && !mds_requests.empty()) {
6314     MetaRequest *req = mds_requests.begin()->second;
6315     if (req->op_stamp + cct->_conf->client_mount_timeout < now) {
6316       req->abort(-ETIMEDOUT);
6317       if (req->caller_cond) {
6318         req->kick = true;
6319         req->caller_cond->notify_all();
6320       }
6321       signal_cond_list(waiting_for_mdsmap);
6322       for (auto &p : mds_sessions) {
6323         signal_context_list(p.second.waiting_for_open);
6324       }
6325     }
6326   }
6327
6328   if (mdsmap->get_epoch()) {
6329     // renew caps?
6330     utime_t el = now - last_cap_renew;
6331     if (el > mdsmap->get_session_timeout() / 3.0)
6332       renew_caps();
6333
6334     flush_cap_releases();
6335   }
6336
6337   // delayed caps
6338   xlist<Inode*>::iterator p = delayed_list.begin();
6339   while (!p.end()) {
6340     Inode *in = *p;
6341     ++p;
6342     if (in->hold_caps_until > now)
6343       break;
6344     delayed_list.pop_front();
6345     check_caps(in, CHECK_CAPS_NODELAY);
6346   }
6347
6348   trim_cache(true);
6349
6350   if (blacklisted && mounted &&
6351       last_auto_reconnect + 30 * 60 < now &&
6352       cct->_conf.get_val<bool>("client_reconnect_stale")) {
6353     messenger->client_reset();
6354     fd_gen++; // invalidate open files
6355     blacklisted = false;
6356     _kick_stale_sessions();
6357     last_auto_reconnect = now;
6358   }
6359 }
6360
6361 void Client::renew_caps()
6362 {
6363   ldout(cct, 10) << "renew_caps()" << dendl;
6364   last_cap_renew = ceph_clock_now();
6365
6366   for (auto &p : mds_sessions) {
6367     ldout(cct, 15) << "renew_caps requesting from mds." << p.first << dendl;
6368     if (mdsmap->get_state(p.first) >= MDSMap::STATE_REJOIN)
6369       renew_caps(&p.second);
6370   }
6371 }
6372
6373 void Client::renew_caps(MetaSession *session)
6374 {
6375   ldout(cct, 10) << "renew_caps mds." << session->mds_num << dendl;
6376   session->last_cap_renew_request = ceph_clock_now();
6377   uint64_t seq = ++session->cap_renew_seq;
6378   session->con->send_message2(make_message<MClientSession>(CEPH_SESSION_REQUEST_RENEWCAPS, seq));
6379 }
6380
6381
6382 // ===============================================================
6383 // high level (POSIXy) interface
6384
6385 int Client::_do_lookup(Inode *dir, const string& name, int mask,
6386                        InodeRef *target, const UserPerm& perms)
6387 {
6388   int op = dir->snapid == CEPH_SNAPDIR ? CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP;
6389   MetaRequest *req = new MetaRequest(op);
6390   filepath path;
6391   dir->make_nosnap_relative_path(path);
6392   path.push_dentry(name);
6393   req->set_filepath(path);
6394   req->set_inode(dir);
6395   if (cct->_conf->client_debug_getattr_caps && op == CEPH_MDS_OP_LOOKUP)
6396       mask |= DEBUG_GETATTR_CAPS;
6397   req->head.args.getattr.mask = mask;
6398
6399   ldout(cct, 10) << __func__ << " on " << path << dendl;
6400
6401   int r = make_request(req, perms, target);
6402   ldout(cct, 10) << __func__ << " res is " << r << dendl;
6403   return r;
6404 }
6405
6406 int Client::_lookup(Inode *dir, const string& dname, int mask, InodeRef *target,
6407                     const UserPerm& perms)
6408 {
6409   int r = 0;
6410   Dentry *dn = NULL;
6411
6412   if (dname == "..") {
6413     if (dir->dentries.empty()) {
6414       MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPPARENT);
6415       filepath path(dir->ino);
6416       req->set_filepath(path);
6417
6418       InodeRef tmptarget;
6419       int r = make_request(req, perms, &tmptarget, NULL, rand() % mdsmap->get_num_in_mds());
6420
6421       if (r == 0) {
6422         Inode *tempino = tmptarget.get();
6423         _ll_get(tempino);
6424         *target = tempino;
6425         ldout(cct, 8) << __func__ << " found target " << (*target)->ino << dendl;
6426       } else {
6427         *target = dir;
6428       }
6429     }
6430     else
6431       *target = dir->get_first_parent()->dir->parent_inode; //dirs can't be hard-linked
6432     goto done;
6433   }
6434
6435   if (dname == ".") {
6436     *target = dir;
6437     goto done;
6438   }
6439
6440   if (!dir->is_dir()) {
6441     r = -ENOTDIR;
6442     goto done;
6443   }
6444
6445   if (dname.length() > NAME_MAX) {
6446     r = -ENAMETOOLONG;
6447     goto done;
6448   }
6449
6450   if (dname == cct->_conf->client_snapdir &&
6451       dir->snapid == CEPH_NOSNAP) {
6452     *target = open_snapdir(dir);
6453     goto done;
6454   }
6455
6456   if (dir->dir &&
6457       dir->dir->dentries.count(dname)) {
6458     dn = dir->dir->dentries[dname];
6459
6460     ldout(cct, 20) << __func__ << " have dn " << dname << " mds." << dn->lease_mds << " ttl " << dn->lease_ttl
6461              << " seq " << dn->lease_seq
6462              << dendl;
6463
6464     if (!dn->inode || dn->inode->caps_issued_mask(mask, true)) {
6465       // is dn lease valid?
6466       utime_t now = ceph_clock_now();
6467       if (dn->lease_mds >= 0 &&
6468           dn->lease_ttl > now &&
6469           mds_sessions.count(dn->lease_mds)) {
6470         MetaSession &s = mds_sessions.at(dn->lease_mds);
6471         if (s.cap_ttl > now &&
6472             s.cap_gen == dn->lease_gen) {
6473           // touch this mds's dir cap too, even though we don't _explicitly_ use it here, to
6474           // make trim_caps() behave.
6475           dir->try_touch_cap(dn->lease_mds);
6476           goto hit_dn;
6477         }
6478         ldout(cct, 20) << " bad lease, cap_ttl " << s.cap_ttl << ", cap_gen " << s.cap_gen
6479                        << " vs lease_gen " << dn->lease_gen << dendl;
6480       }
6481       // dir shared caps?
6482       if (dir->caps_issued_mask(CEPH_CAP_FILE_SHARED, true)) {
6483         if (dn->cap_shared_gen == dir->shared_gen &&
6484             (!dn->inode || dn->inode->caps_issued_mask(mask, true)))
6485               goto hit_dn;
6486         if (!dn->inode && (dir->flags & I_COMPLETE)) {
6487           ldout(cct, 10) << __func__ << " concluded ENOENT locally for "
6488                          << *dir << " dn '" << dname << "'" << dendl;
6489           return -ENOENT;
6490         }
6491       }
6492     } else {
6493       ldout(cct, 20) << " no cap on " << dn->inode->vino() << dendl;
6494     }
6495   } else {
6496     // can we conclude ENOENT locally?
6497     if (dir->caps_issued_mask(CEPH_CAP_FILE_SHARED, true) &&
6498         (dir->flags & I_COMPLETE)) {
6499       ldout(cct, 10) << __func__ << " concluded ENOENT locally for " << *dir << " dn '" << dname << "'" << dendl;
6500       return -ENOENT;
6501     }
6502   }
6503
6504   r = _do_lookup(dir, dname, mask, target, perms);
6505   goto done;
6506
6507  hit_dn:
6508   if (dn->inode) {
6509     *target = dn->inode;
6510   } else {
6511     r = -ENOENT;
6512   }
6513   touch_dn(dn);
6514
6515  done:
6516   if (r < 0)
6517     ldout(cct, 10) << __func__ << " " << *dir << " " << dname << " = " << r << dendl;
6518   else
6519     ldout(cct, 10) << __func__ << " " << *dir << " " << dname << " = " << **target << dendl;
6520   return r;
6521 }
6522
6523 int Client::get_or_create(Inode *dir, const char* name,
6524                           Dentry **pdn, bool expect_null)
6525 {
6526   // lookup
6527   ldout(cct, 20) << __func__ << " " << *dir << " name " << name << dendl;
6528   dir->open_dir();
6529   if (dir->dir->dentries.count(name)) {
6530     Dentry *dn = dir->dir->dentries[name];
6531
6532     // is dn lease valid?
6533     utime_t now = ceph_clock_now();
6534     if (dn->inode &&
6535         dn->lease_mds >= 0 &&
6536         dn->lease_ttl > now &&
6537         mds_sessions.count(dn->lease_mds)) {
6538       MetaSession &s = mds_sessions.at(dn->lease_mds);
6539       if (s.cap_ttl > now &&
6540           s.cap_gen == dn->lease_gen) {
6541         if (expect_null)
6542           return -EEXIST;
6543       }
6544     }
6545     *pdn = dn;
6546   } else {
6547     // otherwise link up a new one
6548     *pdn = link(dir->dir, name, NULL, NULL);
6549   }
6550
6551   // success
6552   return 0;
6553 }
6554
6555 int Client::path_walk(const filepath& origpath, InodeRef *end,
6556                       const UserPerm& perms, bool followsym, int mask)
6557 {
6558   filepath path = origpath;
6559   InodeRef cur;
6560   if (origpath.absolute())
6561     cur = root;
6562   else
6563     cur = cwd;
6564   ceph_assert(cur);
6565
6566   ldout(cct, 10) << __func__ << " " << path << dendl;
6567
6568   int symlinks = 0;
6569
6570   unsigned i=0;
6571   while (i < path.depth() && cur) {
6572     int caps = 0;
6573     const string &dname = path[i];
6574     ldout(cct, 10) << " " << i << " " << *cur << " " << dname << dendl;
6575     ldout(cct, 20) << "  (path is " << path << ")" << dendl;
6576     InodeRef next;
6577     if (cct->_conf->client_permissions) {
6578       int r = may_lookup(cur.get(), perms);
6579       if (r < 0)
6580         return r;
6581       caps = CEPH_CAP_AUTH_SHARED;
6582     }
6583
6584     /* Get extra requested caps on the last component */
6585     if (i == (path.depth() - 1))
6586       caps |= mask;
6587     int r = _lookup(cur.get(), dname, caps, &next, perms);
6588     if (r < 0)
6589       return r;
6590     // only follow trailing symlink if followsym.  always follow
6591     // 'directory' symlinks.
6592     if (next && next->is_symlink()) {
6593       symlinks++;
6594       ldout(cct, 20) << " symlink count " << symlinks << ", value is '" << next->symlink << "'" << dendl;
6595       if (symlinks > MAXSYMLINKS) {
6596         return -ELOOP;
6597       }
6598
6599       if (i < path.depth() - 1) {
6600         // dir symlink
6601         // replace consumed components of path with symlink dir target
6602         filepath resolved(next->symlink.c_str());
6603         resolved.append(path.postfixpath(i + 1));
6604         path = resolved;
6605         i = 0;
6606         if (next->symlink[0] == '/') {
6607           cur = root;
6608         }
6609         continue;
6610       } else if (followsym) {
6611         if (next->symlink[0] == '/') {
6612           path = next->symlink.c_str();
6613           i = 0;
6614           // reset position
6615           cur = root;
6616         } else {
6617           filepath more(next->symlink.c_str());
6618           // we need to remove the symlink component from off of the path
6619           // before adding the target that the symlink points to.  remain
6620           // at the same position in the path.
6621           path.pop_dentry();
6622           path.append(more);
6623         }
6624         continue;
6625       }
6626     }
6627     cur.swap(next);
6628     i++;
6629   }
6630   if (!cur)
6631     return -ENOENT;
6632   if (end)
6633     end->swap(cur);
6634   return 0;
6635 }
6636
6637
6638 // namespace ops
6639
6640 int Client::link(const char *relexisting, const char *relpath, const UserPerm& perm)
6641 {
6642   std::lock_guard lock(client_lock);
6643   tout(cct) << "link" << std::endl;
6644   tout(cct) << relexisting << std::endl;
6645   tout(cct) << relpath << std::endl;
6646
6647   if (unmounting)
6648     return -ENOTCONN;
6649
6650   filepath existing(relexisting);
6651
6652   InodeRef in, dir;
6653   int r = path_walk(existing, &in, perm, true);
6654   if (r < 0)
6655     return r;
6656   if (std::string(relpath) == "/") {
6657     r = -EEXIST;
6658     return r;
6659   }
6660   filepath path(relpath);
6661   string name = path.last_dentry();
6662   path.pop_dentry();
6663
6664   r = path_walk(path, &dir, perm, true);
6665   if (r < 0)
6666     return r;
6667   if (cct->_conf->client_permissions) {
6668     if (S_ISDIR(in->mode)) {
6669       r = -EPERM;
6670       return r;
6671     }
6672     r = may_hardlink(in.get(), perm);
6673     if (r < 0)
6674       return r;
6675     r = may_create(dir.get(), perm);
6676     if (r < 0)
6677       return r;
6678   }
6679   r = _link(in.get(), dir.get(), name.c_str(), perm);
6680   return r;
6681 }
6682
6683 int Client::unlink(const char *relpath, const UserPerm& perm)
6684 {
6685   std::lock_guard lock(client_lock);
6686   tout(cct) << __func__ << std::endl;
6687   tout(cct) << relpath << std::endl;
6688
6689   if (unmounting)
6690     return -ENOTCONN;
6691
6692   if (std::string(relpath) == "/")
6693     return -EISDIR;
6694
6695   filepath path(relpath);
6696   string name = path.last_dentry();
6697   path.pop_dentry();
6698   InodeRef dir;
6699   int r = path_walk(path, &dir, perm);
6700   if (r < 0)
6701     return r;
6702   if (cct->_conf->client_permissions) {
6703     r = may_delete(dir.get(), name.c_str(), perm);
6704     if (r < 0)
6705       return r;
6706   }
6707   return _unlink(dir.get(), name.c_str(), perm);
6708 }
6709
6710 int Client::rename(const char *relfrom, const char *relto, const UserPerm& perm)
6711 {
6712   std::lock_guard lock(client_lock);
6713   tout(cct) << __func__ << std::endl;
6714   tout(cct) << relfrom << std::endl;
6715   tout(cct) << relto << std::endl;
6716
6717   if (unmounting)
6718     return -ENOTCONN;
6719
6720   if (std::string(relfrom) == "/" || std::string(relto) == "/")
6721     return -EBUSY;
6722
6723   filepath from(relfrom);
6724   filepath to(relto);
6725   string fromname = from.last_dentry();
6726   from.pop_dentry();
6727   string toname = to.last_dentry();
6728   to.pop_dentry();
6729
6730   InodeRef fromdir, todir;
6731   int r = path_walk(from, &fromdir, perm);
6732   if (r < 0)
6733     goto out;
6734   r = path_walk(to, &todir, perm);
6735   if (r < 0)
6736     goto out;
6737
6738   if (cct->_conf->client_permissions) {
6739     int r = may_delete(fromdir.get(), fromname.c_str(), perm);
6740     if (r < 0)
6741       return r;
6742     r = may_delete(todir.get(), toname.c_str(), perm);
6743     if (r < 0 && r != -ENOENT)
6744       return r;
6745   }
6746   r = _rename(fromdir.get(), fromname.c_str(), todir.get(), toname.c_str(), perm);
6747 out:
6748   return r;
6749 }
6750
6751 // dirs
6752
6753 int Client::mkdir(const char *relpath, mode_t mode, const UserPerm& perm)
6754 {
6755   std::lock_guard lock(client_lock);
6756   tout(cct) << __func__ << std::endl;
6757   tout(cct) << relpath << std::endl;
6758   tout(cct) << mode << std::endl;
6759   ldout(cct, 10) << __func__ << ": " << relpath << dendl;
6760
6761   if (unmounting)
6762     return -ENOTCONN;
6763
6764   if (std::string(relpath) == "/")
6765     return -EEXIST;
6766
6767   filepath path(relpath);
6768   string name = path.last_dentry();
6769   path.pop_dentry();
6770   InodeRef dir;
6771   int r = path_walk(path, &dir, perm);
6772   if (r < 0)
6773     return r;
6774   if (cct->_conf->client_permissions) {
6775     r = may_create(dir.get(), perm);
6776     if (r < 0)
6777       return r;
6778   }
6779   return _mkdir(dir.get(), name.c_str(), mode, perm);
6780 }
6781
6782 int Client::mkdirs(const char *relpath, mode_t mode, const UserPerm& perms)
6783 {
6784   std::lock_guard lock(client_lock);
6785   ldout(cct, 10) << "Client::mkdirs " << relpath << dendl;
6786   tout(cct) << __func__ << std::endl;
6787   tout(cct) << relpath << std::endl;
6788   tout(cct) << mode << std::endl;
6789
6790   if (unmounting)
6791     return -ENOTCONN;
6792
6793   //get through existing parts of path
6794   filepath path(relpath);
6795   unsigned int i;
6796   int r = 0, caps = 0;
6797   InodeRef cur, next;
6798   cur = cwd;
6799   for (i=0; i<path.depth(); ++i) {
6800     if (cct->_conf->client_permissions) {
6801       r = may_lookup(cur.get(), perms);
6802       if (r < 0)
6803         break;
6804       caps = CEPH_CAP_AUTH_SHARED;
6805     }
6806     r = _lookup(cur.get(), path[i].c_str(), caps, &next, perms);
6807     if (r < 0)
6808       break;
6809     cur.swap(next);
6810   }
6811   if (r!=-ENOENT) return r;
6812   ldout(cct, 20) << __func__ << " got through " << i << " directories on path " << relpath << dendl;
6813   //make new directory at each level
6814   for (; i<path.depth(); ++i) {
6815     if (cct->_conf->client_permissions) {
6816       r = may_create(cur.get(), perms);
6817       if (r < 0)
6818         return r;
6819     }
6820     //make new dir
6821     r = _mkdir(cur.get(), path[i].c_str(), mode, perms, &next);
6822
6823     //check proper creation/existence
6824     if(-EEXIST == r && i < path.depth() - 1) {
6825       r = _lookup(cur.get(), path[i].c_str(), CEPH_CAP_AUTH_SHARED, &next, perms);
6826     }
6827     if (r < 0)
6828       return r;
6829     //move to new dir and continue
6830     cur.swap(next);
6831     ldout(cct, 20) << __func__ << ": successfully created directory "
6832                    << filepath(cur->ino).get_path() << dendl;
6833   }
6834   return 0;
6835 }
6836
6837 int Client::rmdir(const char *relpath, const UserPerm& perms)
6838 {
6839   std::lock_guard lock(client_lock);
6840   tout(cct) << __func__ << std::endl;
6841   tout(cct) << relpath << std::endl;
6842
6843   if (unmounting)
6844     return -ENOTCONN;
6845
6846   if (std::string(relpath) == "/")
6847     return -EBUSY;
6848
6849   filepath path(relpath);
6850   string name = path.last_dentry();
6851   path.pop_dentry();
6852   InodeRef dir;
6853   int r = path_walk(path, &dir, perms);
6854   if (r < 0)
6855     return r;
6856   if (cct->_conf->client_permissions) {
6857     int r = may_delete(dir.get(), name.c_str(), perms);
6858     if (r < 0)
6859       return r;
6860   }
6861   return _rmdir(dir.get(), name.c_str(), perms);
6862 }
6863
6864 int Client::mknod(const char *relpath, mode_t mode, const UserPerm& perms, dev_t rdev)
6865 {
6866   std::lock_guard lock(client_lock);
6867   tout(cct) << __func__ << std::endl;
6868   tout(cct) << relpath << std::endl;
6869   tout(cct) << mode << std::endl;
6870   tout(cct) << rdev << std::endl;
6871
6872   if (unmounting)
6873     return -ENOTCONN;
6874
6875   if (std::string(relpath) == "/")
6876     return -EEXIST;
6877
6878   filepath path(relpath);
6879   string name = path.last_dentry();
6880   path.pop_dentry();
6881   InodeRef dir;
6882   int r = path_walk(path, &dir, perms);
6883   if (r < 0)
6884     return r;
6885   if (cct->_conf->client_permissions) {
6886     int r = may_create(dir.get(), perms);
6887     if (r < 0)
6888       return r;
6889   }
6890   return _mknod(dir.get(), name.c_str(), mode, rdev, perms);
6891 }
6892
6893 // symlinks
6894
6895 int Client::symlink(const char *target, const char *relpath, const UserPerm& perms)
6896 {
6897   std::lock_guard lock(client_lock);
6898   tout(cct) << __func__ << std::endl;
6899   tout(cct) << target << std::endl;
6900   tout(cct) << relpath << std::endl;
6901
6902   if (unmounting)
6903     return -ENOTCONN;
6904
6905   if (std::string(relpath) == "/")
6906     return -EEXIST;
6907
6908   filepath path(relpath);
6909   string name = path.last_dentry();
6910   path.pop_dentry();
6911   InodeRef dir;
6912   int r = path_walk(path, &dir, perms);
6913   if (r < 0)
6914     return r;
6915   if (cct->_conf->client_permissions) {
6916     int r = may_create(dir.get(), perms);
6917     if (r < 0)
6918       return r;
6919   }
6920   return _symlink(dir.get(), name.c_str(), target, perms);
6921 }
6922
6923 int Client::readlink(const char *relpath, char *buf, loff_t size, const UserPerm& perms)
6924 {
6925   std::lock_guard lock(client_lock);
6926   tout(cct) << __func__ << std::endl;
6927   tout(cct) << relpath << std::endl;
6928
6929   if (unmounting)
6930     return -ENOTCONN;
6931
6932   filepath path(relpath);
6933   InodeRef in;
6934   int r = path_walk(path, &in, perms, false);
6935   if (r < 0)
6936     return r;
6937
6938   return _readlink(in.get(), buf, size);
6939 }
6940
6941 int Client::_readlink(Inode *in, char *buf, size_t size)
6942 {
6943   if (!in->is_symlink())
6944     return -EINVAL;
6945
6946   // copy into buf (at most size bytes)
6947   int r = in->symlink.length();
6948   if (r > (int)size)
6949     r = size;
6950   memcpy(buf, in->symlink.c_str(), r);
6951   return r;
6952 }
6953
6954
6955 // inode stuff
6956
6957 int Client::_getattr(Inode *in, int mask, const UserPerm& perms, bool force)
6958 {
6959   bool yes = in->caps_issued_mask(mask, true);
6960
6961   ldout(cct, 10) << __func__ << " mask " << ccap_string(mask) << " issued=" << yes << dendl;
6962   if (yes && !force)
6963     return 0;
6964
6965   MetaRequest *req = new MetaRequest(CEPH_MDS_OP_GETATTR);
6966   filepath path;
6967   in->make_nosnap_relative_path(path);
6968   req->set_filepath(path);
6969   req->set_inode(in);
6970   req->head.args.getattr.mask = mask;
6971
6972   int res = make_request(req, perms);
6973   ldout(cct, 10) << __func__ << " result=" << res << dendl;
6974   return res;
6975 }
6976
6977 int Client::_do_setattr(Inode *in, struct ceph_statx *stx, int mask,
6978                         const UserPerm& perms, InodeRef *inp)
6979 {
6980   int issued = in->caps_issued();
6981
6982   ldout(cct, 10) << __func__ << " mask " << mask << " issued " <<
6983     ccap_string(issued) << dendl;
6984
6985   if (in->snapid != CEPH_NOSNAP) {
6986     return -EROFS;
6987   }
6988   if ((mask & CEPH_SETATTR_SIZE) &&
6989       (unsigned long)stx->stx_size > in->size &&
6990       is_quota_bytes_exceeded(in, (unsigned long)stx->stx_size - in->size,
6991                               perms)) {
6992     return -EDQUOT;
6993   }
6994
6995   // make the change locally?
6996   if ((in->cap_dirtier_uid >= 0 && perms.uid() != in->cap_dirtier_uid) ||
6997       (in->cap_dirtier_gid >= 0 && perms.gid() != in->cap_dirtier_gid)) {
6998     ldout(cct, 10) << __func__ << " caller " << perms.uid() << ":" << perms.gid()
6999                    << " != cap dirtier " << in->cap_dirtier_uid << ":"
7000                    << in->cap_dirtier_gid << ", forcing sync setattr"
7001                    << dendl;
7002     /*
7003      * This works because we implicitly flush the caps as part of the
7004      * request, so the cap update check will happen with the writeback
7005      * cap context, and then the setattr check will happen with the
7006      * caller's context.
7007      *
7008      * In reality this pattern is likely pretty rare (different users
7009      * setattr'ing the same file).  If that turns out not to be the
7010      * case later, we can build a more complex pipelined cap writeback
7011      * infrastructure...
7012      */
7013     if (!mask)
7014       mask |= CEPH_SETATTR_CTIME;
7015     goto force_request;
7016   }
7017
7018   if (!mask) {
7019     // caller just needs us to bump the ctime
7020     in->ctime = ceph_clock_now();
7021     in->cap_dirtier_uid = perms.uid();
7022     in->cap_dirtier_gid = perms.gid();
7023     if (issued & CEPH_CAP_AUTH_EXCL)
7024       in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7025     else if (issued & CEPH_CAP_FILE_EXCL)
7026       in->mark_caps_dirty(CEPH_CAP_FILE_EXCL);
7027     else if (issued & CEPH_CAP_XATTR_EXCL)
7028       in->mark_caps_dirty(CEPH_CAP_XATTR_EXCL);
7029     else
7030       mask |= CEPH_SETATTR_CTIME;
7031   }
7032
7033   if (in->caps_issued_mask(CEPH_CAP_AUTH_EXCL)) {
7034     bool kill_sguid = mask & (CEPH_SETATTR_SIZE|CEPH_SETATTR_KILL_SGUID);
7035
7036     mask &= ~CEPH_SETATTR_KILL_SGUID;
7037
7038     if (mask & CEPH_SETATTR_UID) {
7039       in->ctime = ceph_clock_now();
7040       in->cap_dirtier_uid = perms.uid();
7041       in->cap_dirtier_gid = perms.gid();
7042       in->uid = stx->stx_uid;
7043       in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7044       mask &= ~CEPH_SETATTR_UID;
7045       kill_sguid = true;
7046       ldout(cct,10) << "changing uid to " << stx->stx_uid << dendl;
7047     }
7048     if (mask & CEPH_SETATTR_GID) {
7049       in->ctime = ceph_clock_now();
7050       in->cap_dirtier_uid = perms.uid();
7051       in->cap_dirtier_gid = perms.gid();
7052       in->gid = stx->stx_gid;
7053       in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7054       mask &= ~CEPH_SETATTR_GID;
7055       kill_sguid = true;
7056       ldout(cct,10) << "changing gid to " << stx->stx_gid << dendl;
7057     }
7058
7059     if (mask & CEPH_SETATTR_MODE) {
7060       in->ctime = ceph_clock_now();
7061       in->cap_dirtier_uid = perms.uid();
7062       in->cap_dirtier_gid = perms.gid();
7063       in->mode = (in->mode & ~07777) | (stx->stx_mode & 07777);
7064       in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7065       mask &= ~CEPH_SETATTR_MODE;
7066       ldout(cct,10) << "changing mode to " << stx->stx_mode << dendl;
7067     } else if (kill_sguid && S_ISREG(in->mode) && (in->mode & (S_IXUSR|S_IXGRP|S_IXOTH))) {
7068       /* Must squash the any setuid/setgid bits with an ownership change */
7069       in->mode &= ~(S_ISUID|S_ISGID);
7070       in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7071     }
7072
7073     if (mask & CEPH_SETATTR_BTIME) {
7074       in->ctime = ceph_clock_now();
7075       in->cap_dirtier_uid = perms.uid();
7076       in->cap_dirtier_gid = perms.gid();
7077       in->btime = utime_t(stx->stx_btime);
7078       in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7079       mask &= ~CEPH_SETATTR_BTIME;
7080       ldout(cct,10) << "changing btime to " << in->btime << dendl;
7081     }
7082   } else if (mask & CEPH_SETATTR_SIZE) {
7083     /* If we don't have Ax, then we must ask the server to clear them on truncate */
7084     mask |= CEPH_SETATTR_KILL_SGUID;
7085   }
7086
7087   if (in->caps_issued_mask(CEPH_CAP_FILE_EXCL)) {
7088     if (mask & (CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME)) {
7089       if (mask & CEPH_SETATTR_MTIME)
7090         in->mtime = utime_t(stx->stx_mtime);
7091       if (mask & CEPH_SETATTR_ATIME)
7092         in->atime = utime_t(stx->stx_atime);
7093       in->ctime = ceph_clock_now();
7094       in->cap_dirtier_uid = perms.uid();
7095       in->cap_dirtier_gid = perms.gid();
7096       in->time_warp_seq++;
7097       in->mark_caps_dirty(CEPH_CAP_FILE_EXCL);
7098       mask &= ~(CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME);
7099     }
7100   }
7101   if (!mask) {
7102     in->change_attr++;
7103     return 0;
7104   }
7105
7106 force_request:
7107   MetaRequest *req = new MetaRequest(CEPH_MDS_OP_SETATTR);
7108
7109   filepath path;
7110
7111   in->make_nosnap_relative_path(path);
7112   req->set_filepath(path);
7113   req->set_inode(in);
7114
7115   if (mask & CEPH_SETATTR_KILL_SGUID) {
7116     req->inode_drop |= CEPH_CAP_AUTH_SHARED;
7117   }
7118   if (mask & CEPH_SETATTR_MODE) {
7119     req->head.args.setattr.mode = stx->stx_mode;
7120     req->inode_drop |= CEPH_CAP_AUTH_SHARED;
7121     ldout(cct,10) << "changing mode to " << stx->stx_mode << dendl;
7122   }
7123   if (mask & CEPH_SETATTR_UID) {
7124     req->head.args.setattr.uid = stx->stx_uid;
7125     req->inode_drop |= CEPH_CAP_AUTH_SHARED;
7126     ldout(cct,10) << "changing uid to " << stx->stx_uid << dendl;
7127   }
7128   if (mask & CEPH_SETATTR_GID) {
7129     req->head.args.setattr.gid = stx->stx_gid;
7130     req->inode_drop |= CEPH_CAP_AUTH_SHARED;
7131     ldout(cct,10) << "changing gid to " << stx->stx_gid << dendl;
7132   }
7133   if (mask & CEPH_SETATTR_BTIME) {
7134     req->head.args.setattr.btime = utime_t(stx->stx_btime);
7135     req->inode_drop |= CEPH_CAP_AUTH_SHARED;
7136   }
7137   if (mask & CEPH_SETATTR_MTIME) {
7138     req->head.args.setattr.mtime = utime_t(stx->stx_mtime);
7139     req->inode_drop |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_RD |
7140       CEPH_CAP_FILE_WR;
7141   }
7142   if (mask & CEPH_SETATTR_ATIME) {
7143     req->head.args.setattr.atime = utime_t(stx->stx_atime);
7144     req->inode_drop |= CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_RD |
7145       CEPH_CAP_FILE_WR;
7146   }
7147   if (mask & CEPH_SETATTR_SIZE) {
7148     if ((unsigned long)stx->stx_size < mdsmap->get_max_filesize()) {
7149       req->head.args.setattr.size = stx->stx_size;
7150       ldout(cct,10) << "changing size to " << stx->stx_size << dendl;
7151     } else { //too big!
7152       put_request(req);
7153       ldout(cct,10) << "unable to set size to " << stx->stx_size << ". Too large!" << dendl;
7154       return -EFBIG;
7155     }
7156     req->inode_drop |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_RD |
7157       CEPH_CAP_FILE_WR;
7158   }
7159   req->head.args.setattr.mask = mask;
7160
7161   req->regetattr_mask = mask;
7162
7163   int res = make_request(req, perms, inp);
7164   ldout(cct, 10) << "_setattr result=" << res << dendl;
7165   return res;
7166 }
7167
7168 /* Note that we only care about attrs that setattr cares about */
7169 void Client::stat_to_statx(struct stat *st, struct ceph_statx *stx)
7170 {
7171   stx->stx_size = st->st_size;
7172   stx->stx_mode = st->st_mode;
7173   stx->stx_uid = st->st_uid;
7174   stx->stx_gid = st->st_gid;
7175 #ifdef __APPLE__
7176   stx->stx_mtime = st->st_mtimespec;
7177   stx->stx_atime = st->st_atimespec;
7178 #else
7179   stx->stx_mtime = st->st_mtim;
7180   stx->stx_atime = st->st_atim;
7181 #endif
7182 }
7183
7184 int Client::__setattrx(Inode *in, struct ceph_statx *stx, int mask,
7185                        const UserPerm& perms, InodeRef *inp)
7186 {
7187   int ret = _do_setattr(in, stx, mask, perms, inp);
7188   if (ret < 0)
7189    return ret;
7190   if (mask & CEPH_SETATTR_MODE)
7191     ret = _posix_acl_chmod(in, stx->stx_mode, perms);
7192   return ret;
7193 }
7194
7195 int Client::_setattrx(InodeRef &in, struct ceph_statx *stx, int mask,
7196                       const UserPerm& perms)
7197 {
7198   mask &= (CEPH_SETATTR_MODE | CEPH_SETATTR_UID |
7199            CEPH_SETATTR_GID | CEPH_SETATTR_MTIME |
7200            CEPH_SETATTR_ATIME | CEPH_SETATTR_SIZE |
7201            CEPH_SETATTR_CTIME | CEPH_SETATTR_BTIME);
7202   if (cct->_conf->client_permissions) {
7203     int r = may_setattr(in.get(), stx, mask, perms);
7204     if (r < 0)
7205       return r;
7206   }
7207   return __setattrx(in.get(), stx, mask, perms);
7208 }
7209
7210 int Client::_setattr(InodeRef &in, struct stat *attr, int mask,
7211                      const UserPerm& perms)
7212 {
7213   struct ceph_statx stx;
7214
7215   stat_to_statx(attr, &stx);
7216   mask &= ~CEPH_SETATTR_BTIME;
7217
7218   if ((mask & CEPH_SETATTR_UID) && attr->st_uid == static_cast<uid_t>(-1)) {
7219     mask &= ~CEPH_SETATTR_UID;
7220   }
7221   if ((mask & CEPH_SETATTR_GID) && attr->st_gid == static_cast<uid_t>(-1)) {
7222     mask &= ~CEPH_SETATTR_GID;
7223   }
7224
7225   return _setattrx(in, &stx, mask, perms);
7226 }
7227
7228 int Client::setattr(const char *relpath, struct stat *attr, int mask,
7229                     const UserPerm& perms)
7230 {
7231   std::lock_guard lock(client_lock);
7232   tout(cct) << __func__ << std::endl;
7233   tout(cct) << relpath << std::endl;
7234   tout(cct) << mask  << std::endl;
7235
7236   if (unmounting)
7237     return -ENOTCONN;
7238
7239   filepath path(relpath);
7240   InodeRef in;
7241   int r = path_walk(path, &in, perms);
7242   if (r < 0)
7243     return r;
7244   return _setattr(in, attr, mask, perms);
7245 }
7246
7247 int Client::setattrx(const char *relpath, struct ceph_statx *stx, int mask,
7248                      const UserPerm& perms, int flags)
7249 {
7250   std::lock_guard lock(client_lock);
7251   tout(cct) << __func__ << std::endl;
7252   tout(cct) << relpath << std::endl;
7253   tout(cct) << mask  << std::endl;
7254
7255   if (unmounting)
7256     return -ENOTCONN;
7257
7258   filepath path(relpath);
7259   InodeRef in;
7260   int r = path_walk(path, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW));
7261   if (r < 0)
7262     return r;
7263   return _setattrx(in, stx, mask, perms);
7264 }
7265
7266 int Client::fsetattr(int fd, struct stat *attr, int mask, const UserPerm& perms)
7267 {
7268   std::lock_guard lock(client_lock);
7269   tout(cct) << __func__ << std::endl;
7270   tout(cct) << fd << std::endl;
7271   tout(cct) << mask  << std::endl;
7272
7273   if (unmounting)
7274     return -ENOTCONN;
7275
7276   Fh *f = get_filehandle(fd);
7277   if (!f)
7278     return -EBADF;
7279 #if defined(__linux__) && defined(O_PATH)
7280   if (f->flags & O_PATH)
7281     return -EBADF;
7282 #endif
7283   return _setattr(f->inode, attr, mask, perms);
7284 }
7285
7286 int Client::fsetattrx(int fd, struct ceph_statx *stx, int mask, const UserPerm& perms)
7287 {
7288   std::lock_guard lock(client_lock);
7289   tout(cct) << __func__ << std::endl;
7290   tout(cct) << fd << std::endl;
7291   tout(cct) << mask  << std::endl;
7292
7293   if (unmounting)
7294     return -ENOTCONN;
7295
7296   Fh *f = get_filehandle(fd);
7297   if (!f)
7298     return -EBADF;
7299 #if defined(__linux__) && defined(O_PATH)
7300   if (f->flags & O_PATH)
7301     return -EBADF;
7302 #endif
7303   return _setattrx(f->inode, stx, mask, perms);
7304 }
7305
7306 int Client::stat(const char *relpath, struct stat *stbuf, const UserPerm& perms,
7307                  frag_info_t *dirstat, int mask)
7308 {
7309   ldout(cct, 3) << __func__ << " enter (relpath " << relpath << " mask " << mask << ")" << dendl;
7310   std::lock_guard lock(client_lock);
7311   tout(cct) << "stat" << std::endl;
7312   tout(cct) << relpath << std::endl;
7313
7314   if (unmounting)
7315     return -ENOTCONN;
7316
7317   filepath path(relpath);
7318   InodeRef in;
7319   int r = path_walk(path, &in, perms, true, mask);
7320   if (r < 0)
7321     return r;
7322   r = _getattr(in, mask, perms);
7323   if (r < 0) {
7324     ldout(cct, 3) << __func__ << " exit on error!" << dendl;
7325     return r;
7326   }
7327   fill_stat(in, stbuf, dirstat);
7328   ldout(cct, 3) << __func__ << " exit (relpath " << relpath << " mask " << mask << ")" << dendl;
7329   return r;
7330 }
7331
7332 unsigned Client::statx_to_mask(unsigned int flags, unsigned int want)
7333 {
7334   unsigned mask = 0;
7335
7336   /* if NO_ATTR_SYNC is set, then we don't need any -- just use what's in cache */
7337   if (flags & AT_NO_ATTR_SYNC)
7338     goto out;
7339
7340   /* Always set PIN to distinguish from AT_NO_ATTR_SYNC case */
7341   mask |= CEPH_CAP_PIN;
7342   if (want & (CEPH_STATX_MODE|CEPH_STATX_UID|CEPH_STATX_GID|CEPH_STATX_BTIME|CEPH_STATX_CTIME|CEPH_STATX_VERSION))
7343     mask |= CEPH_CAP_AUTH_SHARED;
7344   if (want & (CEPH_STATX_NLINK|CEPH_STATX_CTIME|CEPH_STATX_VERSION))
7345     mask |= CEPH_CAP_LINK_SHARED;
7346   if (want & (CEPH_STATX_ATIME|CEPH_STATX_MTIME|CEPH_STATX_CTIME|CEPH_STATX_SIZE|CEPH_STATX_BLOCKS|CEPH_STATX_VERSION))
7347     mask |= CEPH_CAP_FILE_SHARED;
7348   if (want & (CEPH_STATX_VERSION|CEPH_STATX_CTIME))
7349     mask |= CEPH_CAP_XATTR_SHARED;
7350 out:
7351   return mask;
7352 }
7353
7354 int Client::statx(const char *relpath, struct ceph_statx *stx,
7355                   const UserPerm& perms,
7356                   unsigned int want, unsigned int flags)
7357 {
7358   ldout(cct, 3) << __func__ << " enter (relpath " << relpath << " want " << want << ")" << dendl;
7359   std::lock_guard lock(client_lock);
7360   tout(cct) << "statx" << std::endl;
7361   tout(cct) << relpath << std::endl;
7362
7363   if (unmounting)
7364     return -ENOTCONN;
7365
7366   filepath path(relpath);
7367   InodeRef in;
7368
7369   unsigned mask = statx_to_mask(flags, want);
7370
7371   int r = path_walk(path, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW), mask);
7372   if (r < 0)
7373     return r;
7374
7375   r = _getattr(in, mask, perms);
7376   if (r < 0) {
7377     ldout(cct, 3) << __func__ << " exit on error!" << dendl;
7378     return r;
7379   }
7380
7381   fill_statx(in, mask, stx);
7382   ldout(cct, 3) << __func__ << " exit (relpath " << relpath << " mask " << stx->stx_mask << ")" << dendl;
7383   return r;
7384 }
7385
7386 int Client::lstat(const char *relpath, struct stat *stbuf,
7387                   const UserPerm& perms, frag_info_t *dirstat, int mask)
7388 {
7389   ldout(cct, 3) << __func__ << " enter (relpath " << relpath << " mask " << mask << ")" << dendl;
7390   std::lock_guard lock(client_lock);
7391   tout(cct) << __func__ << std::endl;
7392   tout(cct) << relpath << std::endl;
7393
7394   if (unmounting)
7395     return -ENOTCONN;
7396
7397   filepath path(relpath);
7398   InodeRef in;
7399   // don't follow symlinks
7400   int r = path_walk(path, &in, perms, false, mask);
7401   if (r < 0)
7402     return r;
7403   r = _getattr(in, mask, perms);
7404   if (r < 0) {
7405     ldout(cct, 3) << __func__ << " exit on error!" << dendl;
7406     return r;
7407   }
7408   fill_stat(in, stbuf, dirstat);
7409   ldout(cct, 3) << __func__ << " exit (relpath " << relpath << " mask " << mask << ")" << dendl;
7410   return r;
7411 }
7412
7413 int Client::fill_stat(Inode *in, struct stat *st, frag_info_t *dirstat, nest_info_t *rstat)
7414 {
7415   ldout(cct, 10) << __func__ << " on " << in->ino << " snap/dev" << in->snapid
7416            << " mode 0" << oct << in->mode << dec
7417            << " mtime " << in->mtime << " ctime " << in->ctime << dendl;
7418   memset(st, 0, sizeof(struct stat));
7419   if (use_faked_inos())
7420     st->st_ino = in->faked_ino;
7421   else
7422     st->st_ino = in->ino;
7423   st->st_dev = in->snapid;
7424   st->st_mode = in->mode;
7425   st->st_rdev = in->rdev;
7426   if (in->is_dir()) {
7427     switch (in->nlink) {
7428       case 0:
7429         st->st_nlink = 0; /* dir is unlinked */
7430         break;
7431       case 1:
7432         st->st_nlink = 1 /* parent dentry */
7433                        + 1 /* <dir>/. */
7434                        + in->dirstat.nsubdirs; /* include <dir>/. self-reference */
7435         break;
7436       default:
7437         ceph_abort();
7438     }
7439   } else {
7440     st->st_nlink = in->nlink;
7441   }
7442   st->st_uid = in->uid;
7443   st->st_gid = in->gid;
7444   if (in->ctime > in->mtime) {
7445     stat_set_ctime_sec(st, in->ctime.sec());
7446     stat_set_ctime_nsec(st, in->ctime.nsec());
7447   } else {
7448     stat_set_ctime_sec(st, in->mtime.sec());
7449     stat_set_ctime_nsec(st, in->mtime.nsec());
7450   }
7451   stat_set_atime_sec(st, in->atime.sec());
7452   stat_set_atime_nsec(st, in->atime.nsec());
7453   stat_set_mtime_sec(st, in->mtime.sec());
7454   stat_set_mtime_nsec(st, in->mtime.nsec());
7455   if (in->is_dir()) {
7456     if (cct->_conf->client_dirsize_rbytes)
7457       st->st_size = in->rstat.rbytes;
7458     else
7459       st->st_size = in->dirstat.size();
7460     st->st_blocks = 1;
7461   } else {
7462     st->st_size = in->size;
7463     st->st_blocks = (in->size + 511) >> 9;
7464   }
7465   st->st_blksize = std::max<uint32_t>(in->layout.stripe_unit, 4096);
7466
7467   if (dirstat)
7468     *dirstat = in->dirstat;
7469   if (rstat)
7470     *rstat = in->rstat;
7471
7472   return in->caps_issued();
7473 }
7474
7475 void Client::fill_statx(Inode *in, unsigned int mask, struct ceph_statx *stx)
7476 {
7477   ldout(cct, 10) << __func__ << " on " << in->ino << " snap/dev" << in->snapid
7478            << " mode 0" << oct << in->mode << dec
7479            << " mtime " << in->mtime << " ctime " << in->ctime << dendl;
7480   memset(stx, 0, sizeof(struct ceph_statx));
7481
7482   /*
7483    * If mask is 0, then the caller set AT_NO_ATTR_SYNC. Reset the mask
7484    * so that all bits are set.
7485    */
7486   if (!mask)
7487     mask = ~0;
7488
7489   /* These are always considered to be available */
7490   stx->stx_dev = in->snapid;
7491   stx->stx_blksize = std::max<uint32_t>(in->layout.stripe_unit, 4096);
7492
7493   /* Type bits are always set, even when CEPH_STATX_MODE is not */
7494   stx->stx_mode = S_IFMT & in->mode;
7495   stx->stx_ino = use_faked_inos() ? in->faked_ino : (ino_t)in->ino;
7496   stx->stx_rdev = in->rdev;
7497   stx->stx_mask |= (CEPH_STATX_INO|CEPH_STATX_RDEV);
7498
7499   if (mask & CEPH_CAP_AUTH_SHARED) {
7500     stx->stx_uid = in->uid;
7501     stx->stx_gid = in->gid;
7502     stx->stx_mode = in->mode;
7503     in->btime.to_timespec(&stx->stx_btime);
7504     stx->stx_mask |= (CEPH_STATX_MODE|CEPH_STATX_UID|CEPH_STATX_GID|CEPH_STATX_BTIME);
7505   }
7506
7507   if (mask & CEPH_CAP_LINK_SHARED) {
7508     if (in->is_dir()) {
7509       switch (in->nlink) {
7510         case 0:
7511           stx->stx_nlink = 0; /* dir is unlinked */
7512           break;
7513         case 1:
7514           stx->stx_nlink = 1 /* parent dentry */
7515                            + 1 /* <dir>/. */
7516                            + in->dirstat.nsubdirs; /* include <dir>/. self-reference */
7517           break;
7518         default:
7519           ceph_abort();
7520       }
7521     } else {
7522       stx->stx_nlink = in->nlink;
7523     }
7524     stx->stx_mask |= CEPH_STATX_NLINK;
7525   }
7526
7527   if (mask & CEPH_CAP_FILE_SHARED) {
7528
7529     in->atime.to_timespec(&stx->stx_atime);
7530     in->mtime.to_timespec(&stx->stx_mtime);
7531
7532     if (in->is_dir()) {
7533       if (cct->_conf->client_dirsize_rbytes)
7534         stx->stx_size = in->rstat.rbytes;
7535       else
7536         stx->stx_size = in->dirstat.size();
7537       stx->stx_blocks = 1;
7538     } else {
7539       stx->stx_size = in->size;
7540       stx->stx_blocks = (in->size + 511) >> 9;
7541     }
7542     stx->stx_mask |= (CEPH_STATX_ATIME|CEPH_STATX_MTIME|
7543                       CEPH_STATX_SIZE|CEPH_STATX_BLOCKS);
7544   }
7545
7546   /* Change time and change_attr both require all shared caps to view */
7547   if ((mask & CEPH_STAT_CAP_INODE_ALL) == CEPH_STAT_CAP_INODE_ALL) {
7548     stx->stx_version = in->change_attr;
7549     if (in->ctime > in->mtime)
7550       in->ctime.to_timespec(&stx->stx_ctime);
7551     else
7552       in->mtime.to_timespec(&stx->stx_ctime);
7553     stx->stx_mask |= (CEPH_STATX_CTIME|CEPH_STATX_VERSION);
7554   }
7555
7556 }
7557
7558 void Client::touch_dn(Dentry *dn)
7559 {
7560   lru.lru_touch(dn);
7561 }
7562
7563 int Client::chmod(const char *relpath, mode_t mode, const UserPerm& perms)
7564 {
7565   std::lock_guard lock(client_lock);
7566   tout(cct) << __func__ << std::endl;
7567   tout(cct) << relpath << std::endl;
7568   tout(cct) << mode << std::endl;
7569
7570   if (unmounting)
7571     return -ENOTCONN;
7572
7573   filepath path(relpath);
7574   InodeRef in;
7575   int r = path_walk(path, &in, perms);
7576   if (r < 0)
7577     return r;
7578   struct stat attr;
7579   attr.st_mode = mode;
7580   return _setattr(in, &attr, CEPH_SETATTR_MODE, perms);
7581 }
7582
7583 int Client::fchmod(int fd, mode_t mode, const UserPerm& perms)
7584 {
7585   std::lock_guard lock(client_lock);
7586   tout(cct) << __func__ << std::endl;
7587   tout(cct) << fd << std::endl;
7588   tout(cct) << mode << std::endl;
7589
7590   if (unmounting)
7591     return -ENOTCONN;
7592
7593   Fh *f = get_filehandle(fd);
7594   if (!f)
7595     return -EBADF;
7596 #if defined(__linux__) && defined(O_PATH)
7597   if (f->flags & O_PATH)
7598     return -EBADF;
7599 #endif
7600   struct stat attr;
7601   attr.st_mode = mode;
7602   return _setattr(f->inode, &attr, CEPH_SETATTR_MODE, perms);
7603 }
7604
7605 int Client::lchmod(const char *relpath, mode_t mode, const UserPerm& perms)
7606 {
7607   std::lock_guard lock(client_lock);
7608   tout(cct) << __func__ << std::endl;
7609   tout(cct) << relpath << std::endl;
7610   tout(cct) << mode << std::endl;
7611
7612   if (unmounting)
7613     return -ENOTCONN;
7614
7615   filepath path(relpath);
7616   InodeRef in;
7617   // don't follow symlinks
7618   int r = path_walk(path, &in, perms, false);
7619   if (r < 0)
7620     return r;
7621   struct stat attr;
7622   attr.st_mode = mode;
7623   return _setattr(in, &attr, CEPH_SETATTR_MODE, perms);
7624 }
7625
7626 int Client::chown(const char *relpath, uid_t new_uid, gid_t new_gid,
7627                   const UserPerm& perms)
7628 {
7629   std::lock_guard lock(client_lock);
7630   tout(cct) << __func__ << std::endl;
7631   tout(cct) << relpath << std::endl;
7632   tout(cct) << new_uid << std::endl;
7633   tout(cct) << new_gid << std::endl;
7634
7635   if (unmounting)
7636     return -ENOTCONN;
7637
7638   filepath path(relpath);
7639   InodeRef in;
7640   int r = path_walk(path, &in, perms);
7641   if (r < 0)
7642     return r;
7643   struct stat attr;
7644   attr.st_uid = new_uid;
7645   attr.st_gid = new_gid;
7646   return _setattr(in, &attr, CEPH_SETATTR_UID|CEPH_SETATTR_GID, perms);
7647 }
7648
7649 int Client::fchown(int fd, uid_t new_uid, gid_t new_gid, const UserPerm& perms)
7650 {
7651   std::lock_guard lock(client_lock);
7652   tout(cct) << __func__ << std::endl;
7653   tout(cct) << fd << std::endl;
7654   tout(cct) << new_uid << std::endl;
7655   tout(cct) << new_gid << std::endl;
7656
7657   if (unmounting)
7658     return -ENOTCONN;
7659
7660   Fh *f = get_filehandle(fd);
7661   if (!f)
7662     return -EBADF;
7663 #if defined(__linux__) && defined(O_PATH)
7664   if (f->flags & O_PATH)
7665     return -EBADF;
7666 #endif
7667   struct stat attr;
7668   attr.st_uid = new_uid;
7669   attr.st_gid = new_gid;
7670   int mask = 0;
7671   if (new_uid != static_cast<uid_t>(-1)) mask |= CEPH_SETATTR_UID;
7672   if (new_gid != static_cast<gid_t>(-1)) mask |= CEPH_SETATTR_GID;
7673   return _setattr(f->inode, &attr, mask, perms);
7674 }
7675
7676 int Client::lchown(const char *relpath, uid_t new_uid, gid_t new_gid,
7677                    const UserPerm& perms)
7678 {
7679   std::lock_guard lock(client_lock);
7680   tout(cct) << __func__ << std::endl;
7681   tout(cct) << relpath << std::endl;
7682   tout(cct) << new_uid << std::endl;
7683   tout(cct) << new_gid << std::endl;
7684
7685   if (unmounting)
7686     return -ENOTCONN;
7687
7688   filepath path(relpath);
7689   InodeRef in;
7690   // don't follow symlinks
7691   int r = path_walk(path, &in, perms, false);
7692   if (r < 0)
7693     return r;
7694   struct stat attr;
7695   attr.st_uid = new_uid;
7696   attr.st_gid = new_gid;
7697   int mask = 0;
7698   if (new_uid != static_cast<uid_t>(-1)) mask |= CEPH_SETATTR_UID;
7699   if (new_gid != static_cast<gid_t>(-1)) mask |= CEPH_SETATTR_GID;
7700   return _setattr(in, &attr, mask, perms);
7701 }
7702
7703 static void attr_set_atime_and_mtime(struct stat *attr,
7704                                      const utime_t &atime,
7705                                      const utime_t &mtime)
7706 {
7707   stat_set_atime_sec(attr, atime.tv.tv_sec);
7708   stat_set_atime_nsec(attr, atime.tv.tv_nsec);
7709   stat_set_mtime_sec(attr, mtime.tv.tv_sec);
7710   stat_set_mtime_nsec(attr, mtime.tv.tv_nsec);
7711 }
7712
7713 // for [l]utime() invoke the timeval variant as the timespec
7714 // variant are not yet implemented. for futime[s](), invoke
7715 // the timespec variant.
7716 int Client::utime(const char *relpath, struct utimbuf *buf,
7717                   const UserPerm& perms)
7718 {
7719   struct timeval tv[2];
7720   tv[0].tv_sec  = buf->actime;
7721   tv[0].tv_usec = 0;
7722   tv[1].tv_sec  = buf->modtime;
7723   tv[1].tv_usec = 0;
7724
7725   return utimes(relpath, tv, perms);
7726 }
7727
7728 int Client::lutime(const char *relpath, struct utimbuf *buf,
7729                    const UserPerm& perms)
7730 {
7731   struct timeval tv[2];
7732   tv[0].tv_sec  = buf->actime;
7733   tv[0].tv_usec = 0;
7734   tv[1].tv_sec  = buf->modtime;
7735   tv[1].tv_usec = 0;
7736
7737   return lutimes(relpath, tv, perms);
7738 }
7739
7740 int Client::futime(int fd, struct utimbuf *buf, const UserPerm& perms)
7741 {
7742   struct timespec ts[2];
7743   ts[0].tv_sec  = buf->actime;
7744   ts[0].tv_nsec = 0;
7745   ts[1].tv_sec  = buf->modtime;
7746   ts[1].tv_nsec = 0;
7747
7748   return futimens(fd, ts, perms);
7749 }
7750
7751 int Client::utimes(const char *relpath, struct timeval times[2],
7752                    const UserPerm& perms)
7753 {
7754   std::lock_guard lock(client_lock);
7755   tout(cct) << __func__ << std::endl;
7756   tout(cct) << relpath << std::endl;
7757   tout(cct) << "atime: " << times[0].tv_sec << "." << times[0].tv_usec
7758             << std::endl;
7759   tout(cct) << "mtime: " << times[1].tv_sec << "." << times[1].tv_usec
7760             << std::endl;
7761
7762   if (unmounting)
7763     return -ENOTCONN;
7764
7765   filepath path(relpath);
7766   InodeRef in;
7767   int r = path_walk(path, &in, perms);
7768   if (r < 0)
7769     return r;
7770   struct stat attr;
7771   utime_t atime(times[0]);
7772   utime_t mtime(times[1]);
7773
7774   attr_set_atime_and_mtime(&attr, atime, mtime);
7775   return _setattr(in, &attr, CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME, perms);
7776 }
7777
7778 int Client::lutimes(const char *relpath, struct timeval times[2],
7779                     const UserPerm& perms)
7780 {
7781   std::lock_guard lock(client_lock);
7782   tout(cct) << __func__ << std::endl;
7783   tout(cct) << relpath << std::endl;
7784   tout(cct) << "atime: " << times[0].tv_sec << "." << times[0].tv_usec
7785             << std::endl;
7786   tout(cct) << "mtime: " << times[1].tv_sec << "." << times[1].tv_usec
7787             << std::endl;
7788
7789   if (unmounting)
7790     return -ENOTCONN;
7791
7792   filepath path(relpath);
7793   InodeRef in;
7794   int r = path_walk(path, &in, perms, false);
7795   if (r < 0)
7796     return r;
7797   struct stat attr;
7798   utime_t atime(times[0]);
7799   utime_t mtime(times[1]);
7800
7801   attr_set_atime_and_mtime(&attr, atime, mtime);
7802   return _setattr(in, &attr, CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME, perms);
7803 }
7804
7805 int Client::futimes(int fd, struct timeval times[2], const UserPerm& perms)
7806 {
7807   struct timespec ts[2];
7808   ts[0].tv_sec  = times[0].tv_sec;
7809   ts[0].tv_nsec = times[0].tv_usec * 1000;
7810   ts[1].tv_sec  = times[1].tv_sec;
7811   ts[1].tv_nsec = times[1].tv_usec * 1000;
7812
7813   return futimens(fd, ts, perms);
7814 }
7815
7816 int Client::futimens(int fd, struct timespec times[2], const UserPerm& perms)
7817 {
7818   std::lock_guard lock(client_lock);
7819   tout(cct) << __func__ << std::endl;
7820   tout(cct) << fd << std::endl;
7821   tout(cct) << "atime: " << times[0].tv_sec << "." << times[0].tv_nsec
7822             << std::endl;
7823   tout(cct) << "mtime: " << times[1].tv_sec << "." << times[1].tv_nsec
7824             << std::endl;
7825
7826   if (unmounting)
7827     return -ENOTCONN;
7828
7829   Fh *f = get_filehandle(fd);
7830   if (!f)
7831     return -EBADF;
7832 #if defined(__linux__) && defined(O_PATH)
7833   if (f->flags & O_PATH)
7834     return -EBADF;
7835 #endif
7836   struct stat attr;
7837   utime_t atime(times[0]);
7838   utime_t mtime(times[1]);
7839
7840   attr_set_atime_and_mtime(&attr, atime, mtime);
7841   return _setattr(f->inode, &attr, CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME, perms);
7842 }
7843
7844 int Client::flock(int fd, int operation, uint64_t owner)
7845 {
7846   std::lock_guard lock(client_lock);
7847   tout(cct) << __func__ << std::endl;
7848   tout(cct) << fd << std::endl;
7849   tout(cct) << operation << std::endl;
7850   tout(cct) << owner << std::endl;
7851
7852   if (unmounting)
7853     return -ENOTCONN;
7854
7855   Fh *f = get_filehandle(fd);
7856   if (!f)
7857     return -EBADF;
7858
7859   return _flock(f, operation, owner);
7860 }
7861
7862 int Client::opendir(const char *relpath, dir_result_t **dirpp, const UserPerm& perms)
7863 {
7864   std::lock_guard lock(client_lock);
7865   tout(cct) << __func__ << std::endl;
7866   tout(cct) << relpath << std::endl;
7867
7868   if (unmounting)
7869     return -ENOTCONN;
7870
7871   filepath path(relpath);
7872   InodeRef in;
7873   int r = path_walk(path, &in, perms, true);
7874   if (r < 0)
7875     return r;
7876   if (cct->_conf->client_permissions) {
7877     int r = may_open(in.get(), O_RDONLY, perms);
7878     if (r < 0)
7879       return r;
7880   }
7881   r = _opendir(in.get(), dirpp, perms);
7882   /* if ENOTDIR, dirpp will be an uninitialized point and it's very dangerous to access its value */
7883   if (r != -ENOTDIR)
7884       tout(cct) << (unsigned long)*dirpp << std::endl;
7885   return r;
7886 }
7887
7888 int Client::_opendir(Inode *in, dir_result_t **dirpp, const UserPerm& perms)
7889 {
7890   if (!in->is_dir())
7891     return -ENOTDIR;
7892   *dirpp = new dir_result_t(in, perms);
7893   opened_dirs.insert(*dirpp);
7894   ldout(cct, 8) << __func__ << "(" << in->ino << ") = " << 0 << " (" << *dirpp << ")" << dendl;
7895   return 0;
7896 }
7897
7898
7899 int Client::closedir(dir_result_t *dir)
7900 {
7901   std::lock_guard lock(client_lock);
7902   tout(cct) << __func__ << std::endl;
7903   tout(cct) << (unsigned long)dir << std::endl;
7904
7905   ldout(cct, 3) << __func__ << "(" << dir << ") = 0" << dendl;
7906   _closedir(dir);
7907   return 0;
7908 }
7909
7910 void Client::_closedir(dir_result_t *dirp)
7911 {
7912   ldout(cct, 10) << __func__ << "(" << dirp << ")" << dendl;
7913   if (dirp->inode) {
7914     ldout(cct, 10) << __func__ << " detaching inode " << dirp->inode << dendl;
7915     dirp->inode.reset();
7916   }
7917   _readdir_drop_dirp_buffer(dirp);
7918   opened_dirs.erase(dirp);
7919   delete dirp;
7920 }
7921
7922 void Client::rewinddir(dir_result_t *dirp)
7923 {
7924   std::lock_guard lock(client_lock);
7925   ldout(cct, 3) << __func__ << "(" << dirp << ")" << dendl;
7926
7927   if (unmounting)
7928     return;
7929
7930   dir_result_t *d = static_cast<dir_result_t*>(dirp);
7931   _readdir_drop_dirp_buffer(d);
7932   d->reset();
7933 }
7934
7935 loff_t Client::telldir(dir_result_t *dirp)
7936 {
7937   dir_result_t *d = static_cast<dir_result_t*>(dirp);
7938   ldout(cct, 3) << __func__ << "(" << dirp << ") = " << d->offset << dendl;
7939   return d->offset;
7940 }
7941
7942 void Client::seekdir(dir_result_t *dirp, loff_t offset)
7943 {
7944   std::lock_guard lock(client_lock);
7945
7946   ldout(cct, 3) << __func__ << "(" << dirp << ", " << offset << ")" << dendl;
7947
7948   if (unmounting)
7949     return;
7950
7951   if (offset == dirp->offset)
7952     return;
7953
7954   if (offset > dirp->offset)
7955     dirp->release_count = 0;   // bump if we do a forward seek
7956   else
7957     dirp->ordered_count = 0;   // disable filling readdir cache
7958
7959   if (dirp->hash_order()) {
7960     if (dirp->offset > offset) {
7961       _readdir_drop_dirp_buffer(dirp);
7962       dirp->reset();
7963     }
7964   } else {
7965     if (offset == 0 ||
7966         dirp->buffer_frag != frag_t(dir_result_t::fpos_high(offset)) ||
7967         dirp->offset_low() > dir_result_t::fpos_low(offset))  {
7968       _readdir_drop_dirp_buffer(dirp);
7969       dirp->reset();
7970     }
7971   }
7972
7973   dirp->offset = offset;
7974 }
7975
7976
7977 //struct dirent {
7978 //  ino_t          d_ino;       /* inode number */
7979 //  off_t          d_off;       /* offset to the next dirent */
7980 //  unsigned short d_reclen;    /* length of this record */
7981 //  unsigned char  d_type;      /* type of file */
7982 //  char           d_name[256]; /* filename */
7983 //};
7984 void Client::fill_dirent(struct dirent *de, const char *name, int type, uint64_t ino, loff_t next_off)
7985 {
7986   strncpy(de->d_name, name, 255);
7987   de->d_name[255] = '\0';
7988 #ifndef __CYGWIN__
7989   de->d_ino = ino;
7990 #if !defined(__APPLE__) && !defined(__FreeBSD__)
7991   de->d_off = next_off;
7992 #endif
7993   de->d_reclen = 1;
7994   de->d_type = IFTODT(type);
7995   ldout(cct, 10) << __func__ << " '" << de->d_name << "' -> " << inodeno_t(de->d_ino)
7996            << " type " << (int)de->d_type << " w/ next_off " << hex << next_off << dec << dendl;
7997 #endif
7998 }
7999
8000 void Client::_readdir_next_frag(dir_result_t *dirp)
8001 {
8002   frag_t fg = dirp->buffer_frag;
8003
8004   if (fg.is_rightmost()) {
8005     ldout(cct, 10) << __func__ << " advance from " << fg << " to END" << dendl;
8006     dirp->set_end();
8007     return;
8008   }
8009
8010   // advance
8011   fg = fg.next();
8012   ldout(cct, 10) << __func__ << " advance from " << dirp->buffer_frag << " to " << fg << dendl;
8013
8014   if (dirp->hash_order()) {
8015     // keep last_name
8016     int64_t new_offset = dir_result_t::make_fpos(fg.value(), 2, true);
8017     if (dirp->offset < new_offset) // don't decrease offset
8018       dirp->offset = new_offset;
8019   } else {
8020     dirp->last_name.clear();
8021     dirp->offset = dir_result_t::make_fpos(fg, 2, false);
8022     _readdir_rechoose_frag(dirp);
8023   }
8024 }
8025
8026 void Client::_readdir_rechoose_frag(dir_result_t *dirp)
8027 {
8028   ceph_assert(dirp->inode);
8029
8030   if (dirp->hash_order())
8031     return;
8032
8033   frag_t cur = frag_t(dirp->offset_high());
8034   frag_t fg = dirp->inode->dirfragtree[cur.value()];
8035   if (fg != cur) {
8036     ldout(cct, 10) << __func__ << " frag " << cur << " maps to " << fg << dendl;
8037     dirp->offset = dir_result_t::make_fpos(fg, 2, false);
8038     dirp->last_name.clear();
8039     dirp->next_offset = 2;
8040   }
8041 }
8042
8043 void Client::_readdir_drop_dirp_buffer(dir_result_t *dirp)
8044 {
8045   ldout(cct, 10) << __func__ << " " << dirp << dendl;
8046   dirp->buffer.clear();
8047 }
8048
8049 int Client::_readdir_get_frag(dir_result_t *dirp)
8050 {
8051   ceph_assert(dirp);
8052   ceph_assert(dirp->inode);
8053
8054   // get the current frag.
8055   frag_t fg;
8056   if (dirp->hash_order())
8057     fg = dirp->inode->dirfragtree[dirp->offset_high()];
8058   else
8059     fg = frag_t(dirp->offset_high());
8060
8061   ldout(cct, 10) << __func__ << " " << dirp << " on " << dirp->inode->ino << " fg " << fg
8062                  << " offset " << hex << dirp->offset << dec << dendl;
8063
8064   int op = CEPH_MDS_OP_READDIR;
8065   if (dirp->inode && dirp->inode->snapid == CEPH_SNAPDIR)
8066     op = CEPH_MDS_OP_LSSNAP;
8067
8068   InodeRef& diri = dirp->inode;
8069
8070   MetaRequest *req = new MetaRequest(op);
8071   filepath path;
8072   diri->make_nosnap_relative_path(path);
8073   req->set_filepath(path);
8074   req->set_inode(diri.get());
8075   req->head.args.readdir.frag = fg;
8076   req->head.args.readdir.flags = CEPH_READDIR_REPLY_BITFLAGS;
8077   if (dirp->last_name.length()) {
8078     req->path2.set_path(dirp->last_name);
8079   } else if (dirp->hash_order()) {
8080     req->head.args.readdir.offset_hash = dirp->offset_high();
8081   }
8082   req->dirp = dirp;
8083
8084   bufferlist dirbl;
8085   int res = make_request(req, dirp->perms, NULL, NULL, -1, &dirbl);
8086
8087   if (res == -EAGAIN) {
8088     ldout(cct, 10) << __func__ << " got EAGAIN, retrying" << dendl;
8089     _readdir_rechoose_frag(dirp);
8090     return _readdir_get_frag(dirp);
8091   }
8092
8093   if (res == 0) {
8094     ldout(cct, 10) << __func__ << " " << dirp << " got frag " << dirp->buffer_frag
8095                    << " size " << dirp->buffer.size() << dendl;
8096   } else {
8097     ldout(cct, 10) << __func__ << " got error " << res << ", setting end flag" << dendl;
8098     dirp->set_end();
8099   }
8100
8101   return res;
8102 }
8103
8104 struct dentry_off_lt {
8105   bool operator()(const Dentry* dn, int64_t off) const {
8106     return dir_result_t::fpos_cmp(dn->offset, off) < 0;
8107   }
8108 };
8109
8110 int Client::_readdir_cache_cb(dir_result_t *dirp, add_dirent_cb_t cb, void *p,
8111                               int caps, bool getref)
8112 {
8113   ceph_assert(ceph_mutex_is_locked(client_lock));
8114   ldout(cct, 10) << __func__ << " " << dirp << " on " << dirp->inode->ino
8115            << " last_name " << dirp->last_name << " offset " << hex << dirp->offset << dec
8116            << dendl;
8117   Dir *dir = dirp->inode->dir;
8118
8119   if (!dir) {
8120     ldout(cct, 10) << " dir is empty" << dendl;
8121     dirp->set_end();
8122     return 0;
8123   }
8124
8125   vector<Dentry*>::iterator pd = std::lower_bound(dir->readdir_cache.begin(),
8126                                                   dir->readdir_cache.end(),
8127                                                   dirp->offset, dentry_off_lt());
8128
8129   string dn_name;
8130   while (true) {
8131     if (!dirp->inode->is_complete_and_ordered())
8132       return -EAGAIN;
8133     if (pd == dir->readdir_cache.end())
8134       break;
8135     Dentry *dn = *pd;
8136     if (dn->inode == NULL) {
8137       ldout(cct, 15) << " skipping null '" << dn->name << "'" << dendl;
8138       ++pd;
8139       continue;
8140     }
8141     if (dn->cap_shared_gen != dir->parent_inode->shared_gen) {
8142       ldout(cct, 15) << " skipping mismatch shared gen '" << dn->name << "'" << dendl;
8143       ++pd;
8144       continue;
8145     }
8146
8147     int idx = pd - dir->readdir_cache.begin();
8148     int r = _getattr(dn->inode, caps, dirp->perms);
8149     if (r < 0)
8150       return r;
8151
8152     // the content of readdir_cache may change after _getattr(), so pd may be invalid iterator
8153     pd = dir->readdir_cache.begin() + idx;
8154     if (pd >= dir->readdir_cache.end() || *pd != dn)
8155       return -EAGAIN;
8156
8157     struct ceph_statx stx;
8158     struct dirent de;
8159     fill_statx(dn->inode, caps, &stx);
8160
8161     uint64_t next_off = dn->offset + 1;
8162     fill_dirent(&de, dn->name.c_str(), stx.stx_mode, stx.stx_ino, next_off);
8163     ++pd;
8164     if (pd == dir->readdir_cache.end())
8165       next_off = dir_result_t::END;
8166
8167     Inode *in = NULL;
8168     if (getref) {
8169       in = dn->inode.get();
8170       _ll_get(in);
8171     }
8172
8173     dn_name = dn->name; // fill in name while we have lock
8174
8175     client_lock.unlock();
8176     r = cb(p, &de, &stx, next_off, in);  // _next_ offset
8177     client_lock.lock();
8178     ldout(cct, 15) << " de " << de.d_name << " off " << hex << dn->offset << dec
8179                    << " = " << r << dendl;
8180     if (r < 0) {
8181       return r;
8182     }
8183
8184     dirp->offset = next_off;
8185     if (dirp->at_end())
8186       dirp->next_offset = 2;
8187     else
8188       dirp->next_offset = dirp->offset_low();
8189     dirp->last_name = dn_name; // we successfully returned this one; update!
8190     dirp->release_count = 0; // last_name no longer match cache index
8191     if (r > 0)
8192       return r;
8193   }
8194
8195   ldout(cct, 10) << __func__ << " " << dirp << " on " << dirp->inode->ino << " at end" << dendl;
8196   dirp->set_end();
8197   return 0;
8198 }
8199
8200 int Client::readdir_r_cb(dir_result_t *d, add_dirent_cb_t cb, void *p,
8201                          unsigned want, unsigned flags, bool getref)
8202 {
8203   int caps = statx_to_mask(flags, want);
8204
8205   std::lock_guard lock(client_lock);
8206
8207   if (unmounting)
8208     return -ENOTCONN;
8209
8210   dir_result_t *dirp = static_cast<dir_result_t*>(d);
8211
8212   ldout(cct, 10) << __func__ << " " << *dirp->inode << " offset " << hex << dirp->offset
8213                  << dec << " at_end=" << dirp->at_end()
8214                  << " hash_order=" << dirp->hash_order() << dendl;
8215
8216   struct dirent de;
8217   struct ceph_statx stx;
8218   memset(&de, 0, sizeof(de));
8219   memset(&stx, 0, sizeof(stx));
8220
8221   InodeRef& diri = dirp->inode;
8222
8223   if (dirp->at_end())
8224     return 0;
8225
8226   if (dirp->offset == 0) {
8227     ldout(cct, 15) << " including ." << dendl;
8228     ceph_assert(diri->dentries.size() < 2); // can't have multiple hard-links to a dir
8229     uint64_t next_off = 1;
8230
8231     int r;
8232     r = _getattr(diri, caps, dirp->perms);
8233     if (r < 0)
8234       return r;
8235
8236     fill_statx(diri, caps, &stx);
8237     fill_dirent(&de, ".", S_IFDIR, stx.stx_ino, next_off);
8238
8239     Inode *inode = NULL;
8240     if (getref) {
8241       inode = diri.get();
8242       _ll_get(inode);
8243     }
8244
8245     client_lock.unlock();
8246     r = cb(p, &de, &stx, next_off, inode);
8247     client_lock.lock();
8248     if (r < 0)
8249       return r;
8250
8251     dirp->offset = next_off;
8252     if (r > 0)
8253       return r;
8254   }
8255   if (dirp->offset == 1) {
8256     ldout(cct, 15) << " including .." << dendl;
8257     uint64_t next_off = 2;
8258     InodeRef in;
8259     if (diri->dentries.empty())
8260       in = diri;
8261     else
8262       in = diri->get_first_parent()->dir->parent_inode;
8263
8264     int r;
8265     r = _getattr(in, caps, dirp->perms);
8266     if (r < 0)
8267       return r;
8268
8269     fill_statx(in, caps, &stx);
8270     fill_dirent(&de, "..", S_IFDIR, stx.stx_ino, next_off);
8271
8272     Inode *inode = NULL;
8273     if (getref) {
8274       inode = in.get();
8275       _ll_get(inode);
8276     }
8277
8278     client_lock.unlock();
8279     r = cb(p, &de, &stx, next_off, inode);
8280     client_lock.lock();
8281     if (r < 0)
8282       return r;
8283
8284     dirp->offset = next_off;
8285     if (r > 0)
8286       return r;
8287   }
8288
8289   // can we read from our cache?
8290   ldout(cct, 10) << "offset " << hex << dirp->offset << dec
8291            << " snapid " << dirp->inode->snapid << " (complete && ordered) "
8292            << dirp->inode->is_complete_and_ordered()
8293            << " issued " << ccap_string(dirp->inode->caps_issued())
8294            << dendl;
8295   if (dirp->inode->snapid != CEPH_SNAPDIR &&
8296       dirp->inode->is_complete_and_ordered() &&
8297       dirp->inode->caps_issued_mask(CEPH_CAP_FILE_SHARED, true)) {
8298     int err = _readdir_cache_cb(dirp, cb, p, caps, getref);
8299     if (err != -EAGAIN)
8300       return err;
8301   }
8302
8303   while (1) {
8304     if (dirp->at_end())
8305       return 0;
8306
8307     bool check_caps = true;
8308     if (!dirp->is_cached()) {
8309       int r = _readdir_get_frag(dirp);
8310       if (r)
8311         return r;
8312       // _readdir_get_frag () may updates dirp->offset if the replied dirfrag is
8313       // different than the requested one. (our dirfragtree was outdated)
8314       check_caps = false;
8315     }
8316     frag_t fg = dirp->buffer_frag;
8317
8318     ldout(cct, 10) << "frag " << fg << " buffer size " << dirp->buffer.size()
8319                    << " offset " << hex << dirp->offset << dendl;
8320
8321     for (auto it = std::lower_bound(dirp->buffer.begin(), dirp->buffer.end(),
8322                                     dirp->offset, dir_result_t::dentry_off_lt());
8323          it != dirp->buffer.end();
8324          ++it) {
8325       dir_result_t::dentry &entry = *it;
8326
8327       uint64_t next_off = entry.offset + 1;
8328
8329       int r;
8330       if (check_caps) {
8331         r = _getattr(entry.inode, caps, dirp->perms);
8332         if (r < 0)
8333           return r;
8334       }
8335
8336       fill_statx(entry.inode, caps, &stx);
8337       fill_dirent(&de, entry.name.c_str(), stx.stx_mode, stx.stx_ino, next_off);
8338
8339       Inode *inode = NULL;
8340       if (getref) {
8341         inode = entry.inode.get();
8342         _ll_get(inode);
8343       }
8344
8345       client_lock.unlock();
8346       r = cb(p, &de, &stx, next_off, inode);  // _next_ offset
8347       client_lock.lock();
8348
8349       ldout(cct, 15) << " de " << de.d_name << " off " << hex << next_off - 1 << dec
8350                      << " = " << r << dendl;
8351       if (r < 0)
8352         return r;
8353
8354       dirp->offset = next_off;
8355       if (r > 0)
8356         return r;
8357     }
8358
8359     if (dirp->next_offset > 2) {
8360       ldout(cct, 10) << " fetching next chunk of this frag" << dendl;
8361       _readdir_drop_dirp_buffer(dirp);
8362       continue;  // more!
8363     }
8364
8365     if (!fg.is_rightmost()) {
8366       // next frag!
8367       _readdir_next_frag(dirp);
8368       continue;
8369     }
8370
8371     if (diri->shared_gen == dirp->start_shared_gen &&
8372         diri->dir_release_count == dirp->release_count) {
8373       if (diri->dir_ordered_count == dirp->ordered_count) {
8374         ldout(cct, 10) << " marking (I_COMPLETE|I_DIR_ORDERED) on " << *diri << dendl;
8375         if (diri->dir) {
8376           ceph_assert(diri->dir->readdir_cache.size() >= dirp->cache_index);
8377           diri->dir->readdir_cache.resize(dirp->cache_index);
8378         }
8379         diri->flags |= I_COMPLETE | I_DIR_ORDERED;
8380       } else {
8381         ldout(cct, 10) << " marking I_COMPLETE on " << *diri << dendl;
8382         diri->flags |= I_COMPLETE;
8383       }
8384     }
8385
8386     dirp->set_end();
8387     return 0;
8388   }
8389   ceph_abort();
8390   return 0;
8391 }
8392
8393
8394 int Client::readdir_r(dir_result_t *d, struct dirent *de)
8395 {
8396   return readdirplus_r(d, de, 0, 0, 0, NULL);
8397 }
8398
8399 /*
8400  * readdirplus_r
8401  *
8402  * returns
8403  *  1 if we got a dirent
8404  *  0 for end of directory
8405  * <0 on error
8406  */
8407
8408 struct single_readdir {
8409   struct dirent *de;
8410   struct ceph_statx *stx;
8411   Inode *inode;
8412   bool full;
8413 };
8414
8415 static int _readdir_single_dirent_cb(void *p, struct dirent *de,
8416                                      struct ceph_statx *stx, off_t off,
8417                                      Inode *in)
8418 {
8419   single_readdir *c = static_cast<single_readdir *>(p);
8420
8421   if (c->full)
8422     return -1;  // already filled this dirent
8423
8424   *c->de = *de;
8425   if (c->stx)
8426     *c->stx = *stx;
8427   c->inode = in;
8428   c->full = true;
8429   return 1;
8430 }
8431
8432 struct dirent *Client::readdir(dir_result_t *d)
8433 {
8434   int ret;
8435   static struct dirent de;
8436   single_readdir sr;
8437   sr.de = &de;
8438   sr.stx = NULL;
8439   sr.inode = NULL;
8440   sr.full = false;
8441
8442   // our callback fills the dirent and sets sr.full=true on first
8443   // call, and returns -1 the second time around.
8444   ret = readdir_r_cb(d, _readdir_single_dirent_cb, (void *)&sr);
8445   if (ret < -1) {
8446     errno = -ret;  // this sucks.
8447     return (dirent *) NULL;
8448   }
8449   if (sr.full) {
8450     return &de;
8451   }
8452   return (dirent *) NULL;
8453 }
8454
8455 int Client::readdirplus_r(dir_result_t *d, struct dirent *de,
8456                           struct ceph_statx *stx, unsigned want,
8457                           unsigned flags, Inode **out)
8458 {
8459   single_readdir sr;
8460   sr.de = de;
8461   sr.stx = stx;
8462   sr.inode = NULL;
8463   sr.full = false;
8464
8465   // our callback fills the dirent and sets sr.full=true on first
8466   // call, and returns -1 the second time around.
8467   int r = readdir_r_cb(d, _readdir_single_dirent_cb, (void *)&sr, want, flags, out);
8468   if (r < -1)
8469     return r;
8470   if (out)
8471     *out = sr.inode;
8472   if (sr.full)
8473     return 1;
8474   return 0;
8475 }
8476
8477
8478 /* getdents */
8479 struct getdents_result {
8480   char *buf;
8481   int buflen;
8482   int pos;
8483   bool fullent;
8484 };
8485
8486 static int _readdir_getdent_cb(void *p, struct dirent *de,
8487                                struct ceph_statx *stx, off_t off, Inode *in)
8488 {
8489   struct getdents_result *c = static_cast<getdents_result *>(p);
8490
8491   int dlen;
8492   if (c->fullent)
8493     dlen = sizeof(*de);
8494   else
8495     dlen = strlen(de->d_name) + 1;
8496
8497   if (c->pos + dlen > c->buflen)
8498     return -1;  // doesn't fit
8499
8500   if (c->fullent) {
8501     memcpy(c->buf + c->pos, de, sizeof(*de));
8502   } else {
8503     memcpy(c->buf + c->pos, de->d_name, dlen);
8504   }
8505   c->pos += dlen;
8506   return 0;
8507 }
8508
8509 int Client::_getdents(dir_result_t *dir, char *buf, int buflen, bool fullent)
8510 {
8511   getdents_result gr;
8512   gr.buf = buf;
8513   gr.buflen = buflen;
8514   gr.fullent = fullent;
8515   gr.pos = 0;
8516
8517   int r = readdir_r_cb(dir, _readdir_getdent_cb, (void *)&gr);
8518
8519   if (r < 0) { // some error
8520     if (r == -1) { // buffer ran out of space
8521       if (gr.pos) { // but we got some entries already!
8522         return gr.pos;
8523       } // or we need a larger buffer
8524       return -ERANGE;
8525     } else { // actual error, return it
8526       return r;
8527     }
8528   }
8529   return gr.pos;
8530 }
8531
8532
8533 /* getdir */
8534 struct getdir_result {
8535   list<string> *contents;
8536   int num;
8537 };
8538
8539 static int _getdir_cb(void *p, struct dirent *de, struct ceph_statx *stx, off_t off, Inode *in)
8540 {
8541   getdir_result *r = static_cast<getdir_result *>(p);
8542
8543   r->contents->push_back(de->d_name);
8544   r->num++;
8545   return 0;
8546 }
8547
8548 int Client::getdir(const char *relpath, list<string>& contents,
8549                    const UserPerm& perms)
8550 {
8551   ldout(cct, 3) << "getdir(" << relpath << ")" << dendl;
8552   {
8553     std::lock_guard lock(client_lock);
8554     tout(cct) << "getdir" << std::endl;
8555     tout(cct) << relpath << std::endl;
8556   }
8557
8558   dir_result_t *d;
8559   int r = opendir(relpath, &d, perms);
8560   if (r < 0)
8561     return r;
8562
8563   getdir_result gr;
8564   gr.contents = &contents;
8565   gr.num = 0;
8566   r = readdir_r_cb(d, _getdir_cb, (void *)&gr);
8567
8568   closedir(d);
8569
8570   if (r < 0)
8571     return r;
8572   return gr.num;
8573 }
8574
8575
8576 /****** file i/o **********/
8577 int Client::open(const char *relpath, int flags, const UserPerm& perms,
8578                  mode_t mode, int stripe_unit, int stripe_count,
8579                  int object_size, const char *data_pool)
8580 {
8581   ldout(cct, 3) << "open enter(" << relpath << ", " << ceph_flags_sys2wire(flags) << "," << mode << ")" << dendl;
8582   std::lock_guard lock(client_lock);
8583   tout(cct) << "open" << std::endl;
8584   tout(cct) << relpath << std::endl;
8585   tout(cct) << ceph_flags_sys2wire(flags) << std::endl;
8586
8587   if (unmounting)
8588     return -ENOTCONN;
8589
8590   Fh *fh = NULL;
8591
8592 #if defined(__linux__) && defined(O_PATH)
8593   /* When the O_PATH is being specified, others flags than O_DIRECTORY
8594    * and O_NOFOLLOW are ignored. Please refer do_entry_open() function
8595    * in kernel (fs/open.c). */
8596   if (flags & O_PATH)
8597     flags &= O_DIRECTORY | O_NOFOLLOW | O_PATH;
8598 #endif
8599
8600   filepath path(relpath);
8601   InodeRef in;
8602   bool created = false;
8603   /* O_CREATE with O_EXCL enforces O_NOFOLLOW. */
8604   bool followsym = !((flags & O_NOFOLLOW) || ((flags & O_CREAT) && (flags & O_EXCL)));
8605   int r = path_walk(path, &in, perms, followsym, ceph_caps_for_mode(mode));
8606
8607   if (r == 0 && (flags & O_CREAT) && (flags & O_EXCL))
8608     return -EEXIST;
8609
8610 #if defined(__linux__) && defined(O_PATH)
8611   if (r == 0 && in->is_symlink() && (flags & O_NOFOLLOW) && !(flags & O_PATH))
8612 #else
8613   if (r == 0 && in->is_symlink() && (flags & O_NOFOLLOW))
8614 #endif
8615     return -ELOOP;
8616
8617   if (r == -ENOENT && (flags & O_CREAT)) {
8618     filepath dirpath = path;
8619     string dname = dirpath.last_dentry();
8620     dirpath.pop_dentry();
8621     InodeRef dir;
8622     r = path_walk(dirpath, &dir, perms, true,
8623                   cct->_conf->client_permissions ? CEPH_CAP_AUTH_SHARED : 0);
8624     if (r < 0)
8625       goto out;
8626     if (cct->_conf->client_permissions) {
8627       r = may_create(dir.get(), perms);
8628       if (r < 0)
8629         goto out;
8630     }
8631     r = _create(dir.get(), dname.c_str(), flags, mode, &in, &fh, stripe_unit,
8632                 stripe_count, object_size, data_pool, &created, perms);
8633   }
8634   if (r < 0)
8635     goto out;
8636
8637   if (!created) {
8638     // posix says we can only check permissions of existing files
8639     if (cct->_conf->client_permissions) {
8640       r = may_open(in.get(), flags, perms);
8641       if (r < 0)
8642         goto out;
8643     }
8644   }
8645
8646   if (!fh)
8647     r = _open(in.get(), flags, mode, &fh, perms);
8648   if (r >= 0) {
8649     // allocate a integer file descriptor
8650     ceph_assert(fh);
8651     r = get_fd();
8652     ceph_assert(fd_map.count(r) == 0);
8653     fd_map[r] = fh;
8654   }
8655
8656  out:
8657   tout(cct) << r << std::endl;
8658   ldout(cct, 3) << "open exit(" << path << ", " << ceph_flags_sys2wire(flags) << ") = " << r << dendl;
8659   return r;
8660 }
8661
8662 int Client::open(const char *relpath, int flags, const UserPerm& perms, mode_t mode)
8663 {
8664   /* Use default file striping parameters */
8665   return open(relpath, flags, perms, mode, 0, 0, 0, NULL);
8666 }
8667
8668 int Client::lookup_hash(inodeno_t ino, inodeno_t dirino, const char *name,
8669                         const UserPerm& perms)
8670 {
8671   std::lock_guard lock(client_lock);
8672   ldout(cct, 3) << __func__ << " enter(" << ino << ", #" << dirino << "/" << name << ")" << dendl;
8673
8674   if (unmounting)
8675     return -ENOTCONN;
8676
8677   MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPHASH);
8678   filepath path(ino);
8679   req->set_filepath(path);
8680
8681   uint32_t h = ceph_str_hash(CEPH_STR_HASH_RJENKINS, name, strlen(name));
8682   char f[30];
8683   sprintf(f, "%u", h);
8684   filepath path2(dirino);
8685   path2.push_dentry(string(f));
8686   req->set_filepath2(path2);
8687
8688   int r = make_request(req, perms, NULL, NULL,
8689                        rand() % mdsmap->get_num_in_mds());
8690   ldout(cct, 3) << __func__ << " exit(" << ino << ", #" << dirino << "/" << name << ") = " << r << dendl;
8691   return r;
8692 }
8693
8694
8695 /**
8696  * Load inode into local cache.
8697  *
8698  * If inode pointer is non-NULL, and take a reference on
8699  * the resulting Inode object in one operation, so that caller
8700  * can safely assume inode will still be there after return.
8701  */
8702 int Client::_lookup_ino(inodeno_t ino, const UserPerm& perms, Inode **inode)
8703 {
8704   ldout(cct, 8) << __func__ << " enter(" << ino << ")" << dendl;
8705
8706   if (unmounting)
8707     return -ENOTCONN;
8708
8709   MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPINO);
8710   filepath path(ino);
8711   req->set_filepath(path);
8712
8713   int r = make_request(req, perms, NULL, NULL, rand() % mdsmap->get_num_in_mds());
8714   if (r == 0 && inode != NULL) {
8715     vinodeno_t vino(ino, CEPH_NOSNAP);
8716     unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino);
8717     ceph_assert(p != inode_map.end());
8718     *inode = p->second;
8719     _ll_get(*inode);
8720   }
8721   ldout(cct, 8) << __func__ << " exit(" << ino << ") = " << r << dendl;
8722   return r;
8723 }
8724
8725 int Client::lookup_ino(inodeno_t ino, const UserPerm& perms, Inode **inode)
8726 {
8727   std::lock_guard lock(client_lock);
8728   return _lookup_ino(ino, perms, inode);
8729 }
8730
8731 /**
8732  * Find the parent inode of `ino` and insert it into
8733  * our cache.  Conditionally also set `parent` to a referenced
8734  * Inode* if caller provides non-NULL value.
8735  */
8736 int Client::_lookup_parent(Inode *ino, const UserPerm& perms, Inode **parent)
8737 {
8738   ldout(cct, 8) << __func__ << " enter(" << ino->ino << ")" << dendl;
8739
8740   MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPPARENT);
8741   filepath path(ino->ino);
8742   req->set_filepath(path);
8743
8744   InodeRef target;
8745   int r = make_request(req, perms, &target, NULL, rand() % mdsmap->get_num_in_mds());
8746   // Give caller a reference to the parent ino if they provided a pointer.
8747   if (parent != NULL) {
8748     if (r == 0) {
8749       *parent = target.get();
8750       _ll_get(*parent);
8751       ldout(cct, 8) << __func__ << " found parent " << (*parent)->ino << dendl;
8752     } else {
8753       *parent = NULL;
8754     }
8755   }
8756   ldout(cct, 8) << __func__ << " exit(" << ino->ino << ") = " << r << dendl;
8757   return r;
8758 }
8759
8760 /**
8761  * Populate the parent dentry for `ino`, provided it is
8762  * a child of `parent`.
8763  */
8764 int Client::_lookup_name(Inode *ino, Inode *parent, const UserPerm& perms)
8765 {
8766   ceph_assert(parent->is_dir());
8767   ldout(cct, 3) << __func__ << " enter(" << ino->ino << ")" << dendl;
8768
8769   if (unmounting)
8770     return -ENOTCONN;
8771
8772   MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPNAME);
8773   req->set_filepath2(filepath(parent->ino));
8774   req->set_filepath(filepath(ino->ino));
8775   req->set_inode(ino);
8776
8777   int r = make_request(req, perms, NULL, NULL, rand() % mdsmap->get_num_in_mds());
8778   ldout(cct, 3) << __func__ << " exit(" << ino->ino << ") = " << r << dendl;
8779   return r;
8780 }
8781
8782 int Client::lookup_name(Inode *ino, Inode *parent, const UserPerm& perms)
8783 {
8784   std::lock_guard lock(client_lock);
8785   return _lookup_name(ino, parent, perms);
8786 }
8787
8788 Fh *Client::_create_fh(Inode *in, int flags, int cmode, const UserPerm& perms)
8789 {
8790   ceph_assert(in);
8791   Fh *f = new Fh(in, flags, cmode, fd_gen, perms);
8792
8793   ldout(cct, 10) << __func__ << " " << in->ino << " mode " << cmode << dendl;
8794
8795   if (in->snapid != CEPH_NOSNAP) {
8796     in->snap_cap_refs++;
8797     ldout(cct, 5) << "open success, fh is " << f << " combined IMMUTABLE SNAP caps "
8798             << ccap_string(in->caps_issued()) << dendl;
8799   }
8800
8801   const auto& conf = cct->_conf;
8802   f->readahead.set_trigger_requests(1);
8803   f->readahead.set_min_readahead_size(conf->client_readahead_min);
8804   uint64_t max_readahead = Readahead::NO_LIMIT;
8805   if (conf->client_readahead_max_bytes) {
8806     max_readahead = std::min(max_readahead, (uint64_t)conf->client_readahead_max_bytes);
8807   }
8808   if (conf->client_readahead_max_periods) {
8809     max_readahead = std::min(max_readahead, in->layout.get_period()*(uint64_t)conf->client_readahead_max_periods);
8810   }
8811   f->readahead.set_max_readahead_size(max_readahead);
8812   vector<uint64_t> alignments;
8813   alignments.push_back(in->layout.get_period());
8814   alignments.push_back(in->layout.stripe_unit);
8815   f->readahead.set_alignments(alignments);
8816
8817   return f;
8818 }
8819
8820 int Client::_release_fh(Fh *f)
8821 {
8822   //ldout(cct, 3) << "op: client->close(open_files[ " << fh << " ]);" << dendl;
8823   //ldout(cct, 3) << "op: open_files.erase( " << fh << " );" << dendl;
8824   Inode *in = f->inode.get();
8825   ldout(cct, 8) << __func__ << " " << f << " mode " << f->mode << " on " << *in << dendl;
8826
8827   in->unset_deleg(f);
8828
8829   if (in->snapid == CEPH_NOSNAP) {
8830     if (in->put_open_ref(f->mode)) {
8831       _flush(in, new C_Client_FlushComplete(this, in));
8832       check_caps(in, 0);
8833     }
8834   } else {
8835     ceph_assert(in->snap_cap_refs > 0);
8836     in->snap_cap_refs--;
8837   }
8838
8839   _release_filelocks(f);
8840
8841   // Finally, read any async err (i.e. from flushes)
8842   int err = f->take_async_err();
8843   if (err != 0) {
8844     ldout(cct, 1) << __func__ << " " << f << " on inode " << *in << " caught async_err = "
8845                   << cpp_strerror(err) << dendl;
8846   } else {
8847     ldout(cct, 10) << __func__ << " " << f << " on inode " << *in << " no async_err state" << dendl;
8848   }
8849
8850   _put_fh(f);
8851
8852   return err;
8853 }
8854
8855 void Client::_put_fh(Fh *f)
8856 {
8857   int left = f->put();
8858   if (!left) {
8859     delete f;
8860   }
8861 }
8862
8863 int Client::_open(Inode *in, int flags, mode_t mode, Fh **fhp,
8864                   const UserPerm& perms)
8865 {
8866   if (in->snapid != CEPH_NOSNAP &&
8867       (flags & (O_WRONLY | O_RDWR | O_CREAT | O_TRUNC | O_APPEND))) {
8868     return -EROFS;
8869   }
8870
8871   // use normalized flags to generate cmode
8872   int cflags = ceph_flags_sys2wire(flags);
8873   if (cct->_conf.get_val<bool>("client_force_lazyio"))
8874     cflags |= CEPH_O_LAZY;
8875
8876   int cmode = ceph_flags_to_mode(cflags);
8877   int want = ceph_caps_for_mode(cmode);
8878   int result = 0;
8879
8880   in->get_open_ref(cmode);  // make note of pending open, since it effects _wanted_ caps.
8881
8882   if ((flags & O_TRUNC) == 0 && in->caps_issued_mask(want)) {
8883     // update wanted?
8884     check_caps(in, CHECK_CAPS_NODELAY);
8885   } else {
8886
8887     MetaRequest *req = new MetaRequest(CEPH_MDS_OP_OPEN);
8888     filepath path;
8889     in->make_nosnap_relative_path(path);
8890     req->set_filepath(path);
8891     req->head.args.open.flags = cflags & ~CEPH_O_CREAT;
8892     req->head.args.open.mode = mode;
8893     req->head.args.open.pool = -1;
8894     if (cct->_conf->client_debug_getattr_caps)
8895       req->head.args.open.mask = DEBUG_GETATTR_CAPS;
8896     else
8897       req->head.args.open.mask = 0;
8898     req->head.args.open.old_size = in->size;   // for O_TRUNC
8899     req->set_inode(in);
8900     result = make_request(req, perms);
8901
8902     /*
8903      * NFS expects that delegations will be broken on a conflicting open,
8904      * not just when there is actual conflicting access to the file. SMB leases
8905      * and oplocks also have similar semantics.
8906      *
8907      * Ensure that clients that have delegations enabled will wait on minimal
8908      * caps during open, just to ensure that other clients holding delegations
8909      * return theirs first.
8910      */
8911     if (deleg_timeout && result == 0) {
8912       int need = 0, have;
8913
8914       if (cmode & CEPH_FILE_MODE_WR)
8915         need |= CEPH_CAP_FILE_WR;
8916       if (cmode & CEPH_FILE_MODE_RD)
8917         need |= CEPH_CAP_FILE_RD;
8918
8919       Fh fh(in, flags, cmode, fd_gen, perms);
8920       result = get_caps(&fh, need, want, &have, -1);
8921       if (result < 0) {
8922         ldout(cct, 8) << "Unable to get caps after open of inode " << *in <<
8923                           " . Denying open: " <<
8924                           cpp_strerror(result) << dendl;
8925         in->put_open_ref(cmode);
8926       } else {
8927         put_cap_ref(in, need);
8928       }
8929     }
8930   }
8931
8932   // success?
8933   if (result >= 0) {
8934     if (fhp)
8935       *fhp = _create_fh(in, flags, cmode, perms);
8936   } else {
8937     in->put_open_ref(cmode);
8938   }
8939
8940   trim_cache();
8941
8942   return result;
8943 }
8944
8945 int Client::_renew_caps(Inode *in)
8946 {
8947   int wanted = in->caps_file_wanted();
8948   if (in->is_any_caps() &&
8949       ((wanted & CEPH_CAP_ANY_WR) == 0 || in->auth_cap)) {
8950     check_caps(in, CHECK_CAPS_NODELAY);
8951     return 0;
8952   }
8953
8954   int flags = 0;
8955   if ((wanted & CEPH_CAP_FILE_RD) && (wanted & CEPH_CAP_FILE_WR))
8956     flags = O_RDWR;
8957   else if (wanted & CEPH_CAP_FILE_RD)
8958     flags = O_RDONLY;
8959   else if (wanted & CEPH_CAP_FILE_WR)
8960     flags = O_WRONLY;
8961
8962   MetaRequest *req = new MetaRequest(CEPH_MDS_OP_OPEN);
8963   filepath path;
8964   in->make_nosnap_relative_path(path);
8965   req->set_filepath(path);
8966   req->head.args.open.flags = flags;
8967   req->head.args.open.pool = -1;
8968   if (cct->_conf->client_debug_getattr_caps)
8969     req->head.args.open.mask = DEBUG_GETATTR_CAPS;
8970   else
8971     req->head.args.open.mask = 0;
8972   req->set_inode(in);
8973
8974   // duplicate in case Cap goes away; not sure if that race is a concern?
8975   const UserPerm *pperm = in->get_best_perms();
8976   UserPerm perms;
8977   if (pperm != NULL)
8978     perms = *pperm;
8979   int ret = make_request(req, perms);
8980   return ret;
8981 }
8982
8983 int Client::close(int fd)
8984 {
8985   ldout(cct, 3) << "close enter(" << fd << ")" << dendl;
8986   std::lock_guard lock(client_lock);
8987   tout(cct) << "close" << std::endl;
8988   tout(cct) << fd << std::endl;
8989
8990   if (unmounting)
8991     return -ENOTCONN;
8992
8993   Fh *fh = get_filehandle(fd);
8994   if (!fh)
8995     return -EBADF;
8996   int err = _release_fh(fh);
8997   fd_map.erase(fd);
8998   put_fd(fd);
8999   ldout(cct, 3) << "close exit(" << fd << ")" << dendl;
9000   return err;
9001 }
9002
9003
9004 // ------------
9005 // read, write
9006
9007 loff_t Client::lseek(int fd, loff_t offset, int whence)
9008 {
9009   std::lock_guard lock(client_lock);
9010   tout(cct) << "lseek" << std::endl;
9011   tout(cct) << fd << std::endl;
9012   tout(cct) << offset << std::endl;
9013   tout(cct) << whence << std::endl;
9014
9015   if (unmounting)
9016     return -ENOTCONN;
9017
9018   Fh *f = get_filehandle(fd);
9019   if (!f)
9020     return -EBADF;
9021 #if defined(__linux__) && defined(O_PATH)
9022   if (f->flags & O_PATH)
9023     return -EBADF;
9024 #endif
9025   return _lseek(f, offset, whence);
9026 }
9027
9028 loff_t Client::_lseek(Fh *f, loff_t offset, int whence)
9029 {
9030   Inode *in = f->inode.get();
9031   bool whence_check = false;
9032   loff_t pos = -1;
9033
9034   switch (whence) {
9035   case SEEK_END:
9036     whence_check = true;
9037   break;
9038
9039 #ifdef SEEK_DATA
9040   case SEEK_DATA:
9041     whence_check = true;
9042   break;
9043 #endif
9044
9045 #ifdef SEEK_HOLE
9046   case SEEK_HOLE:
9047     whence_check = true;
9048   break;
9049 #endif
9050   }
9051
9052   if (whence_check) {
9053     int r = _getattr(in, CEPH_STAT_CAP_SIZE, f->actor_perms);
9054     if (r < 0)
9055       return r;
9056   }
9057
9058   switch (whence) {
9059   case SEEK_SET:
9060     pos = offset;
9061     break;
9062
9063   case SEEK_CUR:
9064     pos = f->pos + offset;
9065     break;
9066
9067   case SEEK_END:
9068     pos = in->size + offset;
9069     break;
9070
9071 #ifdef SEEK_DATA
9072   case SEEK_DATA:
9073     if (offset < 0 || static_cast<uint64_t>(offset) >= in->size)
9074       return -ENXIO;
9075     pos = offset;
9076     break;
9077 #endif
9078
9079 #ifdef SEEK_HOLE
9080   case SEEK_HOLE:
9081     if (offset < 0 || static_cast<uint64_t>(offset) >= in->size)
9082       return -ENXIO;
9083     pos = in->size;
9084     break;
9085 #endif
9086
9087   default:
9088     ldout(cct, 1) << __func__ << ": invalid whence value " << whence << dendl;
9089     return -EINVAL;
9090   }
9091
9092   if (pos < 0) {
9093     return -EINVAL;
9094   } else {
9095     f->pos = pos;
9096   }
9097
9098   ldout(cct, 8) << "_lseek(" << f << ", " << offset << ", " << whence << ") = " << f->pos << dendl;
9099   return f->pos;
9100 }
9101
9102
9103 void Client::lock_fh_pos(Fh *f)
9104 {
9105   ldout(cct, 10) << __func__ << " " << f << dendl;
9106
9107   if (f->pos_locked || !f->pos_waiters.empty()) {
9108     ceph::condition_variable cond;
9109     f->pos_waiters.push_back(&cond);
9110     ldout(cct, 10) << __func__ << " BLOCKING on " << f << dendl;
9111     std::unique_lock l{client_lock, std::adopt_lock};
9112     cond.wait(l, [f, me=&cond] {
9113       return !f->pos_locked && f->pos_waiters.front() == me;
9114     });
9115     l.release();
9116     ldout(cct, 10) << __func__ << " UNBLOCKING on " << f << dendl;
9117     ceph_assert(f->pos_waiters.front() == &cond);
9118     f->pos_waiters.pop_front();
9119   }
9120
9121   f->pos_locked = true;
9122 }
9123
9124 void Client::unlock_fh_pos(Fh *f)
9125 {
9126   ldout(cct, 10) << __func__ << " " << f << dendl;
9127   f->pos_locked = false;
9128 }
9129
9130 int Client::uninline_data(Inode *in, Context *onfinish)
9131 {
9132   if (!in->inline_data.length()) {
9133     onfinish->complete(0);
9134     return 0;
9135   }
9136
9137   char oid_buf[32];
9138   snprintf(oid_buf, sizeof(oid_buf), "%llx.00000000", (long long unsigned)in->ino);
9139   object_t oid = oid_buf;
9140
9141   ObjectOperation create_ops;
9142   create_ops.create(false);
9143
9144   objecter->mutate(oid,
9145                    OSDMap::file_to_object_locator(in->layout),
9146                    create_ops,
9147                    in->snaprealm->get_snap_context(),
9148                    ceph::real_clock::now(),
9149                    0,
9150                    NULL);
9151
9152   bufferlist inline_version_bl;
9153   encode(in->inline_version, inline_version_bl);
9154
9155   ObjectOperation uninline_ops;
9156   uninline_ops.cmpxattr("inline_version",
9157                         CEPH_OSD_CMPXATTR_OP_GT,
9158                         CEPH_OSD_CMPXATTR_MODE_U64,
9159                         inline_version_bl);
9160   bufferlist inline_data = in->inline_data;
9161   uninline_ops.write(0, inline_data, in->truncate_size, in->truncate_seq);
9162   uninline_ops.setxattr("inline_version", stringify(in->inline_version));
9163
9164   objecter->mutate(oid,
9165                    OSDMap::file_to_object_locator(in->layout),
9166                    uninline_ops,
9167                    in->snaprealm->get_snap_context(),
9168                    ceph::real_clock::now(),
9169                    0,
9170                    onfinish);
9171
9172   return 0;
9173 }
9174
9175 //
9176
9177 // blocking osd interface
9178
9179 int Client::read(int fd, char *buf, loff_t size, loff_t offset)
9180 {
9181   std::unique_lock lock(client_lock);
9182   tout(cct) << "read" << std::endl;
9183   tout(cct) << fd << std::endl;
9184   tout(cct) << size << std::endl;
9185   tout(cct) << offset << std::endl;
9186
9187   if (unmounting)
9188     return -ENOTCONN;
9189
9190   Fh *f = get_filehandle(fd);
9191   if (!f)
9192     return -EBADF;
9193 #if defined(__linux__) && defined(O_PATH)
9194   if (f->flags & O_PATH)
9195     return -EBADF;
9196 #endif
9197   bufferlist bl;
9198   /* We can't return bytes written larger than INT_MAX, clamp size to that */
9199   size = std::min(size, (loff_t)INT_MAX);
9200   int r = _read(f, offset, size, &bl);
9201   ldout(cct, 3) << "read(" << fd << ", " << (void*)buf << ", " << size << ", " << offset << ") = " << r << dendl;
9202   if (r >= 0) {
9203     lock.unlock();
9204     bl.begin().copy(bl.length(), buf);
9205     r = bl.length();
9206   }
9207   return r;
9208 }
9209
9210 int Client::preadv(int fd, const struct iovec *iov, int iovcnt, loff_t offset)
9211 {
9212   if (iovcnt < 0)
9213     return -EINVAL;
9214   return _preadv_pwritev(fd, iov, iovcnt, offset, false);
9215 }
9216
9217 int64_t Client::_read(Fh *f, int64_t offset, uint64_t size, bufferlist *bl)
9218 {
9219   int want, have = 0;
9220   bool movepos = false;
9221   std::unique_ptr<C_SaferCond> onuninline;
9222   int64_t r = 0;
9223   const auto& conf = cct->_conf;
9224   Inode *in = f->inode.get();
9225   utime_t lat;
9226   utime_t start = ceph_clock_now();
9227
9228   if ((f->mode & CEPH_FILE_MODE_RD) == 0)
9229     return -EBADF;
9230   //bool lazy = f->mode == CEPH_FILE_MODE_LAZY;
9231
9232   if (offset < 0) {
9233     lock_fh_pos(f);
9234     offset = f->pos;
9235     movepos = true;
9236   }
9237   loff_t start_pos = offset;
9238
9239   if (in->inline_version == 0) {
9240     r = _getattr(in, CEPH_STAT_CAP_INLINE_DATA, f->actor_perms, true);
9241     if (r < 0) {
9242       goto done;
9243     }
9244     ceph_assert(in->inline_version > 0);
9245   }
9246
9247 retry:
9248   if (f->mode & CEPH_FILE_MODE_LAZY)
9249     want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO;
9250   else
9251     want = CEPH_CAP_FILE_CACHE;
9252   r = get_caps(f, CEPH_CAP_FILE_RD, want, &have, -1);
9253   if (r < 0) {
9254     goto done;
9255   }
9256   if (f->flags & O_DIRECT)
9257     have &= ~(CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO);
9258
9259   if (in->inline_version < CEPH_INLINE_NONE) {
9260     if (!(have & CEPH_CAP_FILE_CACHE)) {
9261       onuninline.reset(new C_SaferCond("Client::_read_uninline_data flock"));
9262       uninline_data(in, onuninline.get());
9263     } else {
9264       uint32_t len = in->inline_data.length();
9265       uint64_t endoff = offset + size;
9266       if (endoff > in->size)
9267         endoff = in->size;
9268
9269       if (offset < len) {
9270         if (endoff <= len) {
9271           bl->substr_of(in->inline_data, offset, endoff - offset);
9272         } else {
9273           bl->substr_of(in->inline_data, offset, len - offset);
9274           bl->append_zero(endoff - len);
9275         }
9276         r = endoff - offset;
9277       } else if ((uint64_t)offset < endoff) {
9278         bl->append_zero(endoff - offset);
9279         r = endoff - offset;
9280       } else {
9281         r = 0;
9282       }
9283       goto success;
9284     }
9285   }
9286
9287   if (!conf->client_debug_force_sync_read &&
9288       conf->client_oc &&
9289       (have & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO))) {
9290
9291     if (f->flags & O_RSYNC) {
9292       _flush_range(in, offset, size);
9293     }
9294     r = _read_async(f, offset, size, bl);
9295     if (r < 0)
9296       goto done;
9297   } else {
9298     if (f->flags & O_DIRECT)
9299       _flush_range(in, offset, size);
9300
9301     bool checkeof = false;
9302     r = _read_sync(f, offset, size, bl, &checkeof);
9303     if (r < 0)
9304       goto done;
9305     if (checkeof) {
9306       offset += r;
9307       size -= r;
9308
9309       put_cap_ref(in, CEPH_CAP_FILE_RD);
9310       have = 0;
9311       // reverify size
9312       r = _getattr(in, CEPH_STAT_CAP_SIZE, f->actor_perms);
9313       if (r < 0)
9314         goto done;
9315
9316       // eof?  short read.
9317       if ((uint64_t)offset < in->size)
9318         goto retry;
9319     }
9320   }
9321
9322 success:
9323   ceph_assert(r >= 0);
9324   if (movepos) {
9325     // adjust fd pos
9326     f->pos = start_pos + r;
9327   }
9328
9329   lat = ceph_clock_now();
9330   lat -= start;
9331   logger->tinc(l_c_read, lat);
9332
9333 done:
9334   // done!
9335
9336   if (onuninline) {
9337     client_lock.unlock();
9338     int ret = onuninline->wait();
9339     client_lock.lock();
9340     if (ret >= 0 || ret == -ECANCELED) {
9341       in->inline_data.clear();
9342       in->inline_version = CEPH_INLINE_NONE;
9343       in->mark_caps_dirty(CEPH_CAP_FILE_WR);
9344       check_caps(in, 0);
9345     } else
9346       r = ret;
9347   }
9348   if (have) {
9349     put_cap_ref(in, CEPH_CAP_FILE_RD);
9350   }
9351   if (movepos) {
9352     unlock_fh_pos(f);
9353   }
9354   return r;
9355 }
9356
9357 Client::C_Readahead::C_Readahead(Client *c, Fh *f) :
9358     client(c), f(f) {
9359   f->get();
9360   f->readahead.inc_pending();
9361 }
9362
9363 Client::C_Readahead::~C_Readahead() {
9364   f->readahead.dec_pending();
9365   client->_put_fh(f);
9366 }
9367
9368 void Client::C_Readahead::finish(int r) {
9369   lgeneric_subdout(client->cct, client, 20) << "client." << client->get_nodeid() << " " << "C_Readahead on " << f->inode << dendl;
9370   client->put_cap_ref(f->inode.get(), CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE);
9371 }
9372
9373 int Client::_read_async(Fh *f, uint64_t off, uint64_t len, bufferlist *bl)
9374 {
9375   const auto& conf = cct->_conf;
9376   Inode *in = f->inode.get();
9377
9378   ldout(cct, 10) << __func__ << " " << *in << " " << off << "~" << len << dendl;
9379
9380   // trim read based on file size?
9381   if (off >= in->size)
9382     return 0;
9383   if (len == 0)
9384     return 0;
9385   if (off + len > in->size) {
9386     len = in->size - off;
9387   }
9388
9389   ldout(cct, 10) << " min_bytes=" << f->readahead.get_min_readahead_size()
9390                  << " max_bytes=" << f->readahead.get_max_readahead_size()
9391                  << " max_periods=" << conf->client_readahead_max_periods << dendl;
9392
9393   // read (and possibly block)
9394   int r = 0;
9395   C_SaferCond onfinish("Client::_read_async flock");
9396   r = objectcacher->file_read(&in->oset, &in->layout, in->snapid,
9397                               off, len, bl, 0, &onfinish);
9398   if (r == 0) {
9399     get_cap_ref(in, CEPH_CAP_FILE_CACHE);
9400     client_lock.unlock();
9401     r = onfinish.wait();
9402     client_lock.lock();
9403     put_cap_ref(in, CEPH_CAP_FILE_CACHE);
9404   }
9405
9406   if(f->readahead.get_min_readahead_size() > 0) {
9407     pair<uint64_t, uint64_t> readahead_extent = f->readahead.update(off, len, in->size);
9408     if (readahead_extent.second > 0) {
9409       ldout(cct, 20) << "readahead " << readahead_extent.first << "~" << readahead_extent.second
9410                      << " (caller wants " << off << "~" << len << ")" << dendl;
9411       Context *onfinish2 = new C_Readahead(this, f);
9412       int r2 = objectcacher->file_read(&in->oset, &in->layout, in->snapid,
9413                                        readahead_extent.first, readahead_extent.second,
9414                                        NULL, 0, onfinish2);
9415       if (r2 == 0) {
9416         ldout(cct, 20) << "readahead initiated, c " << onfinish2 << dendl;
9417         get_cap_ref(in, CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE);
9418       } else {
9419         ldout(cct, 20) << "readahead was no-op, already cached" << dendl;
9420         delete onfinish2;
9421       }
9422     }
9423   }
9424
9425   return r;
9426 }
9427
9428 int Client::_read_sync(Fh *f, uint64_t off, uint64_t len, bufferlist *bl,
9429                        bool *checkeof)
9430 {
9431   Inode *in = f->inode.get();
9432   uint64_t pos = off;
9433   int left = len;
9434   int read = 0;
9435
9436   ldout(cct, 10) << __func__ << " " << *in << " " << off << "~" << len << dendl;
9437
9438   while (left > 0) {
9439     C_SaferCond onfinish("Client::_read_sync flock");
9440     bufferlist tbl;
9441
9442     int wanted = left;
9443     filer->read_trunc(in->ino, &in->layout, in->snapid,
9444                       pos, left, &tbl, 0,
9445                       in->truncate_size, in->truncate_seq,
9446                       &onfinish);
9447     client_lock.unlock();
9448     int r = onfinish.wait();
9449     client_lock.lock();
9450
9451     // if we get ENOENT from OSD, assume 0 bytes returned
9452     if (r == -ENOENT)
9453       r = 0;
9454     if (r < 0)
9455       return r;
9456     if (tbl.length()) {
9457       r = tbl.length();
9458
9459       read += r;
9460       pos += r;
9461       left -= r;
9462       bl->claim_append(tbl);
9463     }
9464     // short read?
9465     if (r >= 0 && r < wanted) {
9466       if (pos < in->size) {
9467         // zero up to known EOF
9468         int64_t some = in->size - pos;
9469         if (some > left)
9470           some = left;
9471         auto z = buffer::ptr_node::create(some);
9472         z->zero();
9473         bl->push_back(std::move(z));
9474         read += some;
9475         pos += some;
9476         left -= some;
9477         if (left == 0)
9478           return read;
9479       }
9480
9481       *checkeof = true;
9482       return read;
9483     }
9484   }
9485   return read;
9486 }
9487
9488
9489 /*
9490  * we keep count of uncommitted sync writes on the inode, so that
9491  * fsync can DDRT.
9492  */
9493 void Client::_sync_write_commit(Inode *in)
9494 {
9495   ceph_assert(unsafe_sync_write > 0);
9496   unsafe_sync_write--;
9497
9498   put_cap_ref(in, CEPH_CAP_FILE_BUFFER);
9499
9500   ldout(cct, 15) << __func__ << " unsafe_sync_write = " << unsafe_sync_write << dendl;
9501   if (unsafe_sync_write == 0 && unmounting) {
9502     ldout(cct, 10) << __func__ << " -- no more unsafe writes, unmount can proceed" << dendl;
9503     mount_cond.notify_all();
9504   }
9505 }
9506
9507 int Client::write(int fd, const char *buf, loff_t size, loff_t offset)
9508 {
9509   std::lock_guard lock(client_lock);
9510   tout(cct) << "write" << std::endl;
9511   tout(cct) << fd << std::endl;
9512   tout(cct) << size << std::endl;
9513   tout(cct) << offset << std::endl;
9514
9515   if (unmounting)
9516     return -ENOTCONN;
9517
9518   Fh *fh = get_filehandle(fd);
9519   if (!fh)
9520     return -EBADF;
9521 #if defined(__linux__) && defined(O_PATH)
9522   if (fh->flags & O_PATH)
9523     return -EBADF;
9524 #endif
9525   /* We can't return bytes written larger than INT_MAX, clamp size to that */
9526   size = std::min(size, (loff_t)INT_MAX);
9527   int r = _write(fh, offset, size, buf, NULL, false);
9528   ldout(cct, 3) << "write(" << fd << ", \"...\", " << size << ", " << offset << ") = " << r << dendl;
9529   return r;
9530 }
9531
9532 int Client::pwritev(int fd, const struct iovec *iov, int iovcnt, int64_t offset)
9533 {
9534   if (iovcnt < 0)
9535     return -EINVAL;
9536   return _preadv_pwritev(fd, iov, iovcnt, offset, true);
9537 }
9538
9539 int64_t Client::_preadv_pwritev_locked(Fh *fh, const struct iovec *iov,
9540                                    unsigned iovcnt, int64_t offset, bool write,
9541                                    bool clamp_to_int)
9542 {
9543 #if defined(__linux__) && defined(O_PATH)
9544     if (fh->flags & O_PATH)
9545         return -EBADF;
9546 #endif
9547     loff_t totallen = 0;
9548     for (unsigned i = 0; i < iovcnt; i++) {
9549         totallen += iov[i].iov_len;
9550     }
9551
9552     /*
9553      * Some of the API functions take 64-bit size values, but only return
9554      * 32-bit signed integers. Clamp the I/O sizes in those functions so that
9555      * we don't do I/Os larger than the values we can return.
9556      */
9557     if (clamp_to_int) {
9558       totallen = std::min(totallen, (loff_t)INT_MAX);
9559     }
9560     if (write) {
9561         int64_t w = _write(fh, offset, totallen, NULL, iov, iovcnt);
9562         ldout(cct, 3) << "pwritev(" << fh << ", \"...\", " << totallen << ", " << offset << ") = " << w << dendl;
9563         return w;
9564     } else {
9565         bufferlist bl;
9566         int64_t r = _read(fh, offset, totallen, &bl);
9567         ldout(cct, 3) << "preadv(" << fh << ", " <<  offset << ") = " << r << dendl;
9568         if (r <= 0)
9569           return r;
9570
9571         auto iter = bl.cbegin();
9572         for (unsigned j = 0, resid = r; j < iovcnt && resid > 0; j++) {
9573                /*
9574                 * This piece of code aims to handle the case that bufferlist does not have enough data
9575                 * to fill in the iov
9576                 */
9577                const auto round_size = std::min<unsigned>(resid, iov[j].iov_len);
9578                iter.copy(round_size, reinterpret_cast<char*>(iov[j].iov_base));
9579                resid -= round_size;
9580                /* iter is self-updating */
9581         }
9582         return r;
9583     }
9584 }
9585
9586 int Client::_preadv_pwritev(int fd, const struct iovec *iov, unsigned iovcnt, int64_t offset, bool write)
9587 {
9588     std::lock_guard lock(client_lock);
9589     tout(cct) << fd << std::endl;
9590     tout(cct) << offset << std::endl;
9591
9592     if (unmounting)
9593      return -ENOTCONN;
9594
9595     Fh *fh = get_filehandle(fd);
9596     if (!fh)
9597         return -EBADF;
9598     return _preadv_pwritev_locked(fh, iov, iovcnt, offset, write, true);
9599 }
9600
9601 int64_t Client::_write(Fh *f, int64_t offset, uint64_t size, const char *buf,
9602                         const struct iovec *iov, int iovcnt)
9603 {
9604   uint64_t fpos = 0;
9605
9606   if ((uint64_t)(offset+size) > mdsmap->get_max_filesize()) //too large!
9607     return -EFBIG;
9608
9609   //ldout(cct, 7) << "write fh " << fh << " size " << size << " offset " << offset << dendl;
9610   Inode *in = f->inode.get();
9611
9612   if (objecter->osdmap_pool_full(in->layout.pool_id)) {
9613     return -ENOSPC;
9614   }
9615
9616   ceph_assert(in->snapid == CEPH_NOSNAP);
9617
9618   // was Fh opened as writeable?
9619   if ((f->mode & CEPH_FILE_MODE_WR) == 0)
9620     return -EBADF;
9621
9622   // use/adjust fd pos?
9623   if (offset < 0) {
9624     lock_fh_pos(f);
9625     /*
9626      * FIXME: this is racy in that we may block _after_ this point waiting for caps, and size may
9627      * change out from under us.
9628      */
9629     if (f->flags & O_APPEND) {
9630       auto r = _lseek(f, 0, SEEK_END);
9631       if (r < 0) {
9632         unlock_fh_pos(f);
9633         return r;
9634       }
9635     }
9636     offset = f->pos;
9637     fpos = offset+size;
9638     unlock_fh_pos(f);
9639   }
9640
9641   // check quota
9642   uint64_t endoff = offset + size;
9643   if (endoff > in->size && is_quota_bytes_exceeded(in, endoff - in->size,
9644                                                    f->actor_perms)) {
9645     return -EDQUOT;
9646   }
9647
9648   //bool lazy = f->mode == CEPH_FILE_MODE_LAZY;
9649
9650   ldout(cct, 10) << "cur file size is " << in->size << dendl;
9651
9652   // time it.
9653   utime_t start = ceph_clock_now();
9654
9655   if (in->inline_version == 0) {
9656     int r = _getattr(in, CEPH_STAT_CAP_INLINE_DATA, f->actor_perms, true);
9657     if (r < 0)
9658       return r;
9659     ceph_assert(in->inline_version > 0);
9660   }
9661
9662   // copy into fresh buffer (since our write may be resub, async)
9663   bufferlist bl;
9664   if (buf) {
9665     if (size > 0)
9666       bl.append(buf, size);
9667   } else if (iov){
9668     for (int i = 0; i < iovcnt; i++) {
9669       if (iov[i].iov_len > 0) {
9670         bl.append((const char *)iov[i].iov_base, iov[i].iov_len);
9671       }
9672     }
9673   }
9674
9675   utime_t lat;
9676   uint64_t totalwritten;
9677   int want, have;
9678   if (f->mode & CEPH_FILE_MODE_LAZY)
9679     want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO;
9680   else
9681     want = CEPH_CAP_FILE_BUFFER;
9682   int r = get_caps(f, CEPH_CAP_FILE_WR|CEPH_CAP_AUTH_SHARED, want, &have, endoff);
9683   if (r < 0)
9684     return r;
9685
9686   /* clear the setuid/setgid bits, if any */
9687   if (unlikely(in->mode & (S_ISUID|S_ISGID)) && size > 0) {
9688     struct ceph_statx stx = { 0 };
9689
9690     put_cap_ref(in, CEPH_CAP_AUTH_SHARED);
9691     r = __setattrx(in, &stx, CEPH_SETATTR_KILL_SGUID, f->actor_perms);
9692     if (r < 0)
9693       return r;
9694   } else {
9695     put_cap_ref(in, CEPH_CAP_AUTH_SHARED);
9696   }
9697
9698   if (f->flags & O_DIRECT)
9699     have &= ~(CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO);
9700
9701   ldout(cct, 10) << " snaprealm " << *in->snaprealm << dendl;
9702
9703   std::unique_ptr<C_SaferCond> onuninline = nullptr;
9704
9705   if (in->inline_version < CEPH_INLINE_NONE) {
9706     if (endoff > cct->_conf->client_max_inline_size ||
9707         endoff > CEPH_INLINE_MAX_SIZE ||
9708         !(have & CEPH_CAP_FILE_BUFFER)) {
9709       onuninline.reset(new C_SaferCond("Client::_write_uninline_data flock"));
9710       uninline_data(in, onuninline.get());
9711     } else {
9712       get_cap_ref(in, CEPH_CAP_FILE_BUFFER);
9713
9714       uint32_t len = in->inline_data.length();
9715
9716       if (endoff < len)
9717         in->inline_data.begin(endoff).copy(len - endoff, bl); // XXX
9718
9719       if (offset < len)
9720         in->inline_data.splice(offset, len - offset);
9721       else if (offset > len)
9722         in->inline_data.append_zero(offset - len);
9723
9724       in->inline_data.append(bl);
9725       in->inline_version++;
9726
9727       put_cap_ref(in, CEPH_CAP_FILE_BUFFER);
9728
9729       goto success;
9730     }
9731   }
9732
9733   if (cct->_conf->client_oc &&
9734       (have & (CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO))) {
9735     // do buffered write
9736     if (!in->oset.dirty_or_tx)
9737       get_cap_ref(in, CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_BUFFER);
9738
9739     get_cap_ref(in, CEPH_CAP_FILE_BUFFER);
9740
9741     // async, caching, non-blocking.
9742     r = objectcacher->file_write(&in->oset, &in->layout,
9743                                  in->snaprealm->get_snap_context(),
9744                                  offset, size, bl, ceph::real_clock::now(),
9745                                  0);
9746     put_cap_ref(in, CEPH_CAP_FILE_BUFFER);
9747
9748     if (r < 0)
9749       goto done;
9750
9751     // flush cached write if O_SYNC is set on file fh
9752     // O_DSYNC == O_SYNC on linux < 2.6.33
9753     // O_SYNC = __O_SYNC | O_DSYNC on linux >= 2.6.33
9754     if ((f->flags & O_SYNC) || (f->flags & O_DSYNC)) {
9755       _flush_range(in, offset, size);
9756     }
9757   } else {
9758     if (f->flags & O_DIRECT)
9759       _flush_range(in, offset, size);
9760
9761     // simple, non-atomic sync write
9762     C_SaferCond onfinish("Client::_write flock");
9763     unsafe_sync_write++;
9764     get_cap_ref(in, CEPH_CAP_FILE_BUFFER);  // released by onsafe callback
9765
9766     filer->write_trunc(in->ino, &in->layout, in->snaprealm->get_snap_context(),
9767                        offset, size, bl, ceph::real_clock::now(), 0,
9768                        in->truncate_size, in->truncate_seq,
9769                        &onfinish);
9770     client_lock.unlock();
9771     r = onfinish.wait();
9772     client_lock.lock();
9773     _sync_write_commit(in);
9774     if (r < 0)
9775       goto done;
9776   }
9777
9778   // if we get here, write was successful, update client metadata
9779 success:
9780   // time
9781   lat = ceph_clock_now();
9782   lat -= start;
9783   logger->tinc(l_c_wrlat, lat);
9784
9785   if (fpos) {
9786     lock_fh_pos(f);
9787     f->pos = fpos;
9788     unlock_fh_pos(f);
9789   }
9790   totalwritten = size;
9791   r = (int64_t)totalwritten;
9792
9793   // extend file?
9794   if (totalwritten + offset > in->size) {
9795     in->size = totalwritten + offset;
9796     in->mark_caps_dirty(CEPH_CAP_FILE_WR);
9797
9798     if (is_quota_bytes_approaching(in, f->actor_perms)) {
9799       check_caps(in, CHECK_CAPS_NODELAY);
9800     } else if (is_max_size_approaching(in)) {
9801       check_caps(in, 0);
9802     }
9803
9804     ldout(cct, 7) << "wrote to " << totalwritten+offset << ", extending file size" << dendl;
9805   } else {
9806     ldout(cct, 7) << "wrote to " << totalwritten+offset << ", leaving file size at " << in->size << dendl;
9807   }
9808
9809   // mtime
9810   in->mtime = in->ctime = ceph_clock_now();
9811   in->change_attr++;
9812   in->mark_caps_dirty(CEPH_CAP_FILE_WR);
9813
9814 done:
9815
9816   if (nullptr != onuninline) {
9817     client_lock.unlock();
9818     int uninline_ret = onuninline->wait();
9819     client_lock.lock();
9820
9821     if (uninline_ret >= 0 || uninline_ret == -ECANCELED) {
9822       in->inline_data.clear();
9823       in->inline_version = CEPH_INLINE_NONE;
9824       in->mark_caps_dirty(CEPH_CAP_FILE_WR);
9825       check_caps(in, 0);
9826     } else
9827       r = uninline_ret;
9828   }
9829
9830   put_cap_ref(in, CEPH_CAP_FILE_WR);
9831   return r;
9832 }
9833
9834 int Client::_flush(Fh *f)
9835 {
9836   Inode *in = f->inode.get();
9837   int err = f->take_async_err();
9838   if (err != 0) {
9839     ldout(cct, 1) << __func__ << ": " << f << " on inode " << *in << " caught async_err = "
9840                   << cpp_strerror(err) << dendl;
9841   } else {
9842     ldout(cct, 10) << __func__ << ": " << f << " on inode " << *in << " no async_err state" << dendl;
9843   }
9844
9845   return err;
9846 }
9847
9848 int Client::truncate(const char *relpath, loff_t length, const UserPerm& perms)
9849 {
9850   struct ceph_statx stx;
9851   stx.stx_size = length;
9852   return setattrx(relpath, &stx, CEPH_SETATTR_SIZE, perms);
9853 }
9854
9855 int Client::ftruncate(int fd, loff_t length, const UserPerm& perms)
9856 {
9857   std::lock_guard lock(client_lock);
9858   tout(cct) << __func__ << std::endl;
9859   tout(cct) << fd << std::endl;
9860   tout(cct) << length << std::endl;
9861
9862   if (unmounting)
9863     return -ENOTCONN;
9864
9865   Fh *f = get_filehandle(fd);
9866   if (!f)
9867     return -EBADF;
9868 #if defined(__linux__) && defined(O_PATH)
9869   if (f->flags & O_PATH)
9870     return -EBADF;
9871 #endif
9872   struct stat attr;
9873   attr.st_size = length;
9874   return _setattr(f->inode, &attr, CEPH_SETATTR_SIZE, perms);
9875 }
9876
9877 int Client::fsync(int fd, bool syncdataonly)
9878 {
9879   std::lock_guard lock(client_lock);
9880   tout(cct) << "fsync" << std::endl;
9881   tout(cct) << fd << std::endl;
9882   tout(cct) << syncdataonly << std::endl;
9883
9884   if (unmounting)
9885     return -ENOTCONN;
9886
9887   Fh *f = get_filehandle(fd);
9888   if (!f)
9889     return -EBADF;
9890 #if defined(__linux__) && defined(O_PATH)
9891   if (f->flags & O_PATH)
9892     return -EBADF;
9893 #endif
9894   int r = _fsync(f, syncdataonly);
9895   if (r == 0) {
9896     // The IOs in this fsync were okay, but maybe something happened
9897     // in the background that we shoudl be reporting?
9898     r = f->take_async_err();
9899     ldout(cct, 5) << "fsync(" << fd << ", " << syncdataonly
9900                   << ") = 0, async_err = " << r << dendl;
9901   } else {
9902     // Assume that an error we encountered during fsync, even reported
9903     // synchronously, would also have applied the error to the Fh, and we
9904     // should clear it here to avoid returning the same error again on next
9905     // call.
9906     ldout(cct, 5) << "fsync(" << fd << ", " << syncdataonly << ") = "
9907                   << r << dendl;
9908     f->take_async_err();
9909   }
9910   return r;
9911 }
9912
9913 int Client::_fsync(Inode *in, bool syncdataonly)
9914 {
9915   int r = 0;
9916   std::unique_ptr<C_SaferCond> object_cacher_completion = nullptr;
9917   ceph_tid_t flush_tid = 0;
9918   InodeRef tmp_ref;
9919   utime_t lat;
9920   utime_t start = ceph_clock_now();
9921
9922   ldout(cct, 8) << "_fsync on " << *in << " " << (syncdataonly ? "(dataonly)":"(data+metadata)") << dendl;
9923
9924   if (cct->_conf->client_oc) {
9925     object_cacher_completion.reset(new C_SaferCond("Client::_fsync::lock"));
9926     tmp_ref = in; // take a reference; C_SaferCond doesn't and _flush won't either
9927     _flush(in, object_cacher_completion.get());
9928     ldout(cct, 15) << "using return-valued form of _fsync" << dendl;
9929   }
9930
9931   if (!syncdataonly && in->dirty_caps) {
9932     check_caps(in, CHECK_CAPS_NODELAY|CHECK_CAPS_SYNCHRONOUS);
9933     if (in->flushing_caps)
9934       flush_tid = last_flush_tid;
9935   } else ldout(cct, 10) << "no metadata needs to commit" << dendl;
9936
9937   if (!syncdataonly && !in->unsafe_ops.empty()) {
9938     flush_mdlog_sync();
9939
9940     MetaRequest *req = in->unsafe_ops.back();
9941     ldout(cct, 15) << "waiting on unsafe requests, last tid " << req->get_tid() <<  dendl;
9942
9943     req->get();
9944     wait_on_list(req->waitfor_safe);
9945     put_request(req);
9946   }
9947
9948   if (nullptr != object_cacher_completion) { // wait on a real reply instead of guessing
9949     client_lock.unlock();
9950     ldout(cct, 15) << "waiting on data to flush" << dendl;
9951     r = object_cacher_completion->wait();
9952     client_lock.lock();
9953     ldout(cct, 15) << "got " << r << " from flush writeback" << dendl;
9954   } else {
9955     // FIXME: this can starve
9956     while (in->cap_refs[CEPH_CAP_FILE_BUFFER] > 0) {
9957       ldout(cct, 10) << "ino " << in->ino << " has " << in->cap_refs[CEPH_CAP_FILE_BUFFER]
9958                      << " uncommitted, waiting" << dendl;
9959       wait_on_list(in->waitfor_commit);
9960     }
9961   }
9962
9963   if (!r) {
9964     if (flush_tid > 0)
9965       wait_sync_caps(in, flush_tid);
9966
9967     ldout(cct, 10) << "ino " << in->ino << " has no uncommitted writes" << dendl;
9968   } else {
9969     ldout(cct, 8) << "ino " << in->ino << " failed to commit to disk! "
9970                   << cpp_strerror(-r) << dendl;
9971   }
9972
9973   lat = ceph_clock_now();
9974   lat -= start;
9975   logger->tinc(l_c_fsync, lat);
9976
9977   return r;
9978 }
9979
9980 int Client::_fsync(Fh *f, bool syncdataonly)
9981 {
9982   ldout(cct, 8) << "_fsync(" << f << ", " << (syncdataonly ? "dataonly)":"data+metadata)") << dendl;
9983   return _fsync(f->inode.get(), syncdataonly);
9984 }
9985
9986 int Client::fstat(int fd, struct stat *stbuf, const UserPerm& perms, int mask)
9987 {
9988   std::lock_guard lock(client_lock);
9989   tout(cct) << "fstat mask " << hex << mask << dec << std::endl;
9990   tout(cct) << fd << std::endl;
9991
9992   if (unmounting)
9993     return -ENOTCONN;
9994
9995   Fh *f = get_filehandle(fd);
9996   if (!f)
9997     return -EBADF;
9998   int r = _getattr(f->inode, mask, perms);
9999   if (r < 0)
10000     return r;
10001   fill_stat(f->inode, stbuf, NULL);
10002   ldout(cct, 5) << "fstat(" << fd << ", " << stbuf << ") = " << r << dendl;
10003   return r;
10004 }
10005
10006 int Client::fstatx(int fd, struct ceph_statx *stx, const UserPerm& perms,
10007                    unsigned int want, unsigned int flags)
10008 {
10009   std::lock_guard lock(client_lock);
10010   tout(cct) << "fstatx flags " << hex << flags << " want " << want << dec << std::endl;
10011   tout(cct) << fd << std::endl;
10012
10013   if (unmounting)
10014     return -ENOTCONN;
10015
10016   Fh *f = get_filehandle(fd);
10017   if (!f)
10018     return -EBADF;
10019
10020   unsigned mask = statx_to_mask(flags, want);
10021
10022   int r = 0;
10023   if (mask && !f->inode->caps_issued_mask(mask, true)) {
10024     r = _getattr(f->inode, mask, perms);
10025     if (r < 0) {
10026       ldout(cct, 3) << "fstatx exit on error!" << dendl;
10027       return r;
10028     }
10029   }
10030
10031   fill_statx(f->inode, mask, stx);
10032   ldout(cct, 3) << "fstatx(" << fd << ", " << stx << ") = " << r << dendl;
10033   return r;
10034 }
10035
10036 // not written yet, but i want to link!
10037
10038 int Client::chdir(const char *relpath, std::string &new_cwd,
10039                   const UserPerm& perms)
10040 {
10041   std::lock_guard lock(client_lock);
10042   tout(cct) << "chdir" << std::endl;
10043   tout(cct) << relpath << std::endl;
10044
10045   if (unmounting)
10046     return -ENOTCONN;
10047
10048   filepath path(relpath);
10049   InodeRef in;
10050   int r = path_walk(path, &in, perms);
10051   if (r < 0)
10052     return r;
10053
10054   if (!(in.get()->is_dir()))
10055     return -ENOTDIR;
10056
10057   if (cwd != in)
10058     cwd.swap(in);
10059   ldout(cct, 3) << "chdir(" << relpath << ")  cwd now " << cwd->ino << dendl;
10060
10061   _getcwd(new_cwd, perms);
10062   return 0;
10063 }
10064
10065 void Client::_getcwd(string& dir, const UserPerm& perms)
10066 {
10067   filepath path;
10068   ldout(cct, 10) << __func__ << " " << *cwd << dendl;
10069
10070   Inode *in = cwd.get();
10071   while (in != root) {
10072     ceph_assert(in->dentries.size() < 2); // dirs can't be hard-linked
10073
10074     // A cwd or ancester is unlinked
10075     if (in->dentries.empty()) {
10076       return;
10077     }
10078
10079     Dentry *dn = in->get_first_parent();
10080
10081
10082     if (!dn) {
10083       // look it up
10084       ldout(cct, 10) << __func__ << " looking up parent for " << *in << dendl;
10085       MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPNAME);
10086       filepath path(in->ino);
10087       req->set_filepath(path);
10088       req->set_inode(in);
10089       int res = make_request(req, perms);
10090       if (res < 0)
10091         break;
10092
10093       // start over
10094       path = filepath();
10095       in = cwd.get();
10096       continue;
10097     }
10098     path.push_front_dentry(dn->name);
10099     in = dn->dir->parent_inode;
10100   }
10101   dir = "/";
10102   dir += path.get_path();
10103 }
10104
10105 void Client::getcwd(string& dir, const UserPerm& perms)
10106 {
10107   std::lock_guard l(client_lock);
10108   if (!unmounting)
10109     _getcwd(dir, perms);
10110 }
10111
10112 int Client::statfs(const char *path, struct statvfs *stbuf,
10113                    const UserPerm& perms)
10114 {
10115   std::lock_guard l(client_lock);
10116   tout(cct) << __func__ << std::endl;
10117   unsigned long int total_files_on_fs;
10118
10119   if (unmounting)
10120     return -ENOTCONN;
10121
10122   ceph_statfs stats;
10123   C_SaferCond cond;
10124
10125   const vector<int64_t> &data_pools = mdsmap->get_data_pools();
10126   if (data_pools.size() == 1) {
10127     objecter->get_fs_stats(stats, data_pools[0], &cond);
10128   } else {
10129     objecter->get_fs_stats(stats, boost::optional<int64_t>(), &cond);
10130   }
10131
10132   client_lock.unlock();
10133   int rval = cond.wait();
10134   assert(root);
10135   total_files_on_fs = root->rstat.rfiles + root->rstat.rsubdirs;
10136   client_lock.lock();
10137
10138   if (rval < 0) {
10139     ldout(cct, 1) << "underlying call to statfs returned error: "
10140                   << cpp_strerror(rval)
10141                   << dendl;
10142     return rval;
10143   }
10144
10145   memset(stbuf, 0, sizeof(*stbuf));
10146
10147   /*
10148    * we're going to set a block size of 4MB so we can represent larger
10149    * FSes without overflowing. Additionally convert the space
10150    * measurements from KB to bytes while making them in terms of
10151    * blocks.  We use 4MB only because it is big enough, and because it
10152    * actually *is* the (ceph) default block size.
10153    */
10154   const int CEPH_BLOCK_SHIFT = 22;
10155   stbuf->f_frsize = 1 << CEPH_BLOCK_SHIFT;
10156   stbuf->f_bsize = 1 << CEPH_BLOCK_SHIFT;
10157   stbuf->f_files = total_files_on_fs;
10158   stbuf->f_ffree = 0;
10159   stbuf->f_favail = -1;
10160   stbuf->f_fsid = -1;       // ??
10161   stbuf->f_flag = 0;        // ??
10162   stbuf->f_namemax = NAME_MAX;
10163
10164   // Usually quota_root will == root_ancestor, but if the mount root has no
10165   // quota but we can see a parent of it that does have a quota, we'll
10166   // respect that one instead.
10167   ceph_assert(root != nullptr);
10168   Inode *quota_root = root->quota.is_enable() ? root : get_quota_root(root, perms);
10169
10170   // get_quota_root should always give us something
10171   // because client quotas are always enabled
10172   ceph_assert(quota_root != nullptr);
10173
10174   if (quota_root && cct->_conf->client_quota_df && quota_root->quota.max_bytes) {
10175
10176     // Skip the getattr if any sessions are stale, as we don't want to
10177     // block `df` if this client has e.g. been evicted, or if the MDS cluster
10178     // is unhealthy.
10179     if (!_any_stale_sessions()) {
10180       int r = _getattr(quota_root, 0, perms, true);
10181       if (r != 0) {
10182         // Ignore return value: error getting latest inode metadata is not a good
10183         // reason to break "df".
10184         lderr(cct) << "Error in getattr on quota root 0x"
10185                    << std::hex << quota_root->ino << std::dec
10186                    << " statfs result may be outdated" << dendl;
10187       }
10188     }
10189
10190     // Special case: if there is a size quota set on the Inode acting
10191     // as the root for this client mount, then report the quota status
10192     // as the filesystem statistics.
10193     const fsblkcnt_t total = quota_root->quota.max_bytes >> CEPH_BLOCK_SHIFT;
10194     const fsblkcnt_t used = quota_root->rstat.rbytes >> CEPH_BLOCK_SHIFT;
10195     // It is possible for a quota to be exceeded: arithmetic here must
10196     // handle case where used > total.
10197     const fsblkcnt_t free = total > used ? total - used : 0;
10198
10199     stbuf->f_blocks = total;
10200     stbuf->f_bfree = free;
10201     stbuf->f_bavail = free;
10202   } else {
10203     // General case: report the cluster statistics returned from RADOS. Because
10204     // multiple pools may be used without one filesystem namespace via
10205     // layouts, this is the most correct thing we can do.
10206     stbuf->f_blocks = stats.kb >> (CEPH_BLOCK_SHIFT - 10);
10207     stbuf->f_bfree = stats.kb_avail >> (CEPH_BLOCK_SHIFT - 10);
10208     stbuf->f_bavail = stats.kb_avail >> (CEPH_BLOCK_SHIFT - 10);
10209   }
10210
10211   return rval;
10212 }
10213
10214 int Client::_do_filelock(Inode *in, Fh *fh, int lock_type, int op, int sleep,
10215                          struct flock *fl, uint64_t owner, bool removing)
10216 {
10217   ldout(cct, 10) << __func__ << " ino " << in->ino
10218                  << (lock_type == CEPH_LOCK_FCNTL ? " fcntl" : " flock")
10219                  << " type " << fl->l_type << " owner " << owner
10220                  << " " << fl->l_start << "~" << fl->l_len << dendl;
10221
10222   if (in->flags & I_ERROR_FILELOCK)
10223     return -EIO;
10224
10225   int lock_cmd;
10226   if (F_RDLCK == fl->l_type)
10227     lock_cmd = CEPH_LOCK_SHARED;
10228   else if (F_WRLCK == fl->l_type)
10229     lock_cmd = CEPH_LOCK_EXCL;
10230   else if (F_UNLCK == fl->l_type)
10231     lock_cmd = CEPH_LOCK_UNLOCK;
10232   else
10233     return -EIO;
10234
10235   if (op != CEPH_MDS_OP_SETFILELOCK || lock_cmd == CEPH_LOCK_UNLOCK)
10236     sleep = 0;
10237
10238   /*
10239    * Set the most significant bit, so that MDS knows the 'owner'
10240    * is sufficient to identify the owner of lock. (old code uses
10241    * both 'owner' and 'pid')
10242    */
10243   owner |= (1ULL << 63);
10244
10245   MetaRequest *req = new MetaRequest(op);
10246   filepath path;
10247   in->make_nosnap_relative_path(path);
10248   req->set_filepath(path);
10249   req->set_inode(in);
10250
10251   req->head.args.filelock_change.rule = lock_type;
10252   req->head.args.filelock_change.type = lock_cmd;
10253   req->head.args.filelock_change.owner = owner;
10254   req->head.args.filelock_change.pid = fl->l_pid;
10255   req->head.args.filelock_change.start = fl->l_start;
10256   req->head.args.filelock_change.length = fl->l_len;
10257   req->head.args.filelock_change.wait = sleep;
10258
10259   int ret;
10260   bufferlist bl;
10261
10262   if (sleep && switch_interrupt_cb) {
10263     // enable interrupt
10264     switch_interrupt_cb(callback_handle, req->get());
10265     ret = make_request(req, fh->actor_perms, NULL, NULL, -1, &bl);
10266     // disable interrupt
10267     switch_interrupt_cb(callback_handle, NULL);
10268     if (ret == 0 && req->aborted()) {
10269       // effect of this lock request has been revoked by the 'lock intr' request
10270       ret = req->get_abort_code();
10271     }
10272     put_request(req);
10273   } else {
10274     ret = make_request(req, fh->actor_perms, NULL, NULL, -1, &bl);
10275   }
10276
10277   if (ret == 0) {
10278     if (op == CEPH_MDS_OP_GETFILELOCK) {
10279       ceph_filelock filelock;
10280       auto p = bl.cbegin();
10281       decode(filelock, p);
10282
10283       if (CEPH_LOCK_SHARED == filelock.type)
10284         fl->l_type = F_RDLCK;
10285       else if (CEPH_LOCK_EXCL == filelock.type)
10286         fl->l_type = F_WRLCK;
10287       else
10288         fl->l_type = F_UNLCK;
10289
10290       fl->l_whence = SEEK_SET;
10291       fl->l_start = filelock.start;
10292       fl->l_len = filelock.length;
10293       fl->l_pid = filelock.pid;
10294     } else if (op == CEPH_MDS_OP_SETFILELOCK) {
10295       ceph_lock_state_t *lock_state;
10296       if (lock_type == CEPH_LOCK_FCNTL) {
10297         if (!in->fcntl_locks)
10298           in->fcntl_locks.reset(new ceph_lock_state_t(cct, CEPH_LOCK_FCNTL));
10299         lock_state = in->fcntl_locks.get();
10300       } else if (lock_type == CEPH_LOCK_FLOCK) {
10301         if (!in->flock_locks)
10302           in->flock_locks.reset(new ceph_lock_state_t(cct, CEPH_LOCK_FLOCK));
10303         lock_state = in->flock_locks.get();
10304       } else {
10305         ceph_abort();
10306         return -EINVAL;
10307       }
10308       _update_lock_state(fl, owner, lock_state);
10309
10310       if (!removing) {
10311         if (lock_type == CEPH_LOCK_FCNTL) {
10312           if (!fh->fcntl_locks)
10313             fh->fcntl_locks.reset(new ceph_lock_state_t(cct, CEPH_LOCK_FCNTL));
10314           lock_state = fh->fcntl_locks.get();
10315         } else {
10316           if (!fh->flock_locks)
10317             fh->flock_locks.reset(new ceph_lock_state_t(cct, CEPH_LOCK_FLOCK));
10318           lock_state = fh->flock_locks.get();
10319         }
10320         _update_lock_state(fl, owner, lock_state);
10321       }
10322     } else
10323       ceph_abort();
10324   }
10325   return ret;
10326 }
10327
10328 int Client::_interrupt_filelock(MetaRequest *req)
10329 {
10330   // Set abort code, but do not kick. The abort code prevents the request
10331   // from being re-sent.
10332   req->abort(-EINTR);
10333   if (req->mds < 0)
10334     return 0; // haven't sent the request
10335
10336   Inode *in = req->inode();
10337
10338   int lock_type;
10339   if (req->head.args.filelock_change.rule == CEPH_LOCK_FLOCK)
10340     lock_type = CEPH_LOCK_FLOCK_INTR;
10341   else if (req->head.args.filelock_change.rule == CEPH_LOCK_FCNTL)
10342     lock_type = CEPH_LOCK_FCNTL_INTR;
10343   else {
10344     ceph_abort();
10345     return -EINVAL;
10346   }
10347
10348   MetaRequest *intr_req = new MetaRequest(CEPH_MDS_OP_SETFILELOCK);
10349   filepath path;
10350   in->make_nosnap_relative_path(path);
10351   intr_req->set_filepath(path);
10352   intr_req->set_inode(in);
10353   intr_req->head.args.filelock_change = req->head.args.filelock_change;
10354   intr_req->head.args.filelock_change.rule = lock_type;
10355   intr_req->head.args.filelock_change.type = CEPH_LOCK_UNLOCK;
10356
10357   UserPerm perms(req->get_uid(), req->get_gid());
10358   return make_request(intr_req, perms, NULL, NULL, -1);
10359 }
10360
10361 void Client::_encode_filelocks(Inode *in, bufferlist& bl)
10362 {
10363   if (!in->fcntl_locks && !in->flock_locks)
10364     return;
10365
10366   unsigned nr_fcntl_locks = in->fcntl_locks ? in->fcntl_locks->held_locks.size() : 0;
10367   encode(nr_fcntl_locks, bl);
10368   if (nr_fcntl_locks) {
10369     auto &lock_state = in->fcntl_locks;
10370     for(multimap<uint64_t, ceph_filelock>::iterator p = lock_state->held_locks.begin();
10371         p != lock_state->held_locks.end();
10372         ++p)
10373       encode(p->second, bl);
10374   }
10375
10376   unsigned nr_flock_locks = in->flock_locks ? in->flock_locks->held_locks.size() : 0;
10377   encode(nr_flock_locks, bl);
10378   if (nr_flock_locks) {
10379     auto &lock_state = in->flock_locks;
10380     for(multimap<uint64_t, ceph_filelock>::iterator p = lock_state->held_locks.begin();
10381         p != lock_state->held_locks.end();
10382         ++p)
10383       encode(p->second, bl);
10384   }
10385
10386   ldout(cct, 10) << __func__ << " ino " << in->ino << ", " << nr_fcntl_locks
10387                  << " fcntl locks, " << nr_flock_locks << " flock locks" <<  dendl;
10388 }
10389
10390 void Client::_release_filelocks(Fh *fh)
10391 {
10392   if (!fh->fcntl_locks && !fh->flock_locks)
10393     return;
10394
10395   Inode *in = fh->inode.get();
10396   ldout(cct, 10) << __func__ << " " << fh << " ino " << in->ino << dendl;
10397
10398   list<ceph_filelock> activated_locks;
10399
10400   list<pair<int, ceph_filelock> > to_release;
10401
10402   if (fh->fcntl_locks) {
10403     auto &lock_state = fh->fcntl_locks;
10404     for(auto p = lock_state->held_locks.begin(); p != lock_state->held_locks.end(); ) {
10405       auto q = p++;
10406       if (in->flags & I_ERROR_FILELOCK) {
10407         lock_state->remove_lock(q->second, activated_locks);
10408       } else {
10409         to_release.push_back(pair<int, ceph_filelock>(CEPH_LOCK_FCNTL, q->second));
10410       }
10411     }
10412     lock_state.reset();
10413   }
10414   if (fh->flock_locks) {
10415     auto &lock_state = fh->flock_locks;
10416     for(auto p = lock_state->held_locks.begin(); p != lock_state->held_locks.end(); ) {
10417       auto q = p++;
10418       if (in->flags & I_ERROR_FILELOCK) {
10419         lock_state->remove_lock(q->second, activated_locks);
10420       } else {
10421         to_release.push_back(pair<int, ceph_filelock>(CEPH_LOCK_FLOCK, q->second));
10422       }
10423     }
10424     lock_state.reset();
10425   }
10426
10427   if ((in->flags & I_ERROR_FILELOCK) && !in->has_any_filelocks())
10428     in->flags &= ~I_ERROR_FILELOCK;
10429
10430   if (to_release.empty())
10431     return;
10432
10433   struct flock fl;
10434   memset(&fl, 0, sizeof(fl));
10435   fl.l_whence = SEEK_SET;
10436   fl.l_type = F_UNLCK;
10437
10438   for (list<pair<int, ceph_filelock> >::iterator p = to_release.begin();
10439        p != to_release.end();
10440        ++p) {
10441     fl.l_start = p->second.start;
10442     fl.l_len = p->second.length;
10443     fl.l_pid = p->second.pid;
10444     _do_filelock(in, fh, p->first, CEPH_MDS_OP_SETFILELOCK, 0, &fl,
10445                  p->second.owner, true);
10446   }
10447 }
10448
10449 void Client::_update_lock_state(struct flock *fl, uint64_t owner,
10450                                 ceph_lock_state_t *lock_state)
10451 {
10452   int lock_cmd;
10453   if (F_RDLCK == fl->l_type)
10454     lock_cmd = CEPH_LOCK_SHARED;
10455   else if (F_WRLCK == fl->l_type)
10456     lock_cmd = CEPH_LOCK_EXCL;
10457   else
10458     lock_cmd = CEPH_LOCK_UNLOCK;;
10459
10460   ceph_filelock filelock;
10461   filelock.start = fl->l_start;
10462   filelock.length = fl->l_len;
10463   filelock.client = 0;
10464   // see comment in _do_filelock()
10465   filelock.owner = owner | (1ULL << 63);
10466   filelock.pid = fl->l_pid;
10467   filelock.type = lock_cmd;
10468
10469   if (filelock.type == CEPH_LOCK_UNLOCK) {
10470     list<ceph_filelock> activated_locks;
10471     lock_state->remove_lock(filelock, activated_locks);
10472   } else {
10473     bool r = lock_state->add_lock(filelock, false, false, NULL);
10474     ceph_assert(r);
10475   }
10476 }
10477
10478 int Client::_getlk(Fh *fh, struct flock *fl, uint64_t owner)
10479 {
10480   Inode *in = fh->inode.get();
10481   ldout(cct, 10) << "_getlk " << fh << " ino " << in->ino << dendl;
10482   int ret = _do_filelock(in, fh, CEPH_LOCK_FCNTL, CEPH_MDS_OP_GETFILELOCK, 0, fl, owner);
10483   return ret;
10484 }
10485
10486 int Client::_setlk(Fh *fh, struct flock *fl, uint64_t owner, int sleep)
10487 {
10488   Inode *in = fh->inode.get();
10489   ldout(cct, 10) << "_setlk " << fh << " ino " << in->ino << dendl;
10490   int ret =  _do_filelock(in, fh, CEPH_LOCK_FCNTL, CEPH_MDS_OP_SETFILELOCK, sleep, fl, owner);
10491   ldout(cct, 10) << "_setlk " << fh << " ino " << in->ino << " result=" << ret << dendl;
10492   return ret;
10493 }
10494
10495 int Client::_flock(Fh *fh, int cmd, uint64_t owner)
10496 {
10497   Inode *in = fh->inode.get();
10498   ldout(cct, 10) << "_flock " << fh << " ino " << in->ino << dendl;
10499
10500   int sleep = !(cmd & LOCK_NB);
10501   cmd &= ~LOCK_NB;
10502
10503   int type;
10504   switch (cmd) {
10505     case LOCK_SH:
10506       type = F_RDLCK;
10507       break;
10508     case LOCK_EX:
10509       type = F_WRLCK;
10510       break;
10511     case LOCK_UN:
10512       type = F_UNLCK;
10513       break;
10514     default:
10515       return -EINVAL;
10516   }
10517
10518   struct flock fl;
10519   memset(&fl, 0, sizeof(fl));
10520   fl.l_type = type;
10521   fl.l_whence = SEEK_SET;
10522
10523   int ret =  _do_filelock(in, fh, CEPH_LOCK_FLOCK, CEPH_MDS_OP_SETFILELOCK, sleep, &fl, owner);
10524   ldout(cct, 10) << "_flock " << fh << " ino " << in->ino << " result=" << ret << dendl;
10525   return ret;
10526 }
10527
10528 int Client::ll_statfs(Inode *in, struct statvfs *stbuf, const UserPerm& perms)
10529 {
10530   /* Since the only thing this does is wrap a call to statfs, and
10531      statfs takes a lock, it doesn't seem we have a need to split it
10532      out. */
10533   return statfs(0, stbuf, perms);
10534 }
10535
10536 void Client::ll_register_callbacks(struct ceph_client_callback_args *args)
10537 {
10538   if (!args)
10539     return;
10540   std::lock_guard l(client_lock);
10541   ldout(cct, 10) << __func__ << " cb " << args->handle
10542                  << " invalidate_ino_cb " << args->ino_cb
10543                  << " invalidate_dentry_cb " << args->dentry_cb
10544                  << " switch_interrupt_cb " << args->switch_intr_cb
10545                  << " remount_cb " << args->remount_cb
10546                  << dendl;
10547   callback_handle = args->handle;
10548   if (args->ino_cb) {
10549     ino_invalidate_cb = args->ino_cb;
10550     async_ino_invalidator.start();
10551   }
10552   if (args->dentry_cb) {
10553     dentry_invalidate_cb = args->dentry_cb;
10554     async_dentry_invalidator.start();
10555   }
10556   if (args->switch_intr_cb) {
10557     switch_interrupt_cb = args->switch_intr_cb;
10558     interrupt_finisher.start();
10559   }
10560   if (args->remount_cb) {
10561     remount_cb = args->remount_cb;
10562     remount_finisher.start();
10563   }
10564   if (args->ino_release_cb) {
10565     ino_release_cb = args->ino_release_cb;
10566     async_ino_releasor.start();
10567   }
10568   if (args->umask_cb)
10569     umask_cb = args->umask_cb;
10570 }
10571
10572 int Client::test_dentry_handling(bool can_invalidate)
10573 {
10574   int r = 0;
10575
10576   can_invalidate_dentries = can_invalidate;
10577
10578   if (can_invalidate_dentries) {
10579     ceph_assert(dentry_invalidate_cb);
10580     ldout(cct, 1) << "using dentry_invalidate_cb" << dendl;
10581     r = 0;
10582   } else {
10583     ceph_assert(remount_cb);
10584     ldout(cct, 1) << "using remount_cb" << dendl;
10585     r = _do_remount(false);
10586   }
10587
10588   return r;
10589 }
10590
10591 int Client::_sync_fs()
10592 {
10593   ldout(cct, 10) << __func__ << dendl;
10594
10595   // flush file data
10596   std::unique_ptr<C_SaferCond> cond = nullptr;
10597   if (cct->_conf->client_oc) {
10598     cond.reset(new C_SaferCond("Client::_sync_fs:lock"));
10599     objectcacher->flush_all(cond.get());
10600   }
10601
10602   // flush caps
10603   flush_caps_sync();
10604   ceph_tid_t flush_tid = last_flush_tid;
10605
10606   // wait for unsafe mds requests
10607   wait_unsafe_requests();
10608
10609   wait_sync_caps(flush_tid);
10610
10611   if (nullptr != cond) {
10612     client_lock.unlock();
10613     ldout(cct, 15) << __func__ << " waiting on data to flush" << dendl;
10614     cond->wait();
10615     ldout(cct, 15) << __func__ << " flush finished" << dendl;
10616     client_lock.lock();
10617   }
10618
10619   return 0;
10620 }
10621
10622 int Client::sync_fs()
10623 {
10624   std::lock_guard l(client_lock);
10625
10626   if (unmounting)
10627     return -ENOTCONN;
10628
10629   return _sync_fs();
10630 }
10631
10632 int64_t Client::drop_caches()
10633 {
10634   std::lock_guard l(client_lock);
10635   return objectcacher->release_all();
10636 }
10637
10638 int Client::_lazyio(Fh *fh, int enable)
10639 {
10640   Inode *in = fh->inode.get();
10641   ldout(cct, 20) << __func__ << " " << *in << " " << !!enable << dendl;
10642
10643   if (!!(fh->mode & CEPH_FILE_MODE_LAZY) == !!enable)
10644     return 0;
10645
10646   int orig_mode = fh->mode;
10647   if (enable) {
10648     fh->mode |= CEPH_FILE_MODE_LAZY;
10649     in->get_open_ref(fh->mode);
10650     in->put_open_ref(orig_mode);
10651     check_caps(in, CHECK_CAPS_NODELAY);
10652   } else {
10653     fh->mode &= ~CEPH_FILE_MODE_LAZY;
10654     in->get_open_ref(fh->mode);
10655     in->put_open_ref(orig_mode);
10656     check_caps(in, 0);
10657   }
10658
10659   return 0;
10660 }
10661
10662 int Client::lazyio(int fd, int enable)
10663 {
10664   std::lock_guard l(client_lock);
10665   Fh *f = get_filehandle(fd);
10666   if (!f)
10667     return -EBADF;
10668
10669   return _lazyio(f, enable);
10670 }
10671
10672 int Client::ll_lazyio(Fh *fh, int enable)
10673 {
10674   std::lock_guard lock(client_lock);
10675   ldout(cct, 3) << __func__ << " " << fh << " " << fh->inode->ino << " " << !!enable << dendl;
10676   tout(cct) << __func__ << std::endl;
10677
10678   return _lazyio(fh, enable);
10679 }
10680
10681 int Client::lazyio_propagate(int fd, loff_t offset, size_t count)
10682 {
10683   std::lock_guard l(client_lock);
10684   ldout(cct, 3) << "op: client->lazyio_propagate(" << fd
10685           << ", " << offset << ", " << count << ")" << dendl;
10686
10687   Fh *f = get_filehandle(fd);
10688   if (!f)
10689     return -EBADF;
10690
10691   // for now
10692   _fsync(f, true);
10693
10694   return 0;
10695 }
10696
10697 int Client::lazyio_synchronize(int fd, loff_t offset, size_t count)
10698 {
10699   std::lock_guard l(client_lock);
10700   ldout(cct, 3) << "op: client->lazyio_synchronize(" << fd
10701           << ", " << offset << ", " << count << ")" << dendl;
10702
10703   Fh *f = get_filehandle(fd);
10704   if (!f)
10705     return -EBADF;
10706   Inode *in = f->inode.get();
10707
10708   _fsync(f, true);
10709   if (_release(in)) {
10710     int r =_getattr(in, CEPH_STAT_CAP_SIZE, f->actor_perms);
10711     if (r < 0)
10712       return r;
10713   }
10714   return 0;
10715 }
10716
10717
10718 // =============================
10719 // snaps
10720
10721 int Client::mksnap(const char *relpath, const char *name, const UserPerm& perm)
10722 {
10723   std::lock_guard l(client_lock);
10724
10725   if (unmounting)
10726     return -ENOTCONN;
10727
10728   filepath path(relpath);
10729   InodeRef in;
10730   int r = path_walk(path, &in, perm);
10731   if (r < 0)
10732     return r;
10733   if (cct->_conf->client_permissions) {
10734     r = may_create(in.get(), perm);
10735     if (r < 0)
10736       return r;
10737   }
10738   Inode *snapdir = open_snapdir(in.get());
10739   return _mkdir(snapdir, name, 0, perm);
10740 }
10741
10742 int Client::rmsnap(const char *relpath, const char *name, const UserPerm& perms)
10743 {
10744   std::lock_guard l(client_lock);
10745
10746   if (unmounting)
10747     return -ENOTCONN;
10748
10749   filepath path(relpath);
10750   InodeRef in;
10751   int r = path_walk(path, &in, perms);
10752   if (r < 0)
10753     return r;
10754   if (cct->_conf->client_permissions) {
10755     r = may_delete(in.get(), NULL, perms);
10756     if (r < 0)
10757       return r;
10758   }
10759   Inode *snapdir = open_snapdir(in.get());
10760   return _rmdir(snapdir, name, perms);
10761 }
10762
10763 // =============================
10764 // expose caps
10765
10766 int Client::get_caps_issued(int fd) {
10767
10768   std::lock_guard lock(client_lock);
10769
10770   if (unmounting)
10771     return -ENOTCONN;
10772
10773   Fh *f = get_filehandle(fd);
10774   if (!f)
10775     return -EBADF;
10776
10777   return f->inode->caps_issued();
10778 }
10779
10780 int Client::get_caps_issued(const char *path, const UserPerm& perms)
10781 {
10782   std::lock_guard lock(client_lock);
10783
10784   if (unmounting)
10785     return -ENOTCONN;
10786
10787   filepath p(path);
10788   InodeRef in;
10789   int r = path_walk(p, &in, perms, true);
10790   if (r < 0)
10791     return r;
10792   return in->caps_issued();
10793 }
10794
10795 // =========================================
10796 // low level
10797
10798 Inode *Client::open_snapdir(Inode *diri)
10799 {
10800   Inode *in;
10801   vinodeno_t vino(diri->ino, CEPH_SNAPDIR);
10802   if (!inode_map.count(vino)) {
10803     in = new Inode(this, vino, &diri->layout);
10804
10805     in->ino = diri->ino;
10806     in->snapid = CEPH_SNAPDIR;
10807     in->mode = diri->mode;
10808     in->uid = diri->uid;
10809     in->gid = diri->gid;
10810     in->nlink = 1;
10811     in->mtime = diri->mtime;
10812     in->ctime = diri->ctime;
10813     in->btime = diri->btime;
10814     in->atime = diri->atime;
10815     in->size = diri->size;
10816     in->change_attr = diri->change_attr;
10817
10818     in->dirfragtree.clear();
10819     in->snapdir_parent = diri;
10820     diri->flags |= I_SNAPDIR_OPEN;
10821     inode_map[vino] = in;
10822     if (use_faked_inos())
10823       _assign_faked_ino(in);
10824     ldout(cct, 10) << "open_snapdir created snapshot inode " << *in << dendl;
10825   } else {
10826     in = inode_map[vino];
10827     ldout(cct, 10) << "open_snapdir had snapshot inode " << *in << dendl;
10828   }
10829   return in;
10830 }
10831
10832 int Client::ll_lookup(Inode *parent, const char *name, struct stat *attr,
10833                       Inode **out, const UserPerm& perms)
10834 {
10835   std::lock_guard lock(client_lock);
10836   vinodeno_t vparent = _get_vino(parent);
10837   ldout(cct, 3) << __func__ << " " << vparent << " " << name << dendl;
10838   tout(cct) << __func__ << std::endl;
10839   tout(cct) << name << std::endl;
10840
10841   if (unmounting)
10842     return -ENOTCONN;
10843
10844   int r = 0;
10845   if (!fuse_default_permissions) {
10846     if (strcmp(name, ".") && strcmp(name, "..")) {
10847       r = may_lookup(parent, perms);
10848       if (r < 0)
10849         return r;
10850     }
10851   }
10852
10853   string dname(name);
10854   InodeRef in;
10855
10856   r = _lookup(parent, dname, CEPH_STAT_CAP_INODE_ALL, &in, perms);
10857   if (r < 0) {
10858     attr->st_ino = 0;
10859     goto out;
10860   }
10861
10862   ceph_assert(in);
10863   fill_stat(in, attr);
10864   _ll_get(in.get());
10865
10866  out:
10867   ldout(cct, 3) << __func__ << " " << vparent << " " << name
10868           << " -> " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
10869   tout(cct) << attr->st_ino << std::endl;
10870   *out = in.get();
10871   return r;
10872 }
10873
10874 int Client::ll_lookup_inode(
10875     struct inodeno_t ino,
10876     const UserPerm& perms,
10877     Inode **inode)
10878 {
10879   ceph_assert(inode != NULL);
10880   std::lock_guard lock(client_lock);
10881   ldout(cct, 3) << "ll_lookup_inode " << ino  << dendl;
10882
10883   if (unmounting)
10884     return -ENOTCONN;
10885
10886   // Num1: get inode and *inode
10887   int r = _lookup_ino(ino, perms, inode);
10888   if (r)
10889     return r;
10890
10891   ceph_assert(*inode != NULL);
10892
10893   if (!(*inode)->dentries.empty()) {
10894     ldout(cct, 8) << __func__ << " dentry already present" << dendl;
10895     return 0;
10896   }
10897
10898   if ((*inode)->is_root()) {
10899     ldout(cct, 8) << "ino is root, no parent" << dendl;
10900     return 0;
10901   }
10902
10903   // Num2: Request the parent inode, so that we can look up the name
10904   Inode *parent;
10905   r = _lookup_parent(*inode, perms, &parent);
10906   if (r) {
10907     _ll_forget(*inode, 1);
10908     return r;
10909   }
10910
10911   ceph_assert(parent != NULL);
10912
10913   // Num3: Finally, get the name (dentry) of the requested inode
10914   r = _lookup_name(*inode, parent, perms);
10915   if (r) {
10916     // Unexpected error
10917     _ll_forget(parent, 1);
10918     _ll_forget(*inode, 1);
10919     return r;
10920   }
10921
10922   _ll_forget(parent, 1);
10923   return 0;
10924 }
10925
10926 int Client::ll_lookupx(Inode *parent, const char *name, Inode **out,
10927                        struct ceph_statx *stx, unsigned want, unsigned flags,
10928                        const UserPerm& perms)
10929 {
10930   std::lock_guard lock(client_lock);
10931   vinodeno_t vparent = _get_vino(parent);
10932   ldout(cct, 3) << __func__ << " " << vparent << " " << name << dendl;
10933   tout(cct) << "ll_lookupx" << std::endl;
10934   tout(cct) << name << std::endl;
10935
10936   if (unmounting)
10937     return -ENOTCONN;
10938
10939   int r = 0;
10940   if (!fuse_default_permissions) {
10941     r = may_lookup(parent, perms);
10942     if (r < 0)
10943       return r;
10944   }
10945
10946   string dname(name);
10947   InodeRef in;
10948
10949   unsigned mask = statx_to_mask(flags, want);
10950   r = _lookup(parent, dname, mask, &in, perms);
10951   if (r < 0) {
10952     stx->stx_ino = 0;
10953     stx->stx_mask = 0;
10954   } else {
10955     ceph_assert(in);
10956     fill_statx(in, mask, stx);
10957     _ll_get(in.get());
10958   }
10959
10960   ldout(cct, 3) << __func__ << " " << vparent << " " << name
10961           << " -> " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
10962   tout(cct) << stx->stx_ino << std::endl;
10963   *out = in.get();
10964   return r;
10965 }
10966
10967 int Client::ll_walk(const char* name, Inode **out, struct ceph_statx *stx,
10968                     unsigned int want, unsigned int flags, const UserPerm& perms)
10969 {
10970   std::lock_guard lock(client_lock);
10971
10972   if (unmounting)
10973     return -ENOTCONN;
10974
10975   filepath fp(name, 0);
10976   InodeRef in;
10977   int rc;
10978   unsigned mask = statx_to_mask(flags, want);
10979
10980   ldout(cct, 3) << __func__ << " " << name << dendl;
10981   tout(cct) << __func__ << std::endl;
10982   tout(cct) << name << std::endl;
10983
10984   rc = path_walk(fp, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW), mask);
10985   if (rc < 0) {
10986     /* zero out mask, just in case... */
10987     stx->stx_mask = 0;
10988     stx->stx_ino = 0;
10989     *out = NULL;
10990     return rc;
10991   } else {
10992     ceph_assert(in);
10993     fill_statx(in, mask, stx);
10994     _ll_get(in.get());
10995     *out = in.get();
10996     return 0;
10997   }
10998 }
10999
11000 void Client::_ll_get(Inode *in)
11001 {
11002   if (in->ll_ref == 0) {
11003     in->get();
11004     if (in->is_dir() && !in->dentries.empty()) {
11005       ceph_assert(in->dentries.size() == 1); // dirs can't be hard-linked
11006       in->get_first_parent()->get(); // pin dentry
11007     }
11008     if (in->snapid != CEPH_NOSNAP)
11009       ll_snap_ref[in->snapid]++;
11010   }
11011   in->ll_get();
11012   ldout(cct, 20) << __func__ << " " << in << " " << in->ino << " -> " << in->ll_ref << dendl;
11013 }
11014
11015 int Client::_ll_put(Inode *in, uint64_t num)
11016 {
11017   in->ll_put(num);
11018   ldout(cct, 20) << __func__ << " " << in << " " << in->ino << " " << num << " -> " << in->ll_ref << dendl;
11019   if (in->ll_ref == 0) {
11020     if (in->is_dir() && !in->dentries.empty()) {
11021       ceph_assert(in->dentries.size() == 1); // dirs can't be hard-linked
11022       in->get_first_parent()->put(); // unpin dentry
11023     }
11024     if (in->snapid != CEPH_NOSNAP) {
11025       auto p = ll_snap_ref.find(in->snapid);
11026       ceph_assert(p != ll_snap_ref.end());
11027       ceph_assert(p->second > 0);
11028       if (--p->second == 0)
11029         ll_snap_ref.erase(p);
11030     }
11031     put_inode(in);
11032     return 0;
11033   } else {
11034     return in->ll_ref;
11035   }
11036 }
11037
11038 void Client::_ll_drop_pins()
11039 {
11040   ldout(cct, 10) << __func__ << dendl;
11041   std::set<InodeRef> to_be_put; //this set will be deconstructed item by item when exit
11042   ceph::unordered_map<vinodeno_t, Inode*>::iterator next;
11043   for (ceph::unordered_map<vinodeno_t, Inode*>::iterator it = inode_map.begin();
11044        it != inode_map.end();
11045        it = next) {
11046     Inode *in = it->second;
11047     next = it;
11048     ++next;
11049     if (in->ll_ref){
11050       to_be_put.insert(in);
11051       _ll_put(in, in->ll_ref);
11052     }
11053   }
11054 }
11055
11056 bool Client::_ll_forget(Inode *in, uint64_t count)
11057 {
11058   inodeno_t ino = in->ino;
11059
11060   ldout(cct, 8) << __func__ << " " << ino << " " << count << dendl;
11061   tout(cct) << __func__ << std::endl;
11062   tout(cct) << ino.val << std::endl;
11063   tout(cct) << count << std::endl;
11064
11065   // Ignore forget if we're no longer mounted
11066   if (unmounting)
11067     return true;
11068
11069   if (ino == 1) return true;  // ignore forget on root.
11070
11071   bool last = false;
11072   if (in->ll_ref < count) {
11073     ldout(cct, 1) << "WARNING: ll_forget on " << ino << " " << count
11074                   << ", which only has ll_ref=" << in->ll_ref << dendl;
11075     _ll_put(in, in->ll_ref);
11076     last = true;
11077   } else {
11078     if (_ll_put(in, count) == 0)
11079       last = true;
11080   }
11081
11082   return last;
11083 }
11084
11085 bool Client::ll_forget(Inode *in, uint64_t count)
11086 {
11087   std::lock_guard lock(client_lock);
11088   return _ll_forget(in, count);
11089 }
11090
11091 bool Client::ll_put(Inode *in)
11092 {
11093   /* ll_forget already takes the lock */
11094   return ll_forget(in, 1);
11095 }
11096
11097 int Client::ll_get_snap_ref(snapid_t snap)
11098 {
11099   std::lock_guard lock(client_lock);
11100   auto p = ll_snap_ref.find(snap);
11101   if (p != ll_snap_ref.end())
11102     return p->second;
11103   return 0;
11104 }
11105
11106 snapid_t Client::ll_get_snapid(Inode *in)
11107 {
11108   std::lock_guard lock(client_lock);
11109   return in->snapid;
11110 }
11111
11112 Inode *Client::ll_get_inode(ino_t ino)
11113 {
11114   std::lock_guard lock(client_lock);
11115
11116   if (unmounting)
11117     return NULL;
11118
11119   vinodeno_t vino = _map_faked_ino(ino);
11120   unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino);
11121   if (p == inode_map.end())
11122     return NULL;
11123   Inode *in = p->second;
11124   _ll_get(in);
11125   return in;
11126 }
11127
11128 Inode *Client::ll_get_inode(vinodeno_t vino)
11129 {
11130   std::lock_guard lock(client_lock);
11131
11132   if (unmounting)
11133     return NULL;
11134
11135   unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino);
11136   if (p == inode_map.end())
11137     return NULL;
11138   Inode *in = p->second;
11139   _ll_get(in);
11140   return in;
11141 }
11142
11143 int Client::_ll_getattr(Inode *in, int caps, const UserPerm& perms)
11144 {
11145   vinodeno_t vino = _get_vino(in);
11146
11147   ldout(cct, 8) << __func__ << " " << vino << dendl;
11148   tout(cct) << __func__ << std::endl;
11149   tout(cct) << vino.ino.val << std::endl;
11150
11151   if (vino.snapid < CEPH_NOSNAP)
11152     return 0;
11153   else
11154     return _getattr(in, caps, perms);
11155 }
11156
11157 int Client::ll_getattr(Inode *in, struct stat *attr, const UserPerm& perms)
11158 {
11159   std::lock_guard lock(client_lock);
11160
11161   if (unmounting)
11162     return -ENOTCONN;
11163
11164   int res = _ll_getattr(in, CEPH_STAT_CAP_INODE_ALL, perms);
11165
11166   if (res == 0)
11167     fill_stat(in, attr);
11168   ldout(cct, 3) << __func__ << " " << _get_vino(in) << " = " << res << dendl;
11169   return res;
11170 }
11171
11172 int Client::ll_getattrx(Inode *in, struct ceph_statx *stx, unsigned int want,
11173                         unsigned int flags, const UserPerm& perms)
11174 {
11175   std::lock_guard lock(client_lock);
11176
11177   if (unmounting)
11178     return -ENOTCONN;
11179
11180   int res = 0;
11181   unsigned mask = statx_to_mask(flags, want);
11182
11183   if (mask && !in->caps_issued_mask(mask, true))
11184     res = _ll_getattr(in, mask, perms);
11185
11186   if (res == 0)
11187     fill_statx(in, mask, stx);
11188   ldout(cct, 3) << __func__ << " " << _get_vino(in) << " = " << res << dendl;
11189   return res;
11190 }
11191
11192 int Client::_ll_setattrx(Inode *in, struct ceph_statx *stx, int mask,
11193                          const UserPerm& perms, InodeRef *inp)
11194 {
11195   vinodeno_t vino = _get_vino(in);
11196
11197   ldout(cct, 8) << __func__ << " " << vino << " mask " << hex << mask << dec
11198                 << dendl;
11199   tout(cct) << __func__ << std::endl;
11200   tout(cct) << vino.ino.val << std::endl;
11201   tout(cct) << stx->stx_mode << std::endl;
11202   tout(cct) << stx->stx_uid << std::endl;
11203   tout(cct) << stx->stx_gid << std::endl;
11204   tout(cct) << stx->stx_size << std::endl;
11205   tout(cct) << stx->stx_mtime << std::endl;
11206   tout(cct) << stx->stx_atime << std::endl;
11207   tout(cct) << stx->stx_btime << std::endl;
11208   tout(cct) << mask << std::endl;
11209
11210   if (!fuse_default_permissions) {
11211     int res = may_setattr(in, stx, mask, perms);
11212     if (res < 0)
11213       return res;
11214   }
11215
11216   mask &= ~(CEPH_SETATTR_MTIME_NOW | CEPH_SETATTR_ATIME_NOW);
11217
11218   return __setattrx(in, stx, mask, perms, inp);
11219 }
11220
11221 int Client::ll_setattrx(Inode *in, struct ceph_statx *stx, int mask,
11222                         const UserPerm& perms)
11223 {
11224   std::lock_guard lock(client_lock);
11225
11226   if (unmounting)
11227     return -ENOTCONN;
11228
11229   InodeRef target(in);
11230   int res = _ll_setattrx(in, stx, mask, perms, &target);
11231   if (res == 0) {
11232     ceph_assert(in == target.get());
11233     fill_statx(in, in->caps_issued(), stx);
11234   }
11235
11236   ldout(cct, 3) << __func__ << " " << _get_vino(in) << " = " << res << dendl;
11237   return res;
11238 }
11239
11240 int Client::ll_setattr(Inode *in, struct stat *attr, int mask,
11241                        const UserPerm& perms)
11242 {
11243   struct ceph_statx stx;
11244   stat_to_statx(attr, &stx);
11245
11246   std::lock_guard lock(client_lock);
11247
11248   if (unmounting)
11249     return -ENOTCONN;
11250
11251   InodeRef target(in);
11252   int res = _ll_setattrx(in, &stx, mask, perms, &target);
11253   if (res == 0) {
11254     ceph_assert(in == target.get());
11255     fill_stat(in, attr);
11256   }
11257
11258   ldout(cct, 3) << __func__ << " " << _get_vino(in) << " = " << res << dendl;
11259   return res;
11260 }
11261
11262
11263 // ----------
11264 // xattrs
11265
11266 int Client::getxattr(const char *path, const char *name, void *value, size_t size,
11267                      const UserPerm& perms)
11268 {
11269   std::lock_guard lock(client_lock);
11270
11271   if (unmounting)
11272     return -ENOTCONN;
11273
11274   InodeRef in;
11275   int r = Client::path_walk(path, &in, perms, true, CEPH_STAT_CAP_XATTR);
11276   if (r < 0)
11277     return r;
11278   return _getxattr(in, name, value, size, perms);
11279 }
11280
11281 int Client::lgetxattr(const char *path, const char *name, void *value, size_t size,
11282                       const UserPerm& perms)
11283 {
11284   std::lock_guard lock(client_lock);
11285
11286   if (unmounting)
11287     return -ENOTCONN;
11288
11289   InodeRef in;
11290   int r = Client::path_walk(path, &in, perms, false, CEPH_STAT_CAP_XATTR);
11291   if (r < 0)
11292     return r;
11293   return _getxattr(in, name, value, size, perms);
11294 }
11295
11296 int Client::fgetxattr(int fd, const char *name, void *value, size_t size,
11297                       const UserPerm& perms)
11298 {
11299   std::lock_guard lock(client_lock);
11300
11301   if (unmounting)
11302     return -ENOTCONN;
11303
11304   Fh *f = get_filehandle(fd);
11305   if (!f)
11306     return -EBADF;
11307   return _getxattr(f->inode, name, value, size, perms);
11308 }
11309
11310 int Client::listxattr(const char *path, char *list, size_t size,
11311                       const UserPerm& perms)
11312 {
11313   std::lock_guard lock(client_lock);
11314
11315   if (unmounting)
11316     return -ENOTCONN;
11317
11318   InodeRef in;
11319   int r = Client::path_walk(path, &in, perms, true, CEPH_STAT_CAP_XATTR);
11320   if (r < 0)
11321     return r;
11322   return Client::_listxattr(in.get(), list, size, perms);
11323 }
11324
11325 int Client::llistxattr(const char *path, char *list, size_t size,
11326                        const UserPerm& perms)
11327 {
11328   std::lock_guard lock(client_lock);
11329
11330   if (unmounting)
11331     return -ENOTCONN;
11332
11333   InodeRef in;
11334   int r = Client::path_walk(path, &in, perms, false, CEPH_STAT_CAP_XATTR);
11335   if (r < 0)
11336     return r;
11337   return Client::_listxattr(in.get(), list, size, perms);
11338 }
11339
11340 int Client::flistxattr(int fd, char *list, size_t size, const UserPerm& perms)
11341 {
11342   std::lock_guard lock(client_lock);
11343
11344   if (unmounting)
11345     return -ENOTCONN;
11346
11347   Fh *f = get_filehandle(fd);
11348   if (!f)
11349     return -EBADF;
11350   return Client::_listxattr(f->inode.get(), list, size, perms);
11351 }
11352
11353 int Client::removexattr(const char *path, const char *name,
11354                         const UserPerm& perms)
11355 {
11356   std::lock_guard lock(client_lock);
11357
11358   if (unmounting)
11359     return -ENOTCONN;
11360
11361   InodeRef in;
11362   int r = Client::path_walk(path, &in, perms, true);
11363   if (r < 0)
11364     return r;
11365   return _removexattr(in, name, perms);
11366 }
11367
11368 int Client::lremovexattr(const char *path, const char *name,
11369                          const UserPerm& perms)
11370 {
11371   std::lock_guard lock(client_lock);
11372
11373   if (unmounting)
11374     return -ENOTCONN;
11375
11376   InodeRef in;
11377   int r = Client::path_walk(path, &in, perms, false);
11378   if (r < 0)
11379     return r;
11380   return _removexattr(in, name, perms);
11381 }
11382
11383 int Client::fremovexattr(int fd, const char *name, const UserPerm& perms)
11384 {
11385   std::lock_guard lock(client_lock);
11386
11387   if (unmounting)
11388     return -ENOTCONN;
11389
11390   Fh *f = get_filehandle(fd);
11391   if (!f)
11392     return -EBADF;
11393   return _removexattr(f->inode, name, perms);
11394 }
11395
11396 int Client::setxattr(const char *path, const char *name, const void *value,
11397                      size_t size, int flags, const UserPerm& perms)
11398 {
11399   _setxattr_maybe_wait_for_osdmap(name, value, size);
11400
11401   std::lock_guard lock(client_lock);
11402
11403   if (unmounting)
11404     return -ENOTCONN;
11405
11406   InodeRef in;
11407   int r = Client::path_walk(path, &in, perms, true);
11408   if (r < 0)
11409     return r;
11410   return _setxattr(in, name, value, size, flags, perms);
11411 }
11412
11413 int Client::lsetxattr(const char *path, const char *name, const void *value,
11414                       size_t size, int flags, const UserPerm& perms)
11415 {
11416   _setxattr_maybe_wait_for_osdmap(name, value, size);
11417
11418   std::lock_guard lock(client_lock);
11419
11420   if (unmounting)
11421     return -ENOTCONN;
11422
11423   InodeRef in;
11424   int r = Client::path_walk(path, &in, perms, false);
11425   if (r < 0)
11426     return r;
11427   return _setxattr(in, name, value, size, flags, perms);
11428 }
11429
11430 int Client::fsetxattr(int fd, const char *name, const void *value, size_t size,
11431                       int flags, const UserPerm& perms)
11432 {
11433   _setxattr_maybe_wait_for_osdmap(name, value, size);
11434
11435   std::lock_guard lock(client_lock);
11436
11437   if (unmounting)
11438     return -ENOTCONN;
11439
11440   Fh *f = get_filehandle(fd);
11441   if (!f)
11442     return -EBADF;
11443   return _setxattr(f->inode, name, value, size, flags, perms);
11444 }
11445
11446 int Client::_getxattr(Inode *in, const char *name, void *value, size_t size,
11447                       const UserPerm& perms)
11448 {
11449   int r;
11450
11451   const VXattr *vxattr = _match_vxattr(in, name);
11452   if (vxattr) {
11453     r = -ENODATA;
11454
11455     // Do a force getattr to get the latest quota before returning
11456     // a value to userspace.
11457     int flags = 0;
11458     if (vxattr->flags & VXATTR_RSTAT) {
11459       flags |= CEPH_STAT_RSTAT;
11460     }
11461     r = _getattr(in, flags, perms, true);
11462     if (r != 0) {
11463       // Error from getattr!
11464       return r;
11465     }
11466
11467     // call pointer-to-member function
11468     char buf[256];
11469     if (!(vxattr->exists_cb && !(this->*(vxattr->exists_cb))(in))) {
11470       r = (this->*(vxattr->getxattr_cb))(in, buf, sizeof(buf));
11471     } else {
11472       r = -ENODATA;
11473     }
11474
11475     if (size != 0) {
11476       if (r > (int)size) {
11477         r = -ERANGE;
11478       } else if (r > 0) {
11479         memcpy(value, buf, r);
11480       }
11481     }
11482     goto out;
11483   }
11484
11485   if (acl_type == NO_ACL && !strncmp(name, "system.", 7)) {
11486     r = -EOPNOTSUPP;
11487     goto out;
11488   }
11489
11490   r = _getattr(in, CEPH_STAT_CAP_XATTR, perms, in->xattr_version == 0);
11491   if (r == 0) {
11492     string n(name);
11493     r = -ENODATA;
11494    if (in->xattrs.count(n)) {
11495       r = in->xattrs[n].length();
11496       if (r > 0 && size != 0) {
11497         if (size >= (unsigned)r)
11498           memcpy(value, in->xattrs[n].c_str(), r);
11499         else
11500           r = -ERANGE;
11501       }
11502     }
11503   }
11504  out:
11505   ldout(cct, 8) << "_getxattr(" << in->ino << ", \"" << name << "\", " << size << ") = " << r << dendl;
11506   return r;
11507 }
11508
11509 int Client::_getxattr(InodeRef &in, const char *name, void *value, size_t size,
11510                       const UserPerm& perms)
11511 {
11512   if (cct->_conf->client_permissions) {
11513     int r = xattr_permission(in.get(), name, MAY_READ, perms);
11514     if (r < 0)
11515       return r;
11516   }
11517   return _getxattr(in.get(), name, value, size, perms);
11518 }
11519
11520 int Client::ll_getxattr(Inode *in, const char *name, void *value,
11521                         size_t size, const UserPerm& perms)
11522 {
11523   std::lock_guard lock(client_lock);
11524
11525   if (unmounting)
11526     return -ENOTCONN;
11527
11528   vinodeno_t vino = _get_vino(in);
11529
11530   ldout(cct, 3) << __func__ << " " << vino << " " << name << " size " << size << dendl;
11531   tout(cct) << __func__ << std::endl;
11532   tout(cct) << vino.ino.val << std::endl;
11533   tout(cct) << name << std::endl;
11534
11535   if (!fuse_default_permissions) {
11536     int r = xattr_permission(in, name, MAY_READ, perms);
11537     if (r < 0)
11538       return r;
11539   }
11540
11541   return _getxattr(in, name, value, size, perms);
11542 }
11543
11544 int Client::_listxattr(Inode *in, char *name, size_t size,
11545                        const UserPerm& perms)
11546 {
11547   bool len_only = (size == 0);
11548   int r = _getattr(in, CEPH_STAT_CAP_XATTR, perms, in->xattr_version == 0);
11549   if (r != 0) {
11550     goto out;
11551   }
11552
11553   r = 0;
11554   for (const auto& p : in->xattrs) {
11555     size_t this_len = p.first.length() + 1;
11556     r += this_len;
11557     if (len_only)
11558       continue;
11559
11560     if (this_len > size) {
11561       r = -ERANGE;
11562       goto out;
11563     }
11564
11565     memcpy(name, p.first.c_str(), this_len);
11566     name += this_len;
11567     size -= this_len;
11568   }
11569 out:
11570   ldout(cct, 8) << __func__ << "(" << in->ino << ", " << size << ") = " << r << dendl;
11571   return r;
11572 }
11573
11574 int Client::ll_listxattr(Inode *in, char *names, size_t size,
11575                          const UserPerm& perms)
11576 {
11577   std::lock_guard lock(client_lock);
11578
11579   if (unmounting)
11580     return -ENOTCONN;
11581
11582   vinodeno_t vino = _get_vino(in);
11583
11584   ldout(cct, 3) << __func__ << " " << vino << " size " << size << dendl;
11585   tout(cct) << __func__ << std::endl;
11586   tout(cct) << vino.ino.val << std::endl;
11587   tout(cct) << size << std::endl;
11588
11589   return _listxattr(in, names, size, perms);
11590 }
11591
11592 int Client::_do_setxattr(Inode *in, const char *name, const void *value,
11593                          size_t size, int flags, const UserPerm& perms)
11594 {
11595
11596   int xattr_flags = 0;
11597   if (!value)
11598     xattr_flags |= CEPH_XATTR_REMOVE;
11599   if (flags & XATTR_CREATE)
11600     xattr_flags |= CEPH_XATTR_CREATE;
11601   if (flags & XATTR_REPLACE)
11602     xattr_flags |= CEPH_XATTR_REPLACE;
11603
11604   MetaRequest *req = new MetaRequest(CEPH_MDS_OP_SETXATTR);
11605   filepath path;
11606   in->make_nosnap_relative_path(path);
11607   req->set_filepath(path);
11608   req->set_string2(name);
11609   req->set_inode(in);
11610   req->head.args.setxattr.flags = xattr_flags;
11611
11612   bufferlist bl;
11613   assert (value || size == 0);
11614   bl.append((const char*)value, size);
11615   req->set_data(bl);
11616
11617   int res = make_request(req, perms);
11618
11619   trim_cache();
11620   ldout(cct, 3) << __func__ << "(" << in->ino << ", \"" << name << "\") = " <<
11621     res << dendl;
11622   return res;
11623 }
11624
11625 int Client::_setxattr(Inode *in, const char *name, const void *value,
11626                       size_t size, int flags, const UserPerm& perms)
11627 {
11628   if (in->snapid != CEPH_NOSNAP) {
11629     return -EROFS;
11630   }
11631
11632   if (size == 0) {
11633     value = "";
11634   } else if (value == NULL) {
11635       return -EINVAL;
11636   }
11637
11638   bool posix_acl_xattr = false;
11639   if (acl_type == POSIX_ACL)
11640     posix_acl_xattr = !strncmp(name, "system.", 7);
11641
11642   if (strncmp(name, "user.", 5) &&
11643       strncmp(name, "security.", 9) &&
11644       strncmp(name, "trusted.", 8) &&
11645       strncmp(name, "ceph.", 5) &&
11646       !posix_acl_xattr)
11647     return -EOPNOTSUPP;
11648
11649   bool check_realm = false;
11650
11651   if (posix_acl_xattr) {
11652     if (!strcmp(name, ACL_EA_ACCESS)) {
11653       mode_t new_mode = in->mode;
11654       if (value) {
11655         int ret = posix_acl_equiv_mode(value, size, &new_mode);
11656         if (ret < 0)
11657           return ret;
11658         if (ret == 0) {
11659           value = NULL;
11660           size = 0;
11661         }
11662         if (new_mode != in->mode) {
11663           struct ceph_statx stx;
11664           stx.stx_mode = new_mode;
11665           ret = _do_setattr(in, &stx, CEPH_SETATTR_MODE, perms, NULL);
11666           if (ret < 0)
11667             return ret;
11668         }
11669       }
11670     } else if (!strcmp(name, ACL_EA_DEFAULT)) {
11671       if (value) {
11672         if (!S_ISDIR(in->mode))
11673           return -EACCES;
11674         int ret = posix_acl_check(value, size);
11675         if (ret < 0)
11676           return -EINVAL;
11677         if (ret == 0) {
11678           value = NULL;
11679           size = 0;
11680         }
11681       }
11682     } else {
11683       return -EOPNOTSUPP;
11684     }
11685   } else {
11686     const VXattr *vxattr = _match_vxattr(in, name);
11687     if (vxattr) {
11688       if (vxattr->readonly)
11689         return -EOPNOTSUPP;
11690       if (vxattr->name.compare(0, 10, "ceph.quota") == 0 && value)
11691         check_realm = true;
11692     }
11693   }
11694
11695   int ret = _do_setxattr(in, name, value, size, flags, perms);
11696   if (ret >= 0 && check_realm) {
11697     // check if snaprealm was created for quota inode
11698     if (in->quota.is_enable() &&
11699         !(in->snaprealm && in->snaprealm->ino == in->ino))
11700       ret = -EOPNOTSUPP;
11701   }
11702
11703   return ret;
11704 }
11705
11706 int Client::_setxattr(InodeRef &in, const char *name, const void *value,
11707                       size_t size, int flags, const UserPerm& perms)
11708 {
11709   if (cct->_conf->client_permissions) {
11710     int r = xattr_permission(in.get(), name, MAY_WRITE, perms);
11711     if (r < 0)
11712       return r;
11713   }
11714   return _setxattr(in.get(), name, value, size, flags, perms);
11715 }
11716
11717 int Client::_setxattr_check_data_pool(string& name, string& value, const OSDMap *osdmap)
11718 {
11719   string tmp;
11720   if (name == "layout") {
11721     string::iterator begin = value.begin();
11722     string::iterator end = value.end();
11723     keys_and_values<string::iterator> p;    // create instance of parser
11724     std::map<string, string> m;             // map to receive results
11725     if (!qi::parse(begin, end, p, m)) {     // returns true if successful
11726       return -EINVAL;
11727     }
11728     if (begin != end)
11729       return -EINVAL;
11730     for (map<string,string>::iterator q = m.begin(); q != m.end(); ++q) {
11731       if (q->first == "pool") {
11732         tmp = q->second;
11733         break;
11734       }
11735     }
11736   } else if (name == "layout.pool") {
11737     tmp = value;
11738   }
11739
11740   if (tmp.length()) {
11741     int64_t pool;
11742     try {
11743       pool = boost::lexical_cast<unsigned>(tmp);
11744       if (!osdmap->have_pg_pool(pool))
11745         return -ENOENT;
11746     } catch (boost::bad_lexical_cast const&) {
11747       pool = osdmap->lookup_pg_pool_name(tmp);
11748       if (pool < 0) {
11749         return -ENOENT;
11750       }
11751     }
11752   }
11753
11754   return 0;
11755 }
11756
11757 void Client::_setxattr_maybe_wait_for_osdmap(const char *name, const void *value, size_t size)
11758 {
11759   // For setting pool of layout, MetaRequest need osdmap epoch.
11760   // There is a race which create a new data pool but client and mds both don't have.
11761   // Make client got the latest osdmap which make mds quickly judge whether get newer osdmap.
11762   if (strcmp(name, "ceph.file.layout.pool") == 0 || strcmp(name, "ceph.dir.layout.pool") == 0 ||
11763       strcmp(name, "ceph.file.layout") == 0 || strcmp(name, "ceph.dir.layout") == 0) {
11764     string rest(strstr(name, "layout"));
11765     string v((const char*)value, size);
11766     int r = objecter->with_osdmap([&](const OSDMap& o) {
11767       return _setxattr_check_data_pool(rest, v, &o);
11768     });
11769
11770     if (r == -ENOENT) {
11771       C_SaferCond ctx;
11772       objecter->wait_for_latest_osdmap(&ctx);
11773       ctx.wait();
11774     }
11775   }
11776 }
11777
11778 int Client::ll_setxattr(Inode *in, const char *name, const void *value,
11779                         size_t size, int flags, const UserPerm& perms)
11780 {
11781   _setxattr_maybe_wait_for_osdmap(name, value, size);
11782
11783   std::lock_guard lock(client_lock);
11784
11785   if (unmounting)
11786     return -ENOTCONN;
11787
11788   vinodeno_t vino = _get_vino(in);
11789
11790   ldout(cct, 3) << __func__ << " " << vino << " " << name << " size " << size << dendl;
11791   tout(cct) << __func__ << std::endl;
11792   tout(cct) << vino.ino.val << std::endl;
11793   tout(cct) << name << std::endl;
11794
11795   if (!fuse_default_permissions) {
11796     int r = xattr_permission(in, name, MAY_WRITE, perms);
11797     if (r < 0)
11798       return r;
11799   }
11800   return _setxattr(in, name, value, size, flags, perms);
11801 }
11802
11803 int Client::_removexattr(Inode *in, const char *name, const UserPerm& perms)
11804 {
11805   if (in->snapid != CEPH_NOSNAP) {
11806     return -EROFS;
11807   }
11808
11809   // same xattrs supported by kernel client
11810   if (strncmp(name, "user.", 5) &&
11811       strncmp(name, "system.", 7) &&
11812       strncmp(name, "security.", 9) &&
11813       strncmp(name, "trusted.", 8) &&
11814       strncmp(name, "ceph.", 5))
11815     return -EOPNOTSUPP;
11816
11817   const VXattr *vxattr = _match_vxattr(in, name);
11818   if (vxattr && vxattr->readonly)
11819     return -EOPNOTSUPP;
11820
11821   MetaRequest *req = new MetaRequest(CEPH_MDS_OP_RMXATTR);
11822   filepath path;
11823   in->make_nosnap_relative_path(path);
11824   req->set_filepath(path);
11825   req->set_filepath2(name);
11826   req->set_inode(in);
11827
11828   int res = make_request(req, perms);
11829
11830   trim_cache();
11831   ldout(cct, 8) << "_removexattr(" << in->ino << ", \"" << name << "\") = " << res << dendl;
11832   return res;
11833 }
11834
11835 int Client::_removexattr(InodeRef &in, const char *name, const UserPerm& perms)
11836 {
11837   if (cct->_conf->client_permissions) {
11838     int r = xattr_permission(in.get(), name, MAY_WRITE, perms);
11839     if (r < 0)
11840       return r;
11841   }
11842   return _removexattr(in.get(), name, perms);
11843 }
11844
11845 int Client::ll_removexattr(Inode *in, const char *name, const UserPerm& perms)
11846 {
11847   std::lock_guard lock(client_lock);
11848
11849   if (unmounting)
11850     return -ENOTCONN;
11851
11852   vinodeno_t vino = _get_vino(in);
11853
11854   ldout(cct, 3) << "ll_removexattr " << vino << " " << name << dendl;
11855   tout(cct) << "ll_removexattr" << std::endl;
11856   tout(cct) << vino.ino.val << std::endl;
11857   tout(cct) << name << std::endl;
11858
11859   if (!fuse_default_permissions) {
11860     int r = xattr_permission(in, name, MAY_WRITE, perms);
11861     if (r < 0)
11862       return r;
11863   }
11864
11865   return _removexattr(in, name, perms);
11866 }
11867
11868 bool Client::_vxattrcb_quota_exists(Inode *in)
11869 {
11870   return in->quota.is_enable() &&
11871    (in->snapid != CEPH_NOSNAP ||
11872     (in->snaprealm && in->snaprealm->ino == in->ino));
11873 }
11874 size_t Client::_vxattrcb_quota(Inode *in, char *val, size_t size)
11875 {
11876   return snprintf(val, size,
11877                   "max_bytes=%lld max_files=%lld",
11878                   (long long int)in->quota.max_bytes,
11879                   (long long int)in->quota.max_files);
11880 }
11881 size_t Client::_vxattrcb_quota_max_bytes(Inode *in, char *val, size_t size)
11882 {
11883   return snprintf(val, size, "%lld", (long long int)in->quota.max_bytes);
11884 }
11885 size_t Client::_vxattrcb_quota_max_files(Inode *in, char *val, size_t size)
11886 {
11887   return snprintf(val, size, "%lld", (long long int)in->quota.max_files);
11888 }
11889
11890 bool Client::_vxattrcb_layout_exists(Inode *in)
11891 {
11892   return in->layout != file_layout_t();
11893 }
11894 size_t Client::_vxattrcb_layout(Inode *in, char *val, size_t size)
11895 {
11896   int r = snprintf(val, size,
11897       "stripe_unit=%llu stripe_count=%llu object_size=%llu pool=",
11898       (unsigned long long)in->layout.stripe_unit,
11899       (unsigned long long)in->layout.stripe_count,
11900       (unsigned long long)in->layout.object_size);
11901   objecter->with_osdmap([&](const OSDMap& o) {
11902       if (o.have_pg_pool(in->layout.pool_id))
11903         r += snprintf(val + r, size - r, "%s",
11904                       o.get_pool_name(in->layout.pool_id).c_str());
11905       else
11906         r += snprintf(val + r, size - r, "%" PRIu64,
11907                       (uint64_t)in->layout.pool_id);
11908     });
11909   if (in->layout.pool_ns.length())
11910     r += snprintf(val + r, size - r, " pool_namespace=%s",
11911                   in->layout.pool_ns.c_str());
11912   return r;
11913 }
11914 size_t Client::_vxattrcb_layout_stripe_unit(Inode *in, char *val, size_t size)
11915 {
11916   return snprintf(val, size, "%llu", (unsigned long long)in->layout.stripe_unit);
11917 }
11918 size_t Client::_vxattrcb_layout_stripe_count(Inode *in, char *val, size_t size)
11919 {
11920   return snprintf(val, size, "%llu", (unsigned long long)in->layout.stripe_count);
11921 }
11922 size_t Client::_vxattrcb_layout_object_size(Inode *in, char *val, size_t size)
11923 {
11924   return snprintf(val, size, "%llu", (unsigned long long)in->layout.object_size);
11925 }
11926 size_t Client::_vxattrcb_layout_pool(Inode *in, char *val, size_t size)
11927 {
11928   size_t r;
11929   objecter->with_osdmap([&](const OSDMap& o) {
11930       if (o.have_pg_pool(in->layout.pool_id))
11931         r = snprintf(val, size, "%s", o.get_pool_name(
11932                        in->layout.pool_id).c_str());
11933       else
11934         r = snprintf(val, size, "%" PRIu64, (uint64_t)in->layout.pool_id);
11935     });
11936   return r;
11937 }
11938 size_t Client::_vxattrcb_layout_pool_namespace(Inode *in, char *val, size_t size)
11939 {
11940   return snprintf(val, size, "%s", in->layout.pool_ns.c_str());
11941 }
11942 size_t Client::_vxattrcb_dir_entries(Inode *in, char *val, size_t size)
11943 {
11944   return snprintf(val, size, "%llu", (unsigned long long)(in->dirstat.nfiles + in->dirstat.nsubdirs));
11945 }
11946 size_t Client::_vxattrcb_dir_files(Inode *in, char *val, size_t size)
11947 {
11948   return snprintf(val, size, "%llu", (unsigned long long)in->dirstat.nfiles);
11949 }
11950 size_t Client::_vxattrcb_dir_subdirs(Inode *in, char *val, size_t size)
11951 {
11952   return snprintf(val, size, "%llu", (unsigned long long)in->dirstat.nsubdirs);
11953 }
11954 size_t Client::_vxattrcb_dir_rentries(Inode *in, char *val, size_t size)
11955 {
11956   return snprintf(val, size, "%llu", (unsigned long long)(in->rstat.rfiles + in->rstat.rsubdirs));
11957 }
11958 size_t Client::_vxattrcb_dir_rfiles(Inode *in, char *val, size_t size)
11959 {
11960   return snprintf(val, size, "%llu", (unsigned long long)in->rstat.rfiles);
11961 }
11962 size_t Client::_vxattrcb_dir_rsubdirs(Inode *in, char *val, size_t size)
11963 {
11964   return snprintf(val, size, "%llu", (unsigned long long)in->rstat.rsubdirs);
11965 }
11966 size_t Client::_vxattrcb_dir_rbytes(Inode *in, char *val, size_t size)
11967 {
11968   return snprintf(val, size, "%llu", (unsigned long long)in->rstat.rbytes);
11969 }
11970 size_t Client::_vxattrcb_dir_rctime(Inode *in, char *val, size_t size)
11971 {
11972   return snprintf(val, size, "%ld.%09ld", (long)in->rstat.rctime.sec(),
11973       (long)in->rstat.rctime.nsec());
11974 }
11975 bool Client::_vxattrcb_dir_pin_exists(Inode *in)
11976 {
11977   return in->dir_pin != -ENODATA;
11978 }
11979 size_t Client::_vxattrcb_dir_pin(Inode *in, char *val, size_t size)
11980 {
11981   return snprintf(val, size, "%ld", (long)in->dir_pin);
11982 }
11983
11984 bool Client::_vxattrcb_snap_btime_exists(Inode *in)
11985 {
11986   return !in->snap_btime.is_zero();
11987 }
11988
11989 size_t Client::_vxattrcb_snap_btime(Inode *in, char *val, size_t size)
11990 {
11991   return snprintf(val, size, "%llu.%09lu",
11992       (long long unsigned)in->snap_btime.sec(),
11993       (long unsigned)in->snap_btime.nsec());
11994 }
11995
11996 #define CEPH_XATTR_NAME(_type, _name) "ceph." #_type "." #_name
11997 #define CEPH_XATTR_NAME2(_type, _name, _name2) "ceph." #_type "." #_name "." #_name2
11998
11999 #define XATTR_NAME_CEPH(_type, _name)                           \
12000 {                                                               \
12001   name: CEPH_XATTR_NAME(_type, _name),                          \
12002   getxattr_cb: &Client::_vxattrcb_ ## _type ## _ ## _name,      \
12003   readonly: true,                                               \
12004   exists_cb: NULL,                                              \
12005   flags: 0,                                                     \
12006 }
12007 #define XATTR_NAME_CEPH2(_type, _name, _flags)                 \
12008 {                                                              \
12009   name: CEPH_XATTR_NAME(_type, _name),                         \
12010   getxattr_cb: &Client::_vxattrcb_ ## _type ## _ ## _name,     \
12011   readonly: true,                                              \
12012   exists_cb: NULL,                                             \
12013   flags: _flags,                                               \
12014 }
12015 #define XATTR_LAYOUT_FIELD(_type, _name, _field)                \
12016 {                                                               \
12017   name: CEPH_XATTR_NAME2(_type, _name, _field),                 \
12018   getxattr_cb: &Client::_vxattrcb_ ## _name ## _ ## _field,     \
12019   readonly: false,                                              \
12020   exists_cb: &Client::_vxattrcb_layout_exists,                  \
12021   flags: 0,                                                     \
12022 }
12023 #define XATTR_QUOTA_FIELD(_type, _name)                         \
12024 {                                                               \
12025   name: CEPH_XATTR_NAME(_type, _name),                          \
12026   getxattr_cb: &Client::_vxattrcb_ ## _type ## _ ## _name,      \
12027   readonly: false,                                              \
12028   exists_cb: &Client::_vxattrcb_quota_exists,                   \
12029   flags: 0,                                                     \
12030 }
12031
12032 const Client::VXattr Client::_dir_vxattrs[] = {
12033   {
12034     name: "ceph.dir.layout",
12035     getxattr_cb: &Client::_vxattrcb_layout,
12036     readonly: false,
12037     exists_cb: &Client::_vxattrcb_layout_exists,
12038     flags: 0,
12039   },
12040   XATTR_LAYOUT_FIELD(dir, layout, stripe_unit),
12041   XATTR_LAYOUT_FIELD(dir, layout, stripe_count),
12042   XATTR_LAYOUT_FIELD(dir, layout, object_size),
12043   XATTR_LAYOUT_FIELD(dir, layout, pool),
12044   XATTR_LAYOUT_FIELD(dir, layout, pool_namespace),
12045   XATTR_NAME_CEPH(dir, entries),
12046   XATTR_NAME_CEPH(dir, files),
12047   XATTR_NAME_CEPH(dir, subdirs),
12048   XATTR_NAME_CEPH2(dir, rentries, VXATTR_RSTAT),
12049   XATTR_NAME_CEPH2(dir, rfiles, VXATTR_RSTAT),
12050   XATTR_NAME_CEPH2(dir, rsubdirs, VXATTR_RSTAT),
12051   XATTR_NAME_CEPH2(dir, rbytes, VXATTR_RSTAT),
12052   XATTR_NAME_CEPH2(dir, rctime, VXATTR_RSTAT),
12053   {
12054     name: "ceph.quota",
12055     getxattr_cb: &Client::_vxattrcb_quota,
12056     readonly: false,
12057     exists_cb: &Client::_vxattrcb_quota_exists,
12058     flags: 0,
12059   },
12060   XATTR_QUOTA_FIELD(quota, max_bytes),
12061   XATTR_QUOTA_FIELD(quota, max_files),
12062   {
12063     name: "ceph.dir.pin",
12064     getxattr_cb: &Client::_vxattrcb_dir_pin,
12065     readonly: false,
12066     exists_cb: &Client::_vxattrcb_dir_pin_exists,
12067     flags: 0,
12068   },
12069   {
12070     name: "ceph.snap.btime",
12071     getxattr_cb: &Client::_vxattrcb_snap_btime,
12072     readonly: true,
12073     exists_cb: &Client::_vxattrcb_snap_btime_exists,
12074     flags: 0,
12075   },
12076   { name: "" }     /* Required table terminator */
12077 };
12078
12079 const Client::VXattr Client::_file_vxattrs[] = {
12080   {
12081     name: "ceph.file.layout",
12082     getxattr_cb: &Client::_vxattrcb_layout,
12083     readonly: false,
12084     exists_cb: &Client::_vxattrcb_layout_exists,
12085     flags: 0,
12086   },
12087   XATTR_LAYOUT_FIELD(file, layout, stripe_unit),
12088   XATTR_LAYOUT_FIELD(file, layout, stripe_count),
12089   XATTR_LAYOUT_FIELD(file, layout, object_size),
12090   XATTR_LAYOUT_FIELD(file, layout, pool),
12091   XATTR_LAYOUT_FIELD(file, layout, pool_namespace),
12092   {
12093     name: "ceph.snap.btime",
12094     getxattr_cb: &Client::_vxattrcb_snap_btime,
12095     readonly: true,
12096     exists_cb: &Client::_vxattrcb_snap_btime_exists,
12097     flags: 0,
12098   },
12099   { name: "" }     /* Required table terminator */
12100 };
12101
12102 const Client::VXattr *Client::_get_vxattrs(Inode *in)
12103 {
12104   if (in->is_dir())
12105     return _dir_vxattrs;
12106   else if (in->is_file())
12107     return _file_vxattrs;
12108   return NULL;
12109 }
12110
12111 const Client::VXattr *Client::_match_vxattr(Inode *in, const char *name)
12112 {
12113   if (strncmp(name, "ceph.", 5) == 0) {
12114     const VXattr *vxattr = _get_vxattrs(in);
12115     if (vxattr) {
12116       while (!vxattr->name.empty()) {
12117         if (vxattr->name == name)
12118           return vxattr;
12119         vxattr++;
12120       }
12121     }
12122   }
12123   return NULL;
12124 }
12125
12126 int Client::ll_readlink(Inode *in, char *buf, size_t buflen, const UserPerm& perms)
12127 {
12128   std::lock_guard lock(client_lock);
12129
12130   if (unmounting)
12131     return -ENOTCONN;
12132
12133   vinodeno_t vino = _get_vino(in);
12134
12135   ldout(cct, 3) << "ll_readlink " << vino << dendl;
12136   tout(cct) << "ll_readlink" << std::endl;
12137   tout(cct) << vino.ino.val << std::endl;
12138
12139   for (auto dn : in->dentries) {
12140     touch_dn(dn);
12141   }
12142
12143   int r = _readlink(in, buf, buflen); // FIXME: no permission checking!
12144   ldout(cct, 3) << "ll_readlink " << vino << " = " << r << dendl;
12145   return r;
12146 }
12147
12148 int Client::_mknod(Inode *dir, const char *name, mode_t mode, dev_t rdev,
12149                    const UserPerm& perms, InodeRef *inp)
12150 {
12151   ldout(cct, 8) << "_mknod(" << dir->ino << " " << name << ", 0" << oct
12152                 << mode << dec << ", " << rdev << ", uid " << perms.uid()
12153                 << ", gid " << perms.gid() << ")" << dendl;
12154
12155   if (strlen(name) > NAME_MAX)
12156     return -ENAMETOOLONG;
12157
12158   if (dir->snapid != CEPH_NOSNAP) {
12159     return -EROFS;
12160   }
12161   if (is_quota_files_exceeded(dir, perms)) {
12162     return -EDQUOT;
12163   }
12164
12165   MetaRequest *req = new MetaRequest(CEPH_MDS_OP_MKNOD);
12166
12167   filepath path;
12168   dir->make_nosnap_relative_path(path);
12169   path.push_dentry(name);
12170   req->set_filepath(path);
12171   req->set_inode(dir);
12172   req->head.args.mknod.rdev = rdev;
12173   req->dentry_drop = CEPH_CAP_FILE_SHARED;
12174   req->dentry_unless = CEPH_CAP_FILE_EXCL;
12175
12176   bufferlist xattrs_bl;
12177   int res = _posix_acl_create(dir, &mode, xattrs_bl, perms);
12178   if (res < 0)
12179     goto fail;
12180   req->head.args.mknod.mode = mode;
12181   if (xattrs_bl.length() > 0)
12182     req->set_data(xattrs_bl);
12183
12184   Dentry *de;
12185   res = get_or_create(dir, name, &de);
12186   if (res < 0)
12187     goto fail;
12188   req->set_dentry(de);
12189
12190   res = make_request(req, perms, inp);
12191
12192   trim_cache();
12193
12194   ldout(cct, 8) << "mknod(" << path << ", 0" << oct << mode << dec << ") = " << res << dendl;
12195   return res;
12196
12197  fail:
12198   put_request(req);
12199   return res;
12200 }
12201
12202 int Client::ll_mknod(Inode *parent, const char *name, mode_t mode,
12203                      dev_t rdev, struct stat *attr, Inode **out,
12204                      const UserPerm& perms)
12205 {
12206   std::lock_guard lock(client_lock);
12207
12208   if (unmounting)
12209     return -ENOTCONN;
12210
12211   vinodeno_t vparent = _get_vino(parent);
12212
12213   ldout(cct, 3) << "ll_mknod " << vparent << " " << name << dendl;
12214   tout(cct) << "ll_mknod" << std::endl;
12215   tout(cct) << vparent.ino.val << std::endl;
12216   tout(cct) << name << std::endl;
12217   tout(cct) << mode << std::endl;
12218   tout(cct) << rdev << std::endl;
12219
12220   if (!fuse_default_permissions) {
12221     int r = may_create(parent, perms);
12222     if (r < 0)
12223       return r;
12224   }
12225
12226   InodeRef in;
12227   int r = _mknod(parent, name, mode, rdev, perms, &in);
12228   if (r == 0) {
12229     fill_stat(in, attr);
12230     _ll_get(in.get());
12231   }
12232   tout(cct) << attr->st_ino << std::endl;
12233   ldout(cct, 3) << "ll_mknod " << vparent << " " << name
12234           << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
12235   *out = in.get();
12236   return r;
12237 }
12238
12239 int Client::ll_mknodx(Inode *parent, const char *name, mode_t mode,
12240                       dev_t rdev, Inode **out,
12241                       struct ceph_statx *stx, unsigned want, unsigned flags,
12242                       const UserPerm& perms)
12243 {
12244   unsigned caps = statx_to_mask(flags, want);
12245   std::lock_guard lock(client_lock);
12246
12247   if (unmounting)
12248     return -ENOTCONN;
12249
12250   vinodeno_t vparent = _get_vino(parent);
12251
12252   ldout(cct, 3) << "ll_mknodx " << vparent << " " << name << dendl;
12253   tout(cct) << "ll_mknodx" << std::endl;
12254   tout(cct) << vparent.ino.val << std::endl;
12255   tout(cct) << name << std::endl;
12256   tout(cct) << mode << std::endl;
12257   tout(cct) << rdev << std::endl;
12258
12259   if (!fuse_default_permissions) {
12260     int r = may_create(parent, perms);
12261     if (r < 0)
12262       return r;
12263   }
12264
12265   InodeRef in;
12266   int r = _mknod(parent, name, mode, rdev, perms, &in);
12267   if (r == 0) {
12268     fill_statx(in, caps, stx);
12269     _ll_get(in.get());
12270   }
12271   tout(cct) << stx->stx_ino << std::endl;
12272   ldout(cct, 3) << "ll_mknodx " << vparent << " " << name
12273           << " = " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
12274   *out = in.get();
12275   return r;
12276 }
12277
12278 int Client::_create(Inode *dir, const char *name, int flags, mode_t mode,
12279                     InodeRef *inp, Fh **fhp, int stripe_unit, int stripe_count,
12280                     int object_size, const char *data_pool, bool *created,
12281                     const UserPerm& perms)
12282 {
12283   ldout(cct, 8) << "_create(" << dir->ino << " " << name << ", 0" << oct <<
12284     mode << dec << ")" << dendl;
12285
12286   if (strlen(name) > NAME_MAX)
12287     return -ENAMETOOLONG;
12288   if (dir->snapid != CEPH_NOSNAP) {
12289     return -EROFS;
12290   }
12291   if (is_quota_files_exceeded(dir, perms)) {
12292     return -EDQUOT;
12293   }
12294
12295   // use normalized flags to generate cmode
12296   int cflags = ceph_flags_sys2wire(flags);
12297   if (cct->_conf.get_val<bool>("client_force_lazyio"))
12298     cflags |= CEPH_O_LAZY;
12299
12300   int cmode = ceph_flags_to_mode(cflags);
12301
12302   int64_t pool_id = -1;
12303   if (data_pool && *data_pool) {
12304     pool_id = objecter->with_osdmap(
12305       std::mem_fn(&OSDMap::lookup_pg_pool_name), data_pool);
12306     if (pool_id < 0)
12307       return -EINVAL;
12308     if (pool_id > 0xffffffffll)
12309       return -ERANGE;  // bummer!
12310   }
12311
12312   MetaRequest *req = new MetaRequest(CEPH_MDS_OP_CREATE);
12313
12314   filepath path;
12315   dir->make_nosnap_relative_path(path);
12316   path.push_dentry(name);
12317   req->set_filepath(path);
12318   req->set_inode(dir);
12319   req->head.args.open.flags = cflags | CEPH_O_CREAT;
12320
12321   req->head.args.open.stripe_unit = stripe_unit;
12322   req->head.args.open.stripe_count = stripe_count;
12323   req->head.args.open.object_size = object_size;
12324   if (cct->_conf->client_debug_getattr_caps)
12325     req->head.args.open.mask = DEBUG_GETATTR_CAPS;
12326   else
12327     req->head.args.open.mask = 0;
12328   req->head.args.open.pool = pool_id;
12329   req->dentry_drop = CEPH_CAP_FILE_SHARED;
12330   req->dentry_unless = CEPH_CAP_FILE_EXCL;
12331
12332   mode |= S_IFREG;
12333   bufferlist xattrs_bl;
12334   int res = _posix_acl_create(dir, &mode, xattrs_bl, perms);
12335   if (res < 0)
12336     goto fail;
12337   req->head.args.open.mode = mode;
12338   if (xattrs_bl.length() > 0)
12339     req->set_data(xattrs_bl);
12340
12341   Dentry *de;
12342   res = get_or_create(dir, name, &de);
12343   if (res < 0)
12344     goto fail;
12345   req->set_dentry(de);
12346
12347   res = make_request(req, perms, inp, created);
12348   if (res < 0) {
12349     goto reply_error;
12350   }
12351
12352   /* If the caller passed a value in fhp, do the open */
12353   if(fhp) {
12354     (*inp)->get_open_ref(cmode);
12355     *fhp = _create_fh(inp->get(), flags, cmode, perms);
12356   }
12357
12358  reply_error:
12359   trim_cache();
12360
12361   ldout(cct, 8) << "create(" << path << ", 0" << oct << mode << dec
12362                 << " layout " << stripe_unit
12363                 << ' ' << stripe_count
12364                 << ' ' << object_size
12365                 <<") = " << res << dendl;
12366   return res;
12367
12368  fail:
12369   put_request(req);
12370   return res;
12371 }
12372
12373
12374 int Client::_mkdir(Inode *dir, const char *name, mode_t mode, const UserPerm& perm,
12375                    InodeRef *inp)
12376 {
12377   ldout(cct, 8) << "_mkdir(" << dir->ino << " " << name << ", 0" << oct
12378                 << mode << dec << ", uid " << perm.uid()
12379                 << ", gid " << perm.gid() << ")" << dendl;
12380
12381   if (strlen(name) > NAME_MAX)
12382     return -ENAMETOOLONG;
12383
12384   if (dir->snapid != CEPH_NOSNAP && dir->snapid != CEPH_SNAPDIR) {
12385     return -EROFS;
12386   }
12387   if (is_quota_files_exceeded(dir, perm)) {
12388     return -EDQUOT;
12389   }
12390   MetaRequest *req = new MetaRequest(dir->snapid == CEPH_SNAPDIR ?
12391                                      CEPH_MDS_OP_MKSNAP : CEPH_MDS_OP_MKDIR);
12392
12393   filepath path;
12394   dir->make_nosnap_relative_path(path);
12395   path.push_dentry(name);
12396   req->set_filepath(path);
12397   req->set_inode(dir);
12398   req->dentry_drop = CEPH_CAP_FILE_SHARED;
12399   req->dentry_unless = CEPH_CAP_FILE_EXCL;
12400
12401   mode |= S_IFDIR;
12402   bufferlist xattrs_bl;
12403   int res = _posix_acl_create(dir, &mode, xattrs_bl, perm);
12404   if (res < 0)
12405     goto fail;
12406   req->head.args.mkdir.mode = mode;
12407   if (xattrs_bl.length() > 0)
12408     req->set_data(xattrs_bl);
12409
12410   Dentry *de;
12411   res = get_or_create(dir, name, &de);
12412   if (res < 0)
12413     goto fail;
12414   req->set_dentry(de);
12415
12416   ldout(cct, 10) << "_mkdir: making request" << dendl;
12417   res = make_request(req, perm, inp);
12418   ldout(cct, 10) << "_mkdir result is " << res << dendl;
12419
12420   trim_cache();
12421
12422   ldout(cct, 8) << "_mkdir(" << path << ", 0" << oct << mode << dec << ") = " << res << dendl;
12423   return res;
12424
12425  fail:
12426   put_request(req);
12427   return res;
12428 }
12429
12430 int Client::ll_mkdir(Inode *parent, const char *name, mode_t mode,
12431                      struct stat *attr, Inode **out, const UserPerm& perm)
12432 {
12433   std::lock_guard lock(client_lock);
12434
12435   if (unmounting)
12436     return -ENOTCONN;
12437
12438   vinodeno_t vparent = _get_vino(parent);
12439
12440   ldout(cct, 3) << "ll_mkdir " << vparent << " " << name << dendl;
12441   tout(cct) << "ll_mkdir" << std::endl;
12442   tout(cct) << vparent.ino.val << std::endl;
12443   tout(cct) << name << std::endl;
12444   tout(cct) << mode << std::endl;
12445
12446   if (!fuse_default_permissions) {
12447     int r = may_create(parent, perm);
12448     if (r < 0)
12449       return r;
12450   }
12451
12452   InodeRef in;
12453   int r = _mkdir(parent, name, mode, perm, &in);
12454   if (r == 0) {
12455     fill_stat(in, attr);
12456     _ll_get(in.get());
12457   }
12458   tout(cct) << attr->st_ino << std::endl;
12459   ldout(cct, 3) << "ll_mkdir " << vparent << " " << name
12460           << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
12461   *out = in.get();
12462   return r;
12463 }
12464
12465 int Client::ll_mkdirx(Inode *parent, const char *name, mode_t mode, Inode **out,
12466                       struct ceph_statx *stx, unsigned want, unsigned flags,
12467                       const UserPerm& perms)
12468 {
12469   std::lock_guard lock(client_lock);
12470
12471   if (unmounting)
12472     return -ENOTCONN;
12473
12474   vinodeno_t vparent = _get_vino(parent);
12475
12476   ldout(cct, 3) << "ll_mkdirx " << vparent << " " << name << dendl;
12477   tout(cct) << "ll_mkdirx" << std::endl;
12478   tout(cct) << vparent.ino.val << std::endl;
12479   tout(cct) << name << std::endl;
12480   tout(cct) << mode << std::endl;
12481
12482   if (!fuse_default_permissions) {
12483     int r = may_create(parent, perms);
12484     if (r < 0)
12485       return r;
12486   }
12487
12488   InodeRef in;
12489   int r = _mkdir(parent, name, mode, perms, &in);
12490   if (r == 0) {
12491     fill_statx(in, statx_to_mask(flags, want), stx);
12492     _ll_get(in.get());
12493   } else {
12494     stx->stx_ino = 0;
12495     stx->stx_mask = 0;
12496   }
12497   tout(cct) << stx->stx_ino << std::endl;
12498   ldout(cct, 3) << "ll_mkdirx " << vparent << " " << name
12499           << " = " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
12500   *out = in.get();
12501   return r;
12502 }
12503
12504 int Client::_symlink(Inode *dir, const char *name, const char *target,
12505                      const UserPerm& perms, InodeRef *inp)
12506 {
12507   ldout(cct, 8) << "_symlink(" << dir->ino << " " << name << ", " << target
12508                 << ", uid " << perms.uid() << ", gid " << perms.gid() << ")"
12509                 << dendl;
12510
12511   if (strlen(name) > NAME_MAX)
12512     return -ENAMETOOLONG;
12513
12514   if (dir->snapid != CEPH_NOSNAP) {
12515     return -EROFS;
12516   }
12517   if (is_quota_files_exceeded(dir, perms)) {
12518     return -EDQUOT;
12519   }
12520
12521   MetaRequest *req = new MetaRequest(CEPH_MDS_OP_SYMLINK);
12522
12523   filepath path;
12524   dir->make_nosnap_relative_path(path);
12525   path.push_dentry(name);
12526   req->set_filepath(path);
12527   req->set_inode(dir);
12528   req->set_string2(target);
12529   req->dentry_drop = CEPH_CAP_FILE_SHARED;
12530   req->dentry_unless = CEPH_CAP_FILE_EXCL;
12531
12532   Dentry *de;
12533   int res = get_or_create(dir, name, &de);
12534   if (res < 0)
12535     goto fail;
12536   req->set_dentry(de);
12537
12538   res = make_request(req, perms, inp);
12539
12540   trim_cache();
12541   ldout(cct, 8) << "_symlink(\"" << path << "\", \"" << target << "\") = " <<
12542     res << dendl;
12543   return res;
12544
12545  fail:
12546   put_request(req);
12547   return res;
12548 }
12549
12550 int Client::ll_symlink(Inode *parent, const char *name, const char *value,
12551                        struct stat *attr, Inode **out, const UserPerm& perms)
12552 {
12553   std::lock_guard lock(client_lock);
12554
12555   if (unmounting)
12556     return -ENOTCONN;
12557
12558   vinodeno_t vparent = _get_vino(parent);
12559
12560   ldout(cct, 3) << "ll_symlink " << vparent << " " << name << " -> " << value
12561                 << dendl;
12562   tout(cct) << "ll_symlink" << std::endl;
12563   tout(cct) << vparent.ino.val << std::endl;
12564   tout(cct) << name << std::endl;
12565   tout(cct) << value << std::endl;
12566
12567   if (!fuse_default_permissions) {
12568     int r = may_create(parent, perms);
12569     if (r < 0)
12570       return r;
12571   }
12572
12573   InodeRef in;
12574   int r = _symlink(parent, name, value, perms, &in);
12575   if (r == 0) {
12576     fill_stat(in, attr);
12577     _ll_get(in.get());
12578   }
12579   tout(cct) << attr->st_ino << std::endl;
12580   ldout(cct, 3) << "ll_symlink " << vparent << " " << name
12581           << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
12582   *out = in.get();
12583   return r;
12584 }
12585
12586 int Client::ll_symlinkx(Inode *parent, const char *name, const char *value,
12587                         Inode **out, struct ceph_statx *stx, unsigned want,
12588                         unsigned flags, const UserPerm& perms)
12589 {
12590   std::lock_guard lock(client_lock);
12591
12592   if (unmounting)
12593     return -ENOTCONN;
12594
12595   vinodeno_t vparent = _get_vino(parent);
12596
12597   ldout(cct, 3) << "ll_symlinkx " << vparent << " " << name << " -> " << value
12598                 << dendl;
12599   tout(cct) << "ll_symlinkx" << std::endl;
12600   tout(cct) << vparent.ino.val << std::endl;
12601   tout(cct) << name << std::endl;
12602   tout(cct) << value << std::endl;
12603
12604   if (!fuse_default_permissions) {
12605     int r = may_create(parent, perms);
12606     if (r < 0)
12607       return r;
12608   }
12609
12610   InodeRef in;
12611   int r = _symlink(parent, name, value, perms, &in);
12612   if (r == 0) {
12613     fill_statx(in, statx_to_mask(flags, want), stx);
12614     _ll_get(in.get());
12615   }
12616   tout(cct) << stx->stx_ino << std::endl;
12617   ldout(cct, 3) << "ll_symlinkx " << vparent << " " << name
12618           << " = " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
12619   *out = in.get();
12620   return r;
12621 }
12622
12623 int Client::_unlink(Inode *dir, const char *name, const UserPerm& perm)
12624 {
12625   ldout(cct, 8) << "_unlink(" << dir->ino << " " << name
12626                 << " uid " << perm.uid() << " gid " << perm.gid()
12627                 << ")" << dendl;
12628
12629   if (dir->snapid != CEPH_NOSNAP) {
12630     return -EROFS;
12631   }
12632
12633   MetaRequest *req = new MetaRequest(CEPH_MDS_OP_UNLINK);
12634
12635   filepath path;
12636   dir->make_nosnap_relative_path(path);
12637   path.push_dentry(name);
12638   req->set_filepath(path);
12639
12640   InodeRef otherin;
12641   Inode *in;
12642   Dentry *de;
12643
12644   int res = get_or_create(dir, name, &de);
12645   if (res < 0)
12646     goto fail;
12647   req->set_dentry(de);
12648   req->dentry_drop = CEPH_CAP_FILE_SHARED;
12649   req->dentry_unless = CEPH_CAP_FILE_EXCL;
12650
12651   res = _lookup(dir, name, 0, &otherin, perm);
12652   if (res < 0)
12653     goto fail;
12654
12655   in = otherin.get();
12656   req->set_other_inode(in);
12657   in->break_all_delegs();
12658   req->other_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
12659
12660   req->set_inode(dir);
12661
12662   res = make_request(req, perm);
12663
12664   trim_cache();
12665   ldout(cct, 8) << "unlink(" << path << ") = " << res << dendl;
12666   return res;
12667
12668  fail:
12669   put_request(req);
12670   return res;
12671 }
12672
12673 int Client::ll_unlink(Inode *in, const char *name, const UserPerm& perm)
12674 {
12675   std::lock_guard lock(client_lock);
12676
12677   if (unmounting)
12678     return -ENOTCONN;
12679
12680   vinodeno_t vino = _get_vino(in);
12681
12682   ldout(cct, 3) << "ll_unlink " << vino << " " << name << dendl;
12683   tout(cct) << "ll_unlink" << std::endl;
12684   tout(cct) << vino.ino.val << std::endl;
12685   tout(cct) << name << std::endl;
12686
12687   if (!fuse_default_permissions) {
12688     int r = may_delete(in, name, perm);
12689     if (r < 0)
12690       return r;
12691   }
12692   return _unlink(in, name, perm);
12693 }
12694
12695 int Client::_rmdir(Inode *dir, const char *name, const UserPerm& perms)
12696 {
12697   ldout(cct, 8) << "_rmdir(" << dir->ino << " " << name << " uid "
12698                 << perms.uid() << " gid " << perms.gid() << ")" << dendl;
12699
12700   if (dir->snapid != CEPH_NOSNAP && dir->snapid != CEPH_SNAPDIR) {
12701     return -EROFS;
12702   }
12703
12704   int op = dir->snapid == CEPH_SNAPDIR ? CEPH_MDS_OP_RMSNAP : CEPH_MDS_OP_RMDIR;
12705   MetaRequest *req = new MetaRequest(op);
12706   filepath path;
12707   dir->make_nosnap_relative_path(path);
12708   path.push_dentry(name);
12709   req->set_filepath(path);
12710   req->set_inode(dir);
12711
12712   req->dentry_drop = CEPH_CAP_FILE_SHARED;
12713   req->dentry_unless = CEPH_CAP_FILE_EXCL;
12714   req->other_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
12715
12716   InodeRef in;
12717
12718   Dentry *de;
12719   int res = get_or_create(dir, name, &de);
12720   if (res < 0)
12721     goto fail;
12722   if (op == CEPH_MDS_OP_RMDIR)
12723     req->set_dentry(de);
12724   else
12725     de->get();
12726
12727   res = _lookup(dir, name, 0, &in, perms);
12728   if (res < 0)
12729     goto fail;
12730
12731   if (op == CEPH_MDS_OP_RMSNAP) {
12732     unlink(de, true, true);
12733     de->put();
12734   }
12735   req->set_other_inode(in.get());
12736
12737   res = make_request(req, perms);
12738
12739   trim_cache();
12740   ldout(cct, 8) << "rmdir(" << path << ") = " << res << dendl;
12741   return res;
12742
12743  fail:
12744   put_request(req);
12745   return res;
12746 }
12747
12748 int Client::ll_rmdir(Inode *in, const char *name, const UserPerm& perms)
12749 {
12750   std::lock_guard lock(client_lock);
12751
12752   if (unmounting)
12753     return -ENOTCONN;
12754
12755   vinodeno_t vino = _get_vino(in);
12756
12757   ldout(cct, 3) << "ll_rmdir " << vino << " " << name << dendl;
12758   tout(cct) << "ll_rmdir" << std::endl;
12759   tout(cct) << vino.ino.val << std::endl;
12760   tout(cct) << name << std::endl;
12761
12762   if (!fuse_default_permissions) {
12763     int r = may_delete(in, name, perms);
12764     if (r < 0)
12765       return r;
12766   }
12767
12768   return _rmdir(in, name, perms);
12769 }
12770
12771 int Client::_rename(Inode *fromdir, const char *fromname, Inode *todir, const char *toname, const UserPerm& perm)
12772 {
12773   ldout(cct, 8) << "_rename(" << fromdir->ino << " " << fromname << " to "
12774                 << todir->ino << " " << toname
12775                 << " uid " << perm.uid() << " gid " << perm.gid() << ")"
12776                 << dendl;
12777
12778   if (fromdir->snapid != todir->snapid)
12779     return -EXDEV;
12780
12781   int op = CEPH_MDS_OP_RENAME;
12782   if (fromdir->snapid != CEPH_NOSNAP) {
12783     if (fromdir == todir && fromdir->snapid == CEPH_SNAPDIR)
12784       op = CEPH_MDS_OP_RENAMESNAP;
12785     else
12786       return -EROFS;
12787   }
12788
12789   InodeRef target;
12790   MetaRequest *req = new MetaRequest(op);
12791
12792   filepath from;
12793   fromdir->make_nosnap_relative_path(from);
12794   from.push_dentry(fromname);
12795   filepath to;
12796   todir->make_nosnap_relative_path(to);
12797   to.push_dentry(toname);
12798   req->set_filepath(to);
12799   req->set_filepath2(from);
12800
12801   Dentry *oldde;
12802   int res = get_or_create(fromdir, fromname, &oldde);
12803   if (res < 0)
12804     goto fail;
12805   Dentry *de;
12806   res = get_or_create(todir, toname, &de);
12807   if (res < 0)
12808     goto fail;
12809
12810   if (op == CEPH_MDS_OP_RENAME) {
12811     req->set_old_dentry(oldde);
12812     req->old_dentry_drop = CEPH_CAP_FILE_SHARED;
12813     req->old_dentry_unless = CEPH_CAP_FILE_EXCL;
12814
12815     req->set_dentry(de);
12816     req->dentry_drop = CEPH_CAP_FILE_SHARED;
12817     req->dentry_unless = CEPH_CAP_FILE_EXCL;
12818
12819     InodeRef oldin, otherin;
12820     Inode *fromdir_root = nullptr;
12821     Inode *todir_root = nullptr;
12822     int mask = 0;
12823     bool quota_check = false;
12824     if (fromdir != todir) {
12825       fromdir_root =
12826         fromdir->quota.is_enable() ? fromdir : get_quota_root(fromdir, perm);
12827       todir_root =
12828         todir->quota.is_enable() ? todir : get_quota_root(todir, perm);
12829
12830       if (todir_root->quota.is_enable() && fromdir_root != todir_root) {
12831         // use CEPH_STAT_RSTAT mask to force send getattr or lookup request
12832         // to auth MDS to get latest rstat for todir_root and source dir
12833         // even if their dentry caches and inode caps are satisfied.
12834         res = _getattr(todir_root, CEPH_STAT_RSTAT, perm, true);
12835         if (res < 0)
12836           goto fail;
12837
12838         quota_check = true;
12839         if (oldde->inode && oldde->inode->is_dir()) {
12840           mask |= CEPH_STAT_RSTAT;
12841         }
12842       }
12843     }
12844
12845     res = _lookup(fromdir, fromname, mask, &oldin, perm);
12846     if (res < 0)
12847       goto fail;
12848
12849     Inode *oldinode = oldin.get();
12850     oldinode->break_all_delegs();
12851     req->set_old_inode(oldinode);
12852     req->old_inode_drop = CEPH_CAP_LINK_SHARED;
12853
12854     if (quota_check) {
12855       int64_t old_bytes, old_files;
12856       if (oldinode->is_dir()) {
12857         old_bytes = oldinode->rstat.rbytes;
12858         old_files = oldinode->rstat.rsize();
12859       } else {
12860         old_bytes = oldinode->size;
12861         old_files = 1;
12862       }
12863
12864       bool quota_exceed = false;
12865       if (todir_root && todir_root->quota.max_bytes &&
12866           (old_bytes + todir_root->rstat.rbytes) >= todir_root->quota.max_bytes) {
12867         ldout(cct, 10) << "_rename (" << oldinode->ino << " bytes="
12868                        << old_bytes << ") to (" << todir->ino
12869                        << ") will exceed quota on " << *todir_root << dendl;
12870         quota_exceed = true;
12871       }
12872
12873       if (todir_root && todir_root->quota.max_files &&
12874           (old_files + todir_root->rstat.rsize()) >= todir_root->quota.max_files) {
12875         ldout(cct, 10) << "_rename (" << oldinode->ino << " files="
12876                        << old_files << ") to (" << todir->ino
12877                        << ") will exceed quota on " << *todir_root << dendl;
12878         quota_exceed = true;
12879       }
12880
12881       if (quota_exceed) {
12882         res = (oldinode->is_dir()) ? -EXDEV : -EDQUOT;
12883         goto fail;
12884       }
12885     }
12886
12887     res = _lookup(todir, toname, 0, &otherin, perm);
12888     switch (res) {
12889     case 0:
12890       {
12891         Inode *in = otherin.get();
12892         req->set_other_inode(in);
12893         in->break_all_delegs();
12894       }
12895       req->other_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
12896       break;
12897     case -ENOENT:
12898       break;
12899     default:
12900       goto fail;
12901     }
12902
12903     req->set_inode(todir);
12904   } else {
12905     // renamesnap reply contains no tracedn, so we need to invalidate
12906     // dentry manually
12907     unlink(oldde, true, true);
12908     unlink(de, true, true);
12909
12910     req->set_inode(todir);
12911   }
12912
12913   res = make_request(req, perm, &target);
12914   ldout(cct, 10) << "rename result is " << res << dendl;
12915
12916   // renamed item from our cache
12917
12918   trim_cache();
12919   ldout(cct, 8) << "_rename(" << from << ", " << to << ") = " << res << dendl;
12920   return res;
12921
12922  fail:
12923   put_request(req);
12924   return res;
12925 }
12926
12927 int Client::ll_rename(Inode *parent, const char *name, Inode *newparent,
12928                       const char *newname, const UserPerm& perm)
12929 {
12930   std::lock_guard lock(client_lock);
12931
12932   if (unmounting)
12933     return -ENOTCONN;
12934
12935   vinodeno_t vparent = _get_vino(parent);
12936   vinodeno_t vnewparent = _get_vino(newparent);
12937
12938   ldout(cct, 3) << "ll_rename " << vparent << " " << name << " to "
12939           << vnewparent << " " << newname << dendl;
12940   tout(cct) << "ll_rename" << std::endl;
12941   tout(cct) << vparent.ino.val << std::endl;
12942   tout(cct) << name << std::endl;
12943   tout(cct) << vnewparent.ino.val << std::endl;
12944   tout(cct) << newname << std::endl;
12945
12946   if (!fuse_default_permissions) {
12947     int r = may_delete(parent, name, perm);
12948     if (r < 0)
12949       return r;
12950     r = may_delete(newparent, newname, perm);
12951     if (r < 0 && r != -ENOENT)
12952       return r;
12953   }
12954
12955   return _rename(parent, name, newparent, newname, perm);
12956 }
12957
12958 int Client::_link(Inode *in, Inode *dir, const char *newname, const UserPerm& perm, InodeRef *inp)
12959 {
12960   ldout(cct, 8) << "_link(" << in->ino << " to " << dir->ino << " " << newname
12961                 << " uid " << perm.uid() << " gid " << perm.gid() << ")" << dendl;
12962
12963   if (strlen(newname) > NAME_MAX)
12964     return -ENAMETOOLONG;
12965
12966   if (in->snapid != CEPH_NOSNAP || dir->snapid != CEPH_NOSNAP) {
12967     return -EROFS;
12968   }
12969   if (is_quota_files_exceeded(dir, perm)) {
12970     return -EDQUOT;
12971   }
12972
12973   in->break_all_delegs();
12974   MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LINK);
12975
12976   filepath path(newname, dir->ino);
12977   req->set_filepath(path);
12978   filepath existing(in->ino);
12979   req->set_filepath2(existing);
12980
12981   req->set_inode(dir);
12982   req->inode_drop = CEPH_CAP_FILE_SHARED;
12983   req->inode_unless = CEPH_CAP_FILE_EXCL;
12984
12985   Dentry *de;
12986   int res = get_or_create(dir, newname, &de);
12987   if (res < 0)
12988     goto fail;
12989   req->set_dentry(de);
12990
12991   res = make_request(req, perm, inp);
12992   ldout(cct, 10) << "link result is " << res << dendl;
12993
12994   trim_cache();
12995   ldout(cct, 8) << "link(" << existing << ", " << path << ") = " << res << dendl;
12996   return res;
12997
12998  fail:
12999   put_request(req);
13000   return res;
13001 }
13002
13003 int Client::ll_link(Inode *in, Inode *newparent, const char *newname,
13004                     const UserPerm& perm)
13005 {
13006   std::lock_guard lock(client_lock);
13007
13008   if (unmounting)
13009     return -ENOTCONN;
13010
13011   vinodeno_t vino = _get_vino(in);
13012   vinodeno_t vnewparent = _get_vino(newparent);
13013
13014   ldout(cct, 3) << "ll_link " << vino << " to " << vnewparent << " " <<
13015     newname << dendl;
13016   tout(cct) << "ll_link" << std::endl;
13017   tout(cct) << vino.ino.val << std::endl;
13018   tout(cct) << vnewparent << std::endl;
13019   tout(cct) << newname << std::endl;
13020
13021   InodeRef target;
13022
13023   if (!fuse_default_permissions) {
13024     if (S_ISDIR(in->mode))
13025       return -EPERM;
13026
13027     int r = may_hardlink(in, perm);
13028     if (r < 0)
13029       return r;
13030
13031     r = may_create(newparent, perm);
13032     if (r < 0)
13033       return r;
13034   }
13035
13036   return _link(in, newparent, newname, perm, &target);
13037 }
13038
13039 int Client::ll_num_osds(void)
13040 {
13041   std::lock_guard lock(client_lock);
13042   return objecter->with_osdmap(std::mem_fn(&OSDMap::get_num_osds));
13043 }
13044
13045 int Client::ll_osdaddr(int osd, uint32_t *addr)
13046 {
13047   std::lock_guard lock(client_lock);
13048
13049   entity_addr_t g;
13050   bool exists = objecter->with_osdmap([&](const OSDMap& o) {
13051       if (!o.exists(osd))
13052         return false;
13053       g = o.get_addrs(osd).front();
13054       return true;
13055     });
13056   if (!exists)
13057     return -1;
13058   uint32_t nb_addr = (g.in4_addr()).sin_addr.s_addr;
13059   *addr = ntohl(nb_addr);
13060   return 0;
13061 }
13062
13063 uint32_t Client::ll_stripe_unit(Inode *in)
13064 {
13065   std::lock_guard lock(client_lock);
13066   return in->layout.stripe_unit;
13067 }
13068
13069 uint64_t Client::ll_snap_seq(Inode *in)
13070 {
13071   std::lock_guard lock(client_lock);
13072   return in->snaprealm->seq;
13073 }
13074
13075 int Client::ll_file_layout(Inode *in, file_layout_t *layout)
13076 {
13077   std::lock_guard lock(client_lock);
13078   *layout = in->layout;
13079   return 0;
13080 }
13081
13082 int Client::ll_file_layout(Fh *fh, file_layout_t *layout)
13083 {
13084   return ll_file_layout(fh->inode.get(), layout);
13085 }
13086
13087 /* Currently we cannot take advantage of redundancy in reads, since we
13088    would have to go through all possible placement groups (a
13089    potentially quite large number determined by a hash), and use CRUSH
13090    to calculate the appropriate set of OSDs for each placement group,
13091    then index into that.  An array with one entry per OSD is much more
13092    tractable and works for demonstration purposes. */
13093
13094 int Client::ll_get_stripe_osd(Inode *in, uint64_t blockno,
13095                               file_layout_t* layout)
13096 {
13097   std::lock_guard lock(client_lock);
13098
13099   inodeno_t ino = in->ino;
13100   uint32_t object_size = layout->object_size;
13101   uint32_t su = layout->stripe_unit;
13102   uint32_t stripe_count = layout->stripe_count;
13103   uint64_t stripes_per_object = object_size / su;
13104   uint64_t stripeno = 0, stripepos = 0;
13105
13106   if(stripe_count) {
13107       stripeno = blockno / stripe_count;    // which horizontal stripe        (Y)
13108       stripepos = blockno % stripe_count;   // which object in the object set (X)
13109   }
13110   uint64_t objectsetno = stripeno / stripes_per_object;       // which object set
13111   uint64_t objectno = objectsetno * stripe_count + stripepos;  // object id
13112
13113   object_t oid = file_object_t(ino, objectno);
13114   return objecter->with_osdmap([&](const OSDMap& o) {
13115       ceph_object_layout olayout =
13116         o.file_to_object_layout(oid, *layout);
13117       pg_t pg = (pg_t)olayout.ol_pgid;
13118       vector<int> osds;
13119       int primary;
13120       o.pg_to_acting_osds(pg, &osds, &primary);
13121       return primary;
13122     });
13123 }
13124
13125 /* Return the offset of the block, internal to the object */
13126
13127 uint64_t Client::ll_get_internal_offset(Inode *in, uint64_t blockno)
13128 {
13129   std::lock_guard lock(client_lock);
13130   file_layout_t *layout=&(in->layout);
13131   uint32_t object_size = layout->object_size;
13132   uint32_t su = layout->stripe_unit;
13133   uint64_t stripes_per_object = object_size / su;
13134
13135   return (blockno % stripes_per_object) * su;
13136 }
13137
13138 int Client::ll_opendir(Inode *in, int flags, dir_result_t** dirpp,
13139                        const UserPerm& perms)
13140 {
13141   std::lock_guard lock(client_lock);
13142
13143   if (unmounting)
13144     return -ENOTCONN;
13145
13146   vinodeno_t vino = _get_vino(in);
13147
13148   ldout(cct, 3) << "ll_opendir " << vino << dendl;
13149   tout(cct) << "ll_opendir" << std::endl;
13150   tout(cct) << vino.ino.val << std::endl;
13151
13152   if (!fuse_default_permissions) {
13153     int r = may_open(in, flags, perms);
13154     if (r < 0)
13155       return r;
13156   }
13157
13158   int r = _opendir(in, dirpp, perms);
13159   tout(cct) << (unsigned long)*dirpp << std::endl;
13160
13161   ldout(cct, 3) << "ll_opendir " << vino << " = " << r << " (" << *dirpp << ")"
13162                 << dendl;
13163   return r;
13164 }
13165
13166 int Client::ll_releasedir(dir_result_t *dirp)
13167 {
13168   std::lock_guard lock(client_lock);
13169   ldout(cct, 3) << "ll_releasedir " << dirp << dendl;
13170   tout(cct) << "ll_releasedir" << std::endl;
13171   tout(cct) << (unsigned long)dirp << std::endl;
13172
13173   if (unmounting)
13174     return -ENOTCONN;
13175
13176   _closedir(dirp);
13177   return 0;
13178 }
13179
13180 int Client::ll_fsyncdir(dir_result_t *dirp)
13181 {
13182   std::lock_guard lock(client_lock);
13183   ldout(cct, 3) << "ll_fsyncdir " << dirp << dendl;
13184   tout(cct) << "ll_fsyncdir" << std::endl;
13185   tout(cct) << (unsigned long)dirp << std::endl;
13186
13187   if (unmounting)
13188     return -ENOTCONN;
13189
13190   return _fsync(dirp->inode.get(), false);
13191 }
13192
13193 int Client::ll_open(Inode *in, int flags, Fh **fhp, const UserPerm& perms)
13194 {
13195   ceph_assert(!(flags & O_CREAT));
13196
13197   std::lock_guard lock(client_lock);
13198
13199   if (unmounting)
13200     return -ENOTCONN;
13201
13202   vinodeno_t vino = _get_vino(in);
13203
13204   ldout(cct, 3) << "ll_open " << vino << " " << ceph_flags_sys2wire(flags) << dendl;
13205   tout(cct) << "ll_open" << std::endl;
13206   tout(cct) << vino.ino.val << std::endl;
13207   tout(cct) << ceph_flags_sys2wire(flags) << std::endl;
13208
13209   int r;
13210   if (!fuse_default_permissions) {
13211     r = may_open(in, flags, perms);
13212     if (r < 0)
13213       goto out;
13214   }
13215
13216   r = _open(in, flags, 0, fhp /* may be NULL */, perms);
13217
13218  out:
13219   Fh *fhptr = fhp ? *fhp : NULL;
13220   if (fhptr) {
13221     ll_unclosed_fh_set.insert(fhptr);
13222   }
13223   tout(cct) << (unsigned long)fhptr << std::endl;
13224   ldout(cct, 3) << "ll_open " << vino << " " << ceph_flags_sys2wire(flags) <<
13225       " = " << r << " (" << fhptr << ")" << dendl;
13226   return r;
13227 }
13228
13229 int Client::_ll_create(Inode *parent, const char *name, mode_t mode,
13230                       int flags, InodeRef *in, int caps, Fh **fhp,
13231                       const UserPerm& perms)
13232 {
13233   *fhp = NULL;
13234
13235   vinodeno_t vparent = _get_vino(parent);
13236
13237   ldout(cct, 8) << "_ll_create " << vparent << " " << name << " 0" << oct <<
13238     mode << dec << " " << ceph_flags_sys2wire(flags) << ", uid " << perms.uid()
13239                 << ", gid " << perms.gid() << dendl;
13240   tout(cct) << "ll_create" << std::endl;
13241   tout(cct) << vparent.ino.val << std::endl;
13242   tout(cct) << name << std::endl;
13243   tout(cct) << mode << std::endl;
13244   tout(cct) << ceph_flags_sys2wire(flags) << std::endl;
13245
13246   bool created = false;
13247   int r = _lookup(parent, name, caps, in, perms);
13248
13249   if (r == 0 && (flags & O_CREAT) && (flags & O_EXCL))
13250     return -EEXIST;
13251
13252   if (r == -ENOENT && (flags & O_CREAT)) {
13253     if (!fuse_default_permissions) {
13254       r = may_create(parent, perms);
13255       if (r < 0)
13256         goto out;
13257     }
13258     r = _create(parent, name, flags, mode, in, fhp, 0, 0, 0, NULL, &created,
13259                 perms);
13260     if (r < 0)
13261       goto out;
13262   }
13263
13264   if (r < 0)
13265     goto out;
13266
13267   ceph_assert(*in);
13268
13269   ldout(cct, 20) << "_ll_create created = " << created << dendl;
13270   if (!created) {
13271     if (!fuse_default_permissions) {
13272       r = may_open(in->get(), flags, perms);
13273       if (r < 0) {
13274         if (*fhp) {
13275           int release_r = _release_fh(*fhp);
13276           ceph_assert(release_r == 0);  // during create, no async data ops should have happened
13277         }
13278         goto out;
13279       }
13280     }
13281     if (*fhp == NULL) {
13282       r = _open(in->get(), flags, mode, fhp, perms);
13283       if (r < 0)
13284         goto out;
13285     }
13286   }
13287
13288 out:
13289   if (*fhp) {
13290     ll_unclosed_fh_set.insert(*fhp);
13291   }
13292
13293   ino_t ino = 0;
13294   if (r >= 0) {
13295     Inode *inode = in->get();
13296     if (use_faked_inos())
13297       ino = inode->faked_ino;
13298     else
13299       ino = inode->ino;
13300   }
13301
13302   tout(cct) << (unsigned long)*fhp << std::endl;
13303   tout(cct) << ino << std::endl;
13304   ldout(cct, 8) << "_ll_create " << vparent << " " << name << " 0" << oct <<
13305     mode << dec << " " << ceph_flags_sys2wire(flags) << " = " << r << " (" <<
13306     *fhp << " " << hex << ino << dec << ")" << dendl;
13307
13308   return r;
13309 }
13310
13311 int Client::ll_create(Inode *parent, const char *name, mode_t mode,
13312                       int flags, struct stat *attr, Inode **outp, Fh **fhp,
13313                       const UserPerm& perms)
13314 {
13315   std::lock_guard lock(client_lock);
13316   InodeRef in;
13317
13318   if (unmounting)
13319     return -ENOTCONN;
13320
13321   int r = _ll_create(parent, name, mode, flags, &in, CEPH_STAT_CAP_INODE_ALL,
13322                       fhp, perms);
13323   if (r >= 0) {
13324     ceph_assert(in);
13325
13326     // passing an Inode in outp requires an additional ref
13327     if (outp) {
13328       _ll_get(in.get());
13329       *outp = in.get();
13330     }
13331     fill_stat(in, attr);
13332   } else {
13333     attr->st_ino = 0;
13334   }
13335
13336   return r;
13337 }
13338
13339 int Client::ll_createx(Inode *parent, const char *name, mode_t mode,
13340                         int oflags, Inode **outp, Fh **fhp,
13341                         struct ceph_statx *stx, unsigned want, unsigned lflags,
13342                         const UserPerm& perms)
13343 {
13344   unsigned caps = statx_to_mask(lflags, want);
13345   std::lock_guard lock(client_lock);
13346   InodeRef in;
13347
13348   if (unmounting)
13349     return -ENOTCONN;
13350
13351   int r = _ll_create(parent, name, mode, oflags, &in, caps, fhp, perms);
13352   if (r >= 0) {
13353     ceph_assert(in);
13354
13355     // passing an Inode in outp requires an additional ref
13356     if (outp) {
13357       _ll_get(in.get());
13358       *outp = in.get();
13359     }
13360     fill_statx(in, caps, stx);
13361   } else {
13362     stx->stx_ino = 0;
13363     stx->stx_mask = 0;
13364   }
13365
13366   return r;
13367 }
13368
13369 loff_t Client::ll_lseek(Fh *fh, loff_t offset, int whence)
13370 {
13371   std::lock_guard lock(client_lock);
13372   tout(cct) << "ll_lseek" << std::endl;
13373   tout(cct) << offset << std::endl;
13374   tout(cct) << whence << std::endl;
13375
13376   if (unmounting)
13377     return -ENOTCONN;
13378
13379   return _lseek(fh, offset, whence);
13380 }
13381
13382 int Client::ll_read(Fh *fh, loff_t off, loff_t len, bufferlist *bl)
13383 {
13384   std::lock_guard lock(client_lock);
13385   ldout(cct, 3) << "ll_read " << fh << " " << fh->inode->ino << " " << " " << off << "~" << len << dendl;
13386   tout(cct) << "ll_read" << std::endl;
13387   tout(cct) << (unsigned long)fh << std::endl;
13388   tout(cct) << off << std::endl;
13389   tout(cct) << len << std::endl;
13390
13391   if (unmounting)
13392     return -ENOTCONN;
13393
13394   /* We can't return bytes written larger than INT_MAX, clamp len to that */
13395   len = std::min(len, (loff_t)INT_MAX);
13396   int r = _read(fh, off, len, bl);
13397   ldout(cct, 3) << "ll_read " << fh << " " << off << "~" << len << " = " << r
13398                 << dendl;
13399   return r;
13400 }
13401
13402 int Client::ll_read_block(Inode *in, uint64_t blockid,
13403                           char *buf,
13404                           uint64_t offset,
13405                           uint64_t length,
13406                           file_layout_t* layout)
13407 {
13408   std::lock_guard lock(client_lock);
13409
13410   if (unmounting)
13411     return -ENOTCONN;
13412
13413   vinodeno_t vino = _get_vino(in);
13414   object_t oid = file_object_t(vino.ino, blockid);
13415   C_SaferCond onfinish;
13416   bufferlist bl;
13417
13418   objecter->read(oid,
13419                  object_locator_t(layout->pool_id),
13420                  offset,
13421                  length,
13422                  vino.snapid,
13423                  &bl,
13424                  CEPH_OSD_FLAG_READ,
13425                  &onfinish);
13426
13427   client_lock.unlock();
13428   int r = onfinish.wait();
13429   client_lock.lock();
13430
13431   if (r >= 0) {
13432       bl.begin().copy(bl.length(), buf);
13433       r = bl.length();
13434   }
13435
13436   return r;
13437 }
13438
13439 /* It appears that the OSD doesn't return success unless the entire
13440    buffer was written, return the write length on success. */
13441
13442 int Client::ll_write_block(Inode *in, uint64_t blockid,
13443                            char* buf, uint64_t offset,
13444                            uint64_t length, file_layout_t* layout,
13445                            uint64_t snapseq, uint32_t sync)
13446 {
13447   vinodeno_t vino = ll_get_vino(in);
13448   int r = 0;
13449   std::unique_ptr<C_SaferCond> onsafe = nullptr;
13450
13451   if (length == 0) {
13452     return -EINVAL;
13453   }
13454   if (true || sync) {
13455     /* if write is stable, the epilogue is waiting on
13456      * flock */
13457     onsafe.reset(new C_SaferCond("Client::ll_write_block flock"));
13458   }
13459   object_t oid = file_object_t(vino.ino, blockid);
13460   SnapContext fakesnap;
13461   ceph::bufferlist bl;
13462   if (length > 0) {
13463     bl.push_back(buffer::copy(buf, length));
13464   }
13465
13466   ldout(cct, 1) << "ll_block_write for " << vino.ino << "." << blockid
13467                 << dendl;
13468
13469   fakesnap.seq = snapseq;
13470
13471   /* lock just in time */
13472   client_lock.lock();
13473   if (unmounting) {
13474     client_lock.unlock();
13475     return -ENOTCONN;
13476   }
13477
13478   objecter->write(oid,
13479                   object_locator_t(layout->pool_id),
13480                   offset,
13481                   length,
13482                   fakesnap,
13483                   bl,
13484                   ceph::real_clock::now(),
13485                   0,
13486                   onsafe.get());
13487
13488   client_lock.unlock();
13489   if (nullptr != onsafe) {
13490     r = onsafe->wait();
13491   }
13492
13493   if (r < 0) {
13494     return r;
13495   } else {
13496     return length;
13497   }
13498 }
13499
13500 int Client::ll_commit_blocks(Inode *in,
13501                              uint64_t offset,
13502                              uint64_t length)
13503 {
13504     std::lock_guard lock(client_lock);
13505     /*
13506     BarrierContext *bctx;
13507     vinodeno_t vino = _get_vino(in);
13508     uint64_t ino = vino.ino;
13509
13510     ldout(cct, 1) << "ll_commit_blocks for " << vino.ino << " from "
13511                   << offset << " to " << length << dendl;
13512
13513     if (length == 0) {
13514       return -EINVAL;
13515     }
13516
13517     map<uint64_t, BarrierContext*>::iterator p = barriers.find(ino);
13518     if (p != barriers.end()) {
13519       barrier_interval civ(offset, offset + length);
13520       p->second->commit_barrier(civ);
13521     }
13522     */
13523     return 0;
13524 }
13525
13526 int Client::ll_write(Fh *fh, loff_t off, loff_t len, const char *data)
13527 {
13528   std::lock_guard lock(client_lock);
13529   ldout(cct, 3) << "ll_write " << fh << " " << fh->inode->ino << " " << off <<
13530     "~" << len << dendl;
13531   tout(cct) << "ll_write" << std::endl;
13532   tout(cct) << (unsigned long)fh << std::endl;
13533   tout(cct) << off << std::endl;
13534   tout(cct) << len << std::endl;
13535
13536   if (unmounting)
13537     return -ENOTCONN;
13538
13539   /* We can't return bytes written larger than INT_MAX, clamp len to that */
13540   len = std::min(len, (loff_t)INT_MAX);
13541   int r = _write(fh, off, len, data, NULL, 0);
13542   ldout(cct, 3) << "ll_write " << fh << " " << off << "~" << len << " = " << r
13543                 << dendl;
13544   return r;
13545 }
13546
13547 int64_t Client::ll_writev(struct Fh *fh, const struct iovec *iov, int iovcnt, int64_t off)
13548 {
13549   std::lock_guard lock(client_lock);
13550   if (unmounting)
13551    return -ENOTCONN;
13552   return _preadv_pwritev_locked(fh, iov, iovcnt, off, true, false);
13553 }
13554
13555 int64_t Client::ll_readv(struct Fh *fh, const struct iovec *iov, int iovcnt, int64_t off)
13556 {
13557   std::lock_guard lock(client_lock);
13558   if (unmounting)
13559    return -ENOTCONN;
13560   return _preadv_pwritev_locked(fh, iov, iovcnt, off, false, false);
13561 }
13562
13563 int Client::ll_flush(Fh *fh)
13564 {
13565   std::lock_guard lock(client_lock);
13566   ldout(cct, 3) << "ll_flush " << fh << " " << fh->inode->ino << " " << dendl;
13567   tout(cct) << "ll_flush" << std::endl;
13568   tout(cct) << (unsigned long)fh << std::endl;
13569
13570   if (unmounting)
13571     return -ENOTCONN;
13572
13573   return _flush(fh);
13574 }
13575
13576 int Client::ll_fsync(Fh *fh, bool syncdataonly)
13577 {
13578   std::lock_guard lock(client_lock);
13579   ldout(cct, 3) << "ll_fsync " << fh << " " << fh->inode->ino << " " << dendl;
13580   tout(cct) << "ll_fsync" << std::endl;
13581   tout(cct) << (unsigned long)fh << std::endl;
13582
13583   if (unmounting)
13584     return -ENOTCONN;
13585
13586   int r = _fsync(fh, syncdataonly);
13587   if (r) {
13588     // If we're returning an error, clear it from the FH
13589     fh->take_async_err();
13590   }
13591   return r;
13592 }
13593
13594 int Client::ll_sync_inode(Inode *in, bool syncdataonly)
13595 {
13596   std::lock_guard lock(client_lock);
13597   ldout(cct, 3) << "ll_sync_inode " << *in << " " << dendl;
13598   tout(cct) << "ll_sync_inode" << std::endl;
13599   tout(cct) << (unsigned long)in << std::endl;
13600
13601   if (unmounting)
13602     return -ENOTCONN;
13603
13604   return _fsync(in, syncdataonly);
13605 }
13606
13607 #ifdef FALLOC_FL_PUNCH_HOLE
13608
13609 int Client::_fallocate(Fh *fh, int mode, int64_t offset, int64_t length)
13610 {
13611   if (offset < 0 || length <= 0)
13612     return -EINVAL;
13613
13614   if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
13615     return -EOPNOTSUPP;
13616
13617   if ((mode & FALLOC_FL_PUNCH_HOLE) && !(mode & FALLOC_FL_KEEP_SIZE))
13618     return -EOPNOTSUPP;
13619
13620   Inode *in = fh->inode.get();
13621
13622   if (objecter->osdmap_pool_full(in->layout.pool_id) &&
13623       !(mode & FALLOC_FL_PUNCH_HOLE)) {
13624     return -ENOSPC;
13625   }
13626
13627   if (in->snapid != CEPH_NOSNAP)
13628     return -EROFS;
13629
13630   if ((fh->mode & CEPH_FILE_MODE_WR) == 0)
13631     return -EBADF;
13632
13633   uint64_t size = offset + length;
13634   if (!(mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE)) &&
13635       size > in->size &&
13636       is_quota_bytes_exceeded(in, size - in->size, fh->actor_perms)) {
13637     return -EDQUOT;
13638   }
13639
13640   int have;
13641   int r = get_caps(fh, CEPH_CAP_FILE_WR, CEPH_CAP_FILE_BUFFER, &have, -1);
13642   if (r < 0)
13643     return r;
13644
13645   std::unique_ptr<C_SaferCond> onuninline = nullptr;
13646   if (mode & FALLOC_FL_PUNCH_HOLE) {
13647     if (in->inline_version < CEPH_INLINE_NONE &&
13648         (have & CEPH_CAP_FILE_BUFFER)) {
13649       bufferlist bl;
13650       auto inline_iter = in->inline_data.cbegin();
13651       int len = in->inline_data.length();
13652       if (offset < len) {
13653         if (offset > 0)
13654           inline_iter.copy(offset, bl);
13655         int size = length;
13656         if (offset + size > len)
13657           size = len - offset;
13658         if (size > 0)
13659           bl.append_zero(size);
13660         if (offset + size < len) {
13661           inline_iter += size;
13662           inline_iter.copy(len - offset - size, bl);
13663         }
13664         in->inline_data = bl;
13665         in->inline_version++;
13666       }
13667       in->mtime = in->ctime = ceph_clock_now();
13668       in->change_attr++;
13669       in->mark_caps_dirty(CEPH_CAP_FILE_WR);
13670     } else {
13671       if (in->inline_version < CEPH_INLINE_NONE) {
13672         onuninline.reset(new C_SaferCond("Client::_fallocate_uninline_data flock"));
13673         uninline_data(in, onuninline.get());
13674       }
13675
13676       C_SaferCond onfinish("Client::_punch_hole flock");
13677
13678       unsafe_sync_write++;
13679       get_cap_ref(in, CEPH_CAP_FILE_BUFFER);
13680
13681       _invalidate_inode_cache(in, offset, length);
13682       filer->zero(in->ino, &in->layout,
13683                   in->snaprealm->get_snap_context(),
13684                   offset, length,
13685                   ceph::real_clock::now(),
13686                   0, true, &onfinish);
13687       in->mtime = in->ctime = ceph_clock_now();
13688       in->change_attr++;
13689       in->mark_caps_dirty(CEPH_CAP_FILE_WR);
13690
13691       client_lock.unlock();
13692       onfinish.wait();
13693       client_lock.lock();
13694       _sync_write_commit(in);
13695     }
13696   } else if (!(mode & FALLOC_FL_KEEP_SIZE)) {
13697     uint64_t size = offset + length;
13698     if (size > in->size) {
13699       in->size = size;
13700       in->mtime = in->ctime = ceph_clock_now();
13701       in->change_attr++;
13702       in->mark_caps_dirty(CEPH_CAP_FILE_WR);
13703
13704       if (is_quota_bytes_approaching(in, fh->actor_perms)) {
13705         check_caps(in, CHECK_CAPS_NODELAY);
13706       } else if (is_max_size_approaching(in)) {
13707         check_caps(in, 0);
13708       }
13709     }
13710   }
13711
13712   if (nullptr != onuninline) {
13713     client_lock.unlock();
13714     int ret = onuninline->wait();
13715     client_lock.lock();
13716
13717     if (ret >= 0 || ret == -ECANCELED) {
13718       in->inline_data.clear();
13719       in->inline_version = CEPH_INLINE_NONE;
13720       in->mark_caps_dirty(CEPH_CAP_FILE_WR);
13721       check_caps(in, 0);
13722     } else
13723       r = ret;
13724   }
13725
13726   put_cap_ref(in, CEPH_CAP_FILE_WR);
13727   return r;
13728 }
13729 #else
13730
13731 int Client::_fallocate(Fh *fh, int mode, int64_t offset, int64_t length)
13732 {
13733   return -EOPNOTSUPP;
13734 }
13735
13736 #endif
13737
13738
13739 int Client::ll_fallocate(Fh *fh, int mode, int64_t offset, int64_t length)
13740 {
13741   std::lock_guard lock(client_lock);
13742   ldout(cct, 3) << __func__ << " " << fh << " " << fh->inode->ino << " " << dendl;
13743   tout(cct) << __func__ << " " << mode << " " << offset << " " << length << std::endl;
13744   tout(cct) << (unsigned long)fh << std::endl;
13745
13746   if (unmounting)
13747     return -ENOTCONN;
13748
13749   return _fallocate(fh, mode, offset, length);
13750 }
13751
13752 int Client::fallocate(int fd, int mode, loff_t offset, loff_t length)
13753 {
13754   std::lock_guard lock(client_lock);
13755   tout(cct) << __func__ << " " << " " << fd << mode << " " << offset << " " << length << std::endl;
13756
13757   if (unmounting)
13758     return -ENOTCONN;
13759
13760   Fh *fh = get_filehandle(fd);
13761   if (!fh)
13762     return -EBADF;
13763 #if defined(__linux__) && defined(O_PATH)
13764   if (fh->flags & O_PATH)
13765     return -EBADF;
13766 #endif
13767   return _fallocate(fh, mode, offset, length);
13768 }
13769
13770 int Client::ll_release(Fh *fh)
13771 {
13772   std::lock_guard lock(client_lock);
13773
13774   if (unmounting)
13775     return -ENOTCONN;
13776
13777   ldout(cct, 3) << __func__ << " (fh)" << fh << " " << fh->inode->ino << " " <<
13778     dendl;
13779   tout(cct) << __func__ << " (fh)" << std::endl;
13780   tout(cct) << (unsigned long)fh << std::endl;
13781
13782   if (ll_unclosed_fh_set.count(fh))
13783     ll_unclosed_fh_set.erase(fh);
13784   return _release_fh(fh);
13785 }
13786
13787 int Client::ll_getlk(Fh *fh, struct flock *fl, uint64_t owner)
13788 {
13789   std::lock_guard lock(client_lock);
13790
13791   ldout(cct, 3) << "ll_getlk (fh)" << fh << " " << fh->inode->ino << dendl;
13792   tout(cct) << "ll_getk (fh)" << (unsigned long)fh << std::endl;
13793
13794   if (unmounting)
13795     return -ENOTCONN;
13796
13797   return _getlk(fh, fl, owner);
13798 }
13799
13800 int Client::ll_setlk(Fh *fh, struct flock *fl, uint64_t owner, int sleep)
13801 {
13802   std::lock_guard lock(client_lock);
13803
13804   ldout(cct, 3) << __func__ << "  (fh) " << fh << " " << fh->inode->ino << dendl;
13805   tout(cct) << __func__ << " (fh)" << (unsigned long)fh << std::endl;
13806
13807   if (unmounting)
13808     return -ENOTCONN;
13809
13810   return _setlk(fh, fl, owner, sleep);
13811 }
13812
13813 int Client::ll_flock(Fh *fh, int cmd, uint64_t owner)
13814 {
13815   std::lock_guard lock(client_lock);
13816
13817   ldout(cct, 3) << __func__ << "  (fh) " << fh << " " << fh->inode->ino << dendl;
13818   tout(cct) << __func__ << " (fh)" << (unsigned long)fh << std::endl;
13819
13820   if (unmounting)
13821     return -ENOTCONN;
13822
13823   return _flock(fh, cmd, owner);
13824 }
13825
13826 int Client::set_deleg_timeout(uint32_t timeout)
13827 {
13828   std::lock_guard lock(client_lock);
13829
13830   /*
13831    * The whole point is to prevent blacklisting so we must time out the
13832    * delegation before the session autoclose timeout kicks in.
13833    */
13834   if (timeout >= mdsmap->get_session_autoclose())
13835     return -EINVAL;
13836
13837   deleg_timeout = timeout;
13838   return 0;
13839 }
13840
13841 int Client::ll_delegation(Fh *fh, unsigned cmd, ceph_deleg_cb_t cb, void *priv)
13842 {
13843   int ret = -EINVAL;
13844
13845   std::lock_guard lock(client_lock);
13846
13847   if (!mounted)
13848     return -ENOTCONN;
13849
13850   Inode *inode = fh->inode.get();
13851
13852   switch(cmd) {
13853   case CEPH_DELEGATION_NONE:
13854     inode->unset_deleg(fh);
13855     ret = 0;
13856     break;
13857   default:
13858     try {
13859       ret = inode->set_deleg(fh, cmd, cb, priv);
13860     } catch (std::bad_alloc&) {
13861       ret = -ENOMEM;
13862     }
13863     break;
13864   }
13865   return ret;
13866 }
13867
13868 class C_Client_RequestInterrupt : public Context  {
13869 private:
13870   Client *client;
13871   MetaRequest *req;
13872 public:
13873   C_Client_RequestInterrupt(Client *c, MetaRequest *r) : client(c), req(r) {
13874     req->get();
13875   }
13876   void finish(int r) override {
13877     std::lock_guard l(client->client_lock);
13878     ceph_assert(req->head.op == CEPH_MDS_OP_SETFILELOCK);
13879     client->_interrupt_filelock(req);
13880     client->put_request(req);
13881   }
13882 };
13883
13884 void Client::ll_interrupt(void *d)
13885 {
13886   MetaRequest *req = static_cast<MetaRequest*>(d);
13887   ldout(cct, 3) << __func__ << " tid " << req->get_tid() << dendl;
13888   tout(cct) << __func__ << " tid " << req->get_tid() << std::endl;
13889   interrupt_finisher.queue(new C_Client_RequestInterrupt(this, req));
13890 }
13891
13892 // =========================================
13893 // layout
13894
13895 // expose file layouts
13896
13897 int Client::describe_layout(const char *relpath, file_layout_t *lp,
13898                             const UserPerm& perms)
13899 {
13900   std::lock_guard lock(client_lock);
13901
13902   if (unmounting)
13903     return -ENOTCONN;
13904
13905   filepath path(relpath);
13906   InodeRef in;
13907   int r = path_walk(path, &in, perms);
13908   if (r < 0)
13909     return r;
13910
13911   *lp = in->layout;
13912
13913   ldout(cct, 3) << __func__ << "(" << relpath << ") = 0" << dendl;
13914   return 0;
13915 }
13916
13917 int Client::fdescribe_layout(int fd, file_layout_t *lp)
13918 {
13919   std::lock_guard lock(client_lock);
13920
13921   if (unmounting)
13922     return -ENOTCONN;
13923
13924   Fh *f = get_filehandle(fd);
13925   if (!f)
13926     return -EBADF;
13927   Inode *in = f->inode.get();
13928
13929   *lp = in->layout;
13930
13931   ldout(cct, 3) << __func__ << "(" << fd << ") = 0" << dendl;
13932   return 0;
13933 }
13934
13935 int64_t Client::get_default_pool_id()
13936 {
13937   std::lock_guard lock(client_lock);
13938
13939   if (unmounting)
13940     return -ENOTCONN;
13941
13942   /* first data pool is the default */
13943   return mdsmap->get_first_data_pool();
13944 }
13945
13946 // expose osdmap
13947
13948 int64_t Client::get_pool_id(const char *pool_name)
13949 {
13950   std::lock_guard lock(client_lock);
13951
13952   if (unmounting)
13953     return -ENOTCONN;
13954
13955   return objecter->with_osdmap(std::mem_fn(&OSDMap::lookup_pg_pool_name),
13956                                pool_name);
13957 }
13958
13959 string Client::get_pool_name(int64_t pool)
13960 {
13961   std::lock_guard lock(client_lock);
13962
13963   if (unmounting)
13964     return string();
13965
13966   return objecter->with_osdmap([pool](const OSDMap& o) {
13967       return o.have_pg_pool(pool) ? o.get_pool_name(pool) : string();
13968     });
13969 }
13970
13971 int Client::get_pool_replication(int64_t pool)
13972 {
13973   std::lock_guard lock(client_lock);
13974
13975   if (unmounting)
13976     return -ENOTCONN;
13977
13978   return objecter->with_osdmap([pool](const OSDMap& o) {
13979       return o.have_pg_pool(pool) ? o.get_pg_pool(pool)->get_size() : -ENOENT;
13980     });
13981 }
13982
13983 int Client::get_file_extent_osds(int fd, loff_t off, loff_t *len, vector<int>& osds)
13984 {
13985   std::lock_guard lock(client_lock);
13986
13987   if (unmounting)
13988     return -ENOTCONN;
13989
13990   Fh *f = get_filehandle(fd);
13991   if (!f)
13992     return -EBADF;
13993   Inode *in = f->inode.get();
13994
13995   vector<ObjectExtent> extents;
13996   Striper::file_to_extents(cct, in->ino, &in->layout, off, 1, in->truncate_size, extents);
13997   ceph_assert(extents.size() == 1);
13998
13999   objecter->with_osdmap([&](const OSDMap& o) {
14000       pg_t pg = o.object_locator_to_pg(extents[0].oid, extents[0].oloc);
14001       o.pg_to_acting_osds(pg, osds);
14002     });
14003
14004   if (osds.empty())
14005     return -EINVAL;
14006
14007   /*
14008    * Return the remainder of the extent (stripe unit)
14009    *
14010    * If length = 1 is passed to Striper::file_to_extents we get a single
14011    * extent back, but its length is one so we still need to compute the length
14012    * to the end of the stripe unit.
14013    *
14014    * If length = su then we may get 1 or 2 objects back in the extents vector
14015    * which would have to be examined. Even then, the offsets are local to the
14016    * object, so matching up to the file offset is extra work.
14017    *
14018    * It seems simpler to stick with length = 1 and manually compute the
14019    * remainder.
14020    */
14021   if (len) {
14022     uint64_t su = in->layout.stripe_unit;
14023     *len = su - (off % su);
14024   }
14025
14026   return 0;
14027 }
14028
14029 int Client::get_osd_crush_location(int id, vector<pair<string, string> >& path)
14030 {
14031   std::lock_guard lock(client_lock);
14032
14033   if (unmounting)
14034     return -ENOTCONN;
14035
14036   if (id < 0)
14037     return -EINVAL;
14038   return objecter->with_osdmap([&](const OSDMap& o) {
14039       return o.crush->get_full_location_ordered(id, path);
14040     });
14041 }
14042
14043 int Client::get_file_stripe_address(int fd, loff_t offset,
14044                                     vector<entity_addr_t>& address)
14045 {
14046   std::lock_guard lock(client_lock);
14047
14048   if (unmounting)
14049     return -ENOTCONN;
14050
14051   Fh *f = get_filehandle(fd);
14052   if (!f)
14053     return -EBADF;
14054   Inode *in = f->inode.get();
14055
14056   // which object?
14057   vector<ObjectExtent> extents;
14058   Striper::file_to_extents(cct, in->ino, &in->layout, offset, 1,
14059                            in->truncate_size, extents);
14060   ceph_assert(extents.size() == 1);
14061
14062   // now we have the object and its 'layout'
14063   return objecter->with_osdmap([&](const OSDMap& o) {
14064       pg_t pg = o.object_locator_to_pg(extents[0].oid, extents[0].oloc);
14065       vector<int> osds;
14066       o.pg_to_acting_osds(pg, osds);
14067       if (osds.empty())
14068         return -EINVAL;
14069       for (unsigned i = 0; i < osds.size(); i++) {
14070         entity_addr_t addr = o.get_addrs(osds[i]).front();
14071         address.push_back(addr);
14072       }
14073       return 0;
14074     });
14075 }
14076
14077 int Client::get_osd_addr(int osd, entity_addr_t& addr)
14078 {
14079   std::lock_guard lock(client_lock);
14080
14081   if (unmounting)
14082     return -ENOTCONN;
14083
14084   return objecter->with_osdmap([&](const OSDMap& o) {
14085       if (!o.exists(osd))
14086         return -ENOENT;
14087
14088       addr = o.get_addrs(osd).front();
14089       return 0;
14090     });
14091 }
14092
14093 int Client::enumerate_layout(int fd, vector<ObjectExtent>& result,
14094                              loff_t length, loff_t offset)
14095 {
14096   std::lock_guard lock(client_lock);
14097
14098   if (unmounting)
14099     return -ENOTCONN;
14100
14101   Fh *f = get_filehandle(fd);
14102   if (!f)
14103     return -EBADF;
14104   Inode *in = f->inode.get();
14105
14106   // map to a list of extents
14107   Striper::file_to_extents(cct, in->ino, &in->layout, offset, length, in->truncate_size, result);
14108
14109   ldout(cct, 3) << __func__ << "(" << fd << ", " << length << ", " << offset << ") = 0" << dendl;
14110   return 0;
14111 }
14112
14113
14114 /* find an osd with the same ip.  -ENXIO if none. */
14115 int Client::get_local_osd()
14116 {
14117   std::lock_guard lock(client_lock);
14118
14119   if (unmounting)
14120     return -ENOTCONN;
14121
14122   objecter->with_osdmap([this](const OSDMap& o) {
14123       if (o.get_epoch() != local_osd_epoch) {
14124         local_osd = o.find_osd_on_ip(messenger->get_myaddrs().front());
14125         local_osd_epoch = o.get_epoch();
14126       }
14127     });
14128   return local_osd;
14129 }
14130
14131
14132
14133
14134
14135
14136 // ===============================
14137
14138 void Client::ms_handle_connect(Connection *con)
14139 {
14140   ldout(cct, 10) << __func__ << " on " << con->get_peer_addr() << dendl;
14141 }
14142
14143 bool Client::ms_handle_reset(Connection *con)
14144 {
14145   ldout(cct, 0) << __func__ << " on " << con->get_peer_addr() << dendl;
14146   return false;
14147 }
14148
14149 void Client::ms_handle_remote_reset(Connection *con)
14150 {
14151   ldout(cct, 0) << __func__ << " on " << con->get_peer_addr() << dendl;
14152   std::lock_guard l(client_lock);
14153   switch (con->get_peer_type()) {
14154   case CEPH_ENTITY_TYPE_MDS:
14155     {
14156       // kludge to figure out which mds this is; fixme with a Connection* state
14157       mds_rank_t mds = MDS_RANK_NONE;
14158       MetaSession *s = NULL;
14159       for (auto &p : mds_sessions) {
14160         if (mdsmap->get_addrs(p.first) == con->get_peer_addrs()) {
14161           mds = p.first;
14162           s = &p.second;
14163         }
14164       }
14165       if (mds >= 0) {
14166         assert (s != NULL);
14167         switch (s->state) {
14168         case MetaSession::STATE_CLOSING:
14169           ldout(cct, 1) << "reset from mds we were closing; we'll call that closed" << dendl;
14170           _closed_mds_session(s);
14171           break;
14172
14173         case MetaSession::STATE_OPENING:
14174           {
14175             ldout(cct, 1) << "reset from mds we were opening; retrying" << dendl;
14176             list<Context*> waiters;
14177             waiters.swap(s->waiting_for_open);
14178             _closed_mds_session(s);
14179             MetaSession *news = _get_or_open_mds_session(mds);
14180             news->waiting_for_open.swap(waiters);
14181           }
14182           break;
14183
14184         case MetaSession::STATE_OPEN:
14185           {
14186             objecter->maybe_request_map(); /* to check if we are blacklisted */
14187             if (cct->_conf.get_val<bool>("client_reconnect_stale")) {
14188               ldout(cct, 1) << "reset from mds we were open; close mds session for reconnect" << dendl;
14189               _closed_mds_session(s);
14190             } else {
14191               ldout(cct, 1) << "reset from mds we were open; mark session as stale" << dendl;
14192               s->state = MetaSession::STATE_STALE;
14193             }
14194           }
14195           break;
14196
14197         case MetaSession::STATE_NEW:
14198         case MetaSession::STATE_CLOSED:
14199         default:
14200           break;
14201         }
14202       }
14203     }
14204     break;
14205   }
14206 }
14207
14208 bool Client::ms_handle_refused(Connection *con)
14209 {
14210   ldout(cct, 1) << __func__ << " on " << con->get_peer_addr() << dendl;
14211   return false;
14212 }
14213
14214 Inode *Client::get_quota_root(Inode *in, const UserPerm& perms)
14215 {
14216   Inode *quota_in = root_ancestor;
14217   SnapRealm *realm = in->snaprealm;
14218   while (realm) {
14219     ldout(cct, 10) << __func__ << " realm " << realm->ino << dendl;
14220     if (realm->ino != in->ino) {
14221       auto p = inode_map.find(vinodeno_t(realm->ino, CEPH_NOSNAP));
14222       if (p == inode_map.end())
14223         break;
14224
14225       if (p->second->quota.is_enable()) {
14226         quota_in = p->second;
14227         break;
14228       }
14229     }
14230     realm = realm->pparent;
14231   }
14232   ldout(cct, 10) << __func__ << " " << in->vino() << " -> " << quota_in->vino() << dendl;
14233   return quota_in;
14234 }
14235
14236 /**
14237  * Traverse quota ancestors of the Inode, return true
14238  * if any of them passes the passed function
14239  */
14240 bool Client::check_quota_condition(Inode *in, const UserPerm& perms,
14241                                    std::function<bool (const Inode &in)> test)
14242 {
14243   while (true) {
14244     ceph_assert(in != NULL);
14245     if (test(*in)) {
14246       return true;
14247     }
14248
14249     if (in == root_ancestor) {
14250       // We're done traversing, drop out
14251       return false;
14252     } else {
14253       // Continue up the tree
14254       in = get_quota_root(in, perms);
14255     }
14256   }
14257
14258   return false;
14259 }
14260
14261 bool Client::is_quota_files_exceeded(Inode *in, const UserPerm& perms)
14262 {
14263   return check_quota_condition(in, perms,
14264       [](const Inode &in) {
14265         return in.quota.max_files && in.rstat.rsize() >= in.quota.max_files;
14266       });
14267 }
14268
14269 bool Client::is_quota_bytes_exceeded(Inode *in, int64_t new_bytes,
14270                                      const UserPerm& perms)
14271 {
14272   return check_quota_condition(in, perms,
14273       [&new_bytes](const Inode &in) {
14274         return in.quota.max_bytes && (in.rstat.rbytes + new_bytes)
14275                > in.quota.max_bytes;
14276       });
14277 }
14278
14279 bool Client::is_quota_bytes_approaching(Inode *in, const UserPerm& perms)
14280 {
14281   ceph_assert(in->size >= in->reported_size);
14282   const uint64_t size = in->size - in->reported_size;
14283   return check_quota_condition(in, perms,
14284       [&size](const Inode &in) {
14285         if (in.quota.max_bytes) {
14286           if (in.rstat.rbytes >= in.quota.max_bytes) {
14287             return true;
14288           }
14289
14290           const uint64_t space = in.quota.max_bytes - in.rstat.rbytes;
14291           return (space >> 4) < size;
14292         } else {
14293           return false;
14294         }
14295       });
14296 }
14297
14298 enum {
14299   POOL_CHECKED = 1,
14300   POOL_CHECKING = 2,
14301   POOL_READ = 4,
14302   POOL_WRITE = 8,
14303 };
14304
14305 int Client::check_pool_perm(Inode *in, int need)
14306 {
14307   if (!cct->_conf->client_check_pool_perm)
14308     return 0;
14309
14310   int64_t pool_id = in->layout.pool_id;
14311   std::string pool_ns = in->layout.pool_ns;
14312   std::pair<int64_t, std::string> perm_key(pool_id, pool_ns);
14313   int have = 0;
14314   while (true) {
14315     auto it = pool_perms.find(perm_key);
14316     if (it == pool_perms.end())
14317       break;
14318     if (it->second == POOL_CHECKING) {
14319       // avoid concurrent checkings
14320       wait_on_list(waiting_for_pool_perm);
14321     } else {
14322       have = it->second;
14323       ceph_assert(have & POOL_CHECKED);
14324       break;
14325     }
14326   }
14327
14328   if (!have) {
14329     if (in->snapid != CEPH_NOSNAP) {
14330       // pool permission check needs to write to the first object. But for snapshot,
14331       // head of the first object may have alread been deleted. To avoid creating
14332       // orphan object, skip the check for now.
14333       return 0;
14334     }
14335
14336     pool_perms[perm_key] = POOL_CHECKING;
14337
14338     char oid_buf[32];
14339     snprintf(oid_buf, sizeof(oid_buf), "%llx.00000000", (unsigned long long)in->ino);
14340     object_t oid = oid_buf;
14341
14342     SnapContext nullsnapc;
14343
14344     C_SaferCond rd_cond;
14345     ObjectOperation rd_op;
14346     rd_op.stat(NULL, (ceph::real_time*)nullptr, NULL);
14347
14348     objecter->mutate(oid, OSDMap::file_to_object_locator(in->layout), rd_op,
14349                      nullsnapc, ceph::real_clock::now(), 0, &rd_cond);
14350
14351     C_SaferCond wr_cond;
14352     ObjectOperation wr_op;
14353     wr_op.create(true);
14354
14355     objecter->mutate(oid, OSDMap::file_to_object_locator(in->layout), wr_op,
14356                      nullsnapc, ceph::real_clock::now(), 0, &wr_cond);
14357
14358     client_lock.unlock();
14359     int rd_ret = rd_cond.wait();
14360     int wr_ret = wr_cond.wait();
14361     client_lock.lock();
14362
14363     bool errored = false;
14364
14365     if (rd_ret == 0 || rd_ret == -ENOENT)
14366       have |= POOL_READ;
14367     else if (rd_ret != -EPERM) {
14368       ldout(cct, 10) << __func__ << " on pool " << pool_id << " ns " << pool_ns
14369                      << " rd_err = " << rd_ret << " wr_err = " << wr_ret << dendl;
14370       errored = true;
14371     }
14372
14373     if (wr_ret == 0 || wr_ret == -EEXIST)
14374       have |= POOL_WRITE;
14375     else if (wr_ret != -EPERM) {
14376       ldout(cct, 10) << __func__ << " on pool " << pool_id << " ns " << pool_ns
14377                      << " rd_err = " << rd_ret << " wr_err = " << wr_ret << dendl;
14378       errored = true;
14379     }
14380
14381     if (errored) {
14382       // Indeterminate: erase CHECKING state so that subsequent calls re-check.
14383       // Raise EIO because actual error code might be misleading for
14384       // userspace filesystem user.
14385       pool_perms.erase(perm_key);
14386       signal_cond_list(waiting_for_pool_perm);
14387       return -EIO;
14388     }
14389
14390     pool_perms[perm_key] = have | POOL_CHECKED;
14391     signal_cond_list(waiting_for_pool_perm);
14392   }
14393
14394   if ((need & CEPH_CAP_FILE_RD) && !(have & POOL_READ)) {
14395     ldout(cct, 10) << __func__ << " on pool " << pool_id << " ns " << pool_ns
14396                    << " need " << ccap_string(need) << ", but no read perm" << dendl;
14397     return -EPERM;
14398   }
14399   if ((need & CEPH_CAP_FILE_WR) && !(have & POOL_WRITE)) {
14400     ldout(cct, 10) << __func__ << " on pool " << pool_id << " ns " << pool_ns
14401                    << " need " << ccap_string(need) << ", but no write perm" << dendl;
14402     return -EPERM;
14403   }
14404
14405   return 0;
14406 }
14407
14408 int Client::_posix_acl_permission(Inode *in, const UserPerm& perms, unsigned want)
14409 {
14410   if (acl_type == POSIX_ACL) {
14411     if (in->xattrs.count(ACL_EA_ACCESS)) {
14412       const bufferptr& access_acl = in->xattrs[ACL_EA_ACCESS];
14413
14414       return posix_acl_permits(access_acl, in->uid, in->gid, perms, want);
14415     }
14416   }
14417   return -EAGAIN;
14418 }
14419
14420 int Client::_posix_acl_chmod(Inode *in, mode_t mode, const UserPerm& perms)
14421 {
14422   if (acl_type == NO_ACL)
14423     return 0;
14424
14425   int r = _getattr(in, CEPH_STAT_CAP_XATTR, perms, in->xattr_version == 0);
14426   if (r < 0)
14427     goto out;
14428
14429   if (acl_type == POSIX_ACL) {
14430     if (in->xattrs.count(ACL_EA_ACCESS)) {
14431       const bufferptr& access_acl = in->xattrs[ACL_EA_ACCESS];
14432       bufferptr acl(access_acl.c_str(), access_acl.length());
14433       r = posix_acl_access_chmod(acl, mode);
14434       if (r < 0)
14435         goto out;
14436       r = _do_setxattr(in, ACL_EA_ACCESS, acl.c_str(), acl.length(), 0, perms);
14437     } else {
14438       r = 0;
14439     }
14440   }
14441 out:
14442   ldout(cct, 10) << __func__ << " ino " << in->ino << " result=" << r << dendl;
14443   return r;
14444 }
14445
14446 int Client::_posix_acl_create(Inode *dir, mode_t *mode, bufferlist& xattrs_bl,
14447                               const UserPerm& perms)
14448 {
14449   if (acl_type == NO_ACL)
14450     return 0;
14451
14452   if (S_ISLNK(*mode))
14453     return 0;
14454
14455   int r = _getattr(dir, CEPH_STAT_CAP_XATTR, perms, dir->xattr_version == 0);
14456   if (r < 0)
14457     goto out;
14458
14459   if (acl_type == POSIX_ACL) {
14460     if (dir->xattrs.count(ACL_EA_DEFAULT)) {
14461       map<string, bufferptr> xattrs;
14462
14463       const bufferptr& default_acl = dir->xattrs[ACL_EA_DEFAULT];
14464       bufferptr acl(default_acl.c_str(), default_acl.length());
14465       r = posix_acl_inherit_mode(acl, mode);
14466       if (r < 0)
14467         goto out;
14468
14469       if (r > 0) {
14470         r = posix_acl_equiv_mode(acl.c_str(), acl.length(), mode);
14471         if (r < 0)
14472           goto out;
14473         if (r > 0)
14474           xattrs[ACL_EA_ACCESS] = acl;
14475       }
14476
14477       if (S_ISDIR(*mode))
14478         xattrs[ACL_EA_DEFAULT] = dir->xattrs[ACL_EA_DEFAULT];
14479
14480       r = xattrs.size();
14481       if (r > 0)
14482         encode(xattrs, xattrs_bl);
14483     } else {
14484       if (umask_cb)
14485         *mode &= ~umask_cb(callback_handle);
14486       r = 0;
14487     }
14488   }
14489 out:
14490   ldout(cct, 10) << __func__ << " dir ino " << dir->ino << " result=" << r << dendl;
14491   return r;
14492 }
14493
14494 void Client::set_filer_flags(int flags)
14495 {
14496   std::lock_guard l(client_lock);
14497   ceph_assert(flags == 0 ||
14498          flags == CEPH_OSD_FLAG_LOCALIZE_READS);
14499   objecter->add_global_op_flags(flags);
14500 }
14501
14502 void Client::clear_filer_flags(int flags)
14503 {
14504   std::lock_guard l(client_lock);
14505   ceph_assert(flags == CEPH_OSD_FLAG_LOCALIZE_READS);
14506   objecter->clear_global_op_flag(flags);
14507 }
14508
14509 // called before mount
14510 void Client::set_uuid(const std::string& uuid)
14511 {
14512   std::lock_guard l(client_lock);
14513   assert(initialized);
14514   assert(!uuid.empty());
14515
14516   metadata["uuid"] = uuid;
14517   _close_sessions();
14518 }
14519
14520 // called before mount. 0 means infinite
14521 void Client::set_session_timeout(unsigned timeout)
14522 {
14523   std::lock_guard l(client_lock);
14524   assert(initialized);
14525
14526   metadata["timeout"] = stringify(timeout);
14527 }
14528
14529 // called before mount
14530 int Client::start_reclaim(const std::string& uuid, unsigned flags,
14531                           const std::string& fs_name)
14532 {
14533   std::lock_guard l(client_lock);
14534   if (!initialized)
14535     return -ENOTCONN;
14536
14537   if (uuid.empty())
14538     return -EINVAL;
14539
14540   {
14541     auto it = metadata.find("uuid");
14542     if (it != metadata.end() && it->second == uuid)
14543       return -EINVAL;
14544   }
14545
14546   int r = subscribe_mdsmap(fs_name);
14547   if (r < 0) {
14548     lderr(cct) << "mdsmap subscription failed: " << cpp_strerror(r) << dendl;
14549     return r;
14550   }
14551
14552   if (metadata.empty())
14553     populate_metadata("");
14554
14555   while (mdsmap->get_epoch() == 0)
14556     wait_on_list(waiting_for_mdsmap);
14557
14558   reclaim_errno = 0;
14559   for (unsigned mds = 0; mds < mdsmap->get_num_in_mds(); ) {
14560     if (!mdsmap->is_up(mds)) {
14561       ldout(cct, 10) << "mds." << mds << " not active, waiting for new mdsmap" << dendl;
14562       wait_on_list(waiting_for_mdsmap);
14563       continue;
14564     }
14565
14566     MetaSession *session;
14567     if (!have_open_session(mds)) {
14568       session = _get_or_open_mds_session(mds);
14569       if (session->state == MetaSession::STATE_REJECTED)
14570         return -EPERM;
14571       if (session->state != MetaSession::STATE_OPENING) {
14572         // umounting?
14573         return -EINVAL;
14574       }
14575       ldout(cct, 10) << "waiting for session to mds." << mds << " to open" << dendl;
14576       wait_on_context_list(session->waiting_for_open);
14577       continue;
14578     }
14579
14580     session = &mds_sessions.at(mds);
14581     if (!session->mds_features.test(CEPHFS_FEATURE_RECLAIM_CLIENT))
14582       return -EOPNOTSUPP;
14583
14584     if (session->reclaim_state == MetaSession::RECLAIM_NULL ||
14585         session->reclaim_state == MetaSession::RECLAIMING) {
14586       session->reclaim_state = MetaSession::RECLAIMING;
14587       auto m = make_message<MClientReclaim>(uuid, flags);
14588       session->con->send_message2(std::move(m));
14589       wait_on_list(waiting_for_reclaim);
14590     } else if (session->reclaim_state == MetaSession::RECLAIM_FAIL) {
14591       return reclaim_errno ? : -ENOTRECOVERABLE;
14592     } else {
14593       mds++;
14594     }
14595   }
14596
14597   // didn't find target session in any mds
14598   if (reclaim_target_addrs.empty()) {
14599     if (flags & CEPH_RECLAIM_RESET)
14600       return -ENOENT;
14601     return -ENOTRECOVERABLE;
14602   }
14603
14604   if (flags & CEPH_RECLAIM_RESET)
14605     return 0;
14606
14607   // use blacklist to check if target session was killed
14608   // (config option mds_session_blacklist_on_evict needs to be true)
14609   C_SaferCond cond;
14610   if (!objecter->wait_for_map(reclaim_osd_epoch, &cond)) {
14611     ldout(cct, 10) << __func__ << ": waiting for OSD epoch " << reclaim_osd_epoch << dendl;
14612     client_lock.unlock();
14613     cond.wait();
14614     client_lock.lock();
14615   }
14616
14617   bool blacklisted = objecter->with_osdmap(
14618       [this](const OSDMap &osd_map) -> bool {
14619         return osd_map.is_blacklisted(reclaim_target_addrs);
14620       });
14621   if (blacklisted)
14622     return -ENOTRECOVERABLE;
14623
14624   metadata["reclaiming_uuid"] = uuid;
14625   return 0;
14626 }
14627
14628 void Client::finish_reclaim()
14629 {
14630   auto it = metadata.find("reclaiming_uuid");
14631   if (it == metadata.end()) {
14632     for (auto &p : mds_sessions)
14633       p.second.reclaim_state = MetaSession::RECLAIM_NULL;
14634     return;
14635   }
14636
14637   for (auto &p : mds_sessions) {
14638     p.second.reclaim_state = MetaSession::RECLAIM_NULL;
14639     auto m = make_message<MClientReclaim>("", MClientReclaim::FLAG_FINISH);
14640     p.second.con->send_message2(std::move(m));
14641   }
14642
14643   metadata["uuid"] = it->second;
14644   metadata.erase(it);
14645 }
14646
14647 void Client::handle_client_reclaim_reply(const MConstRef<MClientReclaimReply>& reply)
14648 {
14649   mds_rank_t from = mds_rank_t(reply->get_source().num());
14650   ldout(cct, 10) << __func__ << " " << *reply << " from mds." << from << dendl;
14651
14652   MetaSession *session = _get_mds_session(from, reply->get_connection().get());
14653   if (!session) {
14654     ldout(cct, 10) << " discarding reclaim reply from sessionless mds." <<  from << dendl;
14655     return;
14656   }
14657
14658   if (reply->get_result() >= 0) {
14659     session->reclaim_state = MetaSession::RECLAIM_OK;
14660     if (reply->get_epoch() > reclaim_osd_epoch)
14661       reclaim_osd_epoch = reply->get_epoch();
14662     if (!reply->get_addrs().empty())
14663       reclaim_target_addrs = reply->get_addrs();
14664   } else {
14665     session->reclaim_state = MetaSession::RECLAIM_FAIL;
14666     reclaim_errno = reply->get_result();
14667   }
14668
14669   signal_cond_list(waiting_for_reclaim);
14670 }
14671
14672 /**
14673  * This is included in cap release messages, to cause
14674  * the MDS to wait until this OSD map epoch.  It is necessary
14675  * in corner cases where we cancel RADOS ops, so that
14676  * nobody else tries to do IO to the same objects in
14677  * the same epoch as the cancelled ops.
14678  */
14679 void Client::set_cap_epoch_barrier(epoch_t e)
14680 {
14681   ldout(cct, 5) << __func__ << " epoch = " << e << dendl;
14682   cap_epoch_barrier = e;
14683 }
14684
14685 const char** Client::get_tracked_conf_keys() const
14686 {
14687   static const char* keys[] = {
14688     "client_cache_size",
14689     "client_cache_mid",
14690     "client_acl_type",
14691     "client_deleg_timeout",
14692     "client_deleg_break_on_open",
14693     NULL
14694   };
14695   return keys;
14696 }
14697
14698 void Client::handle_conf_change(const ConfigProxy& conf,
14699                                 const std::set <std::string> &changed)
14700 {
14701   std::lock_guard lock(client_lock);
14702
14703   if (changed.count("client_cache_mid")) {
14704     lru.lru_set_midpoint(cct->_conf->client_cache_mid);
14705   }
14706   if (changed.count("client_acl_type")) {
14707     acl_type = NO_ACL;
14708     if (cct->_conf->client_acl_type == "posix_acl")
14709       acl_type = POSIX_ACL;
14710   }
14711 }
14712
14713 void intrusive_ptr_add_ref(Inode *in)
14714 {
14715   in->get();
14716 }
14717
14718 void intrusive_ptr_release(Inode *in)
14719 {
14720   in->client->put_inode(in);
14721 }
14722
14723 mds_rank_t Client::_get_random_up_mds() const
14724 {
14725   ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
14726
14727   std::set<mds_rank_t> up;
14728   mdsmap->get_up_mds_set(up);
14729
14730   if (up.empty())
14731     return MDS_RANK_NONE;
14732   std::set<mds_rank_t>::const_iterator p = up.begin();
14733   for (int n = rand() % up.size(); n; n--)
14734     ++p;
14735   return *p;
14736 }
14737
14738
14739 StandaloneClient::StandaloneClient(Messenger *m, MonClient *mc)
14740     : Client(m, mc, new Objecter(m->cct, m, mc, NULL, 0, 0))
14741 {
14742   monclient->set_messenger(m);
14743   objecter->set_client_incarnation(0);
14744 }
14745
14746 StandaloneClient::~StandaloneClient()
14747 {
14748   delete objecter;
14749   objecter = nullptr;
14750 }
14751
14752 int StandaloneClient::init()
14753 {
14754   _pre_init();
14755   objecter->init();
14756
14757   client_lock.lock();
14758   ceph_assert(!is_initialized());
14759
14760   messenger->add_dispatcher_tail(objecter);
14761   messenger->add_dispatcher_tail(this);
14762
14763   monclient->set_want_keys(CEPH_ENTITY_TYPE_MDS | CEPH_ENTITY_TYPE_OSD);
14764   int r = monclient->init();
14765   if (r < 0) {
14766     // need to do cleanup because we're in an intermediate init state
14767     timer.shutdown();
14768     client_lock.unlock();
14769     objecter->shutdown();
14770     objectcacher->stop();
14771     monclient->shutdown();
14772     return r;
14773   }
14774   objecter->start();
14775
14776   client_lock.unlock();
14777   _finish_init();
14778
14779   return 0;
14780 }
14781
14782 void StandaloneClient::shutdown()
14783 {
14784   Client::shutdown();
14785   objecter->shutdown();
14786   monclient->shutdown();
14787 }