ceph/src/client/Client.cc

   1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
   2 // vim: ts=8 sw=2 smarttab
   3 /*
   4  * Ceph - scalable distributed file system
   5  *
   6  * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
   7  *
   8  * This is free software; you can redistribute it and/or
   9  * modify it under the terms of the GNU Lesser General Public
  10  * License version 2.1, as published by the Free Software
  11  * Foundation.  See file COPYING.
  12  *
  13  */
  14
  15
  16 // unix-ey fs stuff
  17 #include <unistd.h>
  18 #include <sys/types.h>
  19 #include <time.h>
  20 #include <utime.h>
  21 #include <string.h>
  22 #include <sys/stat.h>
  23 #include <sys/param.h>
  24 #include <fcntl.h>
  25 #include <sys/file.h>
  26 #include <sys/utsname.h>
  27 #include <sys/uio.h>
  28
  29 #include <boost/lexical_cast.hpp>
  30 #include <boost/fusion/include/std_pair.hpp>
  31
  32 #if defined(__FreeBSD__)
  33 #define XATTR_CREATE    0x1
  34 #define XATTR_REPLACE   0x2
  35 #else
  36 #include <sys/xattr.h>
  37 #endif
  38
  39 #if defined(__linux__)
  40 #include <linux/falloc.h>
  41 #endif
  42
  43 #include <sys/statvfs.h>
  44
  45 #include "common/config.h"
  46 #include "common/version.h"
  47
  48 #include "mon/MonClient.h"
  49
  50 #include "messages/MClientCaps.h"
  51 #include "messages/MClientLease.h"
  52 #include "messages/MClientQuota.h"
  53 #include "messages/MClientReclaim.h"
  54 #include "messages/MClientReclaimReply.h"
  55 #include "messages/MClientReconnect.h"
  56 #include "messages/MClientReply.h"
  57 #include "messages/MClientRequest.h"
  58 #include "messages/MClientRequestForward.h"
  59 #include "messages/MClientSession.h"
  60 #include "messages/MClientSnap.h"
  61 #include "messages/MCommandReply.h"
  62 #include "messages/MFSMap.h"
  63 #include "messages/MFSMapUser.h"
  64 #include "messages/MMDSMap.h"
  65 #include "messages/MOSDMap.h"
  66
  67 #include "mds/flock.h"
  68 #include "mds/cephfs_features.h"
  69 #include "osd/OSDMap.h"
  70 #include "osdc/Filer.h"
  71
  72 #include "common/Cond.h"
  73 #include "common/perf_counters.h"
  74 #include "common/admin_socket.h"
  75 #include "common/errno.h"
  76 #include "include/str_list.h"
  77
  78 #define dout_subsys ceph_subsys_client
  79
  80 #include "include/lru.h"
  81 #include "include/compat.h"
  82 #include "include/stringify.h"
  83
  84 #include "Client.h"
  85 #include "Inode.h"
  86 #include "Dentry.h"
  87 #include "Delegation.h"
  88 #include "Dir.h"
  89 #include "ClientSnapRealm.h"
  90 #include "Fh.h"
  91 #include "MetaSession.h"
  92 #include "MetaRequest.h"
  93 #include "ObjecterWriteback.h"
  94 #include "posix_acl.h"
  95
  96 #include "include/ceph_assert.h"
  97 #include "include/stat.h"
  98
  99 #include "include/cephfs/ceph_ll_client.h"
 100
 101 #if HAVE_GETGROUPLIST
 102 #include <grp.h>
 103 #include <pwd.h>
 104 #include <unistd.h>
 105 #endif
 106
 107 #undef dout_prefix
 108 #define dout_prefix *_dout << "client." << whoami << " "
 109
 110 #define  tout(cct)       if (!cct->_conf->client_trace.empty()) traceout
 111
 112 // FreeBSD fails to define this
 113 #ifndef O_DSYNC
 114 #define O_DSYNC 0x0
 115 #endif
 116 // Darwin fails to define this
 117 #ifndef O_RSYNC
 118 #define O_RSYNC 0x0
 119 #endif
 120
 121 #ifndef O_DIRECT
 122 #define O_DIRECT 0x0
 123 #endif
 124
 125 #define DEBUG_GETATTR_CAPS (CEPH_CAP_XATTR_SHARED)
 126
 127 using namespace TOPNSPC::common;
 128
 129 void client_flush_set_callback(void *p, ObjectCacher::ObjectSet *oset)
 130 {
 131   Client *client = static_cast<Client*>(p);
 132   client->flush_set_callback(oset);
 133 }
 134
 135
 136 // -------------
 137
 138 Client::CommandHook::CommandHook(Client *client) :
 139   m_client(client)
 140 {
 141 }
 142
 143 int Client::CommandHook::call(
 144   std::string_view command,
 145   const cmdmap_t& cmdmap,
 146   Formatter *f,
 147   std::ostream& errss,
 148   bufferlist& out)
 149 {
 150   f->open_object_section("result");
 151   {
 152     std::lock_guard l{m_client->client_lock};
 153     if (command == "mds_requests")
 154       m_client->dump_mds_requests(f);
 155     else if (command == "mds_sessions") {
 156       bool cap_dump = false;
 157       cmd_getval(cmdmap, "cap_dump", cap_dump);
 158       m_client->dump_mds_sessions(f, cap_dump);
 159     } else if (command == "dump_cache")
 160       m_client->dump_cache(f);
 161     else if (command == "kick_stale_sessions")
 162       m_client->_kick_stale_sessions();
 163     else if (command == "status")
 164       m_client->dump_status(f);
 165     else
 166       ceph_abort_msg("bad command registered");
 167   }
 168   f->close_section();
 169   return 0;
 170 }
 171
 172
 173 // -------------
 174
 175 dir_result_t::dir_result_t(Inode *in, const UserPerm& perms)
 176   : inode(in), offset(0), next_offset(2),
 177     release_count(0), ordered_count(0), cache_index(0), start_shared_gen(0),
 178     perms(perms)
 179   { }
 180
 181 void Client::_reset_faked_inos()
 182 {
 183   ino_t start = 1024;
 184   free_faked_inos.clear();
 185   free_faked_inos.insert(start, (uint32_t)-1 - start + 1);
 186   last_used_faked_ino = 0;
 187   last_used_faked_root = 0;
 188   _use_faked_inos = sizeof(ino_t) < 8 || cct->_conf->client_use_faked_inos;
 189 }
 190
 191 void Client::_assign_faked_ino(Inode *in)
 192 {
 193   if (0 == last_used_faked_ino)
 194     last_used_faked_ino = last_used_faked_ino + 2048; // start(1024)~2048 reserved for _assign_faked_root
 195   interval_set<ino_t>::const_iterator it = free_faked_inos.lower_bound(last_used_faked_ino + 1);
 196   if (it == free_faked_inos.end() && last_used_faked_ino > 0) {
 197     last_used_faked_ino = 2048;
 198     it = free_faked_inos.lower_bound(last_used_faked_ino + 1);
 199   }
 200   ceph_assert(it != free_faked_inos.end());
 201   if (last_used_faked_ino < it.get_start()) {
 202     ceph_assert(it.get_len() > 0);
 203     last_used_faked_ino = it.get_start();
 204   } else {
 205     ++last_used_faked_ino;
 206     ceph_assert(it.get_start() + it.get_len() > last_used_faked_ino);
 207   }
 208   in->faked_ino = last_used_faked_ino;
 209   free_faked_inos.erase(in->faked_ino);
 210   faked_ino_map[in->faked_ino] = in->vino();
 211 }
 212
 213 /*
 214  * In the faked mode, if you export multiple subdirectories,
 215  * you will see that the inode numbers of the exported subdirectories
 216  * are the same. so we distinguish the mount point by reserving
 217  * the "fake ids" between "1024~2048" and combining the last
 218  * 10bits(0x3ff) of the "root inodes".
 219 */
 220 void Client::_assign_faked_root(Inode *in)
 221 {
 222   interval_set<ino_t>::const_iterator it = free_faked_inos.lower_bound(last_used_faked_root + 1);
 223   if (it == free_faked_inos.end() && last_used_faked_root > 0) {
 224     last_used_faked_root = 0;
 225     it = free_faked_inos.lower_bound(last_used_faked_root + 1);
 226   }
 227   assert(it != free_faked_inos.end());
 228   vinodeno_t inode_info = in->vino();
 229   uint64_t inode_num = (uint64_t)inode_info.ino;
 230   ldout(cct, 10) << "inode_num " << inode_num << "inode_num & 0x3ff=" << (inode_num & 0x3ff)<< dendl;
 231   last_used_faked_root = it.get_start()  + (inode_num & 0x3ff); // 0x3ff mask and get_start will not exceed 2048
 232   assert(it.get_start() + it.get_len() > last_used_faked_root);
 233
 234   in->faked_ino = last_used_faked_root;
 235   free_faked_inos.erase(in->faked_ino);
 236   faked_ino_map[in->faked_ino] = in->vino();
 237 }
 238
 239 void Client::_release_faked_ino(Inode *in)
 240 {
 241   free_faked_inos.insert(in->faked_ino);
 242   faked_ino_map.erase(in->faked_ino);
 243 }
 244
 245 vinodeno_t Client::_map_faked_ino(ino_t ino)
 246 {
 247   vinodeno_t vino;
 248   if (ino == 1)
 249     vino = root->vino();
 250   else if (faked_ino_map.count(ino))
 251     vino = faked_ino_map[ino];
 252   else
 253     vino = vinodeno_t(0, CEPH_NOSNAP);
 254   ldout(cct, 10) << __func__ << " " << ino << " -> " << vino << dendl;
 255   return vino;
 256 }
 257
 258 vinodeno_t Client::map_faked_ino(ino_t ino)
 259 {
 260   std::lock_guard lock(client_lock);
 261   return _map_faked_ino(ino);
 262 }
 263
 264 // cons/des
 265
 266 Client::Client(Messenger *m, MonClient *mc, Objecter *objecter_)
 267   : Dispatcher(m->cct),
 268     timer(m->cct, client_lock),
 269     messenger(m),
 270     monclient(mc),
 271     objecter(objecter_),
 272     whoami(mc->get_global_id()),
 273     async_ino_invalidator(m->cct),
 274     async_dentry_invalidator(m->cct),
 275     interrupt_finisher(m->cct),
 276     remount_finisher(m->cct),
 277     async_ino_releasor(m->cct),
 278     objecter_finisher(m->cct),
 279     m_command_hook(this),
 280     fscid(0)
 281 {
 282   _reset_faked_inos();
 283
 284   user_id = cct->_conf->client_mount_uid;
 285   group_id = cct->_conf->client_mount_gid;
 286   fuse_default_permissions = cct->_conf.get_val<bool>(
 287     "fuse_default_permissions");
 288
 289   if (cct->_conf->client_acl_type == "posix_acl")
 290     acl_type = POSIX_ACL;
 291
 292   lru.lru_set_midpoint(cct->_conf->client_cache_mid);
 293
 294   // file handles
 295   free_fd_set.insert(10, 1<<30);
 296
 297   mdsmap.reset(new MDSMap);
 298
 299   // osd interfaces
 300   writeback_handler.reset(new ObjecterWriteback(objecter, &objecter_finisher,
 301                                             &client_lock));
 302   objectcacher.reset(new ObjectCacher(cct, "libcephfs", *writeback_handler, client_lock,
 303                                   client_flush_set_callback,    // all commit callback
 304                                   (void*)this,
 305                                   cct->_conf->client_oc_size,
 306                                   cct->_conf->client_oc_max_objects,
 307                                   cct->_conf->client_oc_max_dirty,
 308                                   cct->_conf->client_oc_target_dirty,
 309                                   cct->_conf->client_oc_max_dirty_age,
 310                                   true));
 311 }
 312
 313
 314 Client::~Client()
 315 {
 316   ceph_assert(ceph_mutex_is_not_locked(client_lock));
 317
 318   // It is necessary to hold client_lock, because any inode destruction
 319   // may call into ObjectCacher, which asserts that it's lock (which is
 320   // client_lock) is held.
 321   std::lock_guard l{client_lock};
 322   tear_down_cache();
 323 }
 324
 325 void Client::tear_down_cache()
 326 {
 327   // fd's
 328   for (ceph::unordered_map<int, Fh*>::iterator it = fd_map.begin();
 329        it != fd_map.end();
 330        ++it) {
 331     Fh *fh = it->second;
 332     ldout(cct, 1) << __func__ << " forcing close of fh " << it->first << " ino " << fh->inode->ino << dendl;
 333     _release_fh(fh);
 334   }
 335   fd_map.clear();
 336
 337   while (!opened_dirs.empty()) {
 338     dir_result_t *dirp = *opened_dirs.begin();
 339     ldout(cct, 1) << __func__ << " forcing close of dir " << dirp << " ino " << dirp->inode->ino << dendl;
 340     _closedir(dirp);
 341   }
 342
 343   // caps!
 344   // *** FIXME ***
 345
 346   // empty lru
 347   trim_cache();
 348   ceph_assert(lru.lru_get_size() == 0);
 349
 350   // close root ino
 351   ceph_assert(inode_map.size() <= 1 + root_parents.size());
 352   if (root && inode_map.size() == 1 + root_parents.size()) {
 353     delete root;
 354     root = 0;
 355     root_ancestor = 0;
 356     while (!root_parents.empty())
 357       root_parents.erase(root_parents.begin());
 358     inode_map.clear();
 359     _reset_faked_inos();
 360   }
 361
 362   ceph_assert(inode_map.empty());
 363 }
 364
 365 inodeno_t Client::get_root_ino()
 366 {
 367   std::lock_guard l(client_lock);
 368   if (use_faked_inos())
 369     return root->faked_ino;
 370   else
 371     return root->ino;
 372 }
 373
 374 Inode *Client::get_root()
 375 {
 376   std::lock_guard l(client_lock);
 377   root->ll_get();
 378   return root;
 379 }
 380
 381
 382 // debug crapola
 383
 384 void Client::dump_inode(Formatter *f, Inode *in, set<Inode*>& did, bool disconnected)
 385 {
 386   filepath path;
 387   in->make_long_path(path);
 388   ldout(cct, 1) << "dump_inode: "
 389                 << (disconnected ? "DISCONNECTED ":"")
 390                 << "inode " << in->ino
 391                 << " " << path
 392                 << " ref " << in->get_num_ref()
 393                 << *in << dendl;
 394
 395   if (f) {
 396     f->open_object_section("inode");
 397     f->dump_stream("path") << path;
 398     if (disconnected)
 399       f->dump_int("disconnected", 1);
 400     in->dump(f);
 401     f->close_section();
 402   }
 403
 404   did.insert(in);
 405   if (in->dir) {
 406     ldout(cct, 1) << "  dir " << in->dir << " size " << in->dir->dentries.size() << dendl;
 407     for (ceph::unordered_map<string, Dentry*>::iterator it = in->dir->dentries.begin();
 408          it != in->dir->dentries.end();
 409          ++it) {
 410       ldout(cct, 1) << "   " << in->ino << " dn " << it->first << " " << it->second << " ref " << it->second->ref << dendl;
 411       if (f) {
 412         f->open_object_section("dentry");
 413         it->second->dump(f);
 414         f->close_section();
 415       }
 416       if (it->second->inode)
 417         dump_inode(f, it->second->inode.get(), did, false);
 418     }
 419   }
 420 }
 421
 422 void Client::dump_cache(Formatter *f)
 423 {
 424   set<Inode*> did;
 425
 426   ldout(cct, 1) << __func__ << dendl;
 427
 428   if (f)
 429     f->open_array_section("cache");
 430
 431   if (root)
 432     dump_inode(f, root, did, true);
 433
 434   // make a second pass to catch anything disconnected
 435   for (ceph::unordered_map<vinodeno_t, Inode*>::iterator it = inode_map.begin();
 436        it != inode_map.end();
 437        ++it) {
 438     if (did.count(it->second))
 439       continue;
 440     dump_inode(f, it->second, did, true);
 441   }
 442
 443   if (f)
 444     f->close_section();
 445 }
 446
 447 void Client::dump_status(Formatter *f)
 448 {
 449   ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
 450
 451   ldout(cct, 1) << __func__ << dendl;
 452
 453   const epoch_t osd_epoch
 454     = objecter->with_osdmap(std::mem_fn(&OSDMap::get_epoch));
 455
 456   if (f) {
 457     f->open_object_section("metadata");
 458     for (const auto& kv : metadata)
 459       f->dump_string(kv.first.c_str(), kv.second);
 460     f->close_section();
 461
 462     f->dump_int("dentry_count", lru.lru_get_size());
 463     f->dump_int("dentry_pinned_count", lru.lru_get_num_pinned());
 464     f->dump_int("id", get_nodeid().v);
 465     entity_inst_t inst(messenger->get_myname(), messenger->get_myaddr_legacy());
 466     f->dump_object("inst", inst);
 467     f->dump_object("addr", inst.addr);
 468     f->dump_stream("inst_str") << inst.name << " " << inst.addr.get_legacy_str();
 469     f->dump_string("addr_str", inst.addr.get_legacy_str());
 470     f->dump_int("inode_count", inode_map.size());
 471     f->dump_int("mds_epoch", mdsmap->get_epoch());
 472     f->dump_int("osd_epoch", osd_epoch);
 473     f->dump_int("osd_epoch_barrier", cap_epoch_barrier);
 474     f->dump_bool("blacklisted", blacklisted);
 475     f->dump_string("fs_name", mdsmap->get_fs_name());
 476   }
 477 }
 478
 479 void Client::_pre_init()
 480 {
 481   timer.init();
 482
 483   objecter_finisher.start();
 484   filer.reset(new Filer(objecter, &objecter_finisher));
 485   objecter->enable_blacklist_events();
 486
 487   objectcacher->start();
 488 }
 489
 490 int Client::init()
 491 {
 492   _pre_init();
 493   {
 494     std::lock_guard l{client_lock};
 495     ceph_assert(!initialized);
 496     messenger->add_dispatcher_tail(this);
 497   }
 498   _finish_init();
 499   return 0;
 500 }
 501
 502 void Client::_finish_init()
 503 {
 504   {
 505     std::lock_guard l{client_lock};
 506     // logger
 507     PerfCountersBuilder plb(cct, "client", l_c_first, l_c_last);
 508     plb.add_time_avg(l_c_reply, "reply", "Latency of receiving a reply on metadata request");
 509     plb.add_time_avg(l_c_lat, "lat", "Latency of processing a metadata request");
 510     plb.add_time_avg(l_c_wrlat, "wrlat", "Latency of a file data write operation");
 511     plb.add_time_avg(l_c_read, "rdlat", "Latency of a file data read operation");
 512     plb.add_time_avg(l_c_fsync, "fsync", "Latency of a file sync operation");
 513     logger.reset(plb.create_perf_counters());
 514     cct->get_perfcounters_collection()->add(logger.get());
 515   }
 516
 517   cct->_conf.add_observer(this);
 518
 519   AdminSocket* admin_socket = cct->get_admin_socket();
 520   int ret = admin_socket->register_command("mds_requests",
 521                                            &m_command_hook,
 522                                            "show in-progress mds requests");
 523   if (ret < 0) {
 524     lderr(cct) << "error registering admin socket command: "
 525                << cpp_strerror(-ret) << dendl;
 526   }
 527   ret = admin_socket->register_command("mds_sessions "
 528                                        "name=cap_dump,type=CephBool,req=false",
 529                                        &m_command_hook,
 530                                        "show mds session state");
 531   if (ret < 0) {
 532     lderr(cct) << "error registering admin socket command: "
 533                << cpp_strerror(-ret) << dendl;
 534   }
 535   ret = admin_socket->register_command("dump_cache",
 536                                        &m_command_hook,
 537                                        "show in-memory metadata cache contents");
 538   if (ret < 0) {
 539     lderr(cct) << "error registering admin socket command: "
 540                << cpp_strerror(-ret) << dendl;
 541   }
 542   ret = admin_socket->register_command("kick_stale_sessions",
 543                                        &m_command_hook,
 544                                        "kick sessions that were remote reset");
 545   if (ret < 0) {
 546     lderr(cct) << "error registering admin socket command: "
 547                << cpp_strerror(-ret) << dendl;
 548   }
 549   ret = admin_socket->register_command("status",
 550                                        &m_command_hook,
 551                                        "show overall client status");
 552   if (ret < 0) {
 553     lderr(cct) << "error registering admin socket command: "
 554                << cpp_strerror(-ret) << dendl;
 555   }
 556
 557   std::lock_guard l{client_lock};
 558   initialized = true;
 559 }
 560
 561 void Client::shutdown()
 562 {
 563   ldout(cct, 1) << __func__ << dendl;
 564
 565   // If we were not mounted, but were being used for sending
 566   // MDS commands, we may have sessions that need closing.
 567   {
 568     std::lock_guard l{client_lock};
 569     _close_sessions();
 570   }
 571   cct->_conf.remove_observer(this);
 572
 573   cct->get_admin_socket()->unregister_commands(&m_command_hook);
 574
 575   if (ino_invalidate_cb) {
 576     ldout(cct, 10) << "shutdown stopping cache invalidator finisher" << dendl;
 577     async_ino_invalidator.wait_for_empty();
 578     async_ino_invalidator.stop();
 579   }
 580
 581   if (dentry_invalidate_cb) {
 582     ldout(cct, 10) << "shutdown stopping dentry invalidator finisher" << dendl;
 583     async_dentry_invalidator.wait_for_empty();
 584     async_dentry_invalidator.stop();
 585   }
 586
 587   if (switch_interrupt_cb) {
 588     ldout(cct, 10) << "shutdown stopping interrupt finisher" << dendl;
 589     interrupt_finisher.wait_for_empty();
 590     interrupt_finisher.stop();
 591   }
 592
 593   if (remount_cb) {
 594     ldout(cct, 10) << "shutdown stopping remount finisher" << dendl;
 595     remount_finisher.wait_for_empty();
 596     remount_finisher.stop();
 597   }
 598
 599   if (ino_release_cb) {
 600     ldout(cct, 10) << "shutdown stopping inode release finisher" << dendl;
 601     async_ino_releasor.wait_for_empty();
 602     async_ino_releasor.stop();
 603   }
 604
 605   objectcacher->stop();  // outside of client_lock! this does a join.
 606   {
 607     std::lock_guard l{client_lock};
 608     ceph_assert(initialized);
 609     initialized = false;
 610     timer.shutdown();
 611   }
 612   objecter_finisher.wait_for_empty();
 613   objecter_finisher.stop();
 614
 615   if (logger) {
 616     cct->get_perfcounters_collection()->remove(logger.get());
 617     logger.reset();
 618   }
 619 }
 620
 621
 622 // ===================
 623 // metadata cache stuff
 624
 625 void Client::trim_cache(bool trim_kernel_dcache)
 626 {
 627   uint64_t max = cct->_conf->client_cache_size;
 628   ldout(cct, 20) << "trim_cache size " << lru.lru_get_size() << " max " << max << dendl;
 629   unsigned last = 0;
 630   while (lru.lru_get_size() != last) {
 631     last = lru.lru_get_size();
 632
 633     if (!unmounting && lru.lru_get_size() <= max)  break;
 634
 635     // trim!
 636     Dentry *dn = static_cast<Dentry*>(lru.lru_get_next_expire());
 637     if (!dn)
 638       break;  // done
 639
 640     trim_dentry(dn);
 641   }
 642
 643   if (trim_kernel_dcache && lru.lru_get_size() > max)
 644     _invalidate_kernel_dcache();
 645
 646   // hose root?
 647   if (lru.lru_get_size() == 0 && root && root->get_num_ref() == 0 && inode_map.size() == 1 + root_parents.size()) {
 648     ldout(cct, 15) << "trim_cache trimmed root " << root << dendl;
 649     delete root;
 650     root = 0;
 651     root_ancestor = 0;
 652     while (!root_parents.empty())
 653       root_parents.erase(root_parents.begin());
 654     inode_map.clear();
 655     _reset_faked_inos();
 656   }
 657 }
 658
 659 void Client::trim_cache_for_reconnect(MetaSession *s)
 660 {
 661   mds_rank_t mds = s->mds_num;
 662   ldout(cct, 20) << __func__ << " mds." << mds << dendl;
 663
 664   int trimmed = 0;
 665   list<Dentry*> skipped;
 666   while (lru.lru_get_size() > 0) {
 667     Dentry *dn = static_cast<Dentry*>(lru.lru_expire());
 668     if (!dn)
 669       break;
 670
 671     if ((dn->inode && dn->inode->caps.count(mds)) ||
 672         dn->dir->parent_inode->caps.count(mds)) {
 673       trim_dentry(dn);
 674       trimmed++;
 675     } else
 676       skipped.push_back(dn);
 677   }
 678
 679   for(list<Dentry*>::iterator p = skipped.begin(); p != skipped.end(); ++p)
 680     lru.lru_insert_mid(*p);
 681
 682   ldout(cct, 20) << __func__ << " mds." << mds
 683                  << " trimmed " << trimmed << " dentries" << dendl;
 684
 685   if (s->caps.size() > 0)
 686     _invalidate_kernel_dcache();
 687 }
 688
 689 void Client::trim_dentry(Dentry *dn)
 690 {
 691   ldout(cct, 15) << "trim_dentry unlinking dn " << dn->name
 692                  << " in dir "
 693                  << std::hex << dn->dir->parent_inode->ino << std::dec
 694                  << dendl;
 695   if (dn->inode) {
 696     Inode *diri = dn->dir->parent_inode;
 697     clear_dir_complete_and_ordered(diri, true);
 698   }
 699   unlink(dn, false, false);  // drop dir, drop dentry
 700 }
 701
 702
 703 void Client::update_inode_file_size(Inode *in, int issued, uint64_t size,
 704                                     uint64_t truncate_seq, uint64_t truncate_size)
 705 {
 706   uint64_t prior_size = in->size;
 707
 708   if (truncate_seq > in->truncate_seq ||
 709       (truncate_seq == in->truncate_seq && size > in->size)) {
 710     ldout(cct, 10) << "size " << in->size << " -> " << size << dendl;
 711     in->size = size;
 712     in->reported_size = size;
 713     if (truncate_seq != in->truncate_seq) {
 714       ldout(cct, 10) << "truncate_seq " << in->truncate_seq << " -> "
 715                << truncate_seq << dendl;
 716       in->truncate_seq = truncate_seq;
 717       in->oset.truncate_seq = truncate_seq;
 718
 719       // truncate cached file data
 720       if (prior_size > size) {
 721         _invalidate_inode_cache(in, truncate_size, prior_size - truncate_size);
 722       }
 723     }
 724
 725     // truncate inline data
 726     if (in->inline_version < CEPH_INLINE_NONE) {
 727       uint32_t len = in->inline_data.length();
 728       if (size < len)
 729         in->inline_data.splice(size, len - size);
 730     }
 731   }
 732   if (truncate_seq >= in->truncate_seq &&
 733       in->truncate_size != truncate_size) {
 734     if (in->is_file()) {
 735       ldout(cct, 10) << "truncate_size " << in->truncate_size << " -> "
 736                << truncate_size << dendl;
 737       in->truncate_size = truncate_size;
 738       in->oset.truncate_size = truncate_size;
 739     } else {
 740       ldout(cct, 0) << "Hmmm, truncate_seq && truncate_size changed on non-file inode!" << dendl;
 741     }
 742   }
 743 }
 744
 745 void Client::update_inode_file_time(Inode *in, int issued, uint64_t time_warp_seq,
 746                                     utime_t ctime, utime_t mtime, utime_t atime)
 747 {
 748   ldout(cct, 10) << __func__ << " " << *in << " " << ccap_string(issued)
 749                  << " ctime " << ctime << " mtime " << mtime << dendl;
 750
 751   if (time_warp_seq > in->time_warp_seq)
 752     ldout(cct, 10) << " mds time_warp_seq " << time_warp_seq
 753                    << " is higher than local time_warp_seq "
 754                    << in->time_warp_seq << dendl;
 755
 756   int warn = false;
 757   // be careful with size, mtime, atime
 758   if (issued & (CEPH_CAP_FILE_EXCL|
 759                 CEPH_CAP_FILE_WR|
 760                 CEPH_CAP_FILE_BUFFER|
 761                 CEPH_CAP_AUTH_EXCL|
 762                 CEPH_CAP_XATTR_EXCL)) {
 763     ldout(cct, 30) << "Yay have enough caps to look at our times" << dendl;
 764     if (ctime > in->ctime)
 765       in->ctime = ctime;
 766     if (time_warp_seq > in->time_warp_seq) {
 767       //the mds updated times, so take those!
 768       in->mtime = mtime;
 769       in->atime = atime;
 770       in->time_warp_seq = time_warp_seq;
 771     } else if (time_warp_seq == in->time_warp_seq) {
 772       //take max times
 773       if (mtime > in->mtime)
 774         in->mtime = mtime;
 775       if (atime > in->atime)
 776         in->atime = atime;
 777     } else if (issued & CEPH_CAP_FILE_EXCL) {
 778       //ignore mds values as we have a higher seq
 779     } else warn = true;
 780   } else {
 781     ldout(cct, 30) << "Don't have enough caps, just taking mds' time values" << dendl;
 782     if (time_warp_seq >= in->time_warp_seq) {
 783       in->ctime = ctime;
 784       in->mtime = mtime;
 785       in->atime = atime;
 786       in->time_warp_seq = time_warp_seq;
 787     } else warn = true;
 788   }
 789   if (warn) {
 790     ldout(cct, 0) << "WARNING: " << *in << " mds time_warp_seq "
 791             << time_warp_seq << " is lower than local time_warp_seq "
 792             << in->time_warp_seq
 793             << dendl;
 794   }
 795 }
 796
 797 void Client::_fragmap_remove_non_leaves(Inode *in)
 798 {
 799   for (map<frag_t,int>::iterator p = in->fragmap.begin(); p != in->fragmap.end(); )
 800     if (!in->dirfragtree.is_leaf(p->first))
 801       in->fragmap.erase(p++);
 802     else
 803       ++p;
 804 }
 805
 806 void Client::_fragmap_remove_stopped_mds(Inode *in, mds_rank_t mds)
 807 {
 808   for (auto p = in->fragmap.begin(); p != in->fragmap.end(); )
 809     if (p->second == mds)
 810       in->fragmap.erase(p++);
 811     else
 812       ++p;
 813 }
 814
 815 Inode * Client::add_update_inode(InodeStat *st, utime_t from,
 816                                  MetaSession *session,
 817                                  const UserPerm& request_perms)
 818 {
 819   Inode *in;
 820   bool was_new = false;
 821   if (inode_map.count(st->vino)) {
 822     in = inode_map[st->vino];
 823     ldout(cct, 12) << __func__ << " had " << *in << " caps " << ccap_string(st->cap.caps) << dendl;
 824   } else {
 825     in = new Inode(this, st->vino, &st->layout);
 826     inode_map[st->vino] = in;
 827
 828     if (use_faked_inos())
 829       _assign_faked_ino(in);
 830
 831     if (!root) {
 832       root = in;
 833       if (use_faked_inos())
 834         _assign_faked_root(root);
 835       root_ancestor = in;
 836       cwd = root;
 837     } else if (!mounted) {
 838       root_parents[root_ancestor] = in;
 839       root_ancestor = in;
 840     }
 841
 842     // immutable bits
 843     in->ino = st->vino.ino;
 844     in->snapid = st->vino.snapid;
 845     in->mode = st->mode & S_IFMT;
 846     was_new = true;
 847   }
 848
 849   in->rdev = st->rdev;
 850   if (in->is_symlink())
 851     in->symlink = st->symlink;
 852
 853   // only update inode if mds info is strictly newer, or it is the same and projected (odd).
 854   bool new_version = false;
 855   if (in->version == 0 ||
 856       ((st->cap.flags & CEPH_CAP_FLAG_AUTH) &&
 857        (in->version & ~1) < st->version))
 858     new_version = true;
 859
 860   int issued;
 861   in->caps_issued(&issued);
 862   issued |= in->caps_dirty();
 863   int new_issued = ~issued & (int)st->cap.caps;
 864
 865   if ((new_version || (new_issued & CEPH_CAP_AUTH_SHARED)) &&
 866       !(issued & CEPH_CAP_AUTH_EXCL)) {
 867     in->mode = st->mode;
 868     in->uid = st->uid;
 869     in->gid = st->gid;
 870     in->btime = st->btime;
 871     in->snap_btime = st->snap_btime;
 872   }
 873
 874   if ((new_version || (new_issued & CEPH_CAP_LINK_SHARED)) &&
 875       !(issued & CEPH_CAP_LINK_EXCL)) {
 876     in->nlink = st->nlink;
 877   }
 878
 879   if (new_version || (new_issued & CEPH_CAP_ANY_RD)) {
 880     update_inode_file_time(in, issued, st->time_warp_seq,
 881                            st->ctime, st->mtime, st->atime);
 882   }
 883
 884   if (new_version ||
 885       (new_issued & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR))) {
 886     in->layout = st->layout;
 887     update_inode_file_size(in, issued, st->size, st->truncate_seq, st->truncate_size);
 888   }
 889
 890   if (in->is_dir()) {
 891     if (new_version || (new_issued & CEPH_CAP_FILE_SHARED)) {
 892       in->dirstat = st->dirstat;
 893     }
 894     // dir_layout/rstat/quota are not tracked by capability, update them only if
 895     // the inode stat is from auth mds
 896     if (new_version || (st->cap.flags & CEPH_CAP_FLAG_AUTH)) {
 897       in->dir_layout = st->dir_layout;
 898       ldout(cct, 20) << " dir hash is " << (int)in->dir_layout.dl_dir_hash << dendl;
 899       in->rstat = st->rstat;
 900       in->quota = st->quota;
 901       in->dir_pin = st->dir_pin;
 902     }
 903     // move me if/when version reflects fragtree changes.
 904     if (in->dirfragtree != st->dirfragtree) {
 905       in->dirfragtree = st->dirfragtree;
 906       _fragmap_remove_non_leaves(in);
 907     }
 908   }
 909
 910   if ((in->xattr_version  == 0 || !(issued & CEPH_CAP_XATTR_EXCL)) &&
 911       st->xattrbl.length() &&
 912       st->xattr_version > in->xattr_version) {
 913     auto p = st->xattrbl.cbegin();
 914     decode(in->xattrs, p);
 915     in->xattr_version = st->xattr_version;
 916   }
 917
 918   if (st->inline_version > in->inline_version) {
 919     in->inline_data = st->inline_data;
 920     in->inline_version = st->inline_version;
 921   }
 922
 923   /* always take a newer change attr */
 924   if (st->change_attr > in->change_attr)
 925     in->change_attr = st->change_attr;
 926
 927   if (st->version > in->version)
 928     in->version = st->version;
 929
 930   if (was_new)
 931     ldout(cct, 12) << __func__ << " adding " << *in << " caps " << ccap_string(st->cap.caps) << dendl;
 932
 933   if (!st->cap.caps)
 934     return in;   // as with readdir returning indoes in different snaprealms (no caps!)
 935
 936   if (in->snapid == CEPH_NOSNAP) {
 937     add_update_cap(in, session, st->cap.cap_id, st->cap.caps, st->cap.wanted,
 938                    st->cap.seq, st->cap.mseq, inodeno_t(st->cap.realm),
 939                    st->cap.flags, request_perms);
 940     if (in->auth_cap && in->auth_cap->session == session) {
 941       in->max_size = st->max_size;
 942       in->rstat = st->rstat;
 943     }
 944
 945     // setting I_COMPLETE needs to happen after adding the cap
 946     if (in->is_dir() &&
 947         (st->cap.caps & CEPH_CAP_FILE_SHARED) &&
 948         (issued & CEPH_CAP_FILE_EXCL) == 0 &&
 949         in->dirstat.nfiles == 0 &&
 950         in->dirstat.nsubdirs == 0) {
 951       ldout(cct, 10) << " marking (I_COMPLETE|I_DIR_ORDERED) on empty dir " << *in << dendl;
 952       in->flags |= I_COMPLETE | I_DIR_ORDERED;
 953       if (in->dir) {
 954         ldout(cct, 10) << " dir is open on empty dir " << in->ino << " with "
 955                        << in->dir->dentries.size() << " entries, marking all dentries null" << dendl;
 956         in->dir->readdir_cache.clear();
 957         for (const auto& p : in->dir->dentries) {
 958           unlink(p.second, true, true);  // keep dir, keep dentry
 959         }
 960         if (in->dir->dentries.empty())
 961           close_dir(in->dir);
 962       }
 963     }
 964   } else {
 965     in->snap_caps |= st->cap.caps;
 966   }
 967
 968   return in;
 969 }
 970
 971
 972 /*
 973  * insert_dentry_inode - insert + link a single dentry + inode into the metadata cache.
 974  */
 975 Dentry *Client::insert_dentry_inode(Dir *dir, const string& dname, LeaseStat *dlease,
 976                                     Inode *in, utime_t from, MetaSession *session,
 977                                     Dentry *old_dentry)
 978 {
 979   Dentry *dn = NULL;
 980   if (dir->dentries.count(dname))
 981     dn = dir->dentries[dname];
 982
 983   ldout(cct, 12) << __func__ << " '" << dname << "' vino " << in->vino()
 984                  << " in dir " << dir->parent_inode->vino() << " dn " << dn
 985                  << dendl;
 986
 987   if (dn && dn->inode) {
 988     if (dn->inode->vino() == in->vino()) {
 989       touch_dn(dn);
 990       ldout(cct, 12) << " had dentry " << dname
 991                << " with correct vino " << dn->inode->vino()
 992                << dendl;
 993     } else {
 994       ldout(cct, 12) << " had dentry " << dname
 995                << " with WRONG vino " << dn->inode->vino()
 996                << dendl;
 997       unlink(dn, true, true);  // keep dir, keep dentry
 998     }
 999   }
1000
1001   if (!dn || !dn->inode) {
1002     InodeRef tmp_ref(in);
1003     if (old_dentry) {
1004       if (old_dentry->dir != dir) {
1005         Inode *old_diri = old_dentry->dir->parent_inode;
1006         clear_dir_complete_and_ordered(old_diri, false);
1007       }
1008       unlink(old_dentry, dir == old_dentry->dir, false);  // drop dentry, keep dir open if its the same dir
1009     }
1010     Inode *diri = dir->parent_inode;
1011     clear_dir_complete_and_ordered(diri, false);
1012     dn = link(dir, dname, in, dn);
1013   }
1014
1015   update_dentry_lease(dn, dlease, from, session);
1016   return dn;
1017 }
1018
1019 void Client::update_dentry_lease(Dentry *dn, LeaseStat *dlease, utime_t from, MetaSession *session)
1020 {
1021   utime_t dttl = from;
1022   dttl += (float)dlease->duration_ms / 1000.0;
1023
1024   ceph_assert(dn);
1025
1026   if (dlease->mask & CEPH_LEASE_VALID) {
1027     if (dttl > dn->lease_ttl) {
1028       ldout(cct, 10) << "got dentry lease on " << dn->name
1029                << " dur " << dlease->duration_ms << "ms ttl " << dttl << dendl;
1030       dn->lease_ttl = dttl;
1031       dn->lease_mds = session->mds_num;
1032       dn->lease_seq = dlease->seq;
1033       dn->lease_gen = session->cap_gen;
1034     }
1035   }
1036   dn->cap_shared_gen = dn->dir->parent_inode->shared_gen;
1037   if (dlease->mask & CEPH_LEASE_PRIMARY_LINK)
1038     dn->mark_primary();
1039 }
1040
1041
1042 /*
1043  * update MDS location cache for a single inode
1044  */
1045 void Client::update_dir_dist(Inode *in, DirStat *dst)
1046 {
1047   // auth
1048   ldout(cct, 20) << "got dirfrag map for " << in->ino << " frag " << dst->frag << " to mds " << dst->auth << dendl;
1049   if (dst->auth >= 0) {
1050     in->fragmap[dst->frag] = dst->auth;
1051   } else {
1052     in->fragmap.erase(dst->frag);
1053   }
1054   if (!in->dirfragtree.is_leaf(dst->frag)) {
1055     in->dirfragtree.force_to_leaf(cct, dst->frag);
1056     _fragmap_remove_non_leaves(in);
1057   }
1058
1059   // replicated
1060   in->dir_replicated = !dst->dist.empty();  // FIXME that's just one frag!
1061 }
1062
1063 void Client::clear_dir_complete_and_ordered(Inode *diri, bool complete)
1064 {
1065   if (complete)
1066     diri->dir_release_count++;
1067   else
1068     diri->dir_ordered_count++;
1069   if (diri->flags & I_COMPLETE) {
1070     if (complete) {
1071       ldout(cct, 10) << " clearing (I_COMPLETE|I_DIR_ORDERED) on " << *diri << dendl;
1072       diri->flags &= ~(I_COMPLETE | I_DIR_ORDERED);
1073     } else {
1074       if (diri->flags & I_DIR_ORDERED) {
1075         ldout(cct, 10) << " clearing I_DIR_ORDERED on " << *diri << dendl;
1076         diri->flags &= ~I_DIR_ORDERED;
1077       }
1078     }
1079     if (diri->dir)
1080       diri->dir->readdir_cache.clear();
1081   }
1082 }
1083
1084 /*
1085  * insert results from readdir or lssnap into the metadata cache.
1086  */
1087 void Client::insert_readdir_results(MetaRequest *request, MetaSession *session, Inode *diri) {
1088
1089   auto& reply = request->reply;
1090   ConnectionRef con = request->reply->get_connection();
1091   uint64_t features;
1092   if(session->mds_features.test(CEPHFS_FEATURE_REPLY_ENCODING)) {
1093     features = (uint64_t)-1;
1094   }
1095   else {
1096     features = con->get_features();
1097   }
1098
1099   dir_result_t *dirp = request->dirp;
1100   ceph_assert(dirp);
1101
1102   // the extra buffer list is only set for readdir and lssnap replies
1103   auto p = reply->get_extra_bl().cbegin();
1104   if (!p.end()) {
1105     // snapdir?
1106     if (request->head.op == CEPH_MDS_OP_LSSNAP) {
1107       ceph_assert(diri);
1108       diri = open_snapdir(diri);
1109     }
1110
1111     // only open dir if we're actually adding stuff to it!
1112     Dir *dir = diri->open_dir();
1113     ceph_assert(dir);
1114
1115     // dirstat
1116     DirStat dst(p, features);
1117     __u32 numdn;
1118     __u16 flags;
1119     decode(numdn, p);
1120     decode(flags, p);
1121
1122     bool end = ((unsigned)flags & CEPH_READDIR_FRAG_END);
1123     bool hash_order = ((unsigned)flags & CEPH_READDIR_HASH_ORDER);
1124
1125     frag_t fg = (unsigned)request->head.args.readdir.frag;
1126     unsigned readdir_offset = dirp->next_offset;
1127     string readdir_start = dirp->last_name;
1128     ceph_assert(!readdir_start.empty() || readdir_offset == 2);
1129
1130     unsigned last_hash = 0;
1131     if (hash_order) {
1132       if (!readdir_start.empty()) {
1133         last_hash = ceph_frag_value(diri->hash_dentry_name(readdir_start));
1134       } else if (flags & CEPH_READDIR_OFFSET_HASH) {
1135         /* mds understands offset_hash */
1136         last_hash = (unsigned)request->head.args.readdir.offset_hash;
1137       }
1138     }
1139
1140     if (fg != dst.frag) {
1141       ldout(cct, 10) << "insert_trace got new frag " << fg << " -> " << dst.frag << dendl;
1142       fg = dst.frag;
1143       if (!hash_order) {
1144         readdir_offset = 2;
1145         readdir_start.clear();
1146         dirp->offset = dir_result_t::make_fpos(fg, readdir_offset, false);
1147       }
1148     }
1149
1150     ldout(cct, 10) << __func__ << " " << numdn << " readdir items, end=" << end
1151                    << ", hash_order=" << hash_order
1152                    << ", readdir_start " << readdir_start
1153                    << ", last_hash " << last_hash
1154                    << ", next_offset " << readdir_offset << dendl;
1155
1156     if (diri->snapid != CEPH_SNAPDIR &&
1157         fg.is_leftmost() && readdir_offset == 2 &&
1158         !(hash_order && last_hash)) {
1159       dirp->release_count = diri->dir_release_count;
1160       dirp->ordered_count = diri->dir_ordered_count;
1161       dirp->start_shared_gen = diri->shared_gen;
1162       dirp->cache_index = 0;
1163     }
1164
1165     dirp->buffer_frag = fg;
1166
1167     _readdir_drop_dirp_buffer(dirp);
1168     dirp->buffer.reserve(numdn);
1169
1170     string dname;
1171     LeaseStat dlease;
1172     for (unsigned i=0; i<numdn; i++) {
1173       decode(dname, p);
1174       dlease.decode(p, features);
1175       InodeStat ist(p, features);
1176
1177       ldout(cct, 15) << "" << i << ": '" << dname << "'" << dendl;
1178
1179       Inode *in = add_update_inode(&ist, request->sent_stamp, session,
1180                                    request->perms);
1181       Dentry *dn;
1182       if (diri->dir->dentries.count(dname)) {
1183         Dentry *olddn = diri->dir->dentries[dname];
1184         if (olddn->inode != in) {
1185           // replace incorrect dentry
1186           unlink(olddn, true, true);  // keep dir, dentry
1187           dn = link(dir, dname, in, olddn);
1188           ceph_assert(dn == olddn);
1189         } else {
1190           // keep existing dn
1191           dn = olddn;
1192           touch_dn(dn);
1193         }
1194       } else {
1195         // new dn
1196         dn = link(dir, dname, in, NULL);
1197       }
1198
1199       update_dentry_lease(dn, &dlease, request->sent_stamp, session);
1200       if (hash_order) {
1201         unsigned hash = ceph_frag_value(diri->hash_dentry_name(dname));
1202         if (hash != last_hash)
1203           readdir_offset = 2;
1204         last_hash = hash;
1205         dn->offset = dir_result_t::make_fpos(hash, readdir_offset++, true);
1206       } else {
1207         dn->offset = dir_result_t::make_fpos(fg, readdir_offset++, false);
1208       }
1209       // add to readdir cache
1210       if (dirp->release_count == diri->dir_release_count &&
1211           dirp->ordered_count == diri->dir_ordered_count &&
1212           dirp->start_shared_gen == diri->shared_gen) {
1213         if (dirp->cache_index == dir->readdir_cache.size()) {
1214           if (i == 0) {
1215             ceph_assert(!dirp->inode->is_complete_and_ordered());
1216             dir->readdir_cache.reserve(dirp->cache_index + numdn);
1217           }
1218           dir->readdir_cache.push_back(dn);
1219         } else if (dirp->cache_index < dir->readdir_cache.size()) {
1220           if (dirp->inode->is_complete_and_ordered())
1221             ceph_assert(dir->readdir_cache[dirp->cache_index] == dn);
1222           else
1223             dir->readdir_cache[dirp->cache_index] = dn;
1224         } else {
1225           ceph_abort_msg("unexpected readdir buffer idx");
1226         }
1227         dirp->cache_index++;
1228       }
1229       // add to cached result list
1230       dirp->buffer.push_back(dir_result_t::dentry(dn->offset, dname, in));
1231       ldout(cct, 15) << __func__ << "  " << hex << dn->offset << dec << ": '" << dname << "' -> " << in->ino << dendl;
1232     }
1233
1234     if (numdn > 0)
1235       dirp->last_name = dname;
1236     if (end)
1237       dirp->next_offset = 2;
1238     else
1239       dirp->next_offset = readdir_offset;
1240
1241     if (dir->is_empty())
1242       close_dir(dir);
1243   }
1244 }
1245
1246 /** insert_trace
1247  *
1248  * insert a trace from a MDS reply into the cache.
1249  */
1250 Inode* Client::insert_trace(MetaRequest *request, MetaSession *session)
1251 {
1252   auto& reply = request->reply;
1253   int op = request->get_op();
1254
1255   ldout(cct, 10) << "insert_trace from " << request->sent_stamp << " mds." << session->mds_num
1256            << " is_target=" << (int)reply->head.is_target
1257            << " is_dentry=" << (int)reply->head.is_dentry
1258            << dendl;
1259
1260   auto p = reply->get_trace_bl().cbegin();
1261   if (request->got_unsafe) {
1262     ldout(cct, 10) << "insert_trace -- already got unsafe; ignoring" << dendl;
1263     ceph_assert(p.end());
1264     return NULL;
1265   }
1266
1267   if (p.end()) {
1268     ldout(cct, 10) << "insert_trace -- no trace" << dendl;
1269
1270     Dentry *d = request->dentry();
1271     if (d) {
1272       Inode *diri = d->dir->parent_inode;
1273       clear_dir_complete_and_ordered(diri, true);
1274     }
1275
1276     if (d && reply->get_result() == 0) {
1277       if (op == CEPH_MDS_OP_RENAME) {
1278         // rename
1279         Dentry *od = request->old_dentry();
1280         ldout(cct, 10) << " unlinking rename src dn " << od << " for traceless reply" << dendl;
1281         ceph_assert(od);
1282         unlink(od, true, true);  // keep dir, dentry
1283       } else if (op == CEPH_MDS_OP_RMDIR ||
1284                  op == CEPH_MDS_OP_UNLINK) {
1285         // unlink, rmdir
1286         ldout(cct, 10) << " unlinking unlink/rmdir dn " << d << " for traceless reply" << dendl;
1287         unlink(d, true, true);  // keep dir, dentry
1288       }
1289     }
1290     return NULL;
1291   }
1292
1293   ConnectionRef con = request->reply->get_connection();
1294   uint64_t features;
1295   if (session->mds_features.test(CEPHFS_FEATURE_REPLY_ENCODING)) {
1296     features = (uint64_t)-1;
1297   }
1298   else {
1299     features = con->get_features();
1300   }
1301   ldout(cct, 10) << " features 0x" << hex << features << dec << dendl;
1302
1303   // snap trace
1304   SnapRealm *realm = NULL;
1305   if (reply->snapbl.length())
1306     update_snap_trace(reply->snapbl, &realm);
1307
1308   ldout(cct, 10) << " hrm "
1309            << " is_target=" << (int)reply->head.is_target
1310            << " is_dentry=" << (int)reply->head.is_dentry
1311            << dendl;
1312
1313   InodeStat dirst;
1314   DirStat dst;
1315   string dname;
1316   LeaseStat dlease;
1317   InodeStat ist;
1318
1319   if (reply->head.is_dentry) {
1320     dirst.decode(p, features);
1321     dst.decode(p, features);
1322     decode(dname, p);
1323     dlease.decode(p, features);
1324   }
1325
1326   Inode *in = 0;
1327   if (reply->head.is_target) {
1328     ist.decode(p, features);
1329     if (cct->_conf->client_debug_getattr_caps) {
1330       unsigned wanted = 0;
1331       if (op == CEPH_MDS_OP_GETATTR || op == CEPH_MDS_OP_LOOKUP)
1332         wanted = request->head.args.getattr.mask;
1333       else if (op == CEPH_MDS_OP_OPEN || op == CEPH_MDS_OP_CREATE)
1334         wanted = request->head.args.open.mask;
1335
1336       if ((wanted & CEPH_CAP_XATTR_SHARED) &&
1337           !(ist.xattr_version > 0 && ist.xattrbl.length() > 0))
1338         ceph_abort_msg("MDS reply does not contain xattrs");
1339     }
1340
1341     in = add_update_inode(&ist, request->sent_stamp, session,
1342                           request->perms);
1343   }
1344
1345   Inode *diri = NULL;
1346   if (reply->head.is_dentry) {
1347     diri = add_update_inode(&dirst, request->sent_stamp, session,
1348                             request->perms);
1349     update_dir_dist(diri, &dst);  // dir stat info is attached to ..
1350
1351     if (in) {
1352       Dir *dir = diri->open_dir();
1353       insert_dentry_inode(dir, dname, &dlease, in, request->sent_stamp, session,
1354                           (op == CEPH_MDS_OP_RENAME) ? request->old_dentry() : NULL);
1355     } else {
1356       Dentry *dn = NULL;
1357       if (diri->dir && diri->dir->dentries.count(dname)) {
1358         dn = diri->dir->dentries[dname];
1359         if (dn->inode) {
1360           clear_dir_complete_and_ordered(diri, false);
1361           unlink(dn, true, true);  // keep dir, dentry
1362         }
1363       }
1364       if (dlease.duration_ms > 0) {
1365         if (!dn) {
1366           Dir *dir = diri->open_dir();
1367           dn = link(dir, dname, NULL, NULL);
1368         }
1369         update_dentry_lease(dn, &dlease, request->sent_stamp, session);
1370       }
1371     }
1372   } else if (op == CEPH_MDS_OP_LOOKUPSNAP ||
1373              op == CEPH_MDS_OP_MKSNAP) {
1374     ldout(cct, 10) << " faking snap lookup weirdness" << dendl;
1375     // fake it for snap lookup
1376     vinodeno_t vino = ist.vino;
1377     vino.snapid = CEPH_SNAPDIR;
1378     ceph_assert(inode_map.count(vino));
1379     diri = inode_map[vino];
1380
1381     string dname = request->path.last_dentry();
1382
1383     LeaseStat dlease;
1384     dlease.duration_ms = 0;
1385
1386     if (in) {
1387       Dir *dir = diri->open_dir();
1388       insert_dentry_inode(dir, dname, &dlease, in, request->sent_stamp, session);
1389     } else {
1390       if (diri->dir && diri->dir->dentries.count(dname)) {
1391         Dentry *dn = diri->dir->dentries[dname];
1392         if (dn->inode)
1393           unlink(dn, true, true);  // keep dir, dentry
1394       }
1395     }
1396   }
1397
1398   if (in) {
1399     if (op == CEPH_MDS_OP_READDIR ||
1400         op == CEPH_MDS_OP_LSSNAP) {
1401       insert_readdir_results(request, session, in);
1402     } else if (op == CEPH_MDS_OP_LOOKUPNAME) {
1403       // hack: return parent inode instead
1404       in = diri;
1405     }
1406
1407     if (request->dentry() == NULL && in != request->inode()) {
1408       // pin the target inode if its parent dentry is not pinned
1409       request->set_other_inode(in);
1410     }
1411   }
1412
1413   if (realm)
1414     put_snap_realm(realm);
1415
1416   request->target = in;
1417   return in;
1418 }
1419
1420 // -------
1421
1422 mds_rank_t Client::choose_target_mds(MetaRequest *req, Inode** phash_diri)
1423 {
1424   mds_rank_t mds = MDS_RANK_NONE;
1425   __u32 hash = 0;
1426   bool is_hash = false;
1427
1428   Inode *in = NULL;
1429   Dentry *de = NULL;
1430
1431   if (req->resend_mds >= 0) {
1432     mds = req->resend_mds;
1433     req->resend_mds = -1;
1434     ldout(cct, 10) << __func__ << " resend_mds specified as mds." << mds << dendl;
1435     goto out;
1436   }
1437
1438   if (cct->_conf->client_use_random_mds)
1439     goto random_mds;
1440
1441   in = req->inode();
1442   de = req->dentry();
1443   if (in) {
1444     ldout(cct, 20) << __func__ << " starting with req->inode " << *in << dendl;
1445     if (req->path.depth()) {
1446       hash = in->hash_dentry_name(req->path[0]);
1447       ldout(cct, 20) << __func__ << " inode dir hash is " << (int)in->dir_layout.dl_dir_hash
1448                << " on " << req->path[0]
1449                << " => " << hash << dendl;
1450       is_hash = true;
1451     }
1452   } else if (de) {
1453     if (de->inode) {
1454       in = de->inode.get();
1455       ldout(cct, 20) << __func__ << " starting with req->dentry inode " << *in << dendl;
1456     } else {
1457       in = de->dir->parent_inode;
1458       hash = in->hash_dentry_name(de->name);
1459       ldout(cct, 20) << __func__ << " dentry dir hash is " << (int)in->dir_layout.dl_dir_hash
1460                << " on " << de->name
1461                << " => " << hash << dendl;
1462       is_hash = true;
1463     }
1464   }
1465   if (in) {
1466     if (in->snapid != CEPH_NOSNAP) {
1467       ldout(cct, 10) << __func__ << " " << *in << " is snapped, using nonsnap parent" << dendl;
1468       while (in->snapid != CEPH_NOSNAP) {
1469         if (in->snapid == CEPH_SNAPDIR)
1470           in = in->snapdir_parent.get();
1471         else if (!in->dentries.empty())
1472           /* In most cases there will only be one dentry, so getting it
1473            * will be the correct action. If there are multiple hard links,
1474            * I think the MDS should be able to redirect as needed*/
1475           in = in->get_first_parent()->dir->parent_inode;
1476         else {
1477           ldout(cct, 10) << "got unlinked inode, can't look at parent" << dendl;
1478           break;
1479         }
1480       }
1481       is_hash = false;
1482     }
1483
1484     ldout(cct, 20) << __func__ << " " << *in << " is_hash=" << is_hash
1485              << " hash=" << hash << dendl;
1486
1487     if (is_hash && S_ISDIR(in->mode) && !in->fragmap.empty()) {
1488       frag_t fg = in->dirfragtree[hash];
1489       if (in->fragmap.count(fg)) {
1490         mds = in->fragmap[fg];
1491         if (phash_diri)
1492           *phash_diri = in;
1493       } else if (in->auth_cap) {
1494         mds = in->auth_cap->session->mds_num;
1495       }
1496       if (mds >= 0) {
1497         ldout(cct, 10) << __func__ << " from dirfragtree hash" << dendl;
1498         goto out;
1499       }
1500     }
1501
1502     if (in->auth_cap && req->auth_is_best()) {
1503       mds = in->auth_cap->session->mds_num;
1504     } else if (!in->caps.empty()) {
1505       mds = in->caps.begin()->second.session->mds_num;
1506     } else {
1507       goto random_mds;
1508     }
1509     ldout(cct, 10) << __func__ << " from caps on inode " << *in << dendl;
1510
1511     goto out;
1512   }
1513
1514 random_mds:
1515   if (mds < 0) {
1516     mds = _get_random_up_mds();
1517     ldout(cct, 10) << "did not get mds through better means, so chose random mds " << mds << dendl;
1518   }
1519
1520 out:
1521   ldout(cct, 20) << "mds is " << mds << dendl;
1522   return mds;
1523 }
1524
1525
1526 void Client::connect_mds_targets(mds_rank_t mds)
1527 {
1528   ldout(cct, 10) << __func__ << " for mds." << mds << dendl;
1529   ceph_assert(mds_sessions.count(mds));
1530   const MDSMap::mds_info_t& info = mdsmap->get_mds_info(mds);
1531   for (set<mds_rank_t>::const_iterator q = info.export_targets.begin();
1532        q != info.export_targets.end();
1533        ++q) {
1534     if (mds_sessions.count(*q) == 0 &&
1535         mdsmap->is_clientreplay_or_active_or_stopping(*q)) {
1536       ldout(cct, 10) << "check_mds_sessions opening mds." << mds
1537                      << " export target mds." << *q << dendl;
1538       _open_mds_session(*q);
1539     }
1540   }
1541 }
1542
1543 void Client::dump_mds_sessions(Formatter *f, bool cap_dump)
1544 {
1545   f->dump_int("id", get_nodeid().v);
1546   entity_inst_t inst(messenger->get_myname(), messenger->get_myaddr_legacy());
1547   f->dump_object("inst", inst);
1548   f->dump_stream("inst_str") << inst;
1549   f->dump_stream("addr_str") << inst.addr;
1550   f->open_array_section("sessions");
1551   for (const auto &p : mds_sessions) {
1552     f->open_object_section("session");
1553     p.second.dump(f, cap_dump);
1554     f->close_section();
1555   }
1556   f->close_section();
1557   f->dump_int("mdsmap_epoch", mdsmap->get_epoch());
1558 }
1559 void Client::dump_mds_requests(Formatter *f)
1560 {
1561   for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
1562        p != mds_requests.end();
1563        ++p) {
1564     f->open_object_section("request");
1565     p->second->dump(f);
1566     f->close_section();
1567   }
1568 }
1569
1570 int Client::verify_reply_trace(int r, MetaSession *session,
1571                                MetaRequest *request, const MConstRef<MClientReply>& reply,
1572                                InodeRef *ptarget, bool *pcreated,
1573                                const UserPerm& perms)
1574 {
1575   // check whether this request actually did the create, and set created flag
1576   bufferlist extra_bl;
1577   inodeno_t created_ino;
1578   bool got_created_ino = false;
1579   ceph::unordered_map<vinodeno_t, Inode*>::iterator p;
1580
1581   extra_bl = reply->get_extra_bl();
1582   if (extra_bl.length() >= 8) {
1583     if (session->mds_features.test(CEPHFS_FEATURE_DELEG_INO)) {
1584      struct openc_response_t    ocres;
1585
1586      decode(ocres, extra_bl);
1587      created_ino = ocres.created_ino;
1588      /*
1589       * The userland cephfs client doesn't have a way to do an async create
1590       * (yet), so just discard delegated_inos for now. Eventually we should
1591       * store them and use them in create calls, even if they are synchronous,
1592       * if only for testing purposes.
1593       */
1594      ldout(cct, 10) << "delegated_inos: " << ocres.delegated_inos << dendl;
1595     } else {
1596      // u64 containing number of created ino
1597      decode(created_ino, extra_bl);
1598     }
1599     ldout(cct, 10) << "make_request created ino " << created_ino << dendl;
1600     got_created_ino = true;
1601   }
1602
1603   if (pcreated)
1604     *pcreated = got_created_ino;
1605
1606   if (request->target) {
1607     *ptarget = request->target;
1608     ldout(cct, 20) << "make_request target is " << *ptarget->get() << dendl;
1609   } else {
1610     if (got_created_ino && (p = inode_map.find(vinodeno_t(created_ino, CEPH_NOSNAP))) != inode_map.end()) {
1611       (*ptarget) = p->second;
1612       ldout(cct, 20) << "make_request created, target is " << *ptarget->get() << dendl;
1613     } else {
1614       // we got a traceless reply, and need to look up what we just
1615       // created.  for now, do this by name.  someday, do this by the
1616       // ino... which we know!  FIXME.
1617       InodeRef target;
1618       Dentry *d = request->dentry();
1619       if (d) {
1620         if (d->dir) {
1621           ldout(cct, 10) << "make_request got traceless reply, looking up #"
1622                          << d->dir->parent_inode->ino << "/" << d->name
1623                          << " got_ino " << got_created_ino
1624                          << " ino " << created_ino
1625                          << dendl;
1626           r = _do_lookup(d->dir->parent_inode, d->name, request->regetattr_mask,
1627                          &target, perms);
1628         } else {
1629           // if the dentry is not linked, just do our best. see #5021.
1630           ceph_abort_msg("how did this happen?  i want logs!");
1631         }
1632       } else {
1633         Inode *in = request->inode();
1634         ldout(cct, 10) << "make_request got traceless reply, forcing getattr on #"
1635                        << in->ino << dendl;
1636         r = _getattr(in, request->regetattr_mask, perms, true);
1637         target = in;
1638       }
1639       if (r >= 0) {
1640         // verify ino returned in reply and trace_dist are the same
1641         if (got_created_ino &&
1642             created_ino.val != target->ino.val) {
1643           ldout(cct, 5) << "create got ino " << created_ino << " but then failed on lookup; EINTR?" << dendl;
1644           r = -EINTR;
1645         }
1646         if (ptarget)
1647           ptarget->swap(target);
1648       }
1649     }
1650   }
1651
1652   return r;
1653 }
1654
1655
1656 /**
1657  * make a request
1658  *
1659  * Blocking helper to make an MDS request.
1660  *
1661  * If the ptarget flag is set, behavior changes slightly: the caller
1662  * expects to get a pointer to the inode we are creating or operating
1663  * on.  As a result, we will follow up any traceless mutation reply
1664  * with a getattr or lookup to transparently handle a traceless reply
1665  * from the MDS (as when the MDS restarts and the client has to replay
1666  * a request).
1667  *
1668  * @param request the MetaRequest to execute
1669  * @param perms The user uid/gid to execute as (eventually, full group lists?)
1670  * @param ptarget [optional] address to store a pointer to the target inode we want to create or operate on
1671  * @param pcreated [optional; required if ptarget] where to store a bool of whether our create atomically created a file
1672  * @param use_mds [optional] prefer a specific mds (-1 for default)
1673  * @param pdirbl [optional; disallowed if ptarget] where to pass extra reply payload to the caller
1674  */
1675 int Client::make_request(MetaRequest *request,
1676                          const UserPerm& perms,
1677                          InodeRef *ptarget, bool *pcreated,
1678                          mds_rank_t use_mds,
1679                          bufferlist *pdirbl)
1680 {
1681   int r = 0;
1682
1683   // assign a unique tid
1684   ceph_tid_t tid = ++last_tid;
1685   request->set_tid(tid);
1686
1687   // and timestamp
1688   request->op_stamp = ceph_clock_now();
1689
1690   // make note
1691   mds_requests[tid] = request->get();
1692   if (oldest_tid == 0 && request->get_op() != CEPH_MDS_OP_SETFILELOCK)
1693     oldest_tid = tid;
1694
1695   request->set_caller_perms(perms);
1696
1697   if (cct->_conf->client_inject_fixed_oldest_tid) {
1698     ldout(cct, 20) << __func__ << " injecting fixed oldest_client_tid(1)" << dendl;
1699     request->set_oldest_client_tid(1);
1700   } else {
1701     request->set_oldest_client_tid(oldest_tid);
1702   }
1703
1704   // hack target mds?
1705   if (use_mds >= 0)
1706     request->resend_mds = use_mds;
1707
1708   MetaSession *session = NULL;
1709   while (1) {
1710     if (request->aborted())
1711       break;
1712
1713     if (blacklisted) {
1714       request->abort(-EBLACKLISTED);
1715       break;
1716     }
1717
1718     // set up wait cond
1719     ceph::condition_variable caller_cond;
1720     request->caller_cond = &caller_cond;
1721
1722     // choose mds
1723     Inode *hash_diri = NULL;
1724     mds_rank_t mds = choose_target_mds(request, &hash_diri);
1725     int mds_state = (mds == MDS_RANK_NONE) ? MDSMap::STATE_NULL : mdsmap->get_state(mds);
1726     if (mds_state != MDSMap::STATE_ACTIVE && mds_state != MDSMap::STATE_STOPPING) {
1727       if (mds_state == MDSMap::STATE_NULL && mds >= mdsmap->get_max_mds()) {
1728         if (hash_diri) {
1729           ldout(cct, 10) << " target mds." << mds << " has stopped, remove it from fragmap" << dendl;
1730           _fragmap_remove_stopped_mds(hash_diri, mds);
1731         } else {
1732           ldout(cct, 10) << " target mds." << mds << " has stopped, trying a random mds" << dendl;
1733           request->resend_mds = _get_random_up_mds();
1734         }
1735       } else {
1736         ldout(cct, 10) << " target mds." << mds << " not active, waiting for new mdsmap" << dendl;
1737         wait_on_list(waiting_for_mdsmap);
1738       }
1739       continue;
1740     }
1741
1742     // open a session?
1743     if (!have_open_session(mds)) {
1744       session = _get_or_open_mds_session(mds);
1745       if (session->state == MetaSession::STATE_REJECTED) {
1746         request->abort(-EPERM);
1747         break;
1748       }
1749       // wait
1750       if (session->state == MetaSession::STATE_OPENING) {
1751         ldout(cct, 10) << "waiting for session to mds." << mds << " to open" << dendl;
1752         wait_on_context_list(session->waiting_for_open);
1753         continue;
1754       }
1755
1756       if (!have_open_session(mds))
1757         continue;
1758     } else {
1759       session = &mds_sessions.at(mds);
1760     }
1761
1762     // send request.
1763     send_request(request, session);
1764
1765     // wait for signal
1766     ldout(cct, 20) << "awaiting reply|forward|kick on " << &caller_cond << dendl;
1767     request->kick = false;
1768     std::unique_lock l{client_lock, std::adopt_lock};
1769     caller_cond.wait(l, [request] {
1770       return (request->reply ||           // reply
1771               request->resend_mds >= 0 || // forward
1772               request->kick);
1773     });
1774     l.release();
1775     request->caller_cond = nullptr;
1776
1777     // did we get a reply?
1778     if (request->reply)
1779       break;
1780   }
1781
1782   if (!request->reply) {
1783     ceph_assert(request->aborted());
1784     ceph_assert(!request->got_unsafe);
1785     r = request->get_abort_code();
1786     request->item.remove_myself();
1787     unregister_request(request);
1788     put_request(request);
1789     return r;
1790   }
1791
1792   // got it!
1793   auto reply = std::move(request->reply);
1794   r = reply->get_result();
1795   if (r >= 0)
1796     request->success = true;
1797
1798   // kick dispatcher (we've got it!)
1799   ceph_assert(request->dispatch_cond);
1800   request->dispatch_cond->notify_all();
1801   ldout(cct, 20) << "sendrecv kickback on tid " << tid << " " << request->dispatch_cond << dendl;
1802   request->dispatch_cond = 0;
1803
1804   if (r >= 0 && ptarget)
1805     r = verify_reply_trace(r, session, request, reply, ptarget, pcreated, perms);
1806
1807   if (pdirbl)
1808     *pdirbl = reply->get_extra_bl();
1809
1810   // -- log times --
1811   utime_t lat = ceph_clock_now();
1812   lat -= request->sent_stamp;
1813   ldout(cct, 20) << "lat " << lat << dendl;
1814   logger->tinc(l_c_lat, lat);
1815   logger->tinc(l_c_reply, lat);
1816
1817   put_request(request);
1818   return r;
1819 }
1820
1821 void Client::unregister_request(MetaRequest *req)
1822 {
1823   mds_requests.erase(req->tid);
1824   if (req->tid == oldest_tid) {
1825     map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.upper_bound(oldest_tid);
1826     while (true) {
1827       if (p == mds_requests.end()) {
1828         oldest_tid = 0;
1829         break;
1830       }
1831       if (p->second->get_op() != CEPH_MDS_OP_SETFILELOCK) {
1832         oldest_tid = p->first;
1833         break;
1834       }
1835       ++p;
1836     }
1837   }
1838   put_request(req);
1839 }
1840
1841 void Client::put_request(MetaRequest *request)
1842 {
1843   if (request->_put()) {
1844     int op = -1;
1845     if (request->success)
1846       op = request->get_op();
1847     InodeRef other_in;
1848     request->take_other_inode(&other_in);
1849     delete request;
1850
1851     if (other_in &&
1852         (op == CEPH_MDS_OP_RMDIR ||
1853          op == CEPH_MDS_OP_RENAME ||
1854          op == CEPH_MDS_OP_RMSNAP)) {
1855       _try_to_trim_inode(other_in.get(), false);
1856     }
1857   }
1858 }
1859
1860 int Client::encode_inode_release(Inode *in, MetaRequest *req,
1861                          mds_rank_t mds, int drop,
1862                          int unless, int force)
1863 {
1864   ldout(cct, 20) << __func__ << " enter(in:" << *in << ", req:" << req
1865            << " mds:" << mds << ", drop:" << drop << ", unless:" << unless
1866            << ", force:" << force << ")" << dendl;
1867   int released = 0;
1868   auto it = in->caps.find(mds);
1869   if (it != in->caps.end()) {
1870     Cap &cap = it->second;
1871     drop &= ~(in->dirty_caps | get_caps_used(in));
1872     if ((drop & cap.issued) &&
1873         !(unless & cap.issued)) {
1874       ldout(cct, 25) << "dropping caps " << ccap_string(drop) << dendl;
1875       cap.issued &= ~drop;
1876       cap.implemented &= ~drop;
1877       released = 1;
1878     } else {
1879       released = force;
1880     }
1881     if (released) {
1882       cap.wanted = in->caps_wanted();
1883       if (&cap == in->auth_cap &&
1884           !(cap.wanted & CEPH_CAP_ANY_FILE_WR)) {
1885         in->requested_max_size = 0;
1886         ldout(cct, 25) << "reset requested_max_size due to not wanting any file write cap" << dendl;
1887       }
1888       ceph_mds_request_release rel;
1889       rel.ino = in->ino;
1890       rel.cap_id = cap.cap_id;
1891       rel.seq = cap.seq;
1892       rel.issue_seq = cap.issue_seq;
1893       rel.mseq = cap.mseq;
1894       rel.caps = cap.implemented;
1895       rel.wanted = cap.wanted;
1896       rel.dname_len = 0;
1897       rel.dname_seq = 0;
1898       req->cap_releases.push_back(MClientRequest::Release(rel,""));
1899     }
1900   }
1901   ldout(cct, 25) << __func__ << " exit(in:" << *in << ") released:"
1902            << released << dendl;
1903   return released;
1904 }
1905
1906 void Client::encode_dentry_release(Dentry *dn, MetaRequest *req,
1907                            mds_rank_t mds, int drop, int unless)
1908 {
1909   ldout(cct, 20) << __func__ << " enter(dn:"
1910            << dn << ")" << dendl;
1911   int released = 0;
1912   if (dn->dir)
1913     released = encode_inode_release(dn->dir->parent_inode, req,
1914                                     mds, drop, unless, 1);
1915   if (released && dn->lease_mds == mds) {
1916     ldout(cct, 25) << "preemptively releasing dn to mds" << dendl;
1917     auto& rel = req->cap_releases.back();
1918     rel.item.dname_len = dn->name.length();
1919     rel.item.dname_seq = dn->lease_seq;
1920     rel.dname = dn->name;
1921     dn->lease_mds = -1;
1922   }
1923   ldout(cct, 25) << __func__ << " exit(dn:"
1924            << dn << ")" << dendl;
1925 }
1926
1927
1928 /*
1929  * This requires the MClientRequest *request member to be set.
1930  * It will error out horribly without one.
1931  * Additionally, if you set any *drop member, you'd better have
1932  * set the corresponding dentry!
1933  */
1934 void Client::encode_cap_releases(MetaRequest *req, mds_rank_t mds)
1935 {
1936   ldout(cct, 20) << __func__ << " enter (req: "
1937                  << req << ", mds: " << mds << ")" << dendl;
1938   if (req->inode_drop && req->inode())
1939     encode_inode_release(req->inode(), req,
1940                          mds, req->inode_drop,
1941                          req->inode_unless);
1942
1943   if (req->old_inode_drop && req->old_inode())
1944     encode_inode_release(req->old_inode(), req,
1945                          mds, req->old_inode_drop,
1946                          req->old_inode_unless);
1947   if (req->other_inode_drop && req->other_inode())
1948     encode_inode_release(req->other_inode(), req,
1949                          mds, req->other_inode_drop,
1950                          req->other_inode_unless);
1951
1952   if (req->dentry_drop && req->dentry())
1953     encode_dentry_release(req->dentry(), req,
1954                           mds, req->dentry_drop,
1955                           req->dentry_unless);
1956
1957   if (req->old_dentry_drop && req->old_dentry())
1958     encode_dentry_release(req->old_dentry(), req,
1959                           mds, req->old_dentry_drop,
1960                           req->old_dentry_unless);
1961   ldout(cct, 25) << __func__ << " exit (req: "
1962            << req << ", mds " << mds <<dendl;
1963 }
1964
1965 bool Client::have_open_session(mds_rank_t mds)
1966 {
1967   const auto &it = mds_sessions.find(mds);
1968   return it != mds_sessions.end() &&
1969     (it->second.state == MetaSession::STATE_OPEN ||
1970      it->second.state == MetaSession::STATE_STALE);
1971 }
1972
1973 MetaSession *Client::_get_mds_session(mds_rank_t mds, Connection *con)
1974 {
1975   const auto &it = mds_sessions.find(mds);
1976   if (it == mds_sessions.end() || it->second.con != con) {
1977     return NULL;
1978   } else {
1979     return &it->second;
1980   }
1981 }
1982
1983 MetaSession *Client::_get_or_open_mds_session(mds_rank_t mds)
1984 {
1985   auto it = mds_sessions.find(mds);
1986   return it == mds_sessions.end() ? _open_mds_session(mds) : &it->second;
1987 }
1988
1989 /**
1990  * Populate a map of strings with client-identifying metadata,
1991  * such as the hostname.  Call this once at initialization.
1992  */
1993 void Client::populate_metadata(const std::string &mount_root)
1994 {
1995   // Hostname
1996   struct utsname u;
1997   int r = uname(&u);
1998   if (r >= 0) {
1999     metadata["hostname"] = u.nodename;
2000     ldout(cct, 20) << __func__ << " read hostname '" << u.nodename << "'" << dendl;
2001   } else {
2002     ldout(cct, 1) << __func__ << " failed to read hostname (" << cpp_strerror(r) << ")" << dendl;
2003   }
2004
2005   metadata["pid"] = stringify(getpid());
2006
2007   // Ceph entity id (the '0' in "client.0")
2008   metadata["entity_id"] = cct->_conf->name.get_id();
2009
2010   // Our mount position
2011   if (!mount_root.empty()) {
2012     metadata["root"] = mount_root;
2013   }
2014
2015   // Ceph version
2016   metadata["ceph_version"] = pretty_version_to_str();
2017   metadata["ceph_sha1"] = git_version_to_str();
2018
2019   // Apply any metadata from the user's configured overrides
2020   std::vector<std::string> tokens;
2021   get_str_vec(cct->_conf->client_metadata, ",", tokens);
2022   for (const auto &i : tokens) {
2023     auto eqpos = i.find("=");
2024     // Throw out anything that isn't of the form "<str>=<str>"
2025     if (eqpos == 0 || eqpos == std::string::npos || eqpos == i.size()) {
2026       lderr(cct) << "Invalid metadata keyval pair: '" << i << "'" << dendl;
2027       continue;
2028     }
2029     metadata[i.substr(0, eqpos)] = i.substr(eqpos + 1);
2030   }
2031 }
2032
2033 /**
2034  * Optionally add or override client metadata fields.
2035  */
2036 void Client::update_metadata(std::string const &k, std::string const &v)
2037 {
2038   std::lock_guard l(client_lock);
2039   ceph_assert(initialized);
2040
2041   auto it = metadata.find(k);
2042   if (it != metadata.end()) {
2043     ldout(cct, 1) << __func__ << " warning, overriding metadata field '" << k
2044                   << "' from '" << it->second << "' to '" << v << "'" << dendl;
2045   }
2046
2047   metadata[k] = v;
2048 }
2049
2050 MetaSession *Client::_open_mds_session(mds_rank_t mds)
2051 {
2052   ldout(cct, 10) << __func__ << " mds." << mds << dendl;
2053   auto addrs = mdsmap->get_addrs(mds);
2054   auto em = mds_sessions.emplace(std::piecewise_construct,
2055       std::forward_as_tuple(mds),
2056       std::forward_as_tuple(mds, messenger->connect_to_mds(addrs), addrs));
2057   ceph_assert(em.second); /* not already present */
2058   MetaSession *session = &em.first->second;
2059
2060   auto m = make_message<MClientSession>(CEPH_SESSION_REQUEST_OPEN);
2061   m->metadata = metadata;
2062   m->supported_features = feature_bitset_t(CEPHFS_FEATURES_CLIENT_SUPPORTED);
2063   session->con->send_message2(std::move(m));
2064   return session;
2065 }
2066
2067 void Client::_close_mds_session(MetaSession *s)
2068 {
2069   ldout(cct, 2) << __func__ << " mds." << s->mds_num << " seq " << s->seq << dendl;
2070   s->state = MetaSession::STATE_CLOSING;
2071   s->con->send_message2(make_message<MClientSession>(CEPH_SESSION_REQUEST_CLOSE, s->seq));
2072 }
2073
2074 void Client::_closed_mds_session(MetaSession *s, int err, bool rejected)
2075 {
2076   ldout(cct, 5) << __func__ << " mds." << s->mds_num << " seq " << s->seq << dendl;
2077   if (rejected && s->state != MetaSession::STATE_CLOSING)
2078     s->state = MetaSession::STATE_REJECTED;
2079   else
2080     s->state = MetaSession::STATE_CLOSED;
2081   s->con->mark_down();
2082   signal_context_list(s->waiting_for_open);
2083   mount_cond.notify_all();
2084   remove_session_caps(s, err);
2085   kick_requests_closed(s);
2086   mds_ranks_closing.erase(s->mds_num);
2087   if (s->state == MetaSession::STATE_CLOSED)
2088     mds_sessions.erase(s->mds_num);
2089 }
2090
2091 void Client::handle_client_session(const MConstRef<MClientSession>& m)
2092 {
2093   mds_rank_t from = mds_rank_t(m->get_source().num());
2094   ldout(cct, 10) << __func__ << " " << *m << " from mds." << from << dendl;
2095
2096   MetaSession *session = _get_mds_session(from, m->get_connection().get());
2097   if (!session) {
2098     ldout(cct, 10) << " discarding session message from sessionless mds " << m->get_source_inst() << dendl;
2099     return;
2100   }
2101
2102   switch (m->get_op()) {
2103   case CEPH_SESSION_OPEN:
2104     {
2105       feature_bitset_t missing_features(CEPHFS_FEATURES_CLIENT_REQUIRED);
2106       missing_features -= m->supported_features;
2107       if (!missing_features.empty()) {
2108         lderr(cct) << "mds." << from << " lacks required features '"
2109                    << missing_features << "', closing session " << dendl;
2110         _close_mds_session(session);
2111         _closed_mds_session(session, -EPERM, true);
2112         break;
2113       }
2114       session->mds_features = std::move(m->supported_features);
2115
2116       renew_caps(session);
2117       session->state = MetaSession::STATE_OPEN;
2118       if (unmounting)
2119         mount_cond.notify_all();
2120       else
2121         connect_mds_targets(from);
2122       signal_context_list(session->waiting_for_open);
2123       break;
2124     }
2125
2126   case CEPH_SESSION_CLOSE:
2127     _closed_mds_session(session);
2128     break;
2129
2130   case CEPH_SESSION_RENEWCAPS:
2131     if (session->cap_renew_seq == m->get_seq()) {
2132       bool was_stale = ceph_clock_now() >= session->cap_ttl;
2133       session->cap_ttl =
2134         session->last_cap_renew_request + mdsmap->get_session_timeout();
2135       if (was_stale)
2136         wake_up_session_caps(session, false);
2137     }
2138     break;
2139
2140   case CEPH_SESSION_STALE:
2141     // invalidate session caps/leases
2142     session->cap_gen++;
2143     session->cap_ttl = ceph_clock_now();
2144     session->cap_ttl -= 1;
2145     renew_caps(session);
2146     break;
2147
2148   case CEPH_SESSION_RECALL_STATE:
2149     trim_caps(session, m->get_max_caps());
2150     break;
2151
2152   case CEPH_SESSION_FLUSHMSG:
2153     /* flush cap release */
2154     if (auto& m = session->release; m) {
2155       session->con->send_message2(std::move(m));
2156     }
2157     session->con->send_message2(make_message<MClientSession>(CEPH_SESSION_FLUSHMSG_ACK, m->get_seq()));
2158     break;
2159
2160   case CEPH_SESSION_FORCE_RO:
2161     force_session_readonly(session);
2162     break;
2163
2164   case CEPH_SESSION_REJECT:
2165     {
2166       std::string_view error_str;
2167       auto it = m->metadata.find("error_string");
2168       if (it != m->metadata.end())
2169         error_str = it->second;
2170       else
2171         error_str = "unknown error";
2172       lderr(cct) << "mds." << from << " rejected us (" << error_str << ")" << dendl;
2173
2174       _closed_mds_session(session, -EPERM, true);
2175     }
2176     break;
2177
2178   default:
2179     ceph_abort();
2180   }
2181 }
2182
2183 bool Client::_any_stale_sessions() const
2184 {
2185   ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
2186
2187   for (const auto &p : mds_sessions) {
2188     if (p.second.state == MetaSession::STATE_STALE) {
2189       return true;
2190     }
2191   }
2192
2193   return false;
2194 }
2195
2196 void Client::_kick_stale_sessions()
2197 {
2198   ldout(cct, 1) << __func__ << dendl;
2199
2200   for (auto it = mds_sessions.begin(); it != mds_sessions.end(); ) {
2201     MetaSession &s = it->second;
2202     if (s.state == MetaSession::STATE_REJECTED) {
2203       mds_sessions.erase(it++);
2204       continue;
2205     }
2206     ++it;
2207     if (s.state == MetaSession::STATE_STALE)
2208       _closed_mds_session(&s);
2209   }
2210 }
2211
2212 void Client::send_request(MetaRequest *request, MetaSession *session,
2213                           bool drop_cap_releases)
2214 {
2215   // make the request
2216   mds_rank_t mds = session->mds_num;
2217   ldout(cct, 10) << __func__ << " rebuilding request " << request->get_tid()
2218                  << " for mds." << mds << dendl;
2219   auto r = build_client_request(request);
2220   if (request->dentry()) {
2221     r->set_dentry_wanted();
2222   }
2223   if (request->got_unsafe) {
2224     r->set_replayed_op();
2225     if (request->target)
2226       r->head.ino = request->target->ino;
2227   } else {
2228     encode_cap_releases(request, mds);
2229     if (drop_cap_releases) // we haven't send cap reconnect yet, drop cap releases
2230       request->cap_releases.clear();
2231     else
2232       r->releases.swap(request->cap_releases);
2233   }
2234   r->set_mdsmap_epoch(mdsmap->get_epoch());
2235   if (r->head.op == CEPH_MDS_OP_SETXATTR) {
2236     objecter->with_osdmap([r](const OSDMap& o) {
2237         r->set_osdmap_epoch(o.get_epoch());
2238       });
2239   }
2240
2241   if (request->mds == -1) {
2242     request->sent_stamp = ceph_clock_now();
2243     ldout(cct, 20) << __func__ << " set sent_stamp to " << request->sent_stamp << dendl;
2244   }
2245   request->mds = mds;
2246
2247   Inode *in = request->inode();
2248   if (in) {
2249     auto it = in->caps.find(mds);
2250     if (it != in->caps.end()) {
2251       request->sent_on_mseq = it->second.mseq;
2252     }
2253   }
2254
2255   session->requests.push_back(&request->item);
2256
2257   ldout(cct, 10) << __func__ << " " << *r << " to mds." << mds << dendl;
2258   session->con->send_message2(std::move(r));
2259 }
2260
2261 ref_t<MClientRequest> Client::build_client_request(MetaRequest *request)
2262 {
2263   auto req = make_message<MClientRequest>(request->get_op());
2264   req->set_tid(request->tid);
2265   req->set_stamp(request->op_stamp);
2266   memcpy(&req->head, &request->head, sizeof(ceph_mds_request_head));
2267
2268   // if the filepath's haven't been set, set them!
2269   if (request->path.empty()) {
2270     Inode *in = request->inode();
2271     Dentry *de = request->dentry();
2272     if (in)
2273       in->make_nosnap_relative_path(request->path);
2274     else if (de) {
2275       if (de->inode)
2276         de->inode->make_nosnap_relative_path(request->path);
2277       else if (de->dir) {
2278         de->dir->parent_inode->make_nosnap_relative_path(request->path);
2279         request->path.push_dentry(de->name);
2280       }
2281       else ldout(cct, 1) << "Warning -- unable to construct a filepath!"
2282                    << " No path, inode, or appropriately-endowed dentry given!"
2283                    << dendl;
2284     } else ldout(cct, 1) << "Warning -- unable to construct a filepath!"
2285                    << " No path, inode, or dentry given!"
2286                    << dendl;
2287   }
2288   req->set_filepath(request->get_filepath());
2289   req->set_filepath2(request->get_filepath2());
2290   req->set_data(request->data);
2291   req->set_retry_attempt(request->retry_attempt++);
2292   req->head.num_fwd = request->num_fwd;
2293   const gid_t *_gids;
2294   int gid_count = request->perms.get_gids(&_gids);
2295   req->set_gid_list(gid_count, _gids);
2296   return req;
2297 }
2298
2299
2300
2301 void Client::handle_client_request_forward(const MConstRef<MClientRequestForward>& fwd)
2302 {
2303   mds_rank_t mds = mds_rank_t(fwd->get_source().num());
2304   MetaSession *session = _get_mds_session(mds, fwd->get_connection().get());
2305   if (!session) {
2306     return;
2307   }
2308   ceph_tid_t tid = fwd->get_tid();
2309
2310   if (mds_requests.count(tid) == 0) {
2311     ldout(cct, 10) << __func__ << " no pending request on tid " << tid << dendl;
2312     return;
2313   }
2314
2315   MetaRequest *request = mds_requests[tid];
2316   ceph_assert(request);
2317
2318   // reset retry counter
2319   request->retry_attempt = 0;
2320
2321   // request not forwarded, or dest mds has no session.
2322   // resend.
2323   ldout(cct, 10) << __func__ << " tid " << tid
2324            << " fwd " << fwd->get_num_fwd()
2325            << " to mds." << fwd->get_dest_mds()
2326            << ", resending to " << fwd->get_dest_mds()
2327            << dendl;
2328
2329   request->mds = -1;
2330   request->item.remove_myself();
2331   request->num_fwd = fwd->get_num_fwd();
2332   request->resend_mds = fwd->get_dest_mds();
2333   request->caller_cond->notify_all();
2334 }
2335
2336 bool Client::is_dir_operation(MetaRequest *req)
2337 {
2338   int op = req->get_op();
2339   if (op == CEPH_MDS_OP_MKNOD || op == CEPH_MDS_OP_LINK ||
2340       op == CEPH_MDS_OP_UNLINK || op == CEPH_MDS_OP_RENAME ||
2341       op == CEPH_MDS_OP_MKDIR || op == CEPH_MDS_OP_RMDIR ||
2342       op == CEPH_MDS_OP_SYMLINK || op == CEPH_MDS_OP_CREATE)
2343     return true;
2344   return false;
2345 }
2346
2347 void Client::handle_client_reply(const MConstRef<MClientReply>& reply)
2348 {
2349   mds_rank_t mds_num = mds_rank_t(reply->get_source().num());
2350   MetaSession *session = _get_mds_session(mds_num, reply->get_connection().get());
2351   if (!session) {
2352     return;
2353   }
2354
2355   ceph_tid_t tid = reply->get_tid();
2356   bool is_safe = reply->is_safe();
2357
2358   if (mds_requests.count(tid) == 0) {
2359     lderr(cct) << __func__ << " no pending request on tid " << tid
2360                << " safe is:" << is_safe << dendl;
2361     return;
2362   }
2363   MetaRequest *request = mds_requests.at(tid);
2364
2365   ldout(cct, 20) << __func__ << " got a reply. Safe:" << is_safe
2366                  << " tid " << tid << dendl;
2367
2368   if (request->got_unsafe && !is_safe) {
2369     //duplicate response
2370     ldout(cct, 0) << "got a duplicate reply on tid " << tid << " from mds "
2371             << mds_num << " safe:" << is_safe << dendl;
2372     return;
2373   }
2374
2375   if (-ESTALE == reply->get_result()) { // see if we can get to proper MDS
2376     ldout(cct, 20) << "got ESTALE on tid " << request->tid
2377                    << " from mds." << request->mds << dendl;
2378     request->send_to_auth = true;
2379     request->resend_mds = choose_target_mds(request);
2380     Inode *in = request->inode();
2381     std::map<mds_rank_t, Cap>::const_iterator it;
2382     if (request->resend_mds >= 0 &&
2383         request->resend_mds == request->mds &&
2384         (in == NULL ||
2385          (it = in->caps.find(request->resend_mds)) != in->caps.end() ||
2386          request->sent_on_mseq == it->second.mseq)) {
2387       ldout(cct, 20) << "have to return ESTALE" << dendl;
2388     } else {
2389       request->caller_cond->notify_all();
2390       return;
2391     }
2392   }
2393
2394   ceph_assert(!request->reply);
2395   request->reply = reply;
2396   insert_trace(request, session);
2397
2398   // Handle unsafe reply
2399   if (!is_safe) {
2400     request->got_unsafe = true;
2401     session->unsafe_requests.push_back(&request->unsafe_item);
2402     if (is_dir_operation(request)) {
2403       Inode *dir = request->inode();
2404       ceph_assert(dir);
2405       dir->unsafe_ops.push_back(&request->unsafe_dir_item);
2406     }
2407     if (request->target) {
2408       InodeRef &in = request->target;
2409       in->unsafe_ops.push_back(&request->unsafe_target_item);
2410     }
2411   }
2412
2413   // Only signal the caller once (on the first reply):
2414   // Either its an unsafe reply, or its a safe reply and no unsafe reply was sent.
2415   if (!is_safe || !request->got_unsafe) {
2416     ceph::condition_variable cond;
2417     request->dispatch_cond = &cond;
2418
2419     // wake up waiter
2420     ldout(cct, 20) << __func__ << " signalling caller " << (void*)request->caller_cond << dendl;
2421     request->caller_cond->notify_all();
2422
2423     // wake for kick back
2424     std::unique_lock l{client_lock, std::adopt_lock};
2425     cond.wait(l, [tid, request, &cond, this] {
2426       if (request->dispatch_cond) {
2427         ldout(cct, 20) << "handle_client_reply awaiting kickback on tid "
2428                        << tid << " " << &cond << dendl;
2429       }
2430       return !request->dispatch_cond;
2431     });
2432     l.release();
2433   }
2434
2435   if (is_safe) {
2436     // the filesystem change is committed to disk
2437     // we're done, clean up
2438     if (request->got_unsafe) {
2439       request->unsafe_item.remove_myself();
2440       request->unsafe_dir_item.remove_myself();
2441       request->unsafe_target_item.remove_myself();
2442       signal_cond_list(request->waitfor_safe);
2443     }
2444     request->item.remove_myself();
2445     unregister_request(request);
2446   }
2447   if (unmounting)
2448     mount_cond.notify_all();
2449 }
2450
2451 void Client::_handle_full_flag(int64_t pool)
2452 {
2453   ldout(cct, 1) << __func__ << ": FULL: cancelling outstanding operations "
2454     << "on " << pool << dendl;
2455   // Cancel all outstanding ops in this pool with -ENOSPC: it is necessary
2456   // to do this rather than blocking, because otherwise when we fill up we
2457   // potentially lock caps forever on files with dirty pages, and we need
2458   // to be able to release those caps to the MDS so that it can delete files
2459   // and free up space.
2460   epoch_t cancelled_epoch = objecter->op_cancel_writes(-ENOSPC, pool);
2461
2462   // For all inodes with layouts in this pool and a pending flush write op
2463   // (i.e. one of the ones we will cancel), we've got to purge_set their data
2464   // from ObjectCacher so that it doesn't re-issue the write in response to
2465   // the ENOSPC error.
2466   // Fortunately since we're cancelling everything in a given pool, we don't
2467   // need to know which ops belong to which ObjectSet, we can just blow all
2468   // the un-flushed cached data away and mark any dirty inodes' async_err
2469   // field with -ENOSPC as long as we're sure all the ops we cancelled were
2470   // affecting this pool, and all the objectsets we're purging were also
2471   // in this pool.
2472   for (unordered_map<vinodeno_t,Inode*>::iterator i = inode_map.begin();
2473        i != inode_map.end(); ++i)
2474   {
2475     Inode *inode = i->second;
2476     if (inode->oset.dirty_or_tx
2477         && (pool == -1 || inode->layout.pool_id == pool)) {
2478       ldout(cct, 4) << __func__ << ": FULL: inode 0x" << std::hex << i->first << std::dec
2479         << " has dirty objects, purging and setting ENOSPC" << dendl;
2480       objectcacher->purge_set(&inode->oset);
2481       inode->set_async_err(-ENOSPC);
2482     }
2483   }
2484
2485   if (cancelled_epoch != (epoch_t)-1) {
2486     set_cap_epoch_barrier(cancelled_epoch);
2487   }
2488 }
2489
2490 void Client::handle_osd_map(const MConstRef<MOSDMap>& m)
2491 {
2492   std::set<entity_addr_t> new_blacklists;
2493   objecter->consume_blacklist_events(&new_blacklists);
2494
2495   const auto myaddrs = messenger->get_myaddrs();
2496   bool new_blacklist = false;
2497   bool prenautilus = objecter->with_osdmap(
2498     [&](const OSDMap& o) {
2499       return o.require_osd_release < ceph_release_t::nautilus;
2500     });
2501   if (!blacklisted) {
2502     for (auto a : myaddrs.v) {
2503       // blacklist entries are always TYPE_ANY for nautilus+
2504       a.set_type(entity_addr_t::TYPE_ANY);
2505       if (new_blacklists.count(a)) {
2506         new_blacklist = true;
2507         break;
2508       }
2509       if (prenautilus) {
2510         // ...except pre-nautilus, they were TYPE_LEGACY
2511         a.set_type(entity_addr_t::TYPE_LEGACY);
2512         if (new_blacklists.count(a)) {
2513           new_blacklist = true;
2514           break;
2515         }
2516       }
2517     }
2518   }
2519   if (new_blacklist) {
2520     auto epoch = objecter->with_osdmap([](const OSDMap &o){
2521         return o.get_epoch();
2522         });
2523     lderr(cct) << "I was blacklisted at osd epoch " << epoch << dendl;
2524     blacklisted = true;
2525
2526     _abort_mds_sessions(-EBLACKLISTED);
2527
2528     // Since we know all our OSD ops will fail, cancel them all preemtively,
2529     // so that on an unhealthy cluster we can umount promptly even if e.g.
2530     // some PGs were inaccessible.
2531     objecter->op_cancel_writes(-EBLACKLISTED);
2532
2533   } else if (blacklisted) {
2534     // Handle case where we were blacklisted but no longer are
2535     blacklisted = objecter->with_osdmap([myaddrs](const OSDMap &o){
2536         return o.is_blacklisted(myaddrs);});
2537   }
2538
2539   // Always subscribe to next osdmap for blacklisted client
2540   // until this client is not blacklisted.
2541   if (blacklisted) {
2542     objecter->maybe_request_map();
2543   }
2544
2545   if (objecter->osdmap_full_flag()) {
2546     _handle_full_flag(-1);
2547   } else {
2548     // Accumulate local list of full pools so that I can drop
2549     // the objecter lock before re-entering objecter in
2550     // cancel_writes
2551     std::vector<int64_t> full_pools;
2552
2553     objecter->with_osdmap([&full_pools](const OSDMap &o) {
2554         for (const auto& kv : o.get_pools()) {
2555           if (kv.second.has_flag(pg_pool_t::FLAG_FULL)) {
2556             full_pools.push_back(kv.first);
2557           }
2558         }
2559       });
2560
2561     for (auto p : full_pools)
2562       _handle_full_flag(p);
2563
2564     // Subscribe to subsequent maps to watch for the full flag going
2565     // away.  For the global full flag objecter does this for us, but
2566     // it pays no attention to the per-pool full flag so in this branch
2567     // we do it ourselves.
2568     if (!full_pools.empty()) {
2569       objecter->maybe_request_map();
2570     }
2571   }
2572 }
2573
2574
2575 // ------------------------
2576 // incoming messages
2577
2578
2579 bool Client::ms_dispatch2(const MessageRef &m)
2580 {
2581   std::lock_guard l(client_lock);
2582   if (!initialized) {
2583     ldout(cct, 10) << "inactive, discarding " << *m << dendl;
2584     return true;
2585   }
2586
2587   switch (m->get_type()) {
2588     // mounting and mds sessions
2589   case CEPH_MSG_MDS_MAP:
2590     handle_mds_map(ref_cast<MMDSMap>(m));
2591     break;
2592   case CEPH_MSG_FS_MAP:
2593     handle_fs_map(ref_cast<MFSMap>(m));
2594     break;
2595   case CEPH_MSG_FS_MAP_USER:
2596     handle_fs_map_user(ref_cast<MFSMapUser>(m));
2597     break;
2598   case CEPH_MSG_CLIENT_SESSION:
2599     handle_client_session(ref_cast<MClientSession>(m));
2600     break;
2601
2602   case CEPH_MSG_OSD_MAP:
2603     handle_osd_map(ref_cast<MOSDMap>(m));
2604     break;
2605
2606     // requests
2607   case CEPH_MSG_CLIENT_REQUEST_FORWARD:
2608     handle_client_request_forward(ref_cast<MClientRequestForward>(m));
2609     break;
2610   case CEPH_MSG_CLIENT_REPLY:
2611     handle_client_reply(ref_cast<MClientReply>(m));
2612     break;
2613
2614   // reclaim reply
2615   case CEPH_MSG_CLIENT_RECLAIM_REPLY:
2616     handle_client_reclaim_reply(ref_cast<MClientReclaimReply>(m));
2617     break;
2618
2619   case CEPH_MSG_CLIENT_SNAP:
2620     handle_snap(ref_cast<MClientSnap>(m));
2621     break;
2622   case CEPH_MSG_CLIENT_CAPS:
2623     handle_caps(ref_cast<MClientCaps>(m));
2624     break;
2625   case CEPH_MSG_CLIENT_LEASE:
2626     handle_lease(ref_cast<MClientLease>(m));
2627     break;
2628   case MSG_COMMAND_REPLY:
2629     if (m->get_source().type() == CEPH_ENTITY_TYPE_MDS) {
2630       handle_command_reply(ref_cast<MCommandReply>(m));
2631     } else {
2632       return false;
2633     }
2634     break;
2635   case CEPH_MSG_CLIENT_QUOTA:
2636     handle_quota(ref_cast<MClientQuota>(m));
2637     break;
2638
2639   default:
2640     return false;
2641   }
2642
2643   // unmounting?
2644   if (unmounting) {
2645     ldout(cct, 10) << "unmounting: trim pass, size was " << lru.lru_get_size()
2646              << "+" << inode_map.size() << dendl;
2647     long unsigned size = lru.lru_get_size() + inode_map.size();
2648     trim_cache();
2649     if (size < lru.lru_get_size() + inode_map.size()) {
2650       ldout(cct, 10) << "unmounting: trim pass, cache shrank, poking unmount()" << dendl;
2651       mount_cond.notify_all();
2652     } else {
2653       ldout(cct, 10) << "unmounting: trim pass, size still " << lru.lru_get_size()
2654                << "+" << inode_map.size() << dendl;
2655     }
2656   }
2657
2658   return true;
2659 }
2660
2661 void Client::handle_fs_map(const MConstRef<MFSMap>& m)
2662 {
2663   fsmap.reset(new FSMap(m->get_fsmap()));
2664
2665   signal_cond_list(waiting_for_fsmap);
2666
2667   monclient->sub_got("fsmap", fsmap->get_epoch());
2668 }
2669
2670 void Client::handle_fs_map_user(const MConstRef<MFSMapUser>& m)
2671 {
2672   fsmap_user.reset(new FSMapUser);
2673   *fsmap_user = m->get_fsmap();
2674
2675   monclient->sub_got("fsmap.user", fsmap_user->get_epoch());
2676   signal_cond_list(waiting_for_fsmap);
2677 }
2678
2679 void Client::handle_mds_map(const MConstRef<MMDSMap>& m)
2680 {
2681   mds_gid_t old_inc, new_inc;
2682   if (m->get_epoch() <= mdsmap->get_epoch()) {
2683     ldout(cct, 1) << __func__ << " epoch " << m->get_epoch()
2684                   << " is identical to or older than our "
2685                   << mdsmap->get_epoch() << dendl;
2686     return;
2687   }
2688
2689   ldout(cct, 1) << __func__ << " epoch " << m->get_epoch() << dendl;
2690
2691   std::unique_ptr<MDSMap> oldmap(new MDSMap);
2692   oldmap.swap(mdsmap);
2693
2694   mdsmap->decode(m->get_encoded());
2695
2696   // Cancel any commands for missing or laggy GIDs
2697   std::list<ceph_tid_t> cancel_ops;
2698   auto &commands = command_table.get_commands();
2699   for (const auto &i : commands) {
2700     auto &op = i.second;
2701     const mds_gid_t op_mds_gid = op.mds_gid;
2702     if (mdsmap->is_dne_gid(op_mds_gid) || mdsmap->is_laggy_gid(op_mds_gid)) {
2703       ldout(cct, 1) << __func__ << ": cancelling command op " << i.first << dendl;
2704       cancel_ops.push_back(i.first);
2705       if (op.outs) {
2706         std::ostringstream ss;
2707         ss << "MDS " << op_mds_gid << " went away";
2708         *(op.outs) = ss.str();
2709       }
2710       op.con->mark_down();
2711       if (op.on_finish) {
2712         op.on_finish->complete(-ETIMEDOUT);
2713       }
2714     }
2715   }
2716
2717   for (std::list<ceph_tid_t>::iterator i = cancel_ops.begin();
2718        i != cancel_ops.end(); ++i) {
2719     command_table.erase(*i);
2720   }
2721
2722   // reset session
2723   for (auto p = mds_sessions.begin(); p != mds_sessions.end(); ) {
2724     mds_rank_t mds = p->first;
2725     MetaSession *session = &p->second;
2726     ++p;
2727
2728     int oldstate = oldmap->get_state(mds);
2729     int newstate = mdsmap->get_state(mds);
2730     if (!mdsmap->is_up(mds)) {
2731       session->con->mark_down();
2732     } else if (mdsmap->get_addrs(mds) != session->addrs) {
2733       old_inc = oldmap->get_incarnation(mds);
2734       new_inc = mdsmap->get_incarnation(mds);
2735       if (old_inc != new_inc) {
2736         ldout(cct, 1) << "mds incarnation changed from "
2737                       << old_inc << " to " << new_inc << dendl;
2738         oldstate = MDSMap::STATE_NULL;
2739       }
2740       session->con->mark_down();
2741       session->addrs = mdsmap->get_addrs(mds);
2742       // When new MDS starts to take over, notify kernel to trim unused entries
2743       // in its dcache/icache. Hopefully, the kernel will release some unused
2744       // inodes before the new MDS enters reconnect state.
2745       trim_cache_for_reconnect(session);
2746     } else if (oldstate == newstate)
2747       continue;  // no change
2748
2749     session->mds_state = newstate;
2750     if (newstate == MDSMap::STATE_RECONNECT) {
2751       session->con = messenger->connect_to_mds(session->addrs);
2752       send_reconnect(session);
2753     } else if (newstate > MDSMap::STATE_RECONNECT) {
2754       if (oldstate < MDSMap::STATE_RECONNECT) {
2755         ldout(cct, 1) << "we may miss the MDSMap::RECONNECT, close mds session ... " << dendl;
2756         _closed_mds_session(session);
2757         continue;
2758       }
2759       if (newstate >= MDSMap::STATE_ACTIVE) {
2760         if (oldstate < MDSMap::STATE_ACTIVE) {
2761           // kick new requests
2762           kick_requests(session);
2763           kick_flushing_caps(session);
2764           signal_context_list(session->waiting_for_open);
2765           wake_up_session_caps(session, true);
2766         }
2767         connect_mds_targets(mds);
2768       }
2769     } else if (newstate == MDSMap::STATE_NULL &&
2770                mds >= mdsmap->get_max_mds()) {
2771       _closed_mds_session(session);
2772     }
2773   }
2774
2775   // kick any waiting threads
2776   signal_cond_list(waiting_for_mdsmap);
2777
2778   monclient->sub_got("mdsmap", mdsmap->get_epoch());
2779 }
2780
2781 void Client::send_reconnect(MetaSession *session)
2782 {
2783   mds_rank_t mds = session->mds_num;
2784   ldout(cct, 10) << __func__ << " to mds." << mds << dendl;
2785
2786   // trim unused caps to reduce MDS's cache rejoin time
2787   trim_cache_for_reconnect(session);
2788
2789   session->readonly = false;
2790
2791   session->release.reset();
2792
2793   // reset my cap seq number
2794   session->seq = 0;
2795   //connect to the mds' offload targets
2796   connect_mds_targets(mds);
2797   //make sure unsafe requests get saved
2798   resend_unsafe_requests(session);
2799
2800   early_kick_flushing_caps(session);
2801
2802   auto m = make_message<MClientReconnect>();
2803   bool allow_multi = session->mds_features.test(CEPHFS_FEATURE_MULTI_RECONNECT);
2804
2805   // i have an open session.
2806   ceph::unordered_set<inodeno_t> did_snaprealm;
2807   for (ceph::unordered_map<vinodeno_t, Inode*>::iterator p = inode_map.begin();
2808        p != inode_map.end();
2809        ++p) {
2810     Inode *in = p->second;
2811     auto it = in->caps.find(mds);
2812     if (it != in->caps.end()) {
2813       if (allow_multi &&
2814           m->get_approx_size() >=
2815           static_cast<size_t>((std::numeric_limits<int>::max() >> 1))) {
2816         m->mark_more();
2817         session->con->send_message2(std::move(m));
2818
2819         m = make_message<MClientReconnect>();
2820       }
2821
2822       Cap &cap = it->second;
2823       ldout(cct, 10) << " caps on " << p->first
2824                << " " << ccap_string(cap.issued)
2825                << " wants " << ccap_string(in->caps_wanted())
2826                << dendl;
2827       filepath path;
2828       in->make_short_path(path);
2829       ldout(cct, 10) << "    path " << path << dendl;
2830
2831       bufferlist flockbl;
2832       _encode_filelocks(in, flockbl);
2833
2834       cap.seq = 0;  // reset seq.
2835       cap.issue_seq = 0;  // reset seq.
2836       cap.mseq = 0;  // reset seq.
2837       // cap gen should catch up with session cap_gen
2838       if (cap.gen < session->cap_gen) {
2839         cap.gen = session->cap_gen;
2840         cap.issued = cap.implemented = CEPH_CAP_PIN;
2841       } else {
2842         cap.issued = cap.implemented;
2843       }
2844       snapid_t snap_follows = 0;
2845       if (!in->cap_snaps.empty())
2846         snap_follows = in->cap_snaps.begin()->first;
2847
2848       m->add_cap(p->first.ino,
2849                  cap.cap_id,
2850                  path.get_ino(), path.get_path(),   // ino
2851                  in->caps_wanted(), // wanted
2852                  cap.issued,     // issued
2853                  in->snaprealm->ino,
2854                  snap_follows,
2855                  flockbl);
2856
2857       if (did_snaprealm.count(in->snaprealm->ino) == 0) {
2858         ldout(cct, 10) << " snaprealm " << *in->snaprealm << dendl;
2859         m->add_snaprealm(in->snaprealm->ino, in->snaprealm->seq, in->snaprealm->parent);
2860         did_snaprealm.insert(in->snaprealm->ino);
2861       }
2862     }
2863   }
2864
2865   if (!allow_multi)
2866     m->set_encoding_version(0); // use connection features to choose encoding
2867   session->con->send_message2(std::move(m));
2868
2869   mount_cond.notify_all();
2870
2871   if (session->reclaim_state == MetaSession::RECLAIMING)
2872     signal_cond_list(waiting_for_reclaim);
2873 }
2874
2875
2876 void Client::kick_requests(MetaSession *session)
2877 {
2878   ldout(cct, 10) << __func__ << " for mds." << session->mds_num << dendl;
2879   for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
2880        p != mds_requests.end();
2881        ++p) {
2882     MetaRequest *req = p->second;
2883     if (req->got_unsafe)
2884       continue;
2885     if (req->aborted()) {
2886       if (req->caller_cond) {
2887         req->kick = true;
2888         req->caller_cond->notify_all();
2889       }
2890       continue;
2891     }
2892     if (req->retry_attempt > 0)
2893       continue; // new requests only
2894     if (req->mds == session->mds_num) {
2895       send_request(p->second, session);
2896     }
2897   }
2898 }
2899
2900 void Client::resend_unsafe_requests(MetaSession *session)
2901 {
2902   for (xlist<MetaRequest*>::iterator iter = session->unsafe_requests.begin();
2903        !iter.end();
2904        ++iter)
2905     send_request(*iter, session);
2906
2907   // also re-send old requests when MDS enters reconnect stage. So that MDS can
2908   // process completed requests in clientreplay stage.
2909   for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
2910        p != mds_requests.end();
2911        ++p) {
2912     MetaRequest *req = p->second;
2913     if (req->got_unsafe)
2914       continue;
2915     if (req->aborted())
2916       continue;
2917     if (req->retry_attempt == 0)
2918       continue; // old requests only
2919     if (req->mds == session->mds_num)
2920       send_request(req, session, true);
2921   }
2922 }
2923
2924 void Client::wait_unsafe_requests()
2925 {
2926   list<MetaRequest*> last_unsafe_reqs;
2927   for (const auto &p : mds_sessions) {
2928     const MetaSession &s = p.second;
2929     if (!s.unsafe_requests.empty()) {
2930       MetaRequest *req = s.unsafe_requests.back();
2931       req->get();
2932       last_unsafe_reqs.push_back(req);
2933     }
2934   }
2935
2936   for (list<MetaRequest*>::iterator p = last_unsafe_reqs.begin();
2937        p != last_unsafe_reqs.end();
2938        ++p) {
2939     MetaRequest *req = *p;
2940     if (req->unsafe_item.is_on_list())
2941       wait_on_list(req->waitfor_safe);
2942     put_request(req);
2943   }
2944 }
2945
2946 void Client::kick_requests_closed(MetaSession *session)
2947 {
2948   ldout(cct, 10) << __func__ << " for mds." << session->mds_num << dendl;
2949   for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
2950        p != mds_requests.end(); ) {
2951     MetaRequest *req = p->second;
2952     ++p;
2953     if (req->mds == session->mds_num) {
2954       if (req->caller_cond) {
2955         req->kick = true;
2956         req->caller_cond->notify_all();
2957       }
2958       req->item.remove_myself();
2959       if (req->got_unsafe) {
2960         lderr(cct) << __func__ << " removing unsafe request " << req->get_tid() << dendl;
2961         req->unsafe_item.remove_myself();
2962         if (is_dir_operation(req)) {
2963           Inode *dir = req->inode();
2964           assert(dir);
2965           dir->set_async_err(-EIO);
2966           lderr(cct) << "kick_requests_closed drop req of inode(dir) : "
2967                      <<  dir->ino  << " " << req->get_tid() << dendl;
2968           req->unsafe_dir_item.remove_myself();
2969         }
2970         if (req->target) {
2971           InodeRef &in = req->target;
2972           in->set_async_err(-EIO);
2973           lderr(cct) << "kick_requests_closed drop req of inode : "
2974                      <<  in->ino  << " " << req->get_tid() << dendl;
2975           req->unsafe_target_item.remove_myself();
2976         }
2977         signal_cond_list(req->waitfor_safe);
2978         unregister_request(req);
2979       }
2980     }
2981   }
2982   ceph_assert(session->requests.empty());
2983   ceph_assert(session->unsafe_requests.empty());
2984 }
2985
2986
2987
2988
2989 /************
2990  * leases
2991  */
2992
2993 void Client::got_mds_push(MetaSession *s)
2994 {
2995   s->seq++;
2996   ldout(cct, 10) << " mds." << s->mds_num << " seq now " << s->seq << dendl;
2997   if (s->state == MetaSession::STATE_CLOSING) {
2998     s->con->send_message2(make_message<MClientSession>(CEPH_SESSION_REQUEST_CLOSE, s->seq));
2999   }
3000 }
3001
3002 void Client::handle_lease(const MConstRef<MClientLease>& m)
3003 {
3004   ldout(cct, 10) << __func__ << " " << *m << dendl;
3005
3006   ceph_assert(m->get_action() == CEPH_MDS_LEASE_REVOKE);
3007
3008   mds_rank_t mds = mds_rank_t(m->get_source().num());
3009   MetaSession *session = _get_mds_session(mds, m->get_connection().get());
3010   if (!session) {
3011     return;
3012   }
3013
3014   got_mds_push(session);
3015
3016   ceph_seq_t seq = m->get_seq();
3017
3018   Inode *in;
3019   vinodeno_t vino(m->get_ino(), CEPH_NOSNAP);
3020   if (inode_map.count(vino) == 0) {
3021     ldout(cct, 10) << " don't have vino " << vino << dendl;
3022     goto revoke;
3023   }
3024   in = inode_map[vino];
3025
3026   if (m->get_mask() & CEPH_LEASE_VALID) {
3027     if (!in->dir || in->dir->dentries.count(m->dname) == 0) {
3028       ldout(cct, 10) << " don't have dir|dentry " << m->get_ino() << "/" << m->dname <<dendl;
3029       goto revoke;
3030     }
3031     Dentry *dn = in->dir->dentries[m->dname];
3032     ldout(cct, 10) << " revoked DN lease on " << dn << dendl;
3033     dn->lease_mds = -1;
3034   }
3035
3036  revoke:
3037   {
3038     auto reply = make_message<MClientLease>(CEPH_MDS_LEASE_RELEASE, seq,
3039                                             m->get_mask(), m->get_ino(),
3040                                             m->get_first(), m->get_last(), m->dname);
3041     m->get_connection()->send_message2(std::move(reply));
3042   }
3043 }
3044
3045 void Client::put_inode(Inode *in, int n)
3046 {
3047   ldout(cct, 10) << __func__ << " on " << *in << dendl;
3048   int left = in->_put(n);
3049   if (left == 0) {
3050     // release any caps
3051     remove_all_caps(in);
3052
3053     ldout(cct, 10) << __func__ << " deleting " << *in << dendl;
3054     bool unclean = objectcacher->release_set(&in->oset);
3055     ceph_assert(!unclean);
3056     inode_map.erase(in->vino());
3057     if (use_faked_inos())
3058       _release_faked_ino(in);
3059
3060     if (in == root) {
3061       root = 0;
3062       root_ancestor = 0;
3063       while (!root_parents.empty())
3064         root_parents.erase(root_parents.begin());
3065     }
3066
3067     delete in;
3068   }
3069 }
3070
3071 void Client::close_dir(Dir *dir)
3072 {
3073   Inode *in = dir->parent_inode;
3074   ldout(cct, 15) << __func__ << " dir " << dir << " on " << in << dendl;
3075   ceph_assert(dir->is_empty());
3076   ceph_assert(in->dir == dir);
3077   ceph_assert(in->dentries.size() < 2);     // dirs can't be hard-linked
3078   if (!in->dentries.empty())
3079     in->get_first_parent()->put();   // unpin dentry
3080
3081   delete in->dir;
3082   in->dir = 0;
3083   put_inode(in);               // unpin inode
3084 }
3085
3086   /**
3087    * Don't call this with in==NULL, use get_or_create for that
3088    * leave dn set to default NULL unless you're trying to add
3089    * a new inode to a pre-created Dentry
3090    */
3091 Dentry* Client::link(Dir *dir, const string& name, Inode *in, Dentry *dn)
3092 {
3093   if (!dn) {
3094     // create a new Dentry
3095     dn = new Dentry(dir, name);
3096
3097     lru.lru_insert_mid(dn);    // mid or top?
3098
3099     ldout(cct, 15) << "link dir " << dir->parent_inode << " '" << name << "' to inode " << in
3100                    << " dn " << dn << " (new dn)" << dendl;
3101   } else {
3102     ceph_assert(!dn->inode);
3103     ldout(cct, 15) << "link dir " << dir->parent_inode << " '" << name << "' to inode " << in
3104                    << " dn " << dn << " (old dn)" << dendl;
3105   }
3106
3107   if (in) {    // link to inode
3108     InodeRef tmp_ref;
3109     // only one parent for directories!
3110     if (in->is_dir() && !in->dentries.empty()) {
3111       tmp_ref = in; // prevent unlink below from freeing the inode.
3112       Dentry *olddn = in->get_first_parent();
3113       ceph_assert(olddn->dir != dir || olddn->name != name);
3114       Inode *old_diri = olddn->dir->parent_inode;
3115       clear_dir_complete_and_ordered(old_diri, true);
3116       unlink(olddn, true, true);  // keep dir, dentry
3117     }
3118
3119     dn->link(in);
3120     ldout(cct, 20) << "link  inode " << in << " parents now " << in->dentries << dendl;
3121   }
3122
3123   return dn;
3124 }
3125
3126 void Client::unlink(Dentry *dn, bool keepdir, bool keepdentry)
3127 {
3128   InodeRef in(dn->inode);
3129   ldout(cct, 15) << "unlink dir " << dn->dir->parent_inode << " '" << dn->name << "' dn " << dn
3130                  << " inode " << dn->inode << dendl;
3131
3132   // unlink from inode
3133   if (dn->inode) {
3134     dn->unlink();
3135     ldout(cct, 20) << "unlink  inode " << in << " parents now " << in->dentries << dendl;
3136   }
3137
3138   if (keepdentry) {
3139     dn->lease_mds = -1;
3140   } else {
3141     ldout(cct, 15) << "unlink  removing '" << dn->name << "' dn " << dn << dendl;
3142
3143     // unlink from dir
3144     Dir *dir = dn->dir;
3145     dn->detach();
3146
3147     // delete den
3148     lru.lru_remove(dn);
3149     dn->put();
3150
3151     if (dir->is_empty() && !keepdir)
3152       close_dir(dir);
3153   }
3154 }
3155
3156 /**
3157  * For asynchronous flushes, check for errors from the IO and
3158  * update the inode if necessary
3159  */
3160 class C_Client_FlushComplete : public Context {
3161 private:
3162   Client *client;
3163   InodeRef inode;
3164 public:
3165   C_Client_FlushComplete(Client *c, Inode *in) : client(c), inode(in) { }
3166   void finish(int r) override {
3167     ceph_assert(ceph_mutex_is_locked_by_me(client->client_lock));
3168     if (r != 0) {
3169       client_t const whoami = client->whoami;  // For the benefit of ldout prefix
3170       ldout(client->cct, 1) << "I/O error from flush on inode " << inode
3171         << " 0x" << std::hex << inode->ino << std::dec
3172         << ": " << r << "(" << cpp_strerror(r) << ")" << dendl;
3173       inode->set_async_err(r);
3174     }
3175   }
3176 };
3177
3178
3179 /****
3180  * caps
3181  */
3182
3183 void Client::get_cap_ref(Inode *in, int cap)
3184 {
3185   if ((cap & CEPH_CAP_FILE_BUFFER) &&
3186       in->cap_refs[CEPH_CAP_FILE_BUFFER] == 0) {
3187     ldout(cct, 5) << __func__ << " got first FILE_BUFFER ref on " << *in << dendl;
3188     in->get();
3189   }
3190   if ((cap & CEPH_CAP_FILE_CACHE) &&
3191       in->cap_refs[CEPH_CAP_FILE_CACHE] == 0) {
3192     ldout(cct, 5) << __func__ << " got first FILE_CACHE ref on " << *in << dendl;
3193     in->get();
3194   }
3195   in->get_cap_ref(cap);
3196 }
3197
3198 void Client::put_cap_ref(Inode *in, int cap)
3199 {
3200   int last = in->put_cap_ref(cap);
3201   if (last) {
3202     int put_nref = 0;
3203     int drop = last & ~in->caps_issued();
3204     if (in->snapid == CEPH_NOSNAP) {
3205       if ((last & (CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER)) &&
3206           !in->cap_snaps.empty() &&
3207           in->cap_snaps.rbegin()->second.writing) {
3208         ldout(cct, 10) << __func__ << " finishing pending cap_snap on " << *in << dendl;
3209         in->cap_snaps.rbegin()->second.writing = 0;
3210         finish_cap_snap(in, in->cap_snaps.rbegin()->second, get_caps_used(in));
3211         signal_cond_list(in->waitfor_caps);  // wake up blocked sync writers
3212       }
3213       if (last & CEPH_CAP_FILE_BUFFER) {
3214         for (auto &p : in->cap_snaps)
3215           p.second.dirty_data = 0;
3216         signal_cond_list(in->waitfor_commit);
3217         ldout(cct, 5) << __func__ << " dropped last FILE_BUFFER ref on " << *in << dendl;
3218         ++put_nref;
3219       }
3220     }
3221     if (last & CEPH_CAP_FILE_CACHE) {
3222       ldout(cct, 5) << __func__ << " dropped last FILE_CACHE ref on " << *in << dendl;
3223       ++put_nref;
3224     }
3225     if (drop)
3226       check_caps(in, 0);
3227     if (put_nref)
3228       put_inode(in, put_nref);
3229   }
3230 }
3231
3232 int Client::get_caps(Fh *fh, int need, int want, int *phave, loff_t endoff)
3233 {
3234   Inode *in = fh->inode.get();
3235
3236   int r = check_pool_perm(in, need);
3237   if (r < 0)
3238     return r;
3239
3240   while (1) {
3241     int file_wanted = in->caps_file_wanted();
3242     if ((file_wanted & need) != need) {
3243       ldout(cct, 10) << "get_caps " << *in << " need " << ccap_string(need)
3244                      << " file_wanted " << ccap_string(file_wanted) << ", EBADF "
3245                      << dendl;
3246       return -EBADF;
3247     }
3248
3249     if ((fh->mode & CEPH_FILE_MODE_WR) && fh->gen != fd_gen)
3250       return -EBADF;
3251
3252     if ((in->flags & I_ERROR_FILELOCK) && fh->has_any_filelocks())
3253       return -EIO;
3254
3255     int implemented;
3256     int have = in->caps_issued(&implemented);
3257
3258     bool waitfor_caps = false;
3259     bool waitfor_commit = false;
3260
3261     if (have & need & CEPH_CAP_FILE_WR) {
3262       if (endoff > 0) {
3263          if ((endoff >= (loff_t)in->max_size ||
3264               endoff > (loff_t)(in->size << 1)) &&
3265              endoff > (loff_t)in->wanted_max_size) {
3266            ldout(cct, 10) << "wanted_max_size " << in->wanted_max_size << " -> " << endoff << dendl;
3267            in->wanted_max_size = endoff;
3268          }
3269          if (in->wanted_max_size > in->max_size &&
3270              in->wanted_max_size > in->requested_max_size)
3271            check_caps(in, 0);
3272       }
3273
3274       if (endoff >= 0 && endoff > (loff_t)in->max_size) {
3275         ldout(cct, 10) << "waiting on max_size, endoff " << endoff << " max_size " << in->max_size << " on " << *in << dendl;
3276         waitfor_caps = true;
3277       }
3278       if (!in->cap_snaps.empty()) {
3279         if (in->cap_snaps.rbegin()->second.writing) {
3280           ldout(cct, 10) << "waiting on cap_snap write to complete" << dendl;
3281           waitfor_caps = true;
3282         }
3283         for (auto &p : in->cap_snaps) {
3284           if (p.second.dirty_data) {
3285             waitfor_commit = true;
3286             break;
3287           }
3288         }
3289         if (waitfor_commit) {
3290           _flush(in, new C_Client_FlushComplete(this, in));
3291           ldout(cct, 10) << "waiting for WRBUFFER to get dropped" << dendl;
3292         }
3293       }
3294     }
3295
3296     if (!waitfor_caps && !waitfor_commit) {
3297       if ((have & need) == need) {
3298         int revoking = implemented & ~have;
3299         ldout(cct, 10) << "get_caps " << *in << " have " << ccap_string(have)
3300                  << " need " << ccap_string(need) << " want " << ccap_string(want)
3301                  << " revoking " << ccap_string(revoking)
3302                  << dendl;
3303         if ((revoking & want) == 0) {
3304           *phave = need | (have & want);
3305           in->get_cap_ref(need);
3306           return 0;
3307         }
3308       }
3309       ldout(cct, 10) << "waiting for caps " << *in << " need " << ccap_string(need) << " want " << ccap_string(want) << dendl;
3310       waitfor_caps = true;
3311     }
3312
3313     if ((need & CEPH_CAP_FILE_WR) && in->auth_cap &&
3314         in->auth_cap->session->readonly)
3315       return -EROFS;
3316
3317     if (in->flags & I_CAP_DROPPED) {
3318       int mds_wanted = in->caps_mds_wanted();
3319       if ((mds_wanted & need) != need) {
3320         int ret = _renew_caps(in);
3321         if (ret < 0)
3322           return ret;
3323         continue;
3324       }
3325       if (!(file_wanted & ~mds_wanted))
3326         in->flags &= ~I_CAP_DROPPED;
3327     }
3328
3329     if (waitfor_caps)
3330       wait_on_list(in->waitfor_caps);
3331     else if (waitfor_commit)
3332       wait_on_list(in->waitfor_commit);
3333   }
3334 }
3335
3336 int Client::get_caps_used(Inode *in)
3337 {
3338   unsigned used = in->caps_used();
3339   if (!(used & CEPH_CAP_FILE_CACHE) &&
3340       !objectcacher->set_is_empty(&in->oset))
3341     used |= CEPH_CAP_FILE_CACHE;
3342   return used;
3343 }
3344
3345 void Client::cap_delay_requeue(Inode *in)
3346 {
3347   ldout(cct, 10) << __func__ << " on " << *in << dendl;
3348   in->hold_caps_until = ceph_clock_now();
3349   in->hold_caps_until += cct->_conf->client_caps_release_delay;
3350   delayed_list.push_back(&in->delay_cap_item);
3351 }
3352
3353 void Client::send_cap(Inode *in, MetaSession *session, Cap *cap,
3354                       int flags, int used, int want, int retain,
3355                       int flush, ceph_tid_t flush_tid)
3356 {
3357   int held = cap->issued | cap->implemented;
3358   int revoking = cap->implemented & ~cap->issued;
3359   retain &= ~revoking;
3360   int dropping = cap->issued & ~retain;
3361   int op = CEPH_CAP_OP_UPDATE;
3362
3363   ldout(cct, 10) << __func__ << " " << *in
3364            << " mds." << session->mds_num << " seq " << cap->seq
3365            << " used " << ccap_string(used)
3366            << " want " << ccap_string(want)
3367            << " flush " << ccap_string(flush)
3368            << " retain " << ccap_string(retain)
3369            << " held "<< ccap_string(held)
3370            << " revoking " << ccap_string(revoking)
3371            << " dropping " << ccap_string(dropping)
3372            << dendl;
3373
3374   if (cct->_conf->client_inject_release_failure && revoking) {
3375     const int would_have_issued = cap->issued & retain;
3376     const int would_have_implemented = cap->implemented & (cap->issued | used);
3377     // Simulated bug:
3378     //  - tell the server we think issued is whatever they issued plus whatever we implemented
3379     //  - leave what we have implemented in place
3380     ldout(cct, 20) << __func__ << " injecting failure to release caps" << dendl;
3381     cap->issued = cap->issued | cap->implemented;
3382
3383     // Make an exception for revoking xattr caps: we are injecting
3384     // failure to release other caps, but allow xattr because client
3385     // will block on xattr ops if it can't release these to MDS (#9800)
3386     const int xattr_mask = CEPH_CAP_XATTR_SHARED | CEPH_CAP_XATTR_EXCL;
3387     cap->issued ^= xattr_mask & revoking;
3388     cap->implemented ^= xattr_mask & revoking;
3389
3390     ldout(cct, 20) << __func__ << " issued " << ccap_string(cap->issued) << " vs " << ccap_string(would_have_issued) << dendl;
3391     ldout(cct, 20) << __func__ << " implemented " << ccap_string(cap->implemented) << " vs " << ccap_string(would_have_implemented) << dendl;
3392   } else {
3393     // Normal behaviour
3394     cap->issued &= retain;
3395     cap->implemented &= cap->issued | used;
3396   }
3397
3398   snapid_t follows = 0;
3399
3400   if (flush)
3401     follows = in->snaprealm->get_snap_context().seq;
3402
3403   auto m = make_message<MClientCaps>(op,
3404                                    in->ino,
3405                                    0,
3406                                    cap->cap_id, cap->seq,
3407                                    cap->implemented,
3408                                    want,
3409                                    flush,
3410                                    cap->mseq,
3411                                    cap_epoch_barrier);
3412   m->caller_uid = in->cap_dirtier_uid;
3413   m->caller_gid = in->cap_dirtier_gid;
3414
3415   m->head.issue_seq = cap->issue_seq;
3416   m->set_tid(flush_tid);
3417
3418   m->head.uid = in->uid;
3419   m->head.gid = in->gid;
3420   m->head.mode = in->mode;
3421
3422   m->head.nlink = in->nlink;
3423
3424   if (flush & CEPH_CAP_XATTR_EXCL) {
3425     encode(in->xattrs, m->xattrbl);
3426     m->head.xattr_version = in->xattr_version;
3427   }
3428
3429   m->size = in->size;
3430   m->max_size = in->max_size;
3431   m->truncate_seq = in->truncate_seq;
3432   m->truncate_size = in->truncate_size;
3433   m->mtime = in->mtime;
3434   m->atime = in->atime;
3435   m->ctime = in->ctime;
3436   m->btime = in->btime;
3437   m->time_warp_seq = in->time_warp_seq;
3438   m->change_attr = in->change_attr;
3439
3440   if (!(flags & MClientCaps::FLAG_PENDING_CAPSNAP) &&
3441       !in->cap_snaps.empty() &&
3442       in->cap_snaps.rbegin()->second.flush_tid == 0)
3443     flags |= MClientCaps::FLAG_PENDING_CAPSNAP;
3444   m->flags = flags;
3445
3446   if (flush & CEPH_CAP_FILE_WR) {
3447     m->inline_version = in->inline_version;
3448     m->inline_data = in->inline_data;
3449   }
3450
3451   in->reported_size = in->size;
3452   m->set_snap_follows(follows);
3453   cap->wanted = want;
3454   if (cap == in->auth_cap) {
3455     if (want & CEPH_CAP_ANY_FILE_WR) {
3456       m->set_max_size(in->wanted_max_size);
3457       in->requested_max_size = in->wanted_max_size;
3458       ldout(cct, 15) << "auth cap, requesting max_size " << in->requested_max_size << dendl;
3459     } else {
3460       in->requested_max_size = 0;
3461       ldout(cct, 15) << "auth cap, reset requested_max_size due to not wanting any file write cap" << dendl;
3462     }
3463   }
3464
3465   if (!session->flushing_caps_tids.empty())
3466     m->set_oldest_flush_tid(*session->flushing_caps_tids.begin());
3467
3468   session->con->send_message2(std::move(m));
3469 }
3470
3471 static bool is_max_size_approaching(Inode *in)
3472 {
3473   /* mds will adjust max size according to the reported size */
3474   if (in->flushing_caps & CEPH_CAP_FILE_WR)
3475     return false;
3476   if (in->size >= in->max_size)
3477     return true;
3478   /* half of previous max_size increment has been used */
3479   if (in->max_size > in->reported_size &&
3480       (in->size << 1) >= in->max_size + in->reported_size)
3481     return true;
3482   return false;
3483 }
3484
3485 static int adjust_caps_used_for_lazyio(int used, int issued, int implemented)
3486 {
3487   if (!(used & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_BUFFER)))
3488     return used;
3489   if (!(implemented & CEPH_CAP_FILE_LAZYIO))
3490     return used;
3491
3492   if (issued & CEPH_CAP_FILE_LAZYIO) {
3493     if (!(issued & CEPH_CAP_FILE_CACHE)) {
3494       used &= ~CEPH_CAP_FILE_CACHE;
3495       used |= CEPH_CAP_FILE_LAZYIO;
3496     }
3497     if (!(issued & CEPH_CAP_FILE_BUFFER)) {
3498       used &= ~CEPH_CAP_FILE_BUFFER;
3499       used |= CEPH_CAP_FILE_LAZYIO;
3500     }
3501   } else {
3502     if (!(implemented & CEPH_CAP_FILE_CACHE)) {
3503       used &= ~CEPH_CAP_FILE_CACHE;
3504       used |= CEPH_CAP_FILE_LAZYIO;
3505     }
3506     if (!(implemented & CEPH_CAP_FILE_BUFFER)) {
3507       used &= ~CEPH_CAP_FILE_BUFFER;
3508       used |= CEPH_CAP_FILE_LAZYIO;
3509     }
3510   }
3511   return used;
3512 }
3513
3514 /**
3515  * check_caps
3516  *
3517  * Examine currently used and wanted versus held caps. Release, flush or ack
3518  * revoked caps to the MDS as appropriate.
3519  *
3520  * @param in the inode to check
3521  * @param flags flags to apply to cap check
3522  */
3523 void Client::check_caps(Inode *in, unsigned flags)
3524 {
3525   unsigned wanted = in->caps_wanted();
3526   unsigned used = get_caps_used(in);
3527   unsigned cap_used;
3528
3529   int implemented;
3530   int issued = in->caps_issued(&implemented);
3531   int revoking = implemented & ~issued;
3532
3533   int orig_used = used;
3534   used = adjust_caps_used_for_lazyio(used, issued, implemented);
3535
3536   int retain = wanted | used | CEPH_CAP_PIN;
3537   if (!unmounting && in->nlink > 0) {
3538     if (wanted) {
3539       retain |= CEPH_CAP_ANY;
3540     } else if (in->is_dir() &&
3541                (issued & CEPH_CAP_FILE_SHARED) &&
3542                (in->flags & I_COMPLETE)) {
3543       // we do this here because we don't want to drop to Fs (and then
3544       // drop the Fs if we do a create!) if that alone makes us send lookups
3545       // to the MDS. Doing it in in->caps_wanted() has knock-on effects elsewhere
3546       wanted = CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_EXCL;
3547       retain |= wanted;
3548     } else {
3549       retain |= CEPH_CAP_ANY_SHARED;
3550       // keep RD only if we didn't have the file open RW,
3551       // because then the mds would revoke it anyway to
3552       // journal max_size=0.
3553       if (in->max_size == 0)
3554         retain |= CEPH_CAP_ANY_RD;
3555     }
3556   }
3557
3558   ldout(cct, 10) << __func__ << " on " << *in
3559            << " wanted " << ccap_string(wanted)
3560            << " used " << ccap_string(used)
3561            << " issued " << ccap_string(issued)
3562            << " revoking " << ccap_string(revoking)
3563            << " flags=" << flags
3564            << dendl;
3565
3566   if (in->snapid != CEPH_NOSNAP)
3567     return; //snap caps last forever, can't write
3568
3569   if (in->caps.empty())
3570     return;   // guard if at end of func
3571
3572   if (!(orig_used & CEPH_CAP_FILE_BUFFER) &&
3573       (revoking & used & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO))) {
3574     if (_release(in))
3575       used &= ~(CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO);
3576   }
3577
3578
3579   for (auto &p : in->caps) {
3580     mds_rank_t mds = p.first;
3581     Cap &cap = p.second;
3582
3583     MetaSession *session = &mds_sessions.at(mds);
3584
3585     cap_used = used;
3586     if (in->auth_cap && &cap != in->auth_cap)
3587       cap_used &= ~in->auth_cap->issued;
3588
3589     revoking = cap.implemented & ~cap.issued;
3590
3591     ldout(cct, 10) << " cap mds." << mds
3592              << " issued " << ccap_string(cap.issued)
3593              << " implemented " << ccap_string(cap.implemented)
3594              << " revoking " << ccap_string(revoking) << dendl;
3595
3596     if (in->wanted_max_size > in->max_size &&
3597         in->wanted_max_size > in->requested_max_size &&
3598         &cap == in->auth_cap)
3599       goto ack;
3600
3601     /* approaching file_max? */
3602     if ((cap.issued & CEPH_CAP_FILE_WR) &&
3603         &cap == in->auth_cap &&
3604         is_max_size_approaching(in)) {
3605       ldout(cct, 10) << "size " << in->size << " approaching max_size " << in->max_size
3606                      << ", reported " << in->reported_size << dendl;
3607       goto ack;
3608     }
3609
3610     /* completed revocation? */
3611     if (revoking && (revoking & cap_used) == 0) {
3612       ldout(cct, 10) << "completed revocation of " << ccap_string(cap.implemented & ~cap.issued) << dendl;
3613       goto ack;
3614     }
3615
3616     /* want more caps from mds? */
3617     if (wanted & ~(cap.wanted | cap.issued))
3618       goto ack;
3619
3620     if (!revoking && unmounting && (cap_used == 0))
3621       goto ack;
3622
3623     if ((cap.issued & ~retain) == 0 && // and we don't have anything we wouldn't like
3624         !in->dirty_caps)               // and we have no dirty caps
3625       continue;
3626
3627     if (!(flags & CHECK_CAPS_NODELAY)) {
3628       ldout(cct, 10) << "delaying cap release" << dendl;
3629       cap_delay_requeue(in);
3630       continue;
3631     }
3632
3633   ack:
3634     if (&cap == in->auth_cap) {
3635       if (in->flags & I_KICK_FLUSH) {
3636         ldout(cct, 20) << " reflushing caps (check_caps) on " << *in
3637                        << " to mds." << mds << dendl;
3638         kick_flushing_caps(in, session);
3639       }
3640       if (!in->cap_snaps.empty() &&
3641           in->cap_snaps.rbegin()->second.flush_tid == 0)
3642         flush_snaps(in);
3643     }
3644
3645     int flushing;
3646     int msg_flags = 0;
3647     ceph_tid_t flush_tid;
3648     if (in->auth_cap == &cap && in->dirty_caps) {
3649       flushing = mark_caps_flushing(in, &flush_tid);
3650       if (flags & CHECK_CAPS_SYNCHRONOUS)
3651         msg_flags |= MClientCaps::FLAG_SYNC;
3652     } else {
3653       flushing = 0;
3654       flush_tid = 0;
3655     }
3656
3657     send_cap(in, session, &cap, msg_flags, cap_used, wanted, retain,
3658              flushing, flush_tid);
3659   }
3660 }
3661
3662
3663 void Client::queue_cap_snap(Inode *in, SnapContext& old_snapc)
3664 {
3665   int used = get_caps_used(in);
3666   int dirty = in->caps_dirty();
3667   ldout(cct, 10) << __func__ << " " << *in << " snapc " << old_snapc << " used " << ccap_string(used) << dendl;
3668
3669   if (in->cap_snaps.size() &&
3670       in->cap_snaps.rbegin()->second.writing) {
3671     ldout(cct, 10) << __func__ << " already have pending cap_snap on " << *in << dendl;
3672     return;
3673   } else if (in->caps_dirty() ||
3674             (used & CEPH_CAP_FILE_WR) ||
3675              (dirty & CEPH_CAP_ANY_WR)) {
3676     const auto &capsnapem = in->cap_snaps.emplace(std::piecewise_construct, std::make_tuple(old_snapc.seq), std::make_tuple(in));
3677     ceph_assert(capsnapem.second); /* element inserted */
3678     CapSnap &capsnap = capsnapem.first->second;
3679     capsnap.context = old_snapc;
3680     capsnap.issued = in->caps_issued();
3681     capsnap.dirty = in->caps_dirty();
3682
3683     capsnap.dirty_data = (used & CEPH_CAP_FILE_BUFFER);
3684
3685     capsnap.uid = in->uid;
3686     capsnap.gid = in->gid;
3687     capsnap.mode = in->mode;
3688     capsnap.btime = in->btime;
3689     capsnap.xattrs = in->xattrs;
3690     capsnap.xattr_version = in->xattr_version;
3691     capsnap.cap_dirtier_uid = in->cap_dirtier_uid;
3692     capsnap.cap_dirtier_gid = in->cap_dirtier_gid;
3693
3694     if (used & CEPH_CAP_FILE_WR) {
3695       ldout(cct, 10) << __func__ << " WR used on " << *in << dendl;
3696       capsnap.writing = 1;
3697     } else {
3698       finish_cap_snap(in, capsnap, used);
3699     }
3700   } else {
3701     ldout(cct, 10) << __func__ << " not dirty|writing on " << *in << dendl;
3702   }
3703 }
3704
3705 void Client::finish_cap_snap(Inode *in, CapSnap &capsnap, int used)
3706 {
3707   ldout(cct, 10) << __func__ << " " << *in << " capsnap " << (void *)&capsnap << " used " << ccap_string(used) << dendl;
3708   capsnap.size = in->size;
3709   capsnap.mtime = in->mtime;
3710   capsnap.atime = in->atime;
3711   capsnap.ctime = in->ctime;
3712   capsnap.time_warp_seq = in->time_warp_seq;
3713   capsnap.change_attr = in->change_attr;
3714   capsnap.dirty |= in->caps_dirty();
3715
3716   /* Only reset it if it wasn't set before */
3717   if (capsnap.cap_dirtier_uid == -1) {
3718     capsnap.cap_dirtier_uid = in->cap_dirtier_uid;
3719     capsnap.cap_dirtier_gid = in->cap_dirtier_gid;
3720   }
3721
3722   if (capsnap.dirty & CEPH_CAP_FILE_WR) {
3723     capsnap.inline_data = in->inline_data;
3724     capsnap.inline_version = in->inline_version;
3725   }
3726
3727   if (used & CEPH_CAP_FILE_BUFFER) {
3728     capsnap.writing = 1;
3729     ldout(cct, 10) << __func__ << " " << *in << " cap_snap " << &capsnap << " used " << used
3730              << " WRBUFFER, delaying" << dendl;
3731   } else {
3732     capsnap.dirty_data = 0;
3733     flush_snaps(in);
3734   }
3735 }
3736
3737 void Client::send_flush_snap(Inode *in, MetaSession *session,
3738                              snapid_t follows, CapSnap& capsnap)
3739 {
3740   auto m = make_message<MClientCaps>(CEPH_CAP_OP_FLUSHSNAP,
3741                                      in->ino, in->snaprealm->ino, 0,
3742                                      in->auth_cap->mseq, cap_epoch_barrier);
3743   m->caller_uid = capsnap.cap_dirtier_uid;
3744   m->caller_gid = capsnap.cap_dirtier_gid;
3745
3746   m->set_client_tid(capsnap.flush_tid);
3747   m->head.snap_follows = follows;
3748
3749   m->head.caps = capsnap.issued;
3750   m->head.dirty = capsnap.dirty;
3751
3752   m->head.uid = capsnap.uid;
3753   m->head.gid = capsnap.gid;
3754   m->head.mode = capsnap.mode;
3755   m->btime = capsnap.btime;
3756
3757   m->size = capsnap.size;
3758
3759   m->head.xattr_version = capsnap.xattr_version;
3760   encode(capsnap.xattrs, m->xattrbl);
3761
3762   m->ctime = capsnap.ctime;
3763   m->btime = capsnap.btime;
3764   m->mtime = capsnap.mtime;
3765   m->atime = capsnap.atime;
3766   m->time_warp_seq = capsnap.time_warp_seq;
3767   m->change_attr = capsnap.change_attr;
3768
3769   if (capsnap.dirty & CEPH_CAP_FILE_WR) {
3770     m->inline_version = in->inline_version;
3771     m->inline_data = in->inline_data;
3772   }
3773
3774   ceph_assert(!session->flushing_caps_tids.empty());
3775   m->set_oldest_flush_tid(*session->flushing_caps_tids.begin());
3776
3777   session->con->send_message2(std::move(m));
3778 }
3779
3780 void Client::flush_snaps(Inode *in)
3781 {
3782   ldout(cct, 10) << "flush_snaps on " << *in << dendl;
3783   ceph_assert(in->cap_snaps.size());
3784
3785   // pick auth mds
3786   ceph_assert(in->auth_cap);
3787   MetaSession *session = in->auth_cap->session;
3788
3789   for (auto &p : in->cap_snaps) {
3790     CapSnap &capsnap = p.second;
3791     // only do new flush
3792     if (capsnap.flush_tid > 0)
3793       continue;
3794
3795     ldout(cct, 10) << "flush_snaps mds." << session->mds_num
3796              << " follows " << p.first
3797              << " size " << capsnap.size
3798              << " mtime " << capsnap.mtime
3799              << " dirty_data=" << capsnap.dirty_data
3800              << " writing=" << capsnap.writing
3801              << " on " << *in << dendl;
3802     if (capsnap.dirty_data || capsnap.writing)
3803       break;
3804
3805     capsnap.flush_tid = ++last_flush_tid;
3806     session->flushing_caps_tids.insert(capsnap.flush_tid);
3807     in->flushing_cap_tids[capsnap.flush_tid] = 0;
3808     if (!in->flushing_cap_item.is_on_list())
3809       session->flushing_caps.push_back(&in->flushing_cap_item);
3810
3811     send_flush_snap(in, session, p.first, capsnap);
3812   }
3813 }
3814
3815 void Client::wait_on_list(list<ceph::condition_variable*>& ls)
3816 {
3817   ceph::condition_variable cond;
3818   ls.push_back(&cond);
3819   std::unique_lock l{client_lock, std::adopt_lock};
3820   cond.wait(l);
3821   l.release();
3822   ls.remove(&cond);
3823 }
3824
3825 void Client::signal_cond_list(list<ceph::condition_variable*>& ls)
3826 {
3827   for (auto cond : ls) {
3828     cond->notify_all();
3829   }
3830 }
3831
3832 void Client::wait_on_context_list(list<Context*>& ls)
3833 {
3834   ceph::condition_variable cond;
3835   bool done = false;
3836   int r;
3837   ls.push_back(new C_Cond(cond, &done, &r));
3838   std::unique_lock l{client_lock, std::adopt_lock};
3839   cond.wait(l, [&done] { return done;});
3840   l.release();
3841 }
3842
3843 void Client::signal_context_list(list<Context*>& ls)
3844 {
3845   while (!ls.empty()) {
3846     ls.front()->complete(0);
3847     ls.pop_front();
3848   }
3849 }
3850
3851 void Client::wake_up_session_caps(MetaSession *s, bool reconnect)
3852 {
3853   for (const auto &cap : s->caps) {
3854     auto &in = cap->inode;
3855     if (reconnect) {
3856       in.requested_max_size = 0;
3857       in.wanted_max_size = 0;
3858     } else {
3859       if (cap->gen < s->cap_gen) {
3860         // mds did not re-issue stale cap.
3861         cap->issued = cap->implemented = CEPH_CAP_PIN;
3862         // make sure mds knows what we want.
3863         if (in.caps_file_wanted() & ~cap->wanted)
3864           in.flags |= I_CAP_DROPPED;
3865       }
3866     }
3867     signal_cond_list(in.waitfor_caps);
3868   }
3869 }
3870
3871
3872 // flush dirty data (from objectcache)
3873
3874 class C_Client_CacheInvalidate : public Context  {
3875 private:
3876   Client *client;
3877   vinodeno_t ino;
3878   int64_t offset, length;
3879 public:
3880   C_Client_CacheInvalidate(Client *c, Inode *in, int64_t off, int64_t len) :
3881     client(c), offset(off), length(len) {
3882     if (client->use_faked_inos())
3883       ino = vinodeno_t(in->faked_ino, CEPH_NOSNAP);
3884     else
3885       ino = in->vino();
3886   }
3887   void finish(int r) override {
3888     // _async_invalidate takes the lock when it needs to, call this back from outside of lock.
3889     ceph_assert(ceph_mutex_is_not_locked_by_me(client->client_lock));
3890     client->_async_invalidate(ino, offset, length);
3891   }
3892 };
3893
3894 void Client::_async_invalidate(vinodeno_t ino, int64_t off, int64_t len)
3895 {
3896   if (unmounting)
3897     return;
3898   ldout(cct, 10) << __func__ << " " << ino << " " << off << "~" << len << dendl;
3899   ino_invalidate_cb(callback_handle, ino, off, len);
3900 }
3901
3902 void Client::_schedule_invalidate_callback(Inode *in, int64_t off, int64_t len) {
3903
3904   if (ino_invalidate_cb)
3905     // we queue the invalidate, which calls the callback and decrements the ref
3906     async_ino_invalidator.queue(new C_Client_CacheInvalidate(this, in, off, len));
3907 }
3908
3909 void Client::_invalidate_inode_cache(Inode *in)
3910 {
3911   ldout(cct, 10) << __func__ << " " << *in << dendl;
3912
3913   // invalidate our userspace inode cache
3914   if (cct->_conf->client_oc) {
3915     objectcacher->release_set(&in->oset);
3916     if (!objectcacher->set_is_empty(&in->oset))
3917       lderr(cct) << "failed to invalidate cache for " << *in << dendl;
3918   }
3919
3920   _schedule_invalidate_callback(in, 0, 0);
3921 }
3922
3923 void Client::_invalidate_inode_cache(Inode *in, int64_t off, int64_t len)
3924 {
3925   ldout(cct, 10) << __func__ << " " << *in << " " << off << "~" << len << dendl;
3926
3927   // invalidate our userspace inode cache
3928   if (cct->_conf->client_oc) {
3929     vector<ObjectExtent> ls;
3930     Striper::file_to_extents(cct, in->ino, &in->layout, off, len, in->truncate_size, ls);
3931     objectcacher->discard_writeback(&in->oset, ls, nullptr);
3932   }
3933
3934   _schedule_invalidate_callback(in, off, len);
3935 }
3936
3937 bool Client::_release(Inode *in)
3938 {
3939   ldout(cct, 20) << "_release " << *in << dendl;
3940   if (in->cap_refs[CEPH_CAP_FILE_CACHE] == 0) {
3941     _invalidate_inode_cache(in);
3942     return true;
3943   }
3944   return false;
3945 }
3946
3947 bool Client::_flush(Inode *in, Context *onfinish)
3948 {
3949   ldout(cct, 10) << "_flush " << *in << dendl;
3950
3951   if (!in->oset.dirty_or_tx) {
3952     ldout(cct, 10) << " nothing to flush" << dendl;
3953     onfinish->complete(0);
3954     return true;
3955   }
3956
3957   if (objecter->osdmap_pool_full(in->layout.pool_id)) {
3958     ldout(cct, 8) << __func__ << ": FULL, purging for ENOSPC" << dendl;
3959     objectcacher->purge_set(&in->oset);
3960     if (onfinish) {
3961       onfinish->complete(-ENOSPC);
3962     }
3963     return true;
3964   }
3965
3966   return objectcacher->flush_set(&in->oset, onfinish);
3967 }
3968
3969 void Client::_flush_range(Inode *in, int64_t offset, uint64_t size)
3970 {
3971   ceph_assert(ceph_mutex_is_locked(client_lock));
3972   if (!in->oset.dirty_or_tx) {
3973     ldout(cct, 10) << " nothing to flush" << dendl;
3974     return;
3975   }
3976
3977   C_SaferCond onflush("Client::_flush_range flock");
3978   bool ret = objectcacher->file_flush(&in->oset, &in->layout, in->snaprealm->get_snap_context(),
3979                                       offset, size, &onflush);
3980   if (!ret) {
3981     // wait for flush
3982     client_lock.unlock();
3983     onflush.wait();
3984     client_lock.lock();
3985   }
3986 }
3987
3988 void Client::flush_set_callback(ObjectCacher::ObjectSet *oset)
3989 {
3990   //  std::lock_guard l(client_lock);
3991   ceph_assert(ceph_mutex_is_locked(client_lock));   // will be called via dispatch() -> objecter -> ...
3992   Inode *in = static_cast<Inode *>(oset->parent);
3993   ceph_assert(in);
3994   _flushed(in);
3995 }
3996
3997 void Client::_flushed(Inode *in)
3998 {
3999   ldout(cct, 10) << "_flushed " << *in << dendl;
4000
4001   put_cap_ref(in, CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_BUFFER);
4002 }
4003
4004
4005
4006 // checks common to add_update_cap, handle_cap_grant
4007 void Client::check_cap_issue(Inode *in, unsigned issued)
4008 {
4009   unsigned had = in->caps_issued();
4010
4011   if ((issued & CEPH_CAP_FILE_CACHE) &&
4012       !(had & CEPH_CAP_FILE_CACHE))
4013     in->cache_gen++;
4014
4015   if ((issued & CEPH_CAP_FILE_SHARED) !=
4016       (had & CEPH_CAP_FILE_SHARED)) {
4017     if (issued & CEPH_CAP_FILE_SHARED)
4018       in->shared_gen++;
4019     if (in->is_dir())
4020       clear_dir_complete_and_ordered(in, true);
4021   }
4022 }
4023
4024 void Client::add_update_cap(Inode *in, MetaSession *mds_session, uint64_t cap_id,
4025                             unsigned issued, unsigned wanted, unsigned seq, unsigned mseq,
4026                             inodeno_t realm, int flags, const UserPerm& cap_perms)
4027 {
4028   if (!in->is_any_caps()) {
4029     ceph_assert(in->snaprealm == 0);
4030     in->snaprealm = get_snap_realm(realm);
4031     in->snaprealm->inodes_with_caps.push_back(&in->snaprealm_item);
4032     ldout(cct, 15) << __func__ << " first one, opened snaprealm " << in->snaprealm << dendl;
4033   } else {
4034     ceph_assert(in->snaprealm);
4035     if ((flags & CEPH_CAP_FLAG_AUTH) &&
4036         realm != inodeno_t(-1) && in->snaprealm->ino != realm) {
4037       in->snaprealm_item.remove_myself();
4038       auto oldrealm = in->snaprealm;
4039       in->snaprealm = get_snap_realm(realm);
4040       in->snaprealm->inodes_with_caps.push_back(&in->snaprealm_item);
4041       put_snap_realm(oldrealm);
4042     }
4043   }
4044
4045   mds_rank_t mds = mds_session->mds_num;
4046   const auto &capem = in->caps.emplace(std::piecewise_construct, std::forward_as_tuple(mds), std::forward_as_tuple(*in, mds_session));
4047   Cap &cap = capem.first->second;
4048   if (!capem.second) {
4049     if (cap.gen < mds_session->cap_gen)
4050       cap.issued = cap.implemented = CEPH_CAP_PIN;
4051
4052     /*
4053      * auth mds of the inode changed. we received the cap export
4054      * message, but still haven't received the cap import message.
4055      * handle_cap_export() updated the new auth MDS' cap.
4056      *
4057      * "ceph_seq_cmp(seq, cap->seq) <= 0" means we are processing
4058      * a message that was send before the cap import message. So
4059      * don't remove caps.
4060      */
4061     if (ceph_seq_cmp(seq, cap.seq) <= 0) {
4062       if (&cap != in->auth_cap)
4063          ldout(cct, 0) << "WARNING: " <<  "inode " << *in << " caps on mds." << mds << " != auth_cap." << dendl;
4064
4065       ceph_assert(cap.cap_id == cap_id);
4066       seq = cap.seq;
4067       mseq = cap.mseq;
4068       issued |= cap.issued;
4069       flags |= CEPH_CAP_FLAG_AUTH;
4070     }
4071   }
4072
4073   check_cap_issue(in, issued);
4074
4075   if (flags & CEPH_CAP_FLAG_AUTH) {
4076     if (in->auth_cap != &cap &&
4077         (!in->auth_cap || ceph_seq_cmp(in->auth_cap->mseq, mseq) < 0)) {
4078       if (in->auth_cap && in->flushing_cap_item.is_on_list()) {
4079         ldout(cct, 10) << __func__ << " changing auth cap: "
4080                        << "add myself to new auth MDS' flushing caps list" << dendl;
4081         adjust_session_flushing_caps(in, in->auth_cap->session, mds_session);
4082       }
4083       in->auth_cap = &cap;
4084     }
4085   }
4086
4087   unsigned old_caps = cap.issued;
4088   cap.cap_id = cap_id;
4089   cap.issued = issued;
4090   cap.implemented |= issued;
4091   if (ceph_seq_cmp(mseq, cap.mseq) > 0)
4092     cap.wanted = wanted;
4093   else
4094     cap.wanted |= wanted;
4095   cap.seq = seq;
4096   cap.issue_seq = seq;
4097   cap.mseq = mseq;
4098   cap.gen = mds_session->cap_gen;
4099   cap.latest_perms = cap_perms;
4100   ldout(cct, 10) << __func__ << " issued " << ccap_string(old_caps) << " -> " << ccap_string(cap.issued)
4101            << " from mds." << mds
4102            << " on " << *in
4103            << dendl;
4104
4105   if ((issued & ~old_caps) && in->auth_cap == &cap) {
4106     // non-auth MDS is revoking the newly grant caps ?
4107     for (auto &p : in->caps) {
4108       if (&p.second == &cap)
4109         continue;
4110       if (p.second.implemented & ~p.second.issued & issued) {
4111         check_caps(in, CHECK_CAPS_NODELAY);
4112         break;
4113       }
4114     }
4115   }
4116
4117   if (issued & ~old_caps)
4118     signal_cond_list(in->waitfor_caps);
4119 }
4120
4121 void Client::remove_cap(Cap *cap, bool queue_release)
4122 {
4123   auto &in = cap->inode;
4124   MetaSession *session = cap->session;
4125   mds_rank_t mds = cap->session->mds_num;
4126
4127   ldout(cct, 10) << __func__ << " mds." << mds << " on " << in << dendl;
4128
4129   if (queue_release) {
4130     session->enqueue_cap_release(
4131       in.ino,
4132       cap->cap_id,
4133       cap->issue_seq,
4134       cap->mseq,
4135       cap_epoch_barrier);
4136   }
4137
4138   if (in.auth_cap == cap) {
4139     if (in.flushing_cap_item.is_on_list()) {
4140       ldout(cct, 10) << " removing myself from flushing_cap list" << dendl;
4141       in.flushing_cap_item.remove_myself();
4142     }
4143     in.auth_cap = NULL;
4144   }
4145   size_t n = in.caps.erase(mds);
4146   ceph_assert(n == 1);
4147   cap = nullptr;
4148
4149   if (!in.is_any_caps()) {
4150     ldout(cct, 15) << __func__ << " last one, closing snaprealm " << in.snaprealm << dendl;
4151     in.snaprealm_item.remove_myself();
4152     put_snap_realm(in.snaprealm);
4153     in.snaprealm = 0;
4154   }
4155 }
4156
4157 void Client::remove_all_caps(Inode *in)
4158 {
4159   while (!in->caps.empty())
4160     remove_cap(&in->caps.begin()->second, true);
4161 }
4162
4163 void Client::remove_session_caps(MetaSession *s, int err)
4164 {
4165   ldout(cct, 10) << __func__ << " mds." << s->mds_num << dendl;
4166
4167   while (s->caps.size()) {
4168     Cap *cap = *s->caps.begin();
4169     InodeRef in(&cap->inode);
4170     bool dirty_caps = false;
4171     if (in->auth_cap == cap) {
4172       dirty_caps = in->dirty_caps | in->flushing_caps;
4173       in->wanted_max_size = 0;
4174       in->requested_max_size = 0;
4175       if (in->has_any_filelocks())
4176         in->flags |= I_ERROR_FILELOCK;
4177     }
4178     auto caps = cap->implemented;
4179     if (cap->wanted | cap->issued)
4180       in->flags |= I_CAP_DROPPED;
4181     remove_cap(cap, false);
4182     in->cap_snaps.clear();
4183     if (dirty_caps) {
4184       lderr(cct) << __func__ << " still has dirty|flushing caps on " << *in << dendl;
4185       if (in->flushing_caps) {
4186         num_flushing_caps--;
4187         in->flushing_cap_tids.clear();
4188       }
4189       in->flushing_caps = 0;
4190       in->mark_caps_clean();
4191       put_inode(in.get());
4192     }
4193     caps &= CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_BUFFER;
4194     if (caps && !in->caps_issued_mask(caps, true)) {
4195       if (err == -EBLACKLISTED) {
4196         if (in->oset.dirty_or_tx) {
4197           lderr(cct) << __func__ << " still has dirty data on " << *in << dendl;
4198           in->set_async_err(err);
4199         }
4200         objectcacher->purge_set(&in->oset);
4201       } else {
4202         objectcacher->release_set(&in->oset);
4203       }
4204       _schedule_invalidate_callback(in.get(), 0, 0);
4205     }
4206
4207     signal_cond_list(in->waitfor_caps);
4208   }
4209   s->flushing_caps_tids.clear();
4210   sync_cond.notify_all();
4211 }
4212
4213 int Client::_do_remount(bool retry_on_error)
4214 {
4215   uint64_t max_retries = cct->_conf.get_val<uint64_t>("mds_max_retries_on_remount_failure");
4216
4217   errno = 0;
4218   int r = remount_cb(callback_handle);
4219   if (r == 0) {
4220     retries_on_invalidate = 0;
4221   } else {
4222     int e = errno;
4223     client_t whoami = get_nodeid();
4224     if (r == -1) {
4225       lderr(cct) <<
4226           "failed to remount (to trim kernel dentries): "
4227           "errno = " << e << " (" << strerror(e) << ")" << dendl;
4228     } else {
4229       lderr(cct) <<
4230           "failed to remount (to trim kernel dentries): "
4231           "return code = " << r << dendl;
4232     }
4233     bool should_abort =
4234       (cct->_conf.get_val<bool>("client_die_on_failed_remount") ||
4235        cct->_conf.get_val<bool>("client_die_on_failed_dentry_invalidate")) &&
4236       !(retry_on_error && (++retries_on_invalidate < max_retries));
4237     if (should_abort && !unmounting) {
4238       lderr(cct) << "failed to remount for kernel dentry trimming; quitting!" << dendl;
4239       ceph_abort();
4240     }
4241   }
4242   return r;
4243 }
4244
4245 class C_Client_Remount : public Context  {
4246 private:
4247   Client *client;
4248 public:
4249   explicit C_Client_Remount(Client *c) : client(c) {}
4250   void finish(int r) override {
4251     ceph_assert(r == 0);
4252     client->_do_remount(true);
4253   }
4254 };
4255
4256 void Client::_invalidate_kernel_dcache()
4257 {
4258   if (unmounting)
4259     return;
4260   if (can_invalidate_dentries) {
4261     if (dentry_invalidate_cb && root->dir) {
4262       for (ceph::unordered_map<string, Dentry*>::iterator p = root->dir->dentries.begin();
4263          p != root->dir->dentries.end();
4264          ++p) {
4265        if (p->second->inode)
4266         _schedule_invalidate_dentry_callback(p->second, false);
4267       }
4268     }
4269   } else if (remount_cb) {
4270     // Hacky:
4271     // when remounting a file system, linux kernel trims all unused dentries in the fs
4272     remount_finisher.queue(new C_Client_Remount(this));
4273   }
4274 }
4275
4276 void Client::_trim_negative_child_dentries(InodeRef& in)
4277 {
4278   if (!in->is_dir())
4279     return;
4280
4281   Dir* dir = in->dir;
4282   if (dir && dir->dentries.size() == dir->num_null_dentries) {
4283     for (auto p = dir->dentries.begin(); p != dir->dentries.end(); ) {
4284       Dentry *dn = p->second;
4285       ++p;
4286       ceph_assert(!dn->inode);
4287       if (dn->lru_is_expireable())
4288         unlink(dn, true, false);  // keep dir, drop dentry
4289     }
4290     if (dir->dentries.empty()) {
4291       close_dir(dir);
4292     }
4293   }
4294
4295   if (in->flags & I_SNAPDIR_OPEN) {
4296     InodeRef snapdir = open_snapdir(in.get());
4297     _trim_negative_child_dentries(snapdir);
4298   }
4299 }
4300
4301 class C_Client_CacheRelease : public Context  {
4302 private:
4303   Client *client;
4304   vinodeno_t ino;
4305 public:
4306   C_Client_CacheRelease(Client *c, Inode *in) :
4307     client(c) {
4308     if (client->use_faked_inos())
4309       ino = vinodeno_t(in->faked_ino, CEPH_NOSNAP);
4310     else
4311       ino = in->vino();
4312   }
4313   void finish(int r) override {
4314     ceph_assert(ceph_mutex_is_not_locked_by_me(client->client_lock));
4315     client->_async_inode_release(ino);
4316   }
4317 };
4318
4319 void Client::_async_inode_release(vinodeno_t ino)
4320 {
4321   if (unmounting)
4322     return;
4323   ldout(cct, 10) << __func__ << " " << ino << dendl;
4324   ino_release_cb(callback_handle, ino);
4325 }
4326
4327 void Client::_schedule_ino_release_callback(Inode *in) {
4328
4329   if (ino_release_cb)
4330     // we queue the invalidate, which calls the callback and decrements the ref
4331     async_ino_releasor.queue(new C_Client_CacheRelease(this, in));
4332 }
4333
4334 void Client::trim_caps(MetaSession *s, uint64_t max)
4335 {
4336   mds_rank_t mds = s->mds_num;
4337   size_t caps_size = s->caps.size();
4338   ldout(cct, 10) << __func__ << " mds." << mds << " max " << max
4339     << " caps " << caps_size << dendl;
4340
4341   uint64_t trimmed = 0;
4342   auto p = s->caps.begin();
4343   std::set<Dentry *> to_trim; /* this avoids caps other than the one we're
4344                                * looking at from getting deleted during traversal. */
4345   while ((caps_size - trimmed) > max && !p.end()) {
4346     Cap *cap = *p;
4347     InodeRef in(&cap->inode);
4348
4349     // Increment p early because it will be invalidated if cap
4350     // is deleted inside remove_cap
4351     ++p;
4352
4353     if (in->caps.size() > 1 && cap != in->auth_cap) {
4354       int mine = cap->issued | cap->implemented;
4355       int oissued = in->auth_cap ? in->auth_cap->issued : 0;
4356       // disposable non-auth cap
4357       if (!(get_caps_used(in.get()) & ~oissued & mine)) {
4358         ldout(cct, 20) << " removing unused, unneeded non-auth cap on " << *in << dendl;
4359         cap = (remove_cap(cap, true), nullptr);
4360         trimmed++;
4361       }
4362     } else {
4363       ldout(cct, 20) << " trying to trim dentries for " << *in << dendl;
4364       _trim_negative_child_dentries(in);
4365       bool all = true;
4366       auto q = in->dentries.begin();
4367       while (q != in->dentries.end()) {
4368         Dentry *dn = *q;
4369         ++q;
4370         if (dn->lru_is_expireable()) {
4371           if (can_invalidate_dentries &&
4372               dn->dir->parent_inode->ino == MDS_INO_ROOT) {
4373             // Only issue one of these per DN for inodes in root: handle
4374             // others more efficiently by calling for root-child DNs at
4375             // the end of this function.
4376             _schedule_invalidate_dentry_callback(dn, true);
4377           }
4378           ldout(cct, 20) << " queueing dentry for trimming: " << dn->name << dendl;
4379           to_trim.insert(dn);
4380         } else {
4381           ldout(cct, 20) << "  not expirable: " << dn->name << dendl;
4382           all = false;
4383         }
4384       }
4385       if (in->ll_ref == 1 && in->ino != MDS_INO_ROOT) {
4386          _schedule_ino_release_callback(in.get());
4387       }
4388       if (all && in->ino != MDS_INO_ROOT) {
4389         ldout(cct, 20) << __func__ << " counting as trimmed: " << *in << dendl;
4390         trimmed++;
4391       }
4392     }
4393   }
4394   ldout(cct, 20) << " trimming queued dentries: " << dendl;
4395   for (const auto &dn : to_trim) {
4396     trim_dentry(dn);
4397   }
4398   to_trim.clear();
4399
4400   caps_size = s->caps.size();
4401   if (caps_size > (size_t)max)
4402     _invalidate_kernel_dcache();
4403 }
4404
4405 void Client::force_session_readonly(MetaSession *s)
4406 {
4407   s->readonly = true;
4408   for (xlist<Cap*>::iterator p = s->caps.begin(); !p.end(); ++p) {
4409     auto &in = (*p)->inode;
4410     if (in.caps_wanted() & CEPH_CAP_FILE_WR)
4411       signal_cond_list(in.waitfor_caps);
4412   }
4413 }
4414
4415 int Client::mark_caps_flushing(Inode *in, ceph_tid_t* ptid)
4416 {
4417   MetaSession *session = in->auth_cap->session;
4418
4419   int flushing = in->dirty_caps;
4420   ceph_assert(flushing);
4421
4422   ceph_tid_t flush_tid = ++last_flush_tid;
4423   in->flushing_cap_tids[flush_tid] = flushing;
4424
4425   if (!in->flushing_caps) {
4426     ldout(cct, 10) << __func__ << " " << ccap_string(flushing) << " " << *in << dendl;
4427     num_flushing_caps++;
4428   } else {
4429     ldout(cct, 10) << __func__ << " (more) " << ccap_string(flushing) << " " << *in << dendl;
4430   }
4431
4432   in->flushing_caps |= flushing;
4433   in->mark_caps_clean();
4434
4435   if (!in->flushing_cap_item.is_on_list())
4436     session->flushing_caps.push_back(&in->flushing_cap_item);
4437   session->flushing_caps_tids.insert(flush_tid);
4438
4439   *ptid = flush_tid;
4440   return flushing;
4441 }
4442
4443 void Client::adjust_session_flushing_caps(Inode *in, MetaSession *old_s,  MetaSession *new_s)
4444 {
4445   for (auto &p : in->cap_snaps) {
4446     CapSnap &capsnap = p.second;
4447     if (capsnap.flush_tid > 0) {
4448       old_s->flushing_caps_tids.erase(capsnap.flush_tid);
4449       new_s->flushing_caps_tids.insert(capsnap.flush_tid);
4450     }
4451   }
4452   for (map<ceph_tid_t, int>::iterator it = in->flushing_cap_tids.begin();
4453        it != in->flushing_cap_tids.end();
4454        ++it) {
4455     old_s->flushing_caps_tids.erase(it->first);
4456     new_s->flushing_caps_tids.insert(it->first);
4457   }
4458   new_s->flushing_caps.push_back(&in->flushing_cap_item);
4459 }
4460
4461 /*
4462  * Flush all caps back to the MDS. Because the callers generally wait on the
4463  * result of this function (syncfs and umount cases), we set
4464  * CHECK_CAPS_SYNCHRONOUS on the last check_caps call.
4465  */
4466 void Client::flush_caps_sync()
4467 {
4468   ldout(cct, 10) << __func__ << dendl;
4469   xlist<Inode*>::iterator p = delayed_list.begin();
4470   while (!p.end()) {
4471     unsigned flags = CHECK_CAPS_NODELAY;
4472     Inode *in = *p;
4473
4474     ++p;
4475     delayed_list.pop_front();
4476     if (p.end() && dirty_list.empty())
4477       flags |= CHECK_CAPS_SYNCHRONOUS;
4478     check_caps(in, flags);
4479   }
4480
4481   // other caps, too
4482   p = dirty_list.begin();
4483   while (!p.end()) {
4484     unsigned flags = CHECK_CAPS_NODELAY;
4485     Inode *in = *p;
4486
4487     ++p;
4488     if (p.end())
4489       flags |= CHECK_CAPS_SYNCHRONOUS;
4490     check_caps(in, flags);
4491   }
4492 }
4493
4494 void Client::wait_sync_caps(Inode *in, ceph_tid_t want)
4495 {
4496   while (in->flushing_caps) {
4497     map<ceph_tid_t, int>::iterator it = in->flushing_cap_tids.begin();
4498     ceph_assert(it != in->flushing_cap_tids.end());
4499     if (it->first > want)
4500       break;
4501     ldout(cct, 10) << __func__ << " on " << *in << " flushing "
4502                    << ccap_string(it->second) << " want " << want
4503                    << " last " << it->first << dendl;
4504     wait_on_list(in->waitfor_caps);
4505   }
4506 }
4507
4508 void Client::wait_sync_caps(ceph_tid_t want)
4509 {
4510  retry:
4511   ldout(cct, 10) << __func__ << " want " << want  << " (last is " << last_flush_tid << ", "
4512            << num_flushing_caps << " total flushing)" << dendl;
4513   for (auto &p : mds_sessions) {
4514     MetaSession *s = &p.second;
4515     if (s->flushing_caps_tids.empty())
4516         continue;
4517     ceph_tid_t oldest_tid = *s->flushing_caps_tids.begin();
4518     if (oldest_tid <= want) {
4519       ldout(cct, 10) << " waiting on mds." << p.first << " tid " << oldest_tid
4520                      << " (want " << want << ")" << dendl;
4521       std::unique_lock l{client_lock, std::adopt_lock};
4522       sync_cond.wait(l);
4523       l.release();
4524       goto retry;
4525     }
4526   }
4527 }
4528
4529 void Client::kick_flushing_caps(Inode *in, MetaSession *session)
4530 {
4531   in->flags &= ~I_KICK_FLUSH;
4532
4533   Cap *cap = in->auth_cap;
4534   ceph_assert(cap->session == session);
4535
4536   ceph_tid_t last_snap_flush = 0;
4537   for (auto p = in->flushing_cap_tids.rbegin();
4538        p != in->flushing_cap_tids.rend();
4539        ++p) {
4540     if (!p->second) {
4541       last_snap_flush = p->first;
4542       break;
4543     }
4544   }
4545
4546   int wanted = in->caps_wanted();
4547   int used = get_caps_used(in) | in->caps_dirty();
4548   auto it = in->cap_snaps.begin();
4549   for (auto& p : in->flushing_cap_tids) {
4550     if (p.second) {
4551       int msg_flags = p.first < last_snap_flush ? MClientCaps::FLAG_PENDING_CAPSNAP : 0;
4552       send_cap(in, session, cap, msg_flags, used, wanted, (cap->issued | cap->implemented),
4553                p.second, p.first);
4554     } else {
4555       ceph_assert(it != in->cap_snaps.end());
4556       ceph_assert(it->second.flush_tid == p.first);
4557       send_flush_snap(in, session, it->first, it->second);
4558       ++it;
4559     }
4560   }
4561 }
4562
4563 void Client::kick_flushing_caps(MetaSession *session)
4564 {
4565   mds_rank_t mds = session->mds_num;
4566   ldout(cct, 10) << __func__ << " mds." << mds << dendl;
4567
4568   for (xlist<Inode*>::iterator p = session->flushing_caps.begin(); !p.end(); ++p) {
4569     Inode *in = *p;
4570     if (in->flags & I_KICK_FLUSH) {
4571       ldout(cct, 20) << " reflushing caps on " << *in << " to mds." << mds << dendl;
4572       kick_flushing_caps(in, session);
4573     }
4574   }
4575 }
4576
4577 void Client::early_kick_flushing_caps(MetaSession *session)
4578 {
4579   for (xlist<Inode*>::iterator p = session->flushing_caps.begin(); !p.end(); ++p) {
4580     Inode *in = *p;
4581     Cap *cap = in->auth_cap;
4582     ceph_assert(cap);
4583
4584     // if flushing caps were revoked, we re-send the cap flush in client reconnect
4585     // stage. This guarantees that MDS processes the cap flush message before issuing
4586     // the flushing caps to other client.
4587     if ((in->flushing_caps & in->auth_cap->issued) == in->flushing_caps) {
4588       in->flags |= I_KICK_FLUSH;
4589       continue;
4590     }
4591
4592     ldout(cct, 20) << " reflushing caps (early_kick) on " << *in
4593                    << " to mds." << session->mds_num << dendl;
4594     // send_reconnect() also will reset these sequence numbers. make sure
4595     // sequence numbers in cap flush message match later reconnect message.
4596     cap->seq = 0;
4597     cap->issue_seq = 0;
4598     cap->mseq = 0;
4599     cap->issued = cap->implemented;
4600
4601     kick_flushing_caps(in, session);
4602   }
4603 }
4604
4605 void SnapRealm::build_snap_context()
4606 {
4607   set<snapid_t> snaps;
4608   snapid_t max_seq = seq;
4609
4610   // start with prior_parents?
4611   for (unsigned i=0; i<prior_parent_snaps.size(); i++)
4612     snaps.insert(prior_parent_snaps[i]);
4613
4614   // current parent's snaps
4615   if (pparent) {
4616     const SnapContext& psnapc = pparent->get_snap_context();
4617     for (unsigned i=0; i<psnapc.snaps.size(); i++)
4618       if (psnapc.snaps[i] >= parent_since)
4619         snaps.insert(psnapc.snaps[i]);
4620     if (psnapc.seq > max_seq)
4621       max_seq = psnapc.seq;
4622   }
4623
4624   // my snaps
4625   for (unsigned i=0; i<my_snaps.size(); i++)
4626     snaps.insert(my_snaps[i]);
4627
4628   // ok!
4629   cached_snap_context.seq = max_seq;
4630   cached_snap_context.snaps.resize(0);
4631   cached_snap_context.snaps.reserve(snaps.size());
4632   for (set<snapid_t>::reverse_iterator p = snaps.rbegin(); p != snaps.rend(); ++p)
4633     cached_snap_context.snaps.push_back(*p);
4634 }
4635
4636 void Client::invalidate_snaprealm_and_children(SnapRealm *realm)
4637 {
4638   list<SnapRealm*> q;
4639   q.push_back(realm);
4640
4641   while (!q.empty()) {
4642     realm = q.front();
4643     q.pop_front();
4644
4645     ldout(cct, 10) << __func__ << " " << *realm << dendl;
4646     realm->invalidate_cache();
4647
4648     for (set<SnapRealm*>::iterator p = realm->pchildren.begin();
4649          p != realm->pchildren.end();
4650          ++p)
4651       q.push_back(*p);
4652   }
4653 }
4654
4655 SnapRealm *Client::get_snap_realm(inodeno_t r)
4656 {
4657   SnapRealm *realm = snap_realms[r];
4658   if (!realm)
4659     snap_realms[r] = realm = new SnapRealm(r);
4660   ldout(cct, 20) << __func__ << " " << r << " " << realm << " " << realm->nref << " -> " << (realm->nref + 1) << dendl;
4661   realm->nref++;
4662   return realm;
4663 }
4664
4665 SnapRealm *Client::get_snap_realm_maybe(inodeno_t r)
4666 {
4667   if (snap_realms.count(r) == 0) {
4668     ldout(cct, 20) << __func__ << " " << r << " fail" << dendl;
4669     return NULL;
4670   }
4671   SnapRealm *realm = snap_realms[r];
4672   ldout(cct, 20) << __func__ << " " << r << " " << realm << " " << realm->nref << " -> " << (realm->nref + 1) << dendl;
4673   realm->nref++;
4674   return realm;
4675 }
4676
4677 void Client::put_snap_realm(SnapRealm *realm)
4678 {
4679   ldout(cct, 20) << __func__ << " " << realm->ino << " " << realm
4680                  << " " << realm->nref << " -> " << (realm->nref - 1) << dendl;
4681   if (--realm->nref == 0) {
4682     snap_realms.erase(realm->ino);
4683     if (realm->pparent) {
4684       realm->pparent->pchildren.erase(realm);
4685       put_snap_realm(realm->pparent);
4686     }
4687     delete realm;
4688   }
4689 }
4690
4691 bool Client::adjust_realm_parent(SnapRealm *realm, inodeno_t parent)
4692 {
4693   if (realm->parent != parent) {
4694     ldout(cct, 10) << __func__ << " " << *realm
4695              << " " << realm->parent << " -> " << parent << dendl;
4696     realm->parent = parent;
4697     if (realm->pparent) {
4698       realm->pparent->pchildren.erase(realm);
4699       put_snap_realm(realm->pparent);
4700     }
4701     realm->pparent = get_snap_realm(parent);
4702     realm->pparent->pchildren.insert(realm);
4703     return true;
4704   }
4705   return false;
4706 }
4707
4708 static bool has_new_snaps(const SnapContext& old_snapc,
4709                           const SnapContext& new_snapc)
4710 {
4711   return !new_snapc.snaps.empty() && new_snapc.snaps[0] > old_snapc.seq;
4712 }
4713
4714
4715 void Client::update_snap_trace(const bufferlist& bl, SnapRealm **realm_ret, bool flush)
4716 {
4717   SnapRealm *first_realm = NULL;
4718   ldout(cct, 10) << __func__ << " len " << bl.length() << dendl;
4719
4720   map<SnapRealm*, SnapContext> dirty_realms;
4721
4722   auto p = bl.cbegin();
4723   while (!p.end()) {
4724     SnapRealmInfo info;
4725     decode(info, p);
4726     SnapRealm *realm = get_snap_realm(info.ino());
4727
4728     bool invalidate = false;
4729
4730     if (info.seq() > realm->seq) {
4731       ldout(cct, 10) << __func__ << " " << *realm << " seq " << info.seq() << " > " << realm->seq
4732                << dendl;
4733
4734       if (flush) {
4735         // writeback any dirty caps _before_ updating snap list (i.e. with old snap info)
4736         //  flush me + children
4737         list<SnapRealm*> q;
4738         q.push_back(realm);
4739         while (!q.empty()) {
4740           SnapRealm *realm = q.front();
4741           q.pop_front();
4742
4743           for (set<SnapRealm*>::iterator p = realm->pchildren.begin();
4744                p != realm->pchildren.end();
4745                ++p)
4746             q.push_back(*p);
4747
4748           if (dirty_realms.count(realm) == 0) {
4749             realm->nref++;
4750             dirty_realms[realm] = realm->get_snap_context();
4751           }
4752         }
4753       }
4754
4755       // update
4756       realm->seq = info.seq();
4757       realm->created = info.created();
4758       realm->parent_since = info.parent_since();
4759       realm->prior_parent_snaps = info.prior_parent_snaps;
4760       realm->my_snaps = info.my_snaps;
4761       invalidate = true;
4762     }
4763
4764     // _always_ verify parent
4765     if (adjust_realm_parent(realm, info.parent()))
4766       invalidate = true;
4767
4768     if (invalidate) {
4769       invalidate_snaprealm_and_children(realm);
4770       ldout(cct, 15) << __func__ << " " << *realm << " self|parent updated" << dendl;
4771       ldout(cct, 15) << "  snapc " << realm->get_snap_context() << dendl;
4772     } else {
4773       ldout(cct, 10) << __func__ << " " << *realm << " seq " << info.seq()
4774                << " <= " << realm->seq << " and same parent, SKIPPING" << dendl;
4775     }
4776
4777     if (!first_realm)
4778       first_realm = realm;
4779     else
4780       put_snap_realm(realm);
4781   }
4782
4783   for (auto &[realm, snapc] : dirty_realms) {
4784     // if there are new snaps ?
4785     if (has_new_snaps(snapc, realm->get_snap_context())) {
4786       ldout(cct, 10) << " flushing caps on " << *realm << dendl;
4787       for (auto&& in : realm->inodes_with_caps) {
4788         queue_cap_snap(in, snapc);
4789       }
4790     } else {
4791       ldout(cct, 10) << " no new snap on " << *realm << dendl;
4792     }
4793     put_snap_realm(realm);
4794   }
4795
4796   if (realm_ret)
4797     *realm_ret = first_realm;
4798   else
4799     put_snap_realm(first_realm);
4800 }
4801
4802 void Client::handle_snap(const MConstRef<MClientSnap>& m)
4803 {
4804   ldout(cct, 10) << __func__ << " " << *m << dendl;
4805   mds_rank_t mds = mds_rank_t(m->get_source().num());
4806   MetaSession *session = _get_mds_session(mds, m->get_connection().get());
4807   if (!session) {
4808     return;
4809   }
4810
4811   got_mds_push(session);
4812
4813   map<Inode*, SnapContext> to_move;
4814   SnapRealm *realm = 0;
4815
4816   if (m->head.op == CEPH_SNAP_OP_SPLIT) {
4817     ceph_assert(m->head.split);
4818     SnapRealmInfo info;
4819     auto p = m->bl.cbegin();
4820     decode(info, p);
4821     ceph_assert(info.ino() == m->head.split);
4822
4823     // flush, then move, ino's.
4824     realm = get_snap_realm(info.ino());
4825     ldout(cct, 10) << " splitting off " << *realm << dendl;
4826     for (auto& ino : m->split_inos) {
4827       vinodeno_t vino(ino, CEPH_NOSNAP);
4828       if (inode_map.count(vino)) {
4829         Inode *in = inode_map[vino];
4830         if (!in->snaprealm || in->snaprealm == realm)
4831           continue;
4832         if (in->snaprealm->created > info.created()) {
4833           ldout(cct, 10) << " NOT moving " << *in << " from _newer_ realm "
4834                    << *in->snaprealm << dendl;
4835           continue;
4836         }
4837         ldout(cct, 10) << " moving " << *in << " from " << *in->snaprealm << dendl;
4838
4839
4840         in->snaprealm_item.remove_myself();
4841         to_move[in] = in->snaprealm->get_snap_context();
4842         put_snap_realm(in->snaprealm);
4843       }
4844     }
4845
4846     // move child snaprealms, too
4847     for (auto& child_realm : m->split_realms) {
4848       ldout(cct, 10) << "adjusting snaprealm " << child_realm << " parent" << dendl;
4849       SnapRealm *child = get_snap_realm_maybe(child_realm);
4850       if (!child)
4851         continue;
4852       adjust_realm_parent(child, realm->ino);
4853       put_snap_realm(child);
4854     }
4855   }
4856
4857   update_snap_trace(m->bl, NULL, m->head.op != CEPH_SNAP_OP_DESTROY);
4858
4859   if (realm) {
4860     for (auto p = to_move.begin(); p != to_move.end(); ++p) {
4861       Inode *in = p->first;
4862       in->snaprealm = realm;
4863       realm->inodes_with_caps.push_back(&in->snaprealm_item);
4864       realm->nref++;
4865       // queue for snap writeback
4866       if (has_new_snaps(p->second, realm->get_snap_context()))
4867         queue_cap_snap(in, p->second);
4868     }
4869     put_snap_realm(realm);
4870   }
4871 }
4872
4873 void Client::handle_quota(const MConstRef<MClientQuota>& m)
4874 {
4875   mds_rank_t mds = mds_rank_t(m->get_source().num());
4876   MetaSession *session = _get_mds_session(mds, m->get_connection().get());
4877   if (!session) {
4878     return;
4879   }
4880
4881   got_mds_push(session);
4882
4883   ldout(cct, 10) << __func__ << " " << *m << " from mds." << mds << dendl;
4884
4885   vinodeno_t vino(m->ino, CEPH_NOSNAP);
4886   if (inode_map.count(vino)) {
4887     Inode *in = NULL;
4888     in = inode_map[vino];
4889
4890     if (in) {
4891       in->quota = m->quota;
4892       in->rstat = m->rstat;
4893     }
4894   }
4895 }
4896
4897 void Client::handle_caps(const MConstRef<MClientCaps>& m)
4898 {
4899   mds_rank_t mds = mds_rank_t(m->get_source().num());
4900   MetaSession *session = _get_mds_session(mds, m->get_connection().get());
4901   if (!session) {
4902     return;
4903   }
4904
4905   if (m->osd_epoch_barrier && !objecter->have_map(m->osd_epoch_barrier)) {
4906     // Pause RADOS operations until we see the required epoch
4907     objecter->set_epoch_barrier(m->osd_epoch_barrier);
4908   }
4909
4910   if (m->osd_epoch_barrier > cap_epoch_barrier) {
4911     // Record the barrier so that we will transmit it to MDS when releasing
4912     set_cap_epoch_barrier(m->osd_epoch_barrier);
4913   }
4914
4915   got_mds_push(session);
4916
4917   Inode *in;
4918   vinodeno_t vino(m->get_ino(), CEPH_NOSNAP);
4919   if (auto it = inode_map.find(vino); it != inode_map.end()) {
4920     in = it->second;
4921   } else {
4922     if (m->get_op() == CEPH_CAP_OP_IMPORT) {
4923       ldout(cct, 5) << __func__ << " don't have vino " << vino << " on IMPORT, immediately releasing" << dendl;
4924       session->enqueue_cap_release(
4925         m->get_ino(),
4926         m->get_cap_id(),
4927         m->get_seq(),
4928         m->get_mseq(),
4929         cap_epoch_barrier);
4930     } else {
4931       ldout(cct, 5) << __func__ << " don't have vino " << vino << ", dropping" << dendl;
4932     }
4933
4934     // in case the mds is waiting on e.g. a revocation
4935     flush_cap_releases();
4936     return;
4937   }
4938
4939   switch (m->get_op()) {
4940     case CEPH_CAP_OP_EXPORT: return handle_cap_export(session, in, m);
4941     case CEPH_CAP_OP_FLUSHSNAP_ACK: return handle_cap_flushsnap_ack(session, in, m);
4942     case CEPH_CAP_OP_IMPORT: /* no return */ handle_cap_import(session, in, m);
4943   }
4944
4945   if (auto it = in->caps.find(mds); it != in->caps.end()) {
4946     Cap &cap = in->caps.at(mds);
4947
4948     switch (m->get_op()) {
4949       case CEPH_CAP_OP_TRUNC: return handle_cap_trunc(session, in, m);
4950       case CEPH_CAP_OP_IMPORT:
4951       case CEPH_CAP_OP_REVOKE:
4952       case CEPH_CAP_OP_GRANT: return handle_cap_grant(session, in, &cap, m);
4953       case CEPH_CAP_OP_FLUSH_ACK: return handle_cap_flush_ack(session, in, &cap, m);
4954     }
4955   } else {
4956     ldout(cct, 5) << __func__ << " don't have " << *in << " cap on mds." << mds << dendl;
4957     return;
4958   }
4959 }
4960
4961 void Client::handle_cap_import(MetaSession *session, Inode *in, const MConstRef<MClientCaps>& m)
4962 {
4963   mds_rank_t mds = session->mds_num;
4964
4965   ldout(cct, 5) << __func__ << " ino " << m->get_ino() << " mseq " << m->get_mseq()
4966                 << " IMPORT from mds." << mds << dendl;
4967
4968   const mds_rank_t peer_mds = mds_rank_t(m->peer.mds);
4969   Cap *cap = NULL;
4970   UserPerm cap_perms;
4971   if (auto it = in->caps.find(peer_mds); m->peer.cap_id && it != in->caps.end()) {
4972     cap = &it->second;
4973     cap_perms = cap->latest_perms;
4974   }
4975
4976   // add/update it
4977   SnapRealm *realm = NULL;
4978   update_snap_trace(m->snapbl, &realm);
4979
4980   int issued = m->get_caps();
4981   int wanted = m->get_wanted();
4982   add_update_cap(in, session, m->get_cap_id(),
4983                  issued, wanted, m->get_seq(), m->get_mseq(),
4984                  m->get_realm(), CEPH_CAP_FLAG_AUTH, cap_perms);
4985
4986   if (cap && cap->cap_id == m->peer.cap_id) {
4987       remove_cap(cap, (m->peer.flags & CEPH_CAP_FLAG_RELEASE));
4988   }
4989
4990   if (realm)
4991     put_snap_realm(realm);
4992
4993   if (in->auth_cap && in->auth_cap->session == session) {
4994     if (!(wanted & CEPH_CAP_ANY_FILE_WR) ||
4995         in->requested_max_size > m->get_max_size()) {
4996       in->requested_max_size = 0;
4997       ldout(cct, 15) << "reset requested_max_size after cap import" << dendl;
4998     }
4999     // reflush any/all caps (if we are now the auth_cap)
5000     kick_flushing_caps(in, session);
5001   }
5002 }
5003
5004 void Client::handle_cap_export(MetaSession *session, Inode *in, const MConstRef<MClientCaps>& m)
5005 {
5006   mds_rank_t mds = session->mds_num;
5007
5008   ldout(cct, 5) << __func__ << " ino " << m->get_ino() << " mseq " << m->get_mseq()
5009                 << " EXPORT from mds." << mds << dendl;
5010
5011   auto it = in->caps.find(mds);
5012   if (it != in->caps.end()) {
5013     Cap &cap = it->second;
5014     if (cap.cap_id == m->get_cap_id()) {
5015       if (m->peer.cap_id) {
5016         const auto peer_mds = mds_rank_t(m->peer.mds);
5017         MetaSession *tsession = _get_or_open_mds_session(peer_mds);
5018         auto it = in->caps.find(peer_mds);
5019         if (it != in->caps.end()) {
5020           Cap &tcap = it->second;
5021           if (tcap.cap_id == m->peer.cap_id &&
5022               ceph_seq_cmp(tcap.seq, m->peer.seq) < 0) {
5023             tcap.cap_id = m->peer.cap_id;
5024             tcap.seq = m->peer.seq - 1;
5025             tcap.issue_seq = tcap.seq;
5026             tcap.issued |= cap.issued;
5027             tcap.implemented |= cap.issued;
5028             if (&cap == in->auth_cap)
5029               in->auth_cap = &tcap;
5030             if (in->auth_cap == &tcap && in->flushing_cap_item.is_on_list())
5031               adjust_session_flushing_caps(in, session, tsession);
5032           }
5033         } else {
5034           add_update_cap(in, tsession, m->peer.cap_id, cap.issued, 0,
5035                          m->peer.seq - 1, m->peer.mseq, (uint64_t)-1,
5036                          &cap == in->auth_cap ? CEPH_CAP_FLAG_AUTH : 0,
5037                          cap.latest_perms);
5038         }
5039       } else {
5040         if (cap.wanted | cap.issued)
5041           in->flags |= I_CAP_DROPPED;
5042       }
5043
5044       remove_cap(&cap, false);
5045     }
5046   }
5047 }
5048
5049 void Client::handle_cap_trunc(MetaSession *session, Inode *in, const MConstRef<MClientCaps>& m)
5050 {
5051   mds_rank_t mds = session->mds_num;
5052   ceph_assert(in->caps.count(mds));
5053
5054   ldout(cct, 10) << __func__ << " on ino " << *in
5055            << " size " << in->size << " -> " << m->get_size()
5056            << dendl;
5057
5058   int issued;
5059   in->caps_issued(&issued);
5060   issued |= in->caps_dirty();
5061   update_inode_file_size(in, issued, m->get_size(),
5062                          m->get_truncate_seq(), m->get_truncate_size());
5063 }
5064
5065 void Client::handle_cap_flush_ack(MetaSession *session, Inode *in, Cap *cap, const MConstRef<MClientCaps>& m)
5066 {
5067   ceph_tid_t flush_ack_tid = m->get_client_tid();
5068   int dirty = m->get_dirty();
5069   int cleaned = 0;
5070   int flushed = 0;
5071
5072   auto it = in->flushing_cap_tids.begin();
5073   if (it->first < flush_ack_tid) {
5074        ldout(cct, 0) << __func__ << " mds." << session->mds_num
5075                    << " got unexpected flush ack tid " << flush_ack_tid
5076                    << " expected is " << it->first << dendl;
5077   }
5078   for (; it != in->flushing_cap_tids.end(); ) {
5079     if (!it->second) {
5080       // cap snap
5081       ++it;
5082       continue;
5083     }
5084     if (it->first == flush_ack_tid)
5085       cleaned = it->second;
5086     if (it->first <= flush_ack_tid) {
5087       session->flushing_caps_tids.erase(it->first);
5088       in->flushing_cap_tids.erase(it++);
5089       ++flushed;
5090       continue;
5091     }
5092     cleaned &= ~it->second;
5093     if (!cleaned)
5094       break;
5095     ++it;
5096   }
5097
5098   ldout(cct, 5) << __func__ << " mds." << session->mds_num
5099           << " cleaned " << ccap_string(cleaned) << " on " << *in
5100           << " with " << ccap_string(dirty) << dendl;
5101
5102   if (flushed) {
5103     signal_cond_list(in->waitfor_caps);
5104     if (session->flushing_caps_tids.empty() ||
5105         *session->flushing_caps_tids.begin() > flush_ack_tid)
5106       sync_cond.notify_all();
5107   }
5108
5109   if (!dirty) {
5110     in->cap_dirtier_uid = -1;
5111     in->cap_dirtier_gid = -1;
5112   }
5113
5114   if (!cleaned) {
5115     ldout(cct, 10) << " tid " << m->get_client_tid() << " != any cap bit tids" << dendl;
5116   } else {
5117     if (in->flushing_caps) {
5118       ldout(cct, 5) << "  flushing_caps " << ccap_string(in->flushing_caps)
5119               << " -> " << ccap_string(in->flushing_caps & ~cleaned) << dendl;
5120       in->flushing_caps &= ~cleaned;
5121       if (in->flushing_caps == 0) {
5122         ldout(cct, 10) << " " << *in << " !flushing" << dendl;
5123         num_flushing_caps--;
5124        if (in->flushing_cap_tids.empty())
5125           in->flushing_cap_item.remove_myself();
5126       }
5127       if (!in->caps_dirty())
5128         put_inode(in);
5129     }
5130   }
5131 }
5132
5133
5134 void Client::handle_cap_flushsnap_ack(MetaSession *session, Inode *in, const MConstRef<MClientCaps>& m)
5135 {
5136   ceph_tid_t flush_ack_tid = m->get_client_tid();
5137   mds_rank_t mds = session->mds_num;
5138   ceph_assert(in->caps.count(mds));
5139   snapid_t follows = m->get_snap_follows();
5140
5141   if (auto it = in->cap_snaps.find(follows); it != in->cap_snaps.end()) {
5142     auto& capsnap = it->second;
5143     if (flush_ack_tid != capsnap.flush_tid) {
5144       ldout(cct, 10) << " tid " << flush_ack_tid << " != " << capsnap.flush_tid << dendl;
5145     } else {
5146       InodeRef tmp_ref(in);
5147       ldout(cct, 5) << __func__ << " mds." << mds << " flushed snap follows " << follows
5148               << " on " << *in << dendl;
5149       session->flushing_caps_tids.erase(capsnap.flush_tid);
5150       in->flushing_cap_tids.erase(capsnap.flush_tid);
5151       if (in->flushing_caps == 0 && in->flushing_cap_tids.empty())
5152         in->flushing_cap_item.remove_myself();
5153       in->cap_snaps.erase(it);
5154
5155       signal_cond_list(in->waitfor_caps);
5156       if (session->flushing_caps_tids.empty() ||
5157           *session->flushing_caps_tids.begin() > flush_ack_tid)
5158         sync_cond.notify_all();
5159     }
5160   } else {
5161     ldout(cct, 5) << __func__ << " DUP(?) mds." << mds << " flushed snap follows " << follows
5162             << " on " << *in << dendl;
5163     // we may not have it if we send multiple FLUSHSNAP requests and (got multiple FLUSHEDSNAPs back)
5164   }
5165 }
5166
5167 class C_Client_DentryInvalidate : public Context  {
5168 private:
5169   Client *client;
5170   vinodeno_t dirino;
5171   vinodeno_t ino;
5172   string name;
5173 public:
5174   C_Client_DentryInvalidate(Client *c, Dentry *dn, bool del) :
5175     client(c), name(dn->name) {
5176       if (client->use_faked_inos()) {
5177         dirino.ino = dn->dir->parent_inode->faked_ino;
5178         if (del)
5179           ino.ino = dn->inode->faked_ino;
5180       } else {
5181         dirino = dn->dir->parent_inode->vino();
5182         if (del)
5183           ino = dn->inode->vino();
5184       }
5185       if (!del)
5186         ino.ino = inodeno_t();
5187   }
5188   void finish(int r) override {
5189     // _async_dentry_invalidate is responsible for its own locking
5190     ceph_assert(ceph_mutex_is_not_locked_by_me(client->client_lock));
5191     client->_async_dentry_invalidate(dirino, ino, name);
5192   }
5193 };
5194
5195 void Client::_async_dentry_invalidate(vinodeno_t dirino, vinodeno_t ino, string& name)
5196 {
5197   if (unmounting)
5198     return;
5199   ldout(cct, 10) << __func__ << " '" << name << "' ino " << ino
5200                  << " in dir " << dirino << dendl;
5201   dentry_invalidate_cb(callback_handle, dirino, ino, name.c_str(), name.length());
5202 }
5203
5204 void Client::_schedule_invalidate_dentry_callback(Dentry *dn, bool del)
5205 {
5206   if (dentry_invalidate_cb && dn->inode->ll_ref > 0)
5207     async_dentry_invalidator.queue(new C_Client_DentryInvalidate(this, dn, del));
5208 }
5209
5210 void Client::_try_to_trim_inode(Inode *in, bool sched_inval)
5211 {
5212   int ref = in->get_num_ref();
5213   ldout(cct, 5) << __func__ << " in " << *in <<dendl;
5214
5215   if (in->dir && !in->dir->dentries.empty()) {
5216     for (auto p = in->dir->dentries.begin();
5217          p != in->dir->dentries.end(); ) {
5218       Dentry *dn = p->second;
5219       ++p;
5220       /* rmsnap removes whole subtree, need trim inodes recursively.
5221        * we don't need to invalidate dentries recursively. because
5222        * invalidating a directory dentry effectively invalidate
5223        * whole subtree */
5224       if (in->snapid != CEPH_NOSNAP && dn->inode && dn->inode->is_dir())
5225         _try_to_trim_inode(dn->inode.get(), false);
5226
5227       if (dn->lru_is_expireable())
5228         unlink(dn, true, false);  // keep dir, drop dentry
5229     }
5230     if (in->dir->dentries.empty()) {
5231       close_dir(in->dir);
5232       --ref;
5233     }
5234   }
5235
5236   if (ref > 0 && (in->flags & I_SNAPDIR_OPEN)) {
5237     InodeRef snapdir = open_snapdir(in);
5238     _try_to_trim_inode(snapdir.get(), false);
5239     --ref;
5240   }
5241
5242   if (ref > 0) {
5243     auto q = in->dentries.begin();
5244     while (q != in->dentries.end()) {
5245       Dentry *dn = *q;
5246       ++q;
5247       if( in->ll_ref > 0 && sched_inval) {
5248         // FIXME: we play lots of unlink/link tricks when handling MDS replies,
5249         //        so in->dentries doesn't always reflect the state of kernel's dcache.
5250         _schedule_invalidate_dentry_callback(dn, true);
5251       }
5252       unlink(dn, true, true);
5253     }
5254   }
5255 }
5256
5257 void Client::handle_cap_grant(MetaSession *session, Inode *in, Cap *cap, const MConstRef<MClientCaps>& m)
5258 {
5259   mds_rank_t mds = session->mds_num;
5260   int used = get_caps_used(in);
5261   int wanted = in->caps_wanted();
5262
5263   const unsigned new_caps = m->get_caps();
5264   const bool was_stale = session->cap_gen > cap->gen;
5265   ldout(cct, 5) << __func__ << " on in " << m->get_ino()
5266                 << " mds." << mds << " seq " << m->get_seq()
5267                 << " caps now " << ccap_string(new_caps)
5268                 << " was " << ccap_string(cap->issued)
5269                 << (was_stale ? " (stale)" : "") << dendl;
5270
5271   if (was_stale)
5272       cap->issued = cap->implemented = CEPH_CAP_PIN;
5273   cap->seq = m->get_seq();
5274   cap->gen = session->cap_gen;
5275
5276   check_cap_issue(in, new_caps);
5277
5278   // update inode
5279   int issued;
5280   in->caps_issued(&issued);
5281   issued |= in->caps_dirty();
5282
5283   if ((new_caps & CEPH_CAP_AUTH_SHARED) &&
5284       !(issued & CEPH_CAP_AUTH_EXCL)) {
5285     in->mode = m->head.mode;
5286     in->uid = m->head.uid;
5287     in->gid = m->head.gid;
5288     in->btime = m->btime;
5289   }
5290   bool deleted_inode = false;
5291   if ((new_caps & CEPH_CAP_LINK_SHARED) &&
5292       !(issued & CEPH_CAP_LINK_EXCL)) {
5293     in->nlink = m->head.nlink;
5294     if (in->nlink == 0 &&
5295         (new_caps & (CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL)))
5296       deleted_inode = true;
5297   }
5298   if (!(issued & CEPH_CAP_XATTR_EXCL) &&
5299       m->xattrbl.length() &&
5300       m->head.xattr_version > in->xattr_version) {
5301     auto p = m->xattrbl.cbegin();
5302     decode(in->xattrs, p);
5303     in->xattr_version = m->head.xattr_version;
5304   }
5305
5306   if ((new_caps & CEPH_CAP_FILE_SHARED) && m->dirstat_is_valid()) {
5307     in->dirstat.nfiles = m->get_nfiles();
5308     in->dirstat.nsubdirs = m->get_nsubdirs();
5309   }
5310
5311   if (new_caps & CEPH_CAP_ANY_RD) {
5312     update_inode_file_time(in, issued, m->get_time_warp_seq(),
5313                            m->get_ctime(), m->get_mtime(), m->get_atime());
5314   }
5315
5316   if (new_caps & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR)) {
5317     in->layout = m->get_layout();
5318     update_inode_file_size(in, issued, m->get_size(),
5319                            m->get_truncate_seq(), m->get_truncate_size());
5320   }
5321
5322   if (m->inline_version > in->inline_version) {
5323     in->inline_data = m->inline_data;
5324     in->inline_version = m->inline_version;
5325   }
5326
5327   /* always take a newer change attr */
5328   if (m->get_change_attr() > in->change_attr)
5329     in->change_attr = m->get_change_attr();
5330
5331   // max_size
5332   if (cap == in->auth_cap &&
5333       (new_caps & CEPH_CAP_ANY_FILE_WR) &&
5334       (m->get_max_size() != in->max_size)) {
5335     ldout(cct, 10) << "max_size " << in->max_size << " -> " << m->get_max_size() << dendl;
5336     in->max_size = m->get_max_size();
5337     if (in->max_size > in->wanted_max_size) {
5338       in->wanted_max_size = 0;
5339       in->requested_max_size = 0;
5340     }
5341   }
5342
5343   bool check = false;
5344   if ((was_stale || m->get_op() == CEPH_CAP_OP_IMPORT) &&
5345       (wanted & ~(cap->wanted | new_caps))) {
5346     // If mds is importing cap, prior cap messages that update 'wanted'
5347     // may get dropped by mds (migrate seq mismatch).
5348     //
5349     // We don't send cap message to update 'wanted' if what we want are
5350     // already issued. If mds revokes caps, cap message that releases caps
5351     // also tells mds what we want. But if caps got revoked by mds forcedly
5352     // (session stale). We may haven't told mds what we want.
5353     check = true;
5354   }
5355
5356
5357   // update caps
5358   auto revoked = cap->issued & ~new_caps;
5359   if (revoked) {
5360     ldout(cct, 10) << "  revocation of " << ccap_string(revoked) << dendl;
5361     cap->issued = new_caps;
5362     cap->implemented |= new_caps;
5363
5364     // recall delegations if we're losing caps necessary for them
5365     if (revoked & ceph_deleg_caps_for_type(CEPH_DELEGATION_RD))
5366       in->recall_deleg(false);
5367     else if (revoked & ceph_deleg_caps_for_type(CEPH_DELEGATION_WR))
5368       in->recall_deleg(true);
5369
5370     used = adjust_caps_used_for_lazyio(used, cap->issued, cap->implemented);
5371     if ((used & revoked & (CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO)) &&
5372         !_flush(in, new C_Client_FlushComplete(this, in))) {
5373       // waitin' for flush
5374     } else if (used & revoked & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO)) {
5375       if (_release(in))
5376         check = true;
5377     } else {
5378       cap->wanted = 0; // don't let check_caps skip sending a response to MDS
5379       check = true;
5380     }
5381   } else if (cap->issued == new_caps) {
5382     ldout(cct, 10) << "  caps unchanged at " << ccap_string(cap->issued) << dendl;
5383   } else {
5384     ldout(cct, 10) << "  grant, new caps are " << ccap_string(new_caps & ~cap->issued) << dendl;
5385     cap->issued = new_caps;
5386     cap->implemented |= new_caps;
5387
5388     if (cap == in->auth_cap) {
5389       // non-auth MDS is revoking the newly grant caps ?
5390       for (const auto &p : in->caps) {
5391         if (&p.second == cap)
5392           continue;
5393         if (p.second.implemented & ~p.second.issued & new_caps) {
5394           check = true;
5395           break;
5396         }
5397       }
5398     }
5399   }
5400
5401   if (check)
5402     check_caps(in, 0);
5403
5404   // wake up waiters
5405   if (new_caps)
5406     signal_cond_list(in->waitfor_caps);
5407
5408   // may drop inode's last ref
5409   if (deleted_inode)
5410     _try_to_trim_inode(in, true);
5411 }
5412
5413 int Client::inode_permission(Inode *in, const UserPerm& perms, unsigned want)
5414 {
5415   if (perms.uid() == 0)
5416     return 0;
5417
5418   if (perms.uid() != in->uid && (in->mode & S_IRWXG)) {
5419     int ret = _posix_acl_permission(in, perms, want);
5420     if (ret != -EAGAIN)
5421       return ret;
5422   }
5423
5424   // check permissions before doing anything else
5425   if (!in->check_mode(perms, want))
5426     return -EACCES;
5427   return 0;
5428 }
5429
5430 int Client::xattr_permission(Inode *in, const char *name, unsigned want,
5431                              const UserPerm& perms)
5432 {
5433   int r = _getattr_for_perm(in, perms);
5434   if (r < 0)
5435     goto out;
5436
5437   r = 0;
5438   if (strncmp(name, "system.", 7) == 0) {
5439     if ((want & MAY_WRITE) && (perms.uid() != 0 && perms.uid() != in->uid))
5440       r = -EPERM;
5441   } else {
5442     r = inode_permission(in, perms, want);
5443   }
5444 out:
5445   ldout(cct, 5) << __func__ << " " << in << " = " << r <<  dendl;
5446   return r;
5447 }
5448
5449 ostream& operator<<(ostream &out, const UserPerm& perm) {
5450   out << "UserPerm(uid: " << perm.uid() << ", gid: " << perm.gid() << ")";
5451   return out;
5452 }
5453
5454 int Client::may_setattr(Inode *in, struct ceph_statx *stx, int mask,
5455                         const UserPerm& perms)
5456 {
5457   ldout(cct, 20) << __func__ << " " << *in << "; " << perms << dendl;
5458   int r = _getattr_for_perm(in, perms);
5459   if (r < 0)
5460     goto out;
5461
5462   if (mask & CEPH_SETATTR_SIZE) {
5463     r = inode_permission(in, perms, MAY_WRITE);
5464     if (r < 0)
5465       goto out;
5466   }
5467
5468   r = -EPERM;
5469   if (mask & CEPH_SETATTR_UID) {
5470     if (perms.uid() != 0 && (perms.uid() != in->uid || stx->stx_uid != in->uid))
5471       goto out;
5472   }
5473   if (mask & CEPH_SETATTR_GID) {
5474     if (perms.uid() != 0 && (perms.uid() != in->uid ||
5475                (!perms.gid_in_groups(stx->stx_gid) && stx->stx_gid != in->gid)))
5476       goto out;
5477   }
5478
5479   if (mask & CEPH_SETATTR_MODE) {
5480     if (perms.uid() != 0 && perms.uid() != in->uid)
5481       goto out;
5482
5483     gid_t i_gid = (mask & CEPH_SETATTR_GID) ? stx->stx_gid : in->gid;
5484     if (perms.uid() != 0 && !perms.gid_in_groups(i_gid))
5485       stx->stx_mode &= ~S_ISGID;
5486   }
5487
5488   if (mask & (CEPH_SETATTR_CTIME | CEPH_SETATTR_BTIME |
5489               CEPH_SETATTR_MTIME | CEPH_SETATTR_ATIME)) {
5490     if (perms.uid() != 0 && perms.uid() != in->uid) {
5491       int check_mask = CEPH_SETATTR_CTIME | CEPH_SETATTR_BTIME;
5492       if (!(mask & CEPH_SETATTR_MTIME_NOW))
5493         check_mask |= CEPH_SETATTR_MTIME;
5494       if (!(mask & CEPH_SETATTR_ATIME_NOW))
5495         check_mask |= CEPH_SETATTR_ATIME;
5496       if (check_mask & mask) {
5497         goto out;
5498       } else {
5499         r = inode_permission(in, perms, MAY_WRITE);
5500         if (r < 0)
5501           goto out;
5502       }
5503     }
5504   }
5505   r = 0;
5506 out:
5507   ldout(cct, 3) << __func__ << " " << in << " = " << r <<  dendl;
5508   return r;
5509 }
5510
5511 int Client::may_open(Inode *in, int flags, const UserPerm& perms)
5512 {
5513   ldout(cct, 20) << __func__ << " " << *in << "; " << perms << dendl;
5514   unsigned want = 0;
5515
5516   if ((flags & O_ACCMODE) == O_WRONLY)
5517     want = MAY_WRITE;
5518   else if ((flags & O_ACCMODE) == O_RDWR)
5519     want = MAY_READ | MAY_WRITE;
5520   else if ((flags & O_ACCMODE) == O_RDONLY)
5521     want = MAY_READ;
5522   if (flags & O_TRUNC)
5523     want |= MAY_WRITE;
5524
5525   int r = 0;
5526   switch (in->mode & S_IFMT) {
5527     case S_IFLNK:
5528       r = -ELOOP;
5529       goto out;
5530     case S_IFDIR:
5531       if (want & MAY_WRITE) {
5532         r = -EISDIR;
5533         goto out;
5534       }
5535       break;
5536   }
5537
5538   r = _getattr_for_perm(in, perms);
5539   if (r < 0)
5540     goto out;
5541
5542   r = inode_permission(in, perms, want);
5543 out:
5544   ldout(cct, 3) << __func__ << " " << in << " = " << r <<  dendl;
5545   return r;
5546 }
5547
5548 int Client::may_lookup(Inode *dir, const UserPerm& perms)
5549 {
5550   ldout(cct, 20) << __func__ << " " << *dir << "; " << perms << dendl;
5551   int r = _getattr_for_perm(dir, perms);
5552   if (r < 0)
5553     goto out;
5554
5555   r = inode_permission(dir, perms, MAY_EXEC);
5556 out:
5557   ldout(cct, 3) << __func__ << " " << dir << " = " << r <<  dendl;
5558   return r;
5559 }
5560
5561 int Client::may_create(Inode *dir, const UserPerm& perms)
5562 {
5563   ldout(cct, 20) << __func__ << " " << *dir << "; " << perms << dendl;
5564   int r = _getattr_for_perm(dir, perms);
5565   if (r < 0)
5566     goto out;
5567
5568   r = inode_permission(dir, perms, MAY_EXEC | MAY_WRITE);
5569 out:
5570   ldout(cct, 3) << __func__ << " " << dir << " = " << r <<  dendl;
5571   return r;
5572 }
5573
5574 int Client::may_delete(Inode *dir, const char *name, const UserPerm& perms)
5575 {
5576   ldout(cct, 20) << __func__ << " " << *dir << "; " << "; name " << name << "; " << perms << dendl;
5577   int r = _getattr_for_perm(dir, perms);
5578   if (r < 0)
5579     goto out;
5580
5581   r = inode_permission(dir, perms, MAY_EXEC | MAY_WRITE);
5582   if (r < 0)
5583     goto out;
5584
5585   /* 'name == NULL' means rmsnap */
5586   if (perms.uid() != 0 && name && (dir->mode & S_ISVTX)) {
5587     InodeRef otherin;
5588     r = _lookup(dir, name, CEPH_CAP_AUTH_SHARED, &otherin, perms);
5589     if (r < 0)
5590       goto out;
5591     if (dir->uid != perms.uid() && otherin->uid != perms.uid())
5592       r = -EPERM;
5593   }
5594 out:
5595   ldout(cct, 3) << __func__ << " " << dir << " = " << r <<  dendl;
5596   return r;
5597 }
5598
5599 int Client::may_hardlink(Inode *in, const UserPerm& perms)
5600 {
5601   ldout(cct, 20) << __func__ << " " << *in << "; " << perms << dendl;
5602   int r = _getattr_for_perm(in, perms);
5603   if (r < 0)
5604     goto out;
5605
5606   if (perms.uid() == 0 || perms.uid() == in->uid) {
5607     r = 0;
5608     goto out;
5609   }
5610
5611   r = -EPERM;
5612   if (!S_ISREG(in->mode))
5613     goto out;
5614
5615   if (in->mode & S_ISUID)
5616     goto out;
5617
5618   if ((in->mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP))
5619     goto out;
5620
5621   r = inode_permission(in, perms, MAY_READ | MAY_WRITE);
5622 out:
5623   ldout(cct, 3) << __func__ << " " << in << " = " << r <<  dendl;
5624   return r;
5625 }
5626
5627 int Client::_getattr_for_perm(Inode *in, const UserPerm& perms)
5628 {
5629   int mask = CEPH_STAT_CAP_MODE;
5630   bool force = false;
5631   if (acl_type != NO_ACL) {
5632     mask |= CEPH_STAT_CAP_XATTR;
5633     force = in->xattr_version == 0;
5634   }
5635   return _getattr(in, mask, perms, force);
5636 }
5637
5638 vinodeno_t Client::_get_vino(Inode *in)
5639 {
5640   /* The caller must hold the client lock */
5641   return vinodeno_t(in->ino, in->snapid);
5642 }
5643
5644 /**
5645  * Resolve an MDS spec to a list of MDS daemon GIDs.
5646  *
5647  * The spec is a string representing a GID, rank, filesystem:rank, or name/id.
5648  * It may be '*' in which case it matches all GIDs.
5649  *
5650  * If no error is returned, the `targets` vector will be populated with at least
5651  * one MDS.
5652  */
5653 int Client::resolve_mds(
5654     const std::string &mds_spec,
5655     std::vector<mds_gid_t> *targets)
5656 {
5657   ceph_assert(fsmap);
5658   ceph_assert(targets != nullptr);
5659
5660   mds_role_t role;
5661   std::stringstream ss;
5662   int role_r = fsmap->parse_role(mds_spec, &role, ss);
5663   if (role_r == 0) {
5664     // We got a role, resolve it to a GID
5665     ldout(cct, 10) << __func__ << ": resolved '" << mds_spec << "' to role '"
5666       << role << "'" << dendl;
5667     targets->push_back(
5668         fsmap->get_filesystem(role.fscid)->mds_map.get_info(role.rank).global_id);
5669     return 0;
5670   }
5671
5672   std::string strtol_err;
5673   long long rank_or_gid = strict_strtoll(mds_spec.c_str(), 10, &strtol_err);
5674   if (strtol_err.empty()) {
5675     // It is a possible GID
5676     const mds_gid_t mds_gid = mds_gid_t(rank_or_gid);
5677     if (fsmap->gid_exists(mds_gid)) {
5678       ldout(cct, 10) << __func__ << ": validated GID " << mds_gid << dendl;
5679       targets->push_back(mds_gid);
5680     } else {
5681       lderr(cct) << __func__ << ": GID " << mds_gid << " not in MDS map"
5682                  << dendl;
5683       return -ENOENT;
5684     }
5685   } else if (mds_spec == "*") {
5686     // It is a wildcard: use all MDSs
5687     const auto mds_info = fsmap->get_mds_info();
5688
5689     if (mds_info.empty()) {
5690       lderr(cct) << __func__ << ": * passed but no MDS daemons found" << dendl;
5691       return -ENOENT;
5692     }
5693
5694     for (const auto i : mds_info) {
5695       targets->push_back(i.first);
5696     }
5697   } else {
5698     // It did not parse as an integer, it is not a wildcard, it must be a name
5699     const mds_gid_t mds_gid = fsmap->find_mds_gid_by_name(mds_spec);
5700     if (mds_gid == 0) {
5701       lderr(cct) << "MDS ID '" << mds_spec << "' not found" << dendl;
5702
5703       lderr(cct) << "FSMap: " << *fsmap << dendl;
5704
5705       return -ENOENT;
5706     } else {
5707       ldout(cct, 10) << __func__ << ": resolved ID '" << mds_spec
5708                      << "' to GID " << mds_gid << dendl;
5709       targets->push_back(mds_gid);
5710     }
5711   }
5712
5713   return 0;
5714 }
5715
5716
5717 /**
5718  * Authenticate with mon and establish global ID
5719  */
5720 int Client::authenticate()
5721 {
5722   ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
5723
5724   if (monclient->is_authenticated()) {
5725     return 0;
5726   }
5727
5728   client_lock.unlock();
5729   int r = monclient->authenticate(cct->_conf->client_mount_timeout);
5730   client_lock.lock();
5731   if (r < 0) {
5732     return r;
5733   }
5734
5735   whoami = monclient->get_global_id();
5736   messenger->set_myname(entity_name_t::CLIENT(whoami.v));
5737
5738   return 0;
5739 }
5740
5741 int Client::fetch_fsmap(bool user)
5742 {
5743   int r;
5744   // Retrieve FSMap to enable looking up daemon addresses.  We need FSMap
5745   // rather than MDSMap because no one MDSMap contains all the daemons, and
5746   // a `tell` can address any daemon.
5747   version_t fsmap_latest;
5748   do {
5749     C_SaferCond cond;
5750     monclient->get_version("fsmap", &fsmap_latest, NULL, &cond);
5751     client_lock.unlock();
5752     r = cond.wait();
5753     client_lock.lock();
5754   } while (r == -EAGAIN);
5755
5756   if (r < 0) {
5757     lderr(cct) << "Failed to learn FSMap version: " << cpp_strerror(r) << dendl;
5758     return r;
5759   }
5760
5761   ldout(cct, 10) << __func__ << " learned FSMap version " << fsmap_latest << dendl;
5762
5763   if (user) {
5764     if (!fsmap_user || fsmap_user->get_epoch() < fsmap_latest) {
5765       monclient->sub_want("fsmap.user", fsmap_latest, CEPH_SUBSCRIBE_ONETIME);
5766       monclient->renew_subs();
5767       wait_on_list(waiting_for_fsmap);
5768     }
5769     ceph_assert(fsmap_user);
5770     ceph_assert(fsmap_user->get_epoch() >= fsmap_latest);
5771   } else {
5772     if (!fsmap || fsmap->get_epoch() < fsmap_latest) {
5773       monclient->sub_want("fsmap", fsmap_latest, CEPH_SUBSCRIBE_ONETIME);
5774       monclient->renew_subs();
5775       wait_on_list(waiting_for_fsmap);
5776     }
5777     ceph_assert(fsmap);
5778     ceph_assert(fsmap->get_epoch() >= fsmap_latest);
5779   }
5780   ldout(cct, 10) << __func__ << " finished waiting for FSMap version "
5781                  << fsmap_latest << dendl;
5782   return 0;
5783 }
5784
5785 /**
5786  *
5787  * @mds_spec one of ID, rank, GID, "*"
5788  *
5789  */
5790 int Client::mds_command(
5791     const std::string &mds_spec,
5792     const vector<string>& cmd,
5793     const bufferlist& inbl,
5794     bufferlist *outbl,
5795     string *outs,
5796     Context *onfinish)
5797 {
5798   std::lock_guard lock(client_lock);
5799
5800   if (!initialized)
5801     return -ENOTCONN;
5802
5803   int r;
5804   r = authenticate();
5805   if (r < 0) {
5806     return r;
5807   }
5808
5809   r = fetch_fsmap(false);
5810   if (r < 0) {
5811     return r;
5812   }
5813
5814   // Look up MDS target(s) of the command
5815   std::vector<mds_gid_t> targets;
5816   r = resolve_mds(mds_spec, &targets);
5817   if (r < 0) {
5818     return r;
5819   }
5820
5821   // If daemons are laggy, we won't send them commands.  If all
5822   // are laggy then we fail.
5823   std::vector<mds_gid_t> non_laggy;
5824   for (const auto gid : targets) {
5825     const auto info = fsmap->get_info_gid(gid);
5826     if (!info.laggy()) {
5827       non_laggy.push_back(gid);
5828     }
5829   }
5830   if (non_laggy.size() == 0) {
5831     *outs = "All targeted MDS daemons are laggy";
5832     return -ENOENT;
5833   }
5834
5835   if (metadata.empty()) {
5836     // We are called on an unmounted client, so metadata
5837     // won't be initialized yet.
5838     populate_metadata("");
5839   }
5840
5841   // Send commands to targets
5842   C_GatherBuilder gather(cct, onfinish);
5843   for (const auto target_gid : non_laggy) {
5844     const auto info = fsmap->get_info_gid(target_gid);
5845
5846     // Open a connection to the target MDS
5847     ConnectionRef conn = messenger->connect_to_mds(info.get_addrs());
5848
5849     // Generate MDSCommandOp state
5850     auto &op = command_table.start_command();
5851
5852     op.on_finish = gather.new_sub();
5853     op.cmd = cmd;
5854     op.outbl = outbl;
5855     op.outs = outs;
5856     op.inbl = inbl;
5857     op.mds_gid = target_gid;
5858     op.con = conn;
5859
5860     ldout(cct, 4) << __func__ << ": new command op to " << target_gid
5861       << " tid=" << op.tid << cmd << dendl;
5862
5863     // Construct and send MCommand
5864     auto m = op.get_message(monclient->get_fsid());
5865     conn->send_message2(std::move(m));
5866   }
5867   gather.activate();
5868
5869   return 0;
5870 }
5871
5872 void Client::handle_command_reply(const MConstRef<MCommandReply>& m)
5873 {
5874   ceph_tid_t const tid = m->get_tid();
5875
5876   ldout(cct, 10) << __func__ << ": tid=" << m->get_tid() << dendl;
5877
5878   if (!command_table.exists(tid)) {
5879     ldout(cct, 1) << __func__ << ": unknown tid " << tid << ", dropping" << dendl;
5880     return;
5881   }
5882
5883   auto &op = command_table.get_command(tid);
5884   if (op.outbl) {
5885     *op.outbl = m->get_data();
5886   }
5887   if (op.outs) {
5888     *op.outs = m->rs;
5889   }
5890
5891   if (op.on_finish) {
5892     op.on_finish->complete(m->r);
5893   }
5894
5895   command_table.erase(tid);
5896 }
5897
5898 // -------------------
5899 // MOUNT
5900
5901 int Client::subscribe_mdsmap(const std::string &fs_name)
5902 {
5903   int r = authenticate();
5904   if (r < 0) {
5905     lderr(cct) << "authentication failed: " << cpp_strerror(r) << dendl;
5906     return r;
5907   }
5908
5909   std::string resolved_fs_name;
5910   if (fs_name.empty()) {
5911     resolved_fs_name = cct->_conf.get_val<std::string>("client_fs");
5912     if (resolved_fs_name.empty())
5913             // Try the backwards compatibility fs name option
5914             resolved_fs_name = cct->_conf.get_val<std::string>("client_mds_namespace");
5915   } else {
5916     resolved_fs_name = fs_name;
5917   }
5918
5919   std::string want = "mdsmap";
5920   if (!resolved_fs_name.empty()) {
5921     r = fetch_fsmap(true);
5922     if (r < 0)
5923       return r;
5924     fscid = fsmap_user->get_fs_cid(resolved_fs_name);
5925     if (fscid == FS_CLUSTER_ID_NONE) {
5926       return -ENOENT;
5927     }
5928
5929     std::ostringstream oss;
5930     oss << want << "." << fscid;
5931     want = oss.str();
5932   }
5933   ldout(cct, 10) << "Subscribing to map '" << want << "'" << dendl;
5934
5935   monclient->sub_want(want, 0, 0);
5936   monclient->renew_subs();
5937
5938   return 0;
5939 }
5940
5941 int Client::mount(const std::string &mount_root, const UserPerm& perms,
5942                   bool require_mds, const std::string &fs_name)
5943 {
5944   std::lock_guard lock(client_lock);
5945
5946   if (mounted) {
5947     ldout(cct, 5) << "already mounted" << dendl;
5948     return 0;
5949   }
5950
5951   unmounting = false;
5952
5953   int r = subscribe_mdsmap(fs_name);
5954   if (r < 0) {
5955     lderr(cct) << "mdsmap subscription failed: " << cpp_strerror(r) << dendl;
5956     return r;
5957   }
5958
5959   tick(); // start tick
5960
5961   if (require_mds) {
5962     while (1) {
5963       auto availability = mdsmap->is_cluster_available();
5964       if (availability == MDSMap::STUCK_UNAVAILABLE) {
5965         // Error out
5966         ldout(cct, 10) << "mds cluster unavailable: epoch=" << mdsmap->get_epoch() << dendl;
5967         return CEPH_FUSE_NO_MDS_UP;
5968       } else if (availability == MDSMap::AVAILABLE) {
5969         // Continue to mount
5970         break;
5971       } else if (availability == MDSMap::TRANSIENT_UNAVAILABLE) {
5972         // Else, wait.  MDSMonitor will update the map to bring
5973         // us to a conclusion eventually.
5974         wait_on_list(waiting_for_mdsmap);
5975       } else {
5976         // Unexpected value!
5977         ceph_abort();
5978       }
5979     }
5980   }
5981
5982   populate_metadata(mount_root.empty() ? "/" : mount_root);
5983
5984   filepath fp(CEPH_INO_ROOT);
5985   if (!mount_root.empty()) {
5986     fp = filepath(mount_root.c_str());
5987   }
5988   while (true) {
5989     MetaRequest *req = new MetaRequest(CEPH_MDS_OP_GETATTR);
5990     req->set_filepath(fp);
5991     req->head.args.getattr.mask = CEPH_STAT_CAP_INODE_ALL;
5992     int res = make_request(req, perms);
5993     if (res < 0) {
5994       if (res == -EACCES && root) {
5995         ldout(cct, 1) << __func__ << " EACCES on parent of mount point; quotas may not work" << dendl;
5996         break;
5997       }
5998       return res;
5999     }
6000
6001     if (fp.depth())
6002       fp.pop_dentry();
6003     else
6004       break;
6005   }
6006
6007   ceph_assert(root);
6008   _ll_get(root);
6009
6010   mounted = true;
6011
6012   // trace?
6013   if (!cct->_conf->client_trace.empty()) {
6014     traceout.open(cct->_conf->client_trace.c_str());
6015     if (traceout.is_open()) {
6016       ldout(cct, 1) << "opened trace file '" << cct->_conf->client_trace << "'" << dendl;
6017     } else {
6018       ldout(cct, 1) << "FAILED to open trace file '" << cct->_conf->client_trace << "'" << dendl;
6019     }
6020   }
6021
6022   /*
6023   ldout(cct, 3) << "op: // client trace data structs" << dendl;
6024   ldout(cct, 3) << "op: struct stat st;" << dendl;
6025   ldout(cct, 3) << "op: struct utimbuf utim;" << dendl;
6026   ldout(cct, 3) << "op: int readlinkbuf_len = 1000;" << dendl;
6027   ldout(cct, 3) << "op: char readlinkbuf[readlinkbuf_len];" << dendl;
6028   ldout(cct, 3) << "op: map<string, inode_t*> dir_contents;" << dendl;
6029   ldout(cct, 3) << "op: map<int, int> open_files;" << dendl;
6030   ldout(cct, 3) << "op: int fd;" << dendl;
6031   */
6032   return 0;
6033 }
6034
6035 // UNMOUNT
6036
6037 void Client::_close_sessions()
6038 {
6039   for (auto it = mds_sessions.begin(); it != mds_sessions.end(); ) {
6040     if (it->second.state == MetaSession::STATE_REJECTED)
6041       mds_sessions.erase(it++);
6042     else
6043       ++it;
6044   }
6045
6046   while (!mds_sessions.empty()) {
6047     // send session closes!
6048     for (auto &p : mds_sessions) {
6049       if (p.second.state != MetaSession::STATE_CLOSING) {
6050         _close_mds_session(&p.second);
6051         mds_ranks_closing.insert(p.first);
6052       }
6053     }
6054
6055     // wait for sessions to close
6056     double timo = cct->_conf.get_val<std::chrono::seconds>("client_shutdown_timeout").count();
6057     ldout(cct, 2) << "waiting for " << mds_ranks_closing.size() << " mds session(s) to close (timeout: "
6058                   << timo << "s)" << dendl;
6059     std::unique_lock l{client_lock, std::adopt_lock};
6060     if (!timo) {
6061       mount_cond.wait(l);
6062     } else if (!mount_cond.wait_for(l, ceph::make_timespan(timo), [this] { return mds_ranks_closing.empty(); })) {
6063       ldout(cct, 1) << mds_ranks_closing.size() << " mds(s) did not respond to session close -- timing out." << dendl;
6064       while (!mds_ranks_closing.empty()) {
6065         auto session = mds_sessions.at(*mds_ranks_closing.begin());
6066         // this prunes entry from mds_sessions and mds_ranks_closing
6067         _closed_mds_session(&session, -ETIMEDOUT);
6068       }
6069     }
6070
6071     mds_ranks_closing.clear();
6072     l.release();
6073   }
6074 }
6075
6076 void Client::flush_mdlog_sync()
6077 {
6078   if (mds_requests.empty())
6079     return;
6080   for (auto &p : mds_sessions) {
6081     flush_mdlog(&p.second);
6082   }
6083 }
6084
6085 void Client::flush_mdlog(MetaSession *session)
6086 {
6087   // Only send this to Luminous or newer MDS daemons, older daemons
6088   // will crash if they see an unknown CEPH_SESSION_* value in this msg.
6089   const uint64_t features = session->con->get_features();
6090   if (HAVE_FEATURE(features, SERVER_LUMINOUS)) {
6091     auto m = make_message<MClientSession>(CEPH_SESSION_REQUEST_FLUSH_MDLOG);
6092     session->con->send_message2(std::move(m));
6093   }
6094 }
6095
6096
6097 void Client::_abort_mds_sessions(int err)
6098 {
6099   for (auto p = mds_requests.begin(); p != mds_requests.end(); ) {
6100     auto req = p->second;
6101     ++p;
6102     // unsafe requests will be removed during close session below.
6103     if (req->got_unsafe)
6104       continue;
6105
6106     req->abort(err);
6107     if (req->caller_cond) {
6108       req->kick = true;
6109       req->caller_cond->notify_all();
6110     }
6111   }
6112
6113   // Process aborts on any requests that were on this waitlist.
6114   // Any requests that were on a waiting_for_open session waitlist
6115   // will get kicked during close session below.
6116   signal_cond_list(waiting_for_mdsmap);
6117
6118   // Force-close all sessions
6119   while(!mds_sessions.empty()) {
6120     auto& session = mds_sessions.begin()->second;
6121     _closed_mds_session(&session, err);
6122   }
6123 }
6124
6125 void Client::_unmount(bool abort)
6126 {
6127   std::unique_lock lock{client_lock, std::adopt_lock};
6128   if (unmounting)
6129     return;
6130
6131   if (abort || blacklisted) {
6132     ldout(cct, 2) << "unmounting (" << (abort ? "abort)" : "blacklisted)") << dendl;
6133   } else {
6134     ldout(cct, 2) << "unmounting" << dendl;
6135   }
6136   unmounting = true;
6137
6138   deleg_timeout = 0;
6139
6140   if (abort) {
6141     // Abort all mds sessions
6142     _abort_mds_sessions(-ENOTCONN);
6143
6144     objecter->op_cancel_writes(-ENOTCONN);
6145   } else {
6146     // flush the mdlog for pending requests, if any
6147     flush_mdlog_sync();
6148   }
6149
6150   mount_cond.wait(lock, [this] {
6151     if (!mds_requests.empty()) {
6152       ldout(cct, 10) << "waiting on " << mds_requests.size() << " requests"
6153                      << dendl;
6154     }
6155     return mds_requests.empty();
6156   });
6157   if (tick_event)
6158     timer.cancel_event(tick_event);
6159   tick_event = 0;
6160
6161   cwd.reset();
6162
6163   // clean up any unclosed files
6164   while (!fd_map.empty()) {
6165     Fh *fh = fd_map.begin()->second;
6166     fd_map.erase(fd_map.begin());
6167     ldout(cct, 0) << " destroyed lost open file " << fh << " on " << *fh->inode << dendl;
6168     _release_fh(fh);
6169   }
6170
6171   while (!ll_unclosed_fh_set.empty()) {
6172     set<Fh*>::iterator it = ll_unclosed_fh_set.begin();
6173     Fh *fh = *it;
6174     ll_unclosed_fh_set.erase(fh);
6175     ldout(cct, 0) << " destroyed lost open file " << fh << " on " << *(fh->inode) << dendl;
6176     _release_fh(fh);
6177   }
6178
6179   while (!opened_dirs.empty()) {
6180     dir_result_t *dirp = *opened_dirs.begin();
6181     ldout(cct, 0) << " destroyed lost open dir " << dirp << " on " << *dirp->inode << dendl;
6182     _closedir(dirp);
6183   }
6184
6185   _ll_drop_pins();
6186
6187   mount_cond.wait(lock, [this] {
6188     if (unsafe_sync_write > 0) {
6189       ldout(cct, 0) << unsafe_sync_write << " unsafe_sync_writes, waiting"
6190                     << dendl;
6191     }
6192     return unsafe_sync_write <= 0;
6193   });
6194
6195   if (cct->_conf->client_oc) {
6196     // flush/release all buffered data
6197     std::list<InodeRef> anchor;
6198     for (auto& p : inode_map) {
6199       Inode *in = p.second;
6200       if (!in) {
6201         ldout(cct, 0) << "null inode_map entry ino " << p.first << dendl;
6202         ceph_assert(in);
6203       }
6204
6205       // prevent inode from getting freed
6206       anchor.emplace_back(in);
6207
6208       if (abort || blacklisted) {
6209         objectcacher->purge_set(&in->oset);
6210       } else if (!in->caps.empty()) {
6211         _release(in);
6212         _flush(in, new C_Client_FlushComplete(this, in));
6213       }
6214     }
6215   }
6216
6217   if (abort || blacklisted) {
6218     for (auto p = dirty_list.begin(); !p.end(); ) {
6219       Inode *in = *p;
6220       ++p;
6221       if (in->dirty_caps) {
6222         ldout(cct, 0) << " drop dirty caps on " << *in << dendl;
6223         in->mark_caps_clean();
6224         put_inode(in);
6225       }
6226     }
6227   } else {
6228     flush_caps_sync();
6229     wait_sync_caps(last_flush_tid);
6230   }
6231
6232   // empty lru cache
6233   trim_cache();
6234
6235   while (lru.lru_get_size() > 0 ||
6236          !inode_map.empty()) {
6237     ldout(cct, 2) << "cache still has " << lru.lru_get_size()
6238             << "+" << inode_map.size() << " items"
6239             << ", waiting (for caps to release?)"
6240             << dendl;
6241     if (auto r = mount_cond.wait_for(lock, ceph::make_timespan(5));
6242         r == std::cv_status::timeout) {
6243       dump_cache(NULL);
6244     }
6245   }
6246   ceph_assert(lru.lru_get_size() == 0);
6247   ceph_assert(inode_map.empty());
6248
6249   // stop tracing
6250   if (!cct->_conf->client_trace.empty()) {
6251     ldout(cct, 1) << "closing trace file '" << cct->_conf->client_trace << "'" << dendl;
6252     traceout.close();
6253   }
6254
6255   _close_sessions();
6256
6257   mounted = false;
6258
6259   lock.release();
6260   ldout(cct, 2) << "unmounted." << dendl;
6261 }
6262
6263 void Client::unmount()
6264 {
6265   std::lock_guard lock(client_lock);
6266   _unmount(false);
6267 }
6268
6269 void Client::abort_conn()
6270 {
6271   std::lock_guard lock(client_lock);
6272   _unmount(true);
6273 }
6274
6275 void Client::flush_cap_releases()
6276 {
6277   // send any cap releases
6278   for (auto &p : mds_sessions) {
6279     auto &session = p.second;
6280     if (session.release && mdsmap->is_clientreplay_or_active_or_stopping(
6281           p.first)) {
6282       if (cct->_conf->client_inject_release_failure) {
6283         ldout(cct, 20) << __func__ << " injecting failure to send cap release message" << dendl;
6284       } else {
6285         session.con->send_message2(std::move(session.release));
6286       }
6287       session.release.reset();
6288     }
6289   }
6290 }
6291
6292 void Client::tick()
6293 {
6294   if (cct->_conf->client_debug_inject_tick_delay > 0) {
6295     sleep(cct->_conf->client_debug_inject_tick_delay);
6296     ceph_assert(0 == cct->_conf.set_val("client_debug_inject_tick_delay", "0"));
6297     cct->_conf.apply_changes(nullptr);
6298   }
6299
6300   ldout(cct, 21) << "tick" << dendl;
6301   tick_event = timer.add_event_after(
6302     cct->_conf->client_tick_interval,
6303     new LambdaContext([this](int) {
6304         // Called back via Timer, which takes client_lock for us
6305         ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
6306         tick();
6307       }));
6308   utime_t now = ceph_clock_now();
6309
6310   if (!mounted && !mds_requests.empty()) {
6311     MetaRequest *req = mds_requests.begin()->second;
6312     if (req->op_stamp + cct->_conf->client_mount_timeout < now) {
6313       req->abort(-ETIMEDOUT);
6314       if (req->caller_cond) {
6315         req->kick = true;
6316         req->caller_cond->notify_all();
6317       }
6318       signal_cond_list(waiting_for_mdsmap);
6319       for (auto &p : mds_sessions) {
6320         signal_context_list(p.second.waiting_for_open);
6321       }
6322     }
6323   }
6324
6325   if (mdsmap->get_epoch()) {
6326     // renew caps?
6327     utime_t el = now - last_cap_renew;
6328     if (el > mdsmap->get_session_timeout() / 3.0)
6329       renew_caps();
6330
6331     flush_cap_releases();
6332   }
6333
6334   // delayed caps
6335   xlist<Inode*>::iterator p = delayed_list.begin();
6336   while (!p.end()) {
6337     Inode *in = *p;
6338     ++p;
6339     if (in->hold_caps_until > now)
6340       break;
6341     delayed_list.pop_front();
6342     check_caps(in, CHECK_CAPS_NODELAY);
6343   }
6344
6345   trim_cache(true);
6346
6347   if (blacklisted && mounted &&
6348       last_auto_reconnect + 30 * 60 < now &&
6349       cct->_conf.get_val<bool>("client_reconnect_stale")) {
6350     messenger->client_reset();
6351     fd_gen++; // invalidate open files
6352     blacklisted = false;
6353     _kick_stale_sessions();
6354     last_auto_reconnect = now;
6355   }
6356 }
6357
6358 void Client::renew_caps()
6359 {
6360   ldout(cct, 10) << "renew_caps()" << dendl;
6361   last_cap_renew = ceph_clock_now();
6362
6363   for (auto &p : mds_sessions) {
6364     ldout(cct, 15) << "renew_caps requesting from mds." << p.first << dendl;
6365     if (mdsmap->get_state(p.first) >= MDSMap::STATE_REJOIN)
6366       renew_caps(&p.second);
6367   }
6368 }
6369
6370 void Client::renew_caps(MetaSession *session)
6371 {
6372   ldout(cct, 10) << "renew_caps mds." << session->mds_num << dendl;
6373   session->last_cap_renew_request = ceph_clock_now();
6374   uint64_t seq = ++session->cap_renew_seq;
6375   session->con->send_message2(make_message<MClientSession>(CEPH_SESSION_REQUEST_RENEWCAPS, seq));
6376 }
6377
6378
6379 // ===============================================================
6380 // high level (POSIXy) interface
6381
6382 int Client::_do_lookup(Inode *dir, const string& name, int mask,
6383                        InodeRef *target, const UserPerm& perms)
6384 {
6385   int op = dir->snapid == CEPH_SNAPDIR ? CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP;
6386   MetaRequest *req = new MetaRequest(op);
6387   filepath path;
6388   dir->make_nosnap_relative_path(path);
6389   path.push_dentry(name);
6390   req->set_filepath(path);
6391   req->set_inode(dir);
6392   if (cct->_conf->client_debug_getattr_caps && op == CEPH_MDS_OP_LOOKUP)
6393       mask |= DEBUG_GETATTR_CAPS;
6394   req->head.args.getattr.mask = mask;
6395
6396   ldout(cct, 10) << __func__ << " on " << path << dendl;
6397
6398   int r = make_request(req, perms, target);
6399   ldout(cct, 10) << __func__ << " res is " << r << dendl;
6400   return r;
6401 }
6402
6403 int Client::_lookup(Inode *dir, const string& dname, int mask, InodeRef *target,
6404                     const UserPerm& perms)
6405 {
6406   int r = 0;
6407   Dentry *dn = NULL;
6408   // can only request shared caps
6409   mask &= CEPH_CAP_ANY_SHARED | CEPH_STAT_RSTAT;
6410
6411   if (dname == "..") {
6412     if (dir->dentries.empty()) {
6413       MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPPARENT);
6414       filepath path(dir->ino);
6415       req->set_filepath(path);
6416
6417       InodeRef tmptarget;
6418       int r = make_request(req, perms, &tmptarget, NULL, rand() % mdsmap->get_num_in_mds());
6419
6420       if (r == 0) {
6421         *target = std::move(tmptarget);
6422         ldout(cct, 8) << __func__ << " found target " << (*target)->ino << dendl;
6423       } else {
6424         *target = dir;
6425       }
6426     }
6427     else
6428       *target = dir->get_first_parent()->dir->parent_inode; //dirs can't be hard-linked
6429     goto done;
6430   }
6431
6432   if (dname == ".") {
6433     *target = dir;
6434     goto done;
6435   }
6436
6437   if (!dir->is_dir()) {
6438     r = -ENOTDIR;
6439     goto done;
6440   }
6441
6442   if (dname.length() > NAME_MAX) {
6443     r = -ENAMETOOLONG;
6444     goto done;
6445   }
6446
6447   if (dname == cct->_conf->client_snapdir &&
6448       dir->snapid == CEPH_NOSNAP) {
6449     *target = open_snapdir(dir);
6450     goto done;
6451   }
6452
6453   if (dir->dir &&
6454       dir->dir->dentries.count(dname)) {
6455     dn = dir->dir->dentries[dname];
6456
6457     ldout(cct, 20) << __func__ << " have dn " << dname << " mds." << dn->lease_mds << " ttl " << dn->lease_ttl
6458              << " seq " << dn->lease_seq
6459              << dendl;
6460
6461     if (!dn->inode || dn->inode->caps_issued_mask(mask, true)) {
6462       // is dn lease valid?
6463       utime_t now = ceph_clock_now();
6464       if (dn->lease_mds >= 0 &&
6465           dn->lease_ttl > now &&
6466           mds_sessions.count(dn->lease_mds)) {
6467         MetaSession &s = mds_sessions.at(dn->lease_mds);
6468         if (s.cap_ttl > now &&
6469             s.cap_gen == dn->lease_gen) {
6470           // touch this mds's dir cap too, even though we don't _explicitly_ use it here, to
6471           // make trim_caps() behave.
6472           dir->try_touch_cap(dn->lease_mds);
6473           goto hit_dn;
6474         }
6475         ldout(cct, 20) << " bad lease, cap_ttl " << s.cap_ttl << ", cap_gen " << s.cap_gen
6476                        << " vs lease_gen " << dn->lease_gen << dendl;
6477       }
6478       // dir shared caps?
6479       if (dir->caps_issued_mask(CEPH_CAP_FILE_SHARED, true)) {
6480         if (dn->cap_shared_gen == dir->shared_gen &&
6481             (!dn->inode || dn->inode->caps_issued_mask(mask, true)))
6482               goto hit_dn;
6483         if (!dn->inode && (dir->flags & I_COMPLETE)) {
6484           ldout(cct, 10) << __func__ << " concluded ENOENT locally for "
6485                          << *dir << " dn '" << dname << "'" << dendl;
6486           return -ENOENT;
6487         }
6488       }
6489     } else {
6490       ldout(cct, 20) << " no cap on " << dn->inode->vino() << dendl;
6491     }
6492   } else {
6493     // can we conclude ENOENT locally?
6494     if (dir->caps_issued_mask(CEPH_CAP_FILE_SHARED, true) &&
6495         (dir->flags & I_COMPLETE)) {
6496       ldout(cct, 10) << __func__ << " concluded ENOENT locally for " << *dir << " dn '" << dname << "'" << dendl;
6497       return -ENOENT;
6498     }
6499   }
6500
6501   r = _do_lookup(dir, dname, mask, target, perms);
6502   goto done;
6503
6504  hit_dn:
6505   if (dn->inode) {
6506     *target = dn->inode;
6507   } else {
6508     r = -ENOENT;
6509   }
6510   touch_dn(dn);
6511
6512  done:
6513   if (r < 0)
6514     ldout(cct, 10) << __func__ << " " << *dir << " " << dname << " = " << r << dendl;
6515   else
6516     ldout(cct, 10) << __func__ << " " << *dir << " " << dname << " = " << **target << dendl;
6517   return r;
6518 }
6519
6520 int Client::get_or_create(Inode *dir, const char* name,
6521                           Dentry **pdn, bool expect_null)
6522 {
6523   // lookup
6524   ldout(cct, 20) << __func__ << " " << *dir << " name " << name << dendl;
6525   dir->open_dir();
6526   if (dir->dir->dentries.count(name)) {
6527     Dentry *dn = dir->dir->dentries[name];
6528
6529     // is dn lease valid?
6530     utime_t now = ceph_clock_now();
6531     if (dn->inode &&
6532         dn->lease_mds >= 0 &&
6533         dn->lease_ttl > now &&
6534         mds_sessions.count(dn->lease_mds)) {
6535       MetaSession &s = mds_sessions.at(dn->lease_mds);
6536       if (s.cap_ttl > now &&
6537           s.cap_gen == dn->lease_gen) {
6538         if (expect_null)
6539           return -EEXIST;
6540       }
6541     }
6542     *pdn = dn;
6543   } else {
6544     // otherwise link up a new one
6545     *pdn = link(dir->dir, name, NULL, NULL);
6546   }
6547
6548   // success
6549   return 0;
6550 }
6551
6552 int Client::path_walk(const filepath& origpath, InodeRef *end,
6553                       const UserPerm& perms, bool followsym, int mask)
6554 {
6555   filepath path = origpath;
6556   InodeRef cur;
6557   if (origpath.absolute())
6558     cur = root;
6559   else
6560     cur = cwd;
6561   ceph_assert(cur);
6562
6563   ldout(cct, 10) << __func__ << " " << path << dendl;
6564
6565   int symlinks = 0;
6566
6567   unsigned i=0;
6568   while (i < path.depth() && cur) {
6569     int caps = 0;
6570     const string &dname = path[i];
6571     ldout(cct, 10) << " " << i << " " << *cur << " " << dname << dendl;
6572     ldout(cct, 20) << "  (path is " << path << ")" << dendl;
6573     InodeRef next;
6574     if (cct->_conf->client_permissions) {
6575       int r = may_lookup(cur.get(), perms);
6576       if (r < 0)
6577         return r;
6578       caps = CEPH_CAP_AUTH_SHARED;
6579     }
6580
6581     /* Get extra requested caps on the last component */
6582     if (i == (path.depth() - 1))
6583       caps |= mask;
6584     int r = _lookup(cur.get(), dname, caps, &next, perms);
6585     if (r < 0)
6586       return r;
6587     // only follow trailing symlink if followsym.  always follow
6588     // 'directory' symlinks.
6589     if (next && next->is_symlink()) {
6590       symlinks++;
6591       ldout(cct, 20) << " symlink count " << symlinks << ", value is '" << next->symlink << "'" << dendl;
6592       if (symlinks > MAXSYMLINKS) {
6593         return -ELOOP;
6594       }
6595
6596       if (i < path.depth() - 1) {
6597         // dir symlink
6598         // replace consumed components of path with symlink dir target
6599         filepath resolved(next->symlink.c_str());
6600         resolved.append(path.postfixpath(i + 1));
6601         path = resolved;
6602         i = 0;
6603         if (next->symlink[0] == '/') {
6604           cur = root;
6605         }
6606         continue;
6607       } else if (followsym) {
6608         if (next->symlink[0] == '/') {
6609           path = next->symlink.c_str();
6610           i = 0;
6611           // reset position
6612           cur = root;
6613         } else {
6614           filepath more(next->symlink.c_str());
6615           // we need to remove the symlink component from off of the path
6616           // before adding the target that the symlink points to.  remain
6617           // at the same position in the path.
6618           path.pop_dentry();
6619           path.append(more);
6620         }
6621         continue;
6622       }
6623     }
6624     cur.swap(next);
6625     i++;
6626   }
6627   if (!cur)
6628     return -ENOENT;
6629   if (end)
6630     end->swap(cur);
6631   return 0;
6632 }
6633
6634
6635 // namespace ops
6636
6637 int Client::link(const char *relexisting, const char *relpath, const UserPerm& perm)
6638 {
6639   std::lock_guard lock(client_lock);
6640   tout(cct) << "link" << std::endl;
6641   tout(cct) << relexisting << std::endl;
6642   tout(cct) << relpath << std::endl;
6643
6644   if (unmounting)
6645     return -ENOTCONN;
6646
6647   filepath existing(relexisting);
6648
6649   InodeRef in, dir;
6650   int r = path_walk(existing, &in, perm, true);
6651   if (r < 0)
6652     return r;
6653   if (std::string(relpath) == "/") {
6654     r = -EEXIST;
6655     return r;
6656   }
6657   filepath path(relpath);
6658   string name = path.last_dentry();
6659   path.pop_dentry();
6660
6661   r = path_walk(path, &dir, perm, true);
6662   if (r < 0)
6663     return r;
6664   if (cct->_conf->client_permissions) {
6665     if (S_ISDIR(in->mode)) {
6666       r = -EPERM;
6667       return r;
6668     }
6669     r = may_hardlink(in.get(), perm);
6670     if (r < 0)
6671       return r;
6672     r = may_create(dir.get(), perm);
6673     if (r < 0)
6674       return r;
6675   }
6676   r = _link(in.get(), dir.get(), name.c_str(), perm);
6677   return r;
6678 }
6679
6680 int Client::unlink(const char *relpath, const UserPerm& perm)
6681 {
6682   std::lock_guard lock(client_lock);
6683   tout(cct) << __func__ << std::endl;
6684   tout(cct) << relpath << std::endl;
6685
6686   if (unmounting)
6687     return -ENOTCONN;
6688
6689   if (std::string(relpath) == "/")
6690     return -EISDIR;
6691
6692   filepath path(relpath);
6693   string name = path.last_dentry();
6694   path.pop_dentry();
6695   InodeRef dir;
6696   int r = path_walk(path, &dir, perm);
6697   if (r < 0)
6698     return r;
6699   if (cct->_conf->client_permissions) {
6700     r = may_delete(dir.get(), name.c_str(), perm);
6701     if (r < 0)
6702       return r;
6703   }
6704   return _unlink(dir.get(), name.c_str(), perm);
6705 }
6706
6707 int Client::rename(const char *relfrom, const char *relto, const UserPerm& perm)
6708 {
6709   std::lock_guard lock(client_lock);
6710   tout(cct) << __func__ << std::endl;
6711   tout(cct) << relfrom << std::endl;
6712   tout(cct) << relto << std::endl;
6713
6714   if (unmounting)
6715     return -ENOTCONN;
6716
6717   if (std::string(relfrom) == "/" || std::string(relto) == "/")
6718     return -EBUSY;
6719
6720   filepath from(relfrom);
6721   filepath to(relto);
6722   string fromname = from.last_dentry();
6723   from.pop_dentry();
6724   string toname = to.last_dentry();
6725   to.pop_dentry();
6726
6727   InodeRef fromdir, todir;
6728   int r = path_walk(from, &fromdir, perm);
6729   if (r < 0)
6730     goto out;
6731   r = path_walk(to, &todir, perm);
6732   if (r < 0)
6733     goto out;
6734
6735   if (cct->_conf->client_permissions) {
6736     int r = may_delete(fromdir.get(), fromname.c_str(), perm);
6737     if (r < 0)
6738       return r;
6739     r = may_delete(todir.get(), toname.c_str(), perm);
6740     if (r < 0 && r != -ENOENT)
6741       return r;
6742   }
6743   r = _rename(fromdir.get(), fromname.c_str(), todir.get(), toname.c_str(), perm);
6744 out:
6745   return r;
6746 }
6747
6748 // dirs
6749
6750 int Client::mkdir(const char *relpath, mode_t mode, const UserPerm& perm)
6751 {
6752   std::lock_guard lock(client_lock);
6753   tout(cct) << __func__ << std::endl;
6754   tout(cct) << relpath << std::endl;
6755   tout(cct) << mode << std::endl;
6756   ldout(cct, 10) << __func__ << ": " << relpath << dendl;
6757
6758   if (unmounting)
6759     return -ENOTCONN;
6760
6761   if (std::string(relpath) == "/")
6762     return -EEXIST;
6763
6764   filepath path(relpath);
6765   string name = path.last_dentry();
6766   path.pop_dentry();
6767   InodeRef dir;
6768   int r = path_walk(path, &dir, perm);
6769   if (r < 0)
6770     return r;
6771   if (cct->_conf->client_permissions) {
6772     r = may_create(dir.get(), perm);
6773     if (r < 0)
6774       return r;
6775   }
6776   return _mkdir(dir.get(), name.c_str(), mode, perm);
6777 }
6778
6779 int Client::mkdirs(const char *relpath, mode_t mode, const UserPerm& perms)
6780 {
6781   std::lock_guard lock(client_lock);
6782   ldout(cct, 10) << "Client::mkdirs " << relpath << dendl;
6783   tout(cct) << __func__ << std::endl;
6784   tout(cct) << relpath << std::endl;
6785   tout(cct) << mode << std::endl;
6786
6787   if (unmounting)
6788     return -ENOTCONN;
6789
6790   //get through existing parts of path
6791   filepath path(relpath);
6792   unsigned int i;
6793   int r = 0, caps = 0;
6794   InodeRef cur, next;
6795   cur = cwd;
6796   for (i=0; i<path.depth(); ++i) {
6797     if (cct->_conf->client_permissions) {
6798       r = may_lookup(cur.get(), perms);
6799       if (r < 0)
6800         break;
6801       caps = CEPH_CAP_AUTH_SHARED;
6802     }
6803     r = _lookup(cur.get(), path[i].c_str(), caps, &next, perms);
6804     if (r < 0)
6805       break;
6806     cur.swap(next);
6807   }
6808   if (r!=-ENOENT) return r;
6809   ldout(cct, 20) << __func__ << " got through " << i << " directories on path " << relpath << dendl;
6810   //make new directory at each level
6811   for (; i<path.depth(); ++i) {
6812     if (cct->_conf->client_permissions) {
6813       r = may_create(cur.get(), perms);
6814       if (r < 0)
6815         return r;
6816     }
6817     //make new dir
6818     r = _mkdir(cur.get(), path[i].c_str(), mode, perms, &next);
6819
6820     //check proper creation/existence
6821     if(-EEXIST == r && i < path.depth() - 1) {
6822       r = _lookup(cur.get(), path[i].c_str(), CEPH_CAP_AUTH_SHARED, &next, perms);
6823     }
6824     if (r < 0)
6825       return r;
6826     //move to new dir and continue
6827     cur.swap(next);
6828     ldout(cct, 20) << __func__ << ": successfully created directory "
6829                    << filepath(cur->ino).get_path() << dendl;
6830   }
6831   return 0;
6832 }
6833
6834 int Client::rmdir(const char *relpath, const UserPerm& perms)
6835 {
6836   std::lock_guard lock(client_lock);
6837   tout(cct) << __func__ << std::endl;
6838   tout(cct) << relpath << std::endl;
6839
6840   if (unmounting)
6841     return -ENOTCONN;
6842
6843   if (std::string(relpath) == "/")
6844     return -EBUSY;
6845
6846   filepath path(relpath);
6847   string name = path.last_dentry();
6848   path.pop_dentry();
6849   InodeRef dir;
6850   int r = path_walk(path, &dir, perms);
6851   if (r < 0)
6852     return r;
6853   if (cct->_conf->client_permissions) {
6854     int r = may_delete(dir.get(), name.c_str(), perms);
6855     if (r < 0)
6856       return r;
6857   }
6858   return _rmdir(dir.get(), name.c_str(), perms);
6859 }
6860
6861 int Client::mknod(const char *relpath, mode_t mode, const UserPerm& perms, dev_t rdev)
6862 {
6863   std::lock_guard lock(client_lock);
6864   tout(cct) << __func__ << std::endl;
6865   tout(cct) << relpath << std::endl;
6866   tout(cct) << mode << std::endl;
6867   tout(cct) << rdev << std::endl;
6868
6869   if (unmounting)
6870     return -ENOTCONN;
6871
6872   if (std::string(relpath) == "/")
6873     return -EEXIST;
6874
6875   filepath path(relpath);
6876   string name = path.last_dentry();
6877   path.pop_dentry();
6878   InodeRef dir;
6879   int r = path_walk(path, &dir, perms);
6880   if (r < 0)
6881     return r;
6882   if (cct->_conf->client_permissions) {
6883     int r = may_create(dir.get(), perms);
6884     if (r < 0)
6885       return r;
6886   }
6887   return _mknod(dir.get(), name.c_str(), mode, rdev, perms);
6888 }
6889
6890 // symlinks
6891
6892 int Client::symlink(const char *target, const char *relpath, const UserPerm& perms)
6893 {
6894   std::lock_guard lock(client_lock);
6895   tout(cct) << __func__ << std::endl;
6896   tout(cct) << target << std::endl;
6897   tout(cct) << relpath << std::endl;
6898
6899   if (unmounting)
6900     return -ENOTCONN;
6901
6902   if (std::string(relpath) == "/")
6903     return -EEXIST;
6904
6905   filepath path(relpath);
6906   string name = path.last_dentry();
6907   path.pop_dentry();
6908   InodeRef dir;
6909   int r = path_walk(path, &dir, perms);
6910   if (r < 0)
6911     return r;
6912   if (cct->_conf->client_permissions) {
6913     int r = may_create(dir.get(), perms);
6914     if (r < 0)
6915       return r;
6916   }
6917   return _symlink(dir.get(), name.c_str(), target, perms);
6918 }
6919
6920 int Client::readlink(const char *relpath, char *buf, loff_t size, const UserPerm& perms)
6921 {
6922   std::lock_guard lock(client_lock);
6923   tout(cct) << __func__ << std::endl;
6924   tout(cct) << relpath << std::endl;
6925
6926   if (unmounting)
6927     return -ENOTCONN;
6928
6929   filepath path(relpath);
6930   InodeRef in;
6931   int r = path_walk(path, &in, perms, false);
6932   if (r < 0)
6933     return r;
6934
6935   return _readlink(in.get(), buf, size);
6936 }
6937
6938 int Client::_readlink(Inode *in, char *buf, size_t size)
6939 {
6940   if (!in->is_symlink())
6941     return -EINVAL;
6942
6943   // copy into buf (at most size bytes)
6944   int r = in->symlink.length();
6945   if (r > (int)size)
6946     r = size;
6947   memcpy(buf, in->symlink.c_str(), r);
6948   return r;
6949 }
6950
6951
6952 // inode stuff
6953
6954 int Client::_getattr(Inode *in, int mask, const UserPerm& perms, bool force)
6955 {
6956   bool yes = in->caps_issued_mask(mask, true);
6957
6958   ldout(cct, 10) << __func__ << " mask " << ccap_string(mask) << " issued=" << yes << dendl;
6959   if (yes && !force)
6960     return 0;
6961
6962   MetaRequest *req = new MetaRequest(CEPH_MDS_OP_GETATTR);
6963   filepath path;
6964   in->make_nosnap_relative_path(path);
6965   req->set_filepath(path);
6966   req->set_inode(in);
6967   req->head.args.getattr.mask = mask;
6968
6969   int res = make_request(req, perms);
6970   ldout(cct, 10) << __func__ << " result=" << res << dendl;
6971   return res;
6972 }
6973
6974 int Client::_do_setattr(Inode *in, struct ceph_statx *stx, int mask,
6975                         const UserPerm& perms, InodeRef *inp)
6976 {
6977   int issued = in->caps_issued();
6978
6979   ldout(cct, 10) << __func__ << " mask " << mask << " issued " <<
6980     ccap_string(issued) << dendl;
6981
6982   if (in->snapid != CEPH_NOSNAP) {
6983     return -EROFS;
6984   }
6985   if ((mask & CEPH_SETATTR_SIZE) &&
6986       (unsigned long)stx->stx_size > in->size &&
6987       is_quota_bytes_exceeded(in, (unsigned long)stx->stx_size - in->size,
6988                               perms)) {
6989     return -EDQUOT;
6990   }
6991
6992   // make the change locally?
6993   if ((in->cap_dirtier_uid >= 0 && perms.uid() != in->cap_dirtier_uid) ||
6994       (in->cap_dirtier_gid >= 0 && perms.gid() != in->cap_dirtier_gid)) {
6995     ldout(cct, 10) << __func__ << " caller " << perms.uid() << ":" << perms.gid()
6996                    << " != cap dirtier " << in->cap_dirtier_uid << ":"
6997                    << in->cap_dirtier_gid << ", forcing sync setattr"
6998                    << dendl;
6999     /*
7000      * This works because we implicitly flush the caps as part of the
7001      * request, so the cap update check will happen with the writeback
7002      * cap context, and then the setattr check will happen with the
7003      * caller's context.
7004      *
7005      * In reality this pattern is likely pretty rare (different users
7006      * setattr'ing the same file).  If that turns out not to be the
7007      * case later, we can build a more complex pipelined cap writeback
7008      * infrastructure...
7009      */
7010     if (!mask)
7011       mask |= CEPH_SETATTR_CTIME;
7012     goto force_request;
7013   }
7014
7015   if (!mask) {
7016     // caller just needs us to bump the ctime
7017     in->ctime = ceph_clock_now();
7018     in->cap_dirtier_uid = perms.uid();
7019     in->cap_dirtier_gid = perms.gid();
7020     if (issued & CEPH_CAP_AUTH_EXCL)
7021       in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7022     else if (issued & CEPH_CAP_FILE_EXCL)
7023       in->mark_caps_dirty(CEPH_CAP_FILE_EXCL);
7024     else if (issued & CEPH_CAP_XATTR_EXCL)
7025       in->mark_caps_dirty(CEPH_CAP_XATTR_EXCL);
7026     else
7027       mask |= CEPH_SETATTR_CTIME;
7028   }
7029
7030   if (in->caps_issued_mask(CEPH_CAP_AUTH_EXCL)) {
7031     bool kill_sguid = mask & (CEPH_SETATTR_SIZE|CEPH_SETATTR_KILL_SGUID);
7032
7033     mask &= ~CEPH_SETATTR_KILL_SGUID;
7034
7035     if (mask & CEPH_SETATTR_UID) {
7036       in->ctime = ceph_clock_now();
7037       in->cap_dirtier_uid = perms.uid();
7038       in->cap_dirtier_gid = perms.gid();
7039       in->uid = stx->stx_uid;
7040       in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7041       mask &= ~CEPH_SETATTR_UID;
7042       kill_sguid = true;
7043       ldout(cct,10) << "changing uid to " << stx->stx_uid << dendl;
7044     }
7045     if (mask & CEPH_SETATTR_GID) {
7046       in->ctime = ceph_clock_now();
7047       in->cap_dirtier_uid = perms.uid();
7048       in->cap_dirtier_gid = perms.gid();
7049       in->gid = stx->stx_gid;
7050       in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7051       mask &= ~CEPH_SETATTR_GID;
7052       kill_sguid = true;
7053       ldout(cct,10) << "changing gid to " << stx->stx_gid << dendl;
7054     }
7055
7056     if (mask & CEPH_SETATTR_MODE) {
7057       in->ctime = ceph_clock_now();
7058       in->cap_dirtier_uid = perms.uid();
7059       in->cap_dirtier_gid = perms.gid();
7060       in->mode = (in->mode & ~07777) | (stx->stx_mode & 07777);
7061       in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7062       mask &= ~CEPH_SETATTR_MODE;
7063       ldout(cct,10) << "changing mode to " << stx->stx_mode << dendl;
7064     } else if (kill_sguid && S_ISREG(in->mode) && (in->mode & (S_IXUSR|S_IXGRP|S_IXOTH))) {
7065       /* Must squash the any setuid/setgid bits with an ownership change */
7066       in->mode &= ~(S_ISUID|S_ISGID);
7067       in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7068     }
7069
7070     if (mask & CEPH_SETATTR_BTIME) {
7071       in->ctime = ceph_clock_now();
7072       in->cap_dirtier_uid = perms.uid();
7073       in->cap_dirtier_gid = perms.gid();
7074       in->btime = utime_t(stx->stx_btime);
7075       in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7076       mask &= ~CEPH_SETATTR_BTIME;
7077       ldout(cct,10) << "changing btime to " << in->btime << dendl;
7078     }
7079   } else if (mask & CEPH_SETATTR_SIZE) {
7080     /* If we don't have Ax, then we must ask the server to clear them on truncate */
7081     mask |= CEPH_SETATTR_KILL_SGUID;
7082   }
7083
7084   if (in->caps_issued_mask(CEPH_CAP_FILE_EXCL)) {
7085     if (mask & (CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME)) {
7086       if (mask & CEPH_SETATTR_MTIME)
7087         in->mtime = utime_t(stx->stx_mtime);
7088       if (mask & CEPH_SETATTR_ATIME)
7089         in->atime = utime_t(stx->stx_atime);
7090       in->ctime = ceph_clock_now();
7091       in->cap_dirtier_uid = perms.uid();
7092       in->cap_dirtier_gid = perms.gid();
7093       in->time_warp_seq++;
7094       in->mark_caps_dirty(CEPH_CAP_FILE_EXCL);
7095       mask &= ~(CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME);
7096     }
7097   }
7098   if (!mask) {
7099     in->change_attr++;
7100     return 0;
7101   }
7102
7103 force_request:
7104   MetaRequest *req = new MetaRequest(CEPH_MDS_OP_SETATTR);
7105
7106   filepath path;
7107
7108   in->make_nosnap_relative_path(path);
7109   req->set_filepath(path);
7110   req->set_inode(in);
7111
7112   if (mask & CEPH_SETATTR_KILL_SGUID) {
7113     req->inode_drop |= CEPH_CAP_AUTH_SHARED;
7114   }
7115   if (mask & CEPH_SETATTR_MODE) {
7116     req->head.args.setattr.mode = stx->stx_mode;
7117     req->inode_drop |= CEPH_CAP_AUTH_SHARED;
7118     ldout(cct,10) << "changing mode to " << stx->stx_mode << dendl;
7119   }
7120   if (mask & CEPH_SETATTR_UID) {
7121     req->head.args.setattr.uid = stx->stx_uid;
7122     req->inode_drop |= CEPH_CAP_AUTH_SHARED;
7123     ldout(cct,10) << "changing uid to " << stx->stx_uid << dendl;
7124   }
7125   if (mask & CEPH_SETATTR_GID) {
7126     req->head.args.setattr.gid = stx->stx_gid;
7127     req->inode_drop |= CEPH_CAP_AUTH_SHARED;
7128     ldout(cct,10) << "changing gid to " << stx->stx_gid << dendl;
7129   }
7130   if (mask & CEPH_SETATTR_BTIME) {
7131     req->head.args.setattr.btime = utime_t(stx->stx_btime);
7132     req->inode_drop |= CEPH_CAP_AUTH_SHARED;
7133   }
7134   if (mask & CEPH_SETATTR_MTIME) {
7135     req->head.args.setattr.mtime = utime_t(stx->stx_mtime);
7136     req->inode_drop |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_RD |
7137       CEPH_CAP_FILE_WR;
7138   }
7139   if (mask & CEPH_SETATTR_ATIME) {
7140     req->head.args.setattr.atime = utime_t(stx->stx_atime);
7141     req->inode_drop |= CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_RD |
7142       CEPH_CAP_FILE_WR;
7143   }
7144   if (mask & CEPH_SETATTR_SIZE) {
7145     if ((unsigned long)stx->stx_size < mdsmap->get_max_filesize()) {
7146       req->head.args.setattr.size = stx->stx_size;
7147       ldout(cct,10) << "changing size to " << stx->stx_size << dendl;
7148     } else { //too big!
7149       put_request(req);
7150       ldout(cct,10) << "unable to set size to " << stx->stx_size << ". Too large!" << dendl;
7151       return -EFBIG;
7152     }
7153     req->inode_drop |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_RD |
7154       CEPH_CAP_FILE_WR;
7155   }
7156   req->head.args.setattr.mask = mask;
7157
7158   req->regetattr_mask = mask;
7159
7160   int res = make_request(req, perms, inp);
7161   ldout(cct, 10) << "_setattr result=" << res << dendl;
7162   return res;
7163 }
7164
7165 /* Note that we only care about attrs that setattr cares about */
7166 void Client::stat_to_statx(struct stat *st, struct ceph_statx *stx)
7167 {
7168   stx->stx_size = st->st_size;
7169   stx->stx_mode = st->st_mode;
7170   stx->stx_uid = st->st_uid;
7171   stx->stx_gid = st->st_gid;
7172 #ifdef __APPLE__
7173   stx->stx_mtime = st->st_mtimespec;
7174   stx->stx_atime = st->st_atimespec;
7175 #else
7176   stx->stx_mtime = st->st_mtim;
7177   stx->stx_atime = st->st_atim;
7178 #endif
7179 }
7180
7181 int Client::__setattrx(Inode *in, struct ceph_statx *stx, int mask,
7182                        const UserPerm& perms, InodeRef *inp)
7183 {
7184   int ret = _do_setattr(in, stx, mask, perms, inp);
7185   if (ret < 0)
7186    return ret;
7187   if (mask & CEPH_SETATTR_MODE)
7188     ret = _posix_acl_chmod(in, stx->stx_mode, perms);
7189   return ret;
7190 }
7191
7192 int Client::_setattrx(InodeRef &in, struct ceph_statx *stx, int mask,
7193                       const UserPerm& perms)
7194 {
7195   mask &= (CEPH_SETATTR_MODE | CEPH_SETATTR_UID |
7196            CEPH_SETATTR_GID | CEPH_SETATTR_MTIME |
7197            CEPH_SETATTR_ATIME | CEPH_SETATTR_SIZE |
7198            CEPH_SETATTR_CTIME | CEPH_SETATTR_BTIME);
7199   if (cct->_conf->client_permissions) {
7200     int r = may_setattr(in.get(), stx, mask, perms);
7201     if (r < 0)
7202       return r;
7203   }
7204   return __setattrx(in.get(), stx, mask, perms);
7205 }
7206
7207 int Client::_setattr(InodeRef &in, struct stat *attr, int mask,
7208                      const UserPerm& perms)
7209 {
7210   struct ceph_statx stx;
7211
7212   stat_to_statx(attr, &stx);
7213   mask &= ~CEPH_SETATTR_BTIME;
7214
7215   if ((mask & CEPH_SETATTR_UID) && attr->st_uid == static_cast<uid_t>(-1)) {
7216     mask &= ~CEPH_SETATTR_UID;
7217   }
7218   if ((mask & CEPH_SETATTR_GID) && attr->st_gid == static_cast<uid_t>(-1)) {
7219     mask &= ~CEPH_SETATTR_GID;
7220   }
7221
7222   return _setattrx(in, &stx, mask, perms);
7223 }
7224
7225 int Client::setattr(const char *relpath, struct stat *attr, int mask,
7226                     const UserPerm& perms)
7227 {
7228   std::lock_guard lock(client_lock);
7229   tout(cct) << __func__ << std::endl;
7230   tout(cct) << relpath << std::endl;
7231   tout(cct) << mask  << std::endl;
7232
7233   if (unmounting)
7234     return -ENOTCONN;
7235
7236   filepath path(relpath);
7237   InodeRef in;
7238   int r = path_walk(path, &in, perms);
7239   if (r < 0)
7240     return r;
7241   return _setattr(in, attr, mask, perms);
7242 }
7243
7244 int Client::setattrx(const char *relpath, struct ceph_statx *stx, int mask,
7245                      const UserPerm& perms, int flags)
7246 {
7247   std::lock_guard lock(client_lock);
7248   tout(cct) << __func__ << std::endl;
7249   tout(cct) << relpath << std::endl;
7250   tout(cct) << mask  << std::endl;
7251
7252   if (unmounting)
7253     return -ENOTCONN;
7254
7255   filepath path(relpath);
7256   InodeRef in;
7257   int r = path_walk(path, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW));
7258   if (r < 0)
7259     return r;
7260   return _setattrx(in, stx, mask, perms);
7261 }
7262
7263 int Client::fsetattr(int fd, struct stat *attr, int mask, const UserPerm& perms)
7264 {
7265   std::lock_guard lock(client_lock);
7266   tout(cct) << __func__ << std::endl;
7267   tout(cct) << fd << std::endl;
7268   tout(cct) << mask  << std::endl;
7269
7270   if (unmounting)
7271     return -ENOTCONN;
7272
7273   Fh *f = get_filehandle(fd);
7274   if (!f)
7275     return -EBADF;
7276 #if defined(__linux__) && defined(O_PATH)
7277   if (f->flags & O_PATH)
7278     return -EBADF;
7279 #endif
7280   return _setattr(f->inode, attr, mask, perms);
7281 }
7282
7283 int Client::fsetattrx(int fd, struct ceph_statx *stx, int mask, const UserPerm& perms)
7284 {
7285   std::lock_guard lock(client_lock);
7286   tout(cct) << __func__ << std::endl;
7287   tout(cct) << fd << std::endl;
7288   tout(cct) << mask  << std::endl;
7289
7290   if (unmounting)
7291     return -ENOTCONN;
7292
7293   Fh *f = get_filehandle(fd);
7294   if (!f)
7295     return -EBADF;
7296 #if defined(__linux__) && defined(O_PATH)
7297   if (f->flags & O_PATH)
7298     return -EBADF;
7299 #endif
7300   return _setattrx(f->inode, stx, mask, perms);
7301 }
7302
7303 int Client::stat(const char *relpath, struct stat *stbuf, const UserPerm& perms,
7304                  frag_info_t *dirstat, int mask)
7305 {
7306   ldout(cct, 3) << __func__ << " enter (relpath " << relpath << " mask " << mask << ")" << dendl;
7307   std::lock_guard lock(client_lock);
7308   tout(cct) << "stat" << std::endl;
7309   tout(cct) << relpath << std::endl;
7310
7311   if (unmounting)
7312     return -ENOTCONN;
7313
7314   filepath path(relpath);
7315   InodeRef in;
7316   int r = path_walk(path, &in, perms, true, mask);
7317   if (r < 0)
7318     return r;
7319   r = _getattr(in, mask, perms);
7320   if (r < 0) {
7321     ldout(cct, 3) << __func__ << " exit on error!" << dendl;
7322     return r;
7323   }
7324   fill_stat(in, stbuf, dirstat);
7325   ldout(cct, 3) << __func__ << " exit (relpath " << relpath << " mask " << mask << ")" << dendl;
7326   return r;
7327 }
7328
7329 unsigned Client::statx_to_mask(unsigned int flags, unsigned int want)
7330 {
7331   unsigned mask = 0;
7332
7333   /* if NO_ATTR_SYNC is set, then we don't need any -- just use what's in cache */
7334   if (flags & AT_NO_ATTR_SYNC)
7335     goto out;
7336
7337   /* Always set PIN to distinguish from AT_NO_ATTR_SYNC case */
7338   mask |= CEPH_CAP_PIN;
7339   if (want & (CEPH_STATX_MODE|CEPH_STATX_UID|CEPH_STATX_GID|CEPH_STATX_BTIME|CEPH_STATX_CTIME|CEPH_STATX_VERSION))
7340     mask |= CEPH_CAP_AUTH_SHARED;
7341   if (want & (CEPH_STATX_NLINK|CEPH_STATX_CTIME|CEPH_STATX_VERSION))
7342     mask |= CEPH_CAP_LINK_SHARED;
7343   if (want & (CEPH_STATX_NLINK|CEPH_STATX_ATIME|CEPH_STATX_MTIME|CEPH_STATX_CTIME|CEPH_STATX_SIZE|CEPH_STATX_BLOCKS|CEPH_STATX_VERSION))
7344     mask |= CEPH_CAP_FILE_SHARED;
7345   if (want & (CEPH_STATX_VERSION|CEPH_STATX_CTIME))
7346     mask |= CEPH_CAP_XATTR_SHARED;
7347 out:
7348   return mask;
7349 }
7350
7351 int Client::statx(const char *relpath, struct ceph_statx *stx,
7352                   const UserPerm& perms,
7353                   unsigned int want, unsigned int flags)
7354 {
7355   ldout(cct, 3) << __func__ << " enter (relpath " << relpath << " want " << want << ")" << dendl;
7356   std::lock_guard lock(client_lock);
7357   tout(cct) << "statx" << std::endl;
7358   tout(cct) << relpath << std::endl;
7359
7360   if (unmounting)
7361     return -ENOTCONN;
7362
7363   filepath path(relpath);
7364   InodeRef in;
7365
7366   unsigned mask = statx_to_mask(flags, want);
7367
7368   int r = path_walk(path, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW), mask);
7369   if (r < 0)
7370     return r;
7371
7372   r = _getattr(in, mask, perms);
7373   if (r < 0) {
7374     ldout(cct, 3) << __func__ << " exit on error!" << dendl;
7375     return r;
7376   }
7377
7378   fill_statx(in, mask, stx);
7379   ldout(cct, 3) << __func__ << " exit (relpath " << relpath << " mask " << stx->stx_mask << ")" << dendl;
7380   return r;
7381 }
7382
7383 int Client::lstat(const char *relpath, struct stat *stbuf,
7384                   const UserPerm& perms, frag_info_t *dirstat, int mask)
7385 {
7386   ldout(cct, 3) << __func__ << " enter (relpath " << relpath << " mask " << mask << ")" << dendl;
7387   std::lock_guard lock(client_lock);
7388   tout(cct) << __func__ << std::endl;
7389   tout(cct) << relpath << std::endl;
7390
7391   if (unmounting)
7392     return -ENOTCONN;
7393
7394   filepath path(relpath);
7395   InodeRef in;
7396   // don't follow symlinks
7397   int r = path_walk(path, &in, perms, false, mask);
7398   if (r < 0)
7399     return r;
7400   r = _getattr(in, mask, perms);
7401   if (r < 0) {
7402     ldout(cct, 3) << __func__ << " exit on error!" << dendl;
7403     return r;
7404   }
7405   fill_stat(in, stbuf, dirstat);
7406   ldout(cct, 3) << __func__ << " exit (relpath " << relpath << " mask " << mask << ")" << dendl;
7407   return r;
7408 }
7409
7410 int Client::fill_stat(Inode *in, struct stat *st, frag_info_t *dirstat, nest_info_t *rstat)
7411 {
7412   ldout(cct, 10) << __func__ << " on " << in->ino << " snap/dev" << in->snapid
7413            << " mode 0" << oct << in->mode << dec
7414            << " mtime " << in->mtime << " ctime " << in->ctime << dendl;
7415   memset(st, 0, sizeof(struct stat));
7416   if (use_faked_inos())
7417     st->st_ino = in->faked_ino;
7418   else
7419     st->st_ino = in->ino;
7420   st->st_dev = in->snapid;
7421   st->st_mode = in->mode;
7422   st->st_rdev = in->rdev;
7423   if (in->is_dir()) {
7424     switch (in->nlink) {
7425       case 0:
7426         st->st_nlink = 0; /* dir is unlinked */
7427         break;
7428       case 1:
7429         st->st_nlink = 1 /* parent dentry */
7430                        + 1 /* <dir>/. */
7431                        + in->dirstat.nsubdirs; /* include <dir>/. self-reference */
7432         break;
7433       default:
7434         ceph_abort();
7435     }
7436   } else {
7437     st->st_nlink = in->nlink;
7438   }
7439   st->st_uid = in->uid;
7440   st->st_gid = in->gid;
7441   if (in->ctime > in->mtime) {
7442     stat_set_ctime_sec(st, in->ctime.sec());
7443     stat_set_ctime_nsec(st, in->ctime.nsec());
7444   } else {
7445     stat_set_ctime_sec(st, in->mtime.sec());
7446     stat_set_ctime_nsec(st, in->mtime.nsec());
7447   }
7448   stat_set_atime_sec(st, in->atime.sec());
7449   stat_set_atime_nsec(st, in->atime.nsec());
7450   stat_set_mtime_sec(st, in->mtime.sec());
7451   stat_set_mtime_nsec(st, in->mtime.nsec());
7452   if (in->is_dir()) {
7453     if (cct->_conf->client_dirsize_rbytes)
7454       st->st_size = in->rstat.rbytes;
7455     else
7456       st->st_size = in->dirstat.size();
7457     st->st_blocks = 1;
7458   } else {
7459     st->st_size = in->size;
7460     st->st_blocks = (in->size + 511) >> 9;
7461   }
7462   st->st_blksize = std::max<uint32_t>(in->layout.stripe_unit, 4096);
7463
7464   if (dirstat)
7465     *dirstat = in->dirstat;
7466   if (rstat)
7467     *rstat = in->rstat;
7468
7469   return in->caps_issued();
7470 }
7471
7472 void Client::fill_statx(Inode *in, unsigned int mask, struct ceph_statx *stx)
7473 {
7474   ldout(cct, 10) << __func__ << " on " << in->ino << " snap/dev" << in->snapid
7475            << " mode 0" << oct << in->mode << dec
7476            << " mtime " << in->mtime << " ctime " << in->ctime << dendl;
7477   memset(stx, 0, sizeof(struct ceph_statx));
7478
7479   /*
7480    * If mask is 0, then the caller set AT_NO_ATTR_SYNC. Reset the mask
7481    * so that all bits are set.
7482    */
7483   if (!mask)
7484     mask = ~0;
7485
7486   /* These are always considered to be available */
7487   stx->stx_dev = in->snapid;
7488   stx->stx_blksize = std::max<uint32_t>(in->layout.stripe_unit, 4096);
7489
7490   /* Type bits are always set, even when CEPH_STATX_MODE is not */
7491   stx->stx_mode = S_IFMT & in->mode;
7492   stx->stx_ino = use_faked_inos() ? in->faked_ino : (ino_t)in->ino;
7493   stx->stx_rdev = in->rdev;
7494   stx->stx_mask |= (CEPH_STATX_INO|CEPH_STATX_RDEV);
7495
7496   if (mask & CEPH_CAP_AUTH_SHARED) {
7497     stx->stx_uid = in->uid;
7498     stx->stx_gid = in->gid;
7499     stx->stx_mode = in->mode;
7500     in->btime.to_timespec(&stx->stx_btime);
7501     stx->stx_mask |= (CEPH_STATX_MODE|CEPH_STATX_UID|CEPH_STATX_GID|CEPH_STATX_BTIME);
7502   }
7503
7504   if (mask & CEPH_CAP_LINK_SHARED) {
7505     if (in->is_dir()) {
7506       switch (in->nlink) {
7507         case 0:
7508           stx->stx_nlink = 0; /* dir is unlinked */
7509           break;
7510         case 1:
7511           stx->stx_nlink = 1 /* parent dentry */
7512                            + 1 /* <dir>/. */
7513                            + in->dirstat.nsubdirs; /* include <dir>/. self-reference */
7514           break;
7515         default:
7516           ceph_abort();
7517       }
7518     } else {
7519       stx->stx_nlink = in->nlink;
7520     }
7521     stx->stx_mask |= CEPH_STATX_NLINK;
7522   }
7523
7524   if (mask & CEPH_CAP_FILE_SHARED) {
7525
7526     in->atime.to_timespec(&stx->stx_atime);
7527     in->mtime.to_timespec(&stx->stx_mtime);
7528
7529     if (in->is_dir()) {
7530       if (cct->_conf->client_dirsize_rbytes)
7531         stx->stx_size = in->rstat.rbytes;
7532       else
7533         stx->stx_size = in->dirstat.size();
7534       stx->stx_blocks = 1;
7535     } else {
7536       stx->stx_size = in->size;
7537       stx->stx_blocks = (in->size + 511) >> 9;
7538     }
7539     stx->stx_mask |= (CEPH_STATX_ATIME|CEPH_STATX_MTIME|
7540                       CEPH_STATX_SIZE|CEPH_STATX_BLOCKS);
7541   }
7542
7543   /* Change time and change_attr both require all shared caps to view */
7544   if ((mask & CEPH_STAT_CAP_INODE_ALL) == CEPH_STAT_CAP_INODE_ALL) {
7545     stx->stx_version = in->change_attr;
7546     if (in->ctime > in->mtime)
7547       in->ctime.to_timespec(&stx->stx_ctime);
7548     else
7549       in->mtime.to_timespec(&stx->stx_ctime);
7550     stx->stx_mask |= (CEPH_STATX_CTIME|CEPH_STATX_VERSION);
7551   }
7552
7553 }
7554
7555 void Client::touch_dn(Dentry *dn)
7556 {
7557   lru.lru_touch(dn);
7558 }
7559
7560 int Client::chmod(const char *relpath, mode_t mode, const UserPerm& perms)
7561 {
7562   std::lock_guard lock(client_lock);
7563   tout(cct) << __func__ << std::endl;
7564   tout(cct) << relpath << std::endl;
7565   tout(cct) << mode << std::endl;
7566
7567   if (unmounting)
7568     return -ENOTCONN;
7569
7570   filepath path(relpath);
7571   InodeRef in;
7572   int r = path_walk(path, &in, perms);
7573   if (r < 0)
7574     return r;
7575   struct stat attr;
7576   attr.st_mode = mode;
7577   return _setattr(in, &attr, CEPH_SETATTR_MODE, perms);
7578 }
7579
7580 int Client::fchmod(int fd, mode_t mode, const UserPerm& perms)
7581 {
7582   std::lock_guard lock(client_lock);
7583   tout(cct) << __func__ << std::endl;
7584   tout(cct) << fd << std::endl;
7585   tout(cct) << mode << std::endl;
7586
7587   if (unmounting)
7588     return -ENOTCONN;
7589
7590   Fh *f = get_filehandle(fd);
7591   if (!f)
7592     return -EBADF;
7593 #if defined(__linux__) && defined(O_PATH)
7594   if (f->flags & O_PATH)
7595     return -EBADF;
7596 #endif
7597   struct stat attr;
7598   attr.st_mode = mode;
7599   return _setattr(f->inode, &attr, CEPH_SETATTR_MODE, perms);
7600 }
7601
7602 int Client::lchmod(const char *relpath, mode_t mode, const UserPerm& perms)
7603 {
7604   std::lock_guard lock(client_lock);
7605   tout(cct) << __func__ << std::endl;
7606   tout(cct) << relpath << std::endl;
7607   tout(cct) << mode << std::endl;
7608
7609   if (unmounting)
7610     return -ENOTCONN;
7611
7612   filepath path(relpath);
7613   InodeRef in;
7614   // don't follow symlinks
7615   int r = path_walk(path, &in, perms, false);
7616   if (r < 0)
7617     return r;
7618   struct stat attr;
7619   attr.st_mode = mode;
7620   return _setattr(in, &attr, CEPH_SETATTR_MODE, perms);
7621 }
7622
7623 int Client::chown(const char *relpath, uid_t new_uid, gid_t new_gid,
7624                   const UserPerm& perms)
7625 {
7626   std::lock_guard lock(client_lock);
7627   tout(cct) << __func__ << std::endl;
7628   tout(cct) << relpath << std::endl;
7629   tout(cct) << new_uid << std::endl;
7630   tout(cct) << new_gid << std::endl;
7631
7632   if (unmounting)
7633     return -ENOTCONN;
7634
7635   filepath path(relpath);
7636   InodeRef in;
7637   int r = path_walk(path, &in, perms);
7638   if (r < 0)
7639     return r;
7640   struct stat attr;
7641   attr.st_uid = new_uid;
7642   attr.st_gid = new_gid;
7643   return _setattr(in, &attr, CEPH_SETATTR_UID|CEPH_SETATTR_GID, perms);
7644 }
7645
7646 int Client::fchown(int fd, uid_t new_uid, gid_t new_gid, const UserPerm& perms)
7647 {
7648   std::lock_guard lock(client_lock);
7649   tout(cct) << __func__ << std::endl;
7650   tout(cct) << fd << std::endl;
7651   tout(cct) << new_uid << std::endl;
7652   tout(cct) << new_gid << std::endl;
7653
7654   if (unmounting)
7655     return -ENOTCONN;
7656
7657   Fh *f = get_filehandle(fd);
7658   if (!f)
7659     return -EBADF;
7660 #if defined(__linux__) && defined(O_PATH)
7661   if (f->flags & O_PATH)
7662     return -EBADF;
7663 #endif
7664   struct stat attr;
7665   attr.st_uid = new_uid;
7666   attr.st_gid = new_gid;
7667   int mask = 0;
7668   if (new_uid != static_cast<uid_t>(-1)) mask |= CEPH_SETATTR_UID;
7669   if (new_gid != static_cast<gid_t>(-1)) mask |= CEPH_SETATTR_GID;
7670   return _setattr(f->inode, &attr, mask, perms);
7671 }
7672
7673 int Client::lchown(const char *relpath, uid_t new_uid, gid_t new_gid,
7674                    const UserPerm& perms)
7675 {
7676   std::lock_guard lock(client_lock);
7677   tout(cct) << __func__ << std::endl;
7678   tout(cct) << relpath << std::endl;
7679   tout(cct) << new_uid << std::endl;
7680   tout(cct) << new_gid << std::endl;
7681
7682   if (unmounting)
7683     return -ENOTCONN;
7684
7685   filepath path(relpath);
7686   InodeRef in;
7687   // don't follow symlinks
7688   int r = path_walk(path, &in, perms, false);
7689   if (r < 0)
7690     return r;
7691   struct stat attr;
7692   attr.st_uid = new_uid;
7693   attr.st_gid = new_gid;
7694   int mask = 0;
7695   if (new_uid != static_cast<uid_t>(-1)) mask |= CEPH_SETATTR_UID;
7696   if (new_gid != static_cast<gid_t>(-1)) mask |= CEPH_SETATTR_GID;
7697   return _setattr(in, &attr, mask, perms);
7698 }
7699
7700 static void attr_set_atime_and_mtime(struct stat *attr,
7701                                      const utime_t &atime,
7702                                      const utime_t &mtime)
7703 {
7704   stat_set_atime_sec(attr, atime.tv.tv_sec);
7705   stat_set_atime_nsec(attr, atime.tv.tv_nsec);
7706   stat_set_mtime_sec(attr, mtime.tv.tv_sec);
7707   stat_set_mtime_nsec(attr, mtime.tv.tv_nsec);
7708 }
7709
7710 // for [l]utime() invoke the timeval variant as the timespec
7711 // variant are not yet implemented. for futime[s](), invoke
7712 // the timespec variant.
7713 int Client::utime(const char *relpath, struct utimbuf *buf,
7714                   const UserPerm& perms)
7715 {
7716   struct timeval tv[2];
7717   tv[0].tv_sec  = buf->actime;
7718   tv[0].tv_usec = 0;
7719   tv[1].tv_sec  = buf->modtime;
7720   tv[1].tv_usec = 0;
7721
7722   return utimes(relpath, tv, perms);
7723 }
7724
7725 int Client::lutime(const char *relpath, struct utimbuf *buf,
7726                    const UserPerm& perms)
7727 {
7728   struct timeval tv[2];
7729   tv[0].tv_sec  = buf->actime;
7730   tv[0].tv_usec = 0;
7731   tv[1].tv_sec  = buf->modtime;
7732   tv[1].tv_usec = 0;
7733
7734   return lutimes(relpath, tv, perms);
7735 }
7736
7737 int Client::futime(int fd, struct utimbuf *buf, const UserPerm& perms)
7738 {
7739   struct timespec ts[2];
7740   ts[0].tv_sec  = buf->actime;
7741   ts[0].tv_nsec = 0;
7742   ts[1].tv_sec  = buf->modtime;
7743   ts[1].tv_nsec = 0;
7744
7745   return futimens(fd, ts, perms);
7746 }
7747
7748 int Client::utimes(const char *relpath, struct timeval times[2],
7749                    const UserPerm& perms)
7750 {
7751   std::lock_guard lock(client_lock);
7752   tout(cct) << __func__ << std::endl;
7753   tout(cct) << relpath << std::endl;
7754   tout(cct) << "atime: " << times[0].tv_sec << "." << times[0].tv_usec
7755             << std::endl;
7756   tout(cct) << "mtime: " << times[1].tv_sec << "." << times[1].tv_usec
7757             << std::endl;
7758
7759   if (unmounting)
7760     return -ENOTCONN;
7761
7762   filepath path(relpath);
7763   InodeRef in;
7764   int r = path_walk(path, &in, perms);
7765   if (r < 0)
7766     return r;
7767   struct stat attr;
7768   utime_t atime(times[0]);
7769   utime_t mtime(times[1]);
7770
7771   attr_set_atime_and_mtime(&attr, atime, mtime);
7772   return _setattr(in, &attr, CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME, perms);
7773 }
7774
7775 int Client::lutimes(const char *relpath, struct timeval times[2],
7776                     const UserPerm& perms)
7777 {
7778   std::lock_guard lock(client_lock);
7779   tout(cct) << __func__ << std::endl;
7780   tout(cct) << relpath << std::endl;
7781   tout(cct) << "atime: " << times[0].tv_sec << "." << times[0].tv_usec
7782             << std::endl;
7783   tout(cct) << "mtime: " << times[1].tv_sec << "." << times[1].tv_usec
7784             << std::endl;
7785
7786   if (unmounting)
7787     return -ENOTCONN;
7788
7789   filepath path(relpath);
7790   InodeRef in;
7791   int r = path_walk(path, &in, perms, false);
7792   if (r < 0)
7793     return r;
7794   struct stat attr;
7795   utime_t atime(times[0]);
7796   utime_t mtime(times[1]);
7797
7798   attr_set_atime_and_mtime(&attr, atime, mtime);
7799   return _setattr(in, &attr, CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME, perms);
7800 }
7801
7802 int Client::futimes(int fd, struct timeval times[2], const UserPerm& perms)
7803 {
7804   struct timespec ts[2];
7805   ts[0].tv_sec  = times[0].tv_sec;
7806   ts[0].tv_nsec = times[0].tv_usec * 1000;
7807   ts[1].tv_sec  = times[1].tv_sec;
7808   ts[1].tv_nsec = times[1].tv_usec * 1000;
7809
7810   return futimens(fd, ts, perms);
7811 }
7812
7813 int Client::futimens(int fd, struct timespec times[2], const UserPerm& perms)
7814 {
7815   std::lock_guard lock(client_lock);
7816   tout(cct) << __func__ << std::endl;
7817   tout(cct) << fd << std::endl;
7818   tout(cct) << "atime: " << times[0].tv_sec << "." << times[0].tv_nsec
7819             << std::endl;
7820   tout(cct) << "mtime: " << times[1].tv_sec << "." << times[1].tv_nsec
7821             << std::endl;
7822
7823   if (unmounting)
7824     return -ENOTCONN;
7825
7826   Fh *f = get_filehandle(fd);
7827   if (!f)
7828     return -EBADF;
7829 #if defined(__linux__) && defined(O_PATH)
7830   if (f->flags & O_PATH)
7831     return -EBADF;
7832 #endif
7833   struct stat attr;
7834   utime_t atime(times[0]);
7835   utime_t mtime(times[1]);
7836
7837   attr_set_atime_and_mtime(&attr, atime, mtime);
7838   return _setattr(f->inode, &attr, CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME, perms);
7839 }
7840
7841 int Client::flock(int fd, int operation, uint64_t owner)
7842 {
7843   std::lock_guard lock(client_lock);
7844   tout(cct) << __func__ << std::endl;
7845   tout(cct) << fd << std::endl;
7846   tout(cct) << operation << std::endl;
7847   tout(cct) << owner << std::endl;
7848
7849   if (unmounting)
7850     return -ENOTCONN;
7851
7852   Fh *f = get_filehandle(fd);
7853   if (!f)
7854     return -EBADF;
7855
7856   return _flock(f, operation, owner);
7857 }
7858
7859 int Client::opendir(const char *relpath, dir_result_t **dirpp, const UserPerm& perms)
7860 {
7861   std::lock_guard lock(client_lock);
7862   tout(cct) << __func__ << std::endl;
7863   tout(cct) << relpath << std::endl;
7864
7865   if (unmounting)
7866     return -ENOTCONN;
7867
7868   filepath path(relpath);
7869   InodeRef in;
7870   int r = path_walk(path, &in, perms, true);
7871   if (r < 0)
7872     return r;
7873   if (cct->_conf->client_permissions) {
7874     int r = may_open(in.get(), O_RDONLY, perms);
7875     if (r < 0)
7876       return r;
7877   }
7878   r = _opendir(in.get(), dirpp, perms);
7879   /* if ENOTDIR, dirpp will be an uninitialized point and it's very dangerous to access its value */
7880   if (r != -ENOTDIR)
7881       tout(cct) << (unsigned long)*dirpp << std::endl;
7882   return r;
7883 }
7884
7885 int Client::_opendir(Inode *in, dir_result_t **dirpp, const UserPerm& perms)
7886 {
7887   if (!in->is_dir())
7888     return -ENOTDIR;
7889   *dirpp = new dir_result_t(in, perms);
7890   opened_dirs.insert(*dirpp);
7891   ldout(cct, 8) << __func__ << "(" << in->ino << ") = " << 0 << " (" << *dirpp << ")" << dendl;
7892   return 0;
7893 }
7894
7895
7896 int Client::closedir(dir_result_t *dir)
7897 {
7898   std::lock_guard lock(client_lock);
7899   tout(cct) << __func__ << std::endl;
7900   tout(cct) << (unsigned long)dir << std::endl;
7901
7902   ldout(cct, 3) << __func__ << "(" << dir << ") = 0" << dendl;
7903   _closedir(dir);
7904   return 0;
7905 }
7906
7907 void Client::_closedir(dir_result_t *dirp)
7908 {
7909   ldout(cct, 10) << __func__ << "(" << dirp << ")" << dendl;
7910   if (dirp->inode) {
7911     ldout(cct, 10) << __func__ << " detaching inode " << dirp->inode << dendl;
7912     dirp->inode.reset();
7913   }
7914   _readdir_drop_dirp_buffer(dirp);
7915   opened_dirs.erase(dirp);
7916   delete dirp;
7917 }
7918
7919 void Client::rewinddir(dir_result_t *dirp)
7920 {
7921   std::lock_guard lock(client_lock);
7922   ldout(cct, 3) << __func__ << "(" << dirp << ")" << dendl;
7923
7924   if (unmounting)
7925     return;
7926
7927   dir_result_t *d = static_cast<dir_result_t*>(dirp);
7928   _readdir_drop_dirp_buffer(d);
7929   d->reset();
7930 }
7931
7932 loff_t Client::telldir(dir_result_t *dirp)
7933 {
7934   dir_result_t *d = static_cast<dir_result_t*>(dirp);
7935   ldout(cct, 3) << __func__ << "(" << dirp << ") = " << d->offset << dendl;
7936   return d->offset;
7937 }
7938
7939 void Client::seekdir(dir_result_t *dirp, loff_t offset)
7940 {
7941   std::lock_guard lock(client_lock);
7942
7943   ldout(cct, 3) << __func__ << "(" << dirp << ", " << offset << ")" << dendl;
7944
7945   if (unmounting)
7946     return;
7947
7948   if (offset == dirp->offset)
7949     return;
7950
7951   if (offset > dirp->offset)
7952     dirp->release_count = 0;   // bump if we do a forward seek
7953   else
7954     dirp->ordered_count = 0;   // disable filling readdir cache
7955
7956   if (dirp->hash_order()) {
7957     if (dirp->offset > offset) {
7958       _readdir_drop_dirp_buffer(dirp);
7959       dirp->reset();
7960     }
7961   } else {
7962     if (offset == 0 ||
7963         dirp->buffer_frag != frag_t(dir_result_t::fpos_high(offset)) ||
7964         dirp->offset_low() > dir_result_t::fpos_low(offset))  {
7965       _readdir_drop_dirp_buffer(dirp);
7966       dirp->reset();
7967     }
7968   }
7969
7970   dirp->offset = offset;
7971 }
7972
7973
7974 //struct dirent {
7975 //  ino_t          d_ino;       /* inode number */
7976 //  off_t          d_off;       /* offset to the next dirent */
7977 //  unsigned short d_reclen;    /* length of this record */
7978 //  unsigned char  d_type;      /* type of file */
7979 //  char           d_name[256]; /* filename */
7980 //};
7981 void Client::fill_dirent(struct dirent *de, const char *name, int type, uint64_t ino, loff_t next_off)
7982 {
7983   strncpy(de->d_name, name, 255);
7984   de->d_name[255] = '\0';
7985 #ifndef __CYGWIN__
7986   de->d_ino = ino;
7987 #if !defined(__APPLE__) && !defined(__FreeBSD__)
7988   de->d_off = next_off;
7989 #endif
7990   de->d_reclen = 1;
7991   de->d_type = IFTODT(type);
7992   ldout(cct, 10) << __func__ << " '" << de->d_name << "' -> " << inodeno_t(de->d_ino)
7993            << " type " << (int)de->d_type << " w/ next_off " << hex << next_off << dec << dendl;
7994 #endif
7995 }
7996
7997 void Client::_readdir_next_frag(dir_result_t *dirp)
7998 {
7999   frag_t fg = dirp->buffer_frag;
8000
8001   if (fg.is_rightmost()) {
8002     ldout(cct, 10) << __func__ << " advance from " << fg << " to END" << dendl;
8003     dirp->set_end();
8004     return;
8005   }
8006
8007   // advance
8008   fg = fg.next();
8009   ldout(cct, 10) << __func__ << " advance from " << dirp->buffer_frag << " to " << fg << dendl;
8010
8011   if (dirp->hash_order()) {
8012     // keep last_name
8013     int64_t new_offset = dir_result_t::make_fpos(fg.value(), 2, true);
8014     if (dirp->offset < new_offset) // don't decrease offset
8015       dirp->offset = new_offset;
8016   } else {
8017     dirp->last_name.clear();
8018     dirp->offset = dir_result_t::make_fpos(fg, 2, false);
8019     _readdir_rechoose_frag(dirp);
8020   }
8021 }
8022
8023 void Client::_readdir_rechoose_frag(dir_result_t *dirp)
8024 {
8025   ceph_assert(dirp->inode);
8026
8027   if (dirp->hash_order())
8028     return;
8029
8030   frag_t cur = frag_t(dirp->offset_high());
8031   frag_t fg = dirp->inode->dirfragtree[cur.value()];
8032   if (fg != cur) {
8033     ldout(cct, 10) << __func__ << " frag " << cur << " maps to " << fg << dendl;
8034     dirp->offset = dir_result_t::make_fpos(fg, 2, false);
8035     dirp->last_name.clear();
8036     dirp->next_offset = 2;
8037   }
8038 }
8039
8040 void Client::_readdir_drop_dirp_buffer(dir_result_t *dirp)
8041 {
8042   ldout(cct, 10) << __func__ << " " << dirp << dendl;
8043   dirp->buffer.clear();
8044 }
8045
8046 int Client::_readdir_get_frag(dir_result_t *dirp)
8047 {
8048   ceph_assert(dirp);
8049   ceph_assert(dirp->inode);
8050
8051   // get the current frag.
8052   frag_t fg;
8053   if (dirp->hash_order())
8054     fg = dirp->inode->dirfragtree[dirp->offset_high()];
8055   else
8056     fg = frag_t(dirp->offset_high());
8057
8058   ldout(cct, 10) << __func__ << " " << dirp << " on " << dirp->inode->ino << " fg " << fg
8059                  << " offset " << hex << dirp->offset << dec << dendl;
8060
8061   int op = CEPH_MDS_OP_READDIR;
8062   if (dirp->inode && dirp->inode->snapid == CEPH_SNAPDIR)
8063     op = CEPH_MDS_OP_LSSNAP;
8064
8065   InodeRef& diri = dirp->inode;
8066
8067   MetaRequest *req = new MetaRequest(op);
8068   filepath path;
8069   diri->make_nosnap_relative_path(path);
8070   req->set_filepath(path);
8071   req->set_inode(diri.get());
8072   req->head.args.readdir.frag = fg;
8073   req->head.args.readdir.flags = CEPH_READDIR_REPLY_BITFLAGS;
8074   if (dirp->last_name.length()) {
8075     req->path2.set_path(dirp->last_name);
8076   } else if (dirp->hash_order()) {
8077     req->head.args.readdir.offset_hash = dirp->offset_high();
8078   }
8079   req->dirp = dirp;
8080
8081   bufferlist dirbl;
8082   int res = make_request(req, dirp->perms, NULL, NULL, -1, &dirbl);
8083
8084   if (res == -EAGAIN) {
8085     ldout(cct, 10) << __func__ << " got EAGAIN, retrying" << dendl;
8086     _readdir_rechoose_frag(dirp);
8087     return _readdir_get_frag(dirp);
8088   }
8089
8090   if (res == 0) {
8091     ldout(cct, 10) << __func__ << " " << dirp << " got frag " << dirp->buffer_frag
8092                    << " size " << dirp->buffer.size() << dendl;
8093   } else {
8094     ldout(cct, 10) << __func__ << " got error " << res << ", setting end flag" << dendl;
8095     dirp->set_end();
8096   }
8097
8098   return res;
8099 }
8100
8101 struct dentry_off_lt {
8102   bool operator()(const Dentry* dn, int64_t off) const {
8103     return dir_result_t::fpos_cmp(dn->offset, off) < 0;
8104   }
8105 };
8106
8107 int Client::_readdir_cache_cb(dir_result_t *dirp, add_dirent_cb_t cb, void *p,
8108                               int caps, bool getref)
8109 {
8110   ceph_assert(ceph_mutex_is_locked(client_lock));
8111   ldout(cct, 10) << __func__ << " " << dirp << " on " << dirp->inode->ino
8112            << " last_name " << dirp->last_name << " offset " << hex << dirp->offset << dec
8113            << dendl;
8114   Dir *dir = dirp->inode->dir;
8115
8116   if (!dir) {
8117     ldout(cct, 10) << " dir is empty" << dendl;
8118     dirp->set_end();
8119     return 0;
8120   }
8121
8122   vector<Dentry*>::iterator pd = std::lower_bound(dir->readdir_cache.begin(),
8123                                                   dir->readdir_cache.end(),
8124                                                   dirp->offset, dentry_off_lt());
8125
8126   string dn_name;
8127   while (true) {
8128     int mask = caps;
8129     if (!dirp->inode->is_complete_and_ordered())
8130       return -EAGAIN;
8131     if (pd == dir->readdir_cache.end())
8132       break;
8133     Dentry *dn = *pd;
8134     if (dn->inode == NULL) {
8135       ldout(cct, 15) << " skipping null '" << dn->name << "'" << dendl;
8136       ++pd;
8137       continue;
8138     }
8139     if (dn->cap_shared_gen != dir->parent_inode->shared_gen) {
8140       ldout(cct, 15) << " skipping mismatch shared gen '" << dn->name << "'" << dendl;
8141       ++pd;
8142       continue;
8143     }
8144
8145     int idx = pd - dir->readdir_cache.begin();
8146     if (dn->inode->is_dir()) {
8147       mask |= CEPH_STAT_RSTAT;
8148     }
8149     int r = _getattr(dn->inode, mask, dirp->perms);
8150     if (r < 0)
8151       return r;
8152
8153     // the content of readdir_cache may change after _getattr(), so pd may be invalid iterator
8154     pd = dir->readdir_cache.begin() + idx;
8155     if (pd >= dir->readdir_cache.end() || *pd != dn)
8156       return -EAGAIN;
8157
8158     struct ceph_statx stx;
8159     struct dirent de;
8160     fill_statx(dn->inode, caps, &stx);
8161
8162     uint64_t next_off = dn->offset + 1;
8163     fill_dirent(&de, dn->name.c_str(), stx.stx_mode, stx.stx_ino, next_off);
8164     ++pd;
8165     if (pd == dir->readdir_cache.end())
8166       next_off = dir_result_t::END;
8167
8168     Inode *in = NULL;
8169     if (getref) {
8170       in = dn->inode.get();
8171       _ll_get(in);
8172     }
8173
8174     dn_name = dn->name; // fill in name while we have lock
8175
8176     client_lock.unlock();
8177     r = cb(p, &de, &stx, next_off, in);  // _next_ offset
8178     client_lock.lock();
8179     ldout(cct, 15) << " de " << de.d_name << " off " << hex << dn->offset << dec
8180                    << " = " << r << dendl;
8181     if (r < 0) {
8182       return r;
8183     }
8184
8185     dirp->offset = next_off;
8186     if (dirp->at_end())
8187       dirp->next_offset = 2;
8188     else
8189       dirp->next_offset = dirp->offset_low();
8190     dirp->last_name = dn_name; // we successfully returned this one; update!
8191     dirp->release_count = 0; // last_name no longer match cache index
8192     if (r > 0)
8193       return r;
8194   }
8195
8196   ldout(cct, 10) << __func__ << " " << dirp << " on " << dirp->inode->ino << " at end" << dendl;
8197   dirp->set_end();
8198   return 0;
8199 }
8200
8201 int Client::readdir_r_cb(dir_result_t *d, add_dirent_cb_t cb, void *p,
8202                          unsigned want, unsigned flags, bool getref)
8203 {
8204   int caps = statx_to_mask(flags, want);
8205
8206   std::lock_guard lock(client_lock);
8207
8208   if (unmounting)
8209     return -ENOTCONN;
8210
8211   dir_result_t *dirp = static_cast<dir_result_t*>(d);
8212
8213   ldout(cct, 10) << __func__ << " " << *dirp->inode << " offset " << hex << dirp->offset
8214                  << dec << " at_end=" << dirp->at_end()
8215                  << " hash_order=" << dirp->hash_order() << dendl;
8216
8217   struct dirent de;
8218   struct ceph_statx stx;
8219   memset(&de, 0, sizeof(de));
8220   memset(&stx, 0, sizeof(stx));
8221
8222   InodeRef& diri = dirp->inode;
8223
8224   if (dirp->at_end())
8225     return 0;
8226
8227   if (dirp->offset == 0) {
8228     ldout(cct, 15) << " including ." << dendl;
8229     ceph_assert(diri->dentries.size() < 2); // can't have multiple hard-links to a dir
8230     uint64_t next_off = 1;
8231
8232     int r;
8233     r = _getattr(diri, caps | CEPH_STAT_RSTAT, dirp->perms);
8234     if (r < 0)
8235       return r;
8236
8237     fill_statx(diri, caps, &stx);
8238     fill_dirent(&de, ".", S_IFDIR, stx.stx_ino, next_off);
8239
8240     Inode *inode = NULL;
8241     if (getref) {
8242       inode = diri.get();
8243       _ll_get(inode);
8244     }
8245
8246     client_lock.unlock();
8247     r = cb(p, &de, &stx, next_off, inode);
8248     client_lock.lock();
8249     if (r < 0)
8250       return r;
8251
8252     dirp->offset = next_off;
8253     if (r > 0)
8254       return r;
8255   }
8256   if (dirp->offset == 1) {
8257     ldout(cct, 15) << " including .." << dendl;
8258     uint64_t next_off = 2;
8259     InodeRef in;
8260     if (diri->dentries.empty())
8261       in = diri;
8262     else
8263       in = diri->get_first_parent()->dir->parent_inode;
8264
8265     int r;
8266     r = _getattr(in, caps | CEPH_STAT_RSTAT, dirp->perms);
8267     if (r < 0)
8268       return r;
8269
8270     fill_statx(in, caps, &stx);
8271     fill_dirent(&de, "..", S_IFDIR, stx.stx_ino, next_off);
8272
8273     Inode *inode = NULL;
8274     if (getref) {
8275       inode = in.get();
8276       _ll_get(inode);
8277     }
8278
8279     client_lock.unlock();
8280     r = cb(p, &de, &stx, next_off, inode);
8281     client_lock.lock();
8282     if (r < 0)
8283       return r;
8284
8285     dirp->offset = next_off;
8286     if (r > 0)
8287       return r;
8288   }
8289
8290   // can we read from our cache?
8291   ldout(cct, 10) << "offset " << hex << dirp->offset << dec
8292            << " snapid " << dirp->inode->snapid << " (complete && ordered) "
8293            << dirp->inode->is_complete_and_ordered()
8294            << " issued " << ccap_string(dirp->inode->caps_issued())
8295            << dendl;
8296   if (dirp->inode->snapid != CEPH_SNAPDIR &&
8297       dirp->inode->is_complete_and_ordered() &&
8298       dirp->inode->caps_issued_mask(CEPH_CAP_FILE_SHARED, true)) {
8299     int err = _readdir_cache_cb(dirp, cb, p, caps, getref);
8300     if (err != -EAGAIN)
8301       return err;
8302   }
8303
8304   while (1) {
8305     if (dirp->at_end())
8306       return 0;
8307
8308     bool check_caps = true;
8309     if (!dirp->is_cached()) {
8310       int r = _readdir_get_frag(dirp);
8311       if (r)
8312         return r;
8313       // _readdir_get_frag () may updates dirp->offset if the replied dirfrag is
8314       // different than the requested one. (our dirfragtree was outdated)
8315       check_caps = false;
8316     }
8317     frag_t fg = dirp->buffer_frag;
8318
8319     ldout(cct, 10) << "frag " << fg << " buffer size " << dirp->buffer.size()
8320                    << " offset " << hex << dirp->offset << dendl;
8321
8322     for (auto it = std::lower_bound(dirp->buffer.begin(), dirp->buffer.end(),
8323                                     dirp->offset, dir_result_t::dentry_off_lt());
8324          it != dirp->buffer.end();
8325          ++it) {
8326       dir_result_t::dentry &entry = *it;
8327
8328       uint64_t next_off = entry.offset + 1;
8329
8330       int r;
8331       if (check_caps) {
8332         int mask = caps;
8333         if(entry.inode->is_dir()){
8334           mask |= CEPH_STAT_RSTAT;
8335         }
8336         r = _getattr(entry.inode, mask, dirp->perms);
8337         if (r < 0)
8338           return r;
8339       }
8340
8341       fill_statx(entry.inode, caps, &stx);
8342       fill_dirent(&de, entry.name.c_str(), stx.stx_mode, stx.stx_ino, next_off);
8343
8344       Inode *inode = NULL;
8345       if (getref) {
8346         inode = entry.inode.get();
8347         _ll_get(inode);
8348       }
8349
8350       client_lock.unlock();
8351       r = cb(p, &de, &stx, next_off, inode);  // _next_ offset
8352       client_lock.lock();
8353
8354       ldout(cct, 15) << " de " << de.d_name << " off " << hex << next_off - 1 << dec
8355                      << " = " << r << dendl;
8356       if (r < 0)
8357         return r;
8358
8359       dirp->offset = next_off;
8360       if (r > 0)
8361         return r;
8362     }
8363
8364     if (dirp->next_offset > 2) {
8365       ldout(cct, 10) << " fetching next chunk of this frag" << dendl;
8366       _readdir_drop_dirp_buffer(dirp);
8367       continue;  // more!
8368     }
8369
8370     if (!fg.is_rightmost()) {
8371       // next frag!
8372       _readdir_next_frag(dirp);
8373       continue;
8374     }
8375
8376     if (diri->shared_gen == dirp->start_shared_gen &&
8377         diri->dir_release_count == dirp->release_count) {
8378       if (diri->dir_ordered_count == dirp->ordered_count) {
8379         ldout(cct, 10) << " marking (I_COMPLETE|I_DIR_ORDERED) on " << *diri << dendl;
8380         if (diri->dir) {
8381           ceph_assert(diri->dir->readdir_cache.size() >= dirp->cache_index);
8382           diri->dir->readdir_cache.resize(dirp->cache_index);
8383         }
8384         diri->flags |= I_COMPLETE | I_DIR_ORDERED;
8385       } else {
8386         ldout(cct, 10) << " marking I_COMPLETE on " << *diri << dendl;
8387         diri->flags |= I_COMPLETE;
8388       }
8389     }
8390
8391     dirp->set_end();
8392     return 0;
8393   }
8394   ceph_abort();
8395   return 0;
8396 }
8397
8398
8399 int Client::readdir_r(dir_result_t *d, struct dirent *de)
8400 {
8401   return readdirplus_r(d, de, 0, 0, 0, NULL);
8402 }
8403
8404 /*
8405  * readdirplus_r
8406  *
8407  * returns
8408  *  1 if we got a dirent
8409  *  0 for end of directory
8410  * <0 on error
8411  */
8412
8413 struct single_readdir {
8414   struct dirent *de;
8415   struct ceph_statx *stx;
8416   Inode *inode;
8417   bool full;
8418 };
8419
8420 static int _readdir_single_dirent_cb(void *p, struct dirent *de,
8421                                      struct ceph_statx *stx, off_t off,
8422                                      Inode *in)
8423 {
8424   single_readdir *c = static_cast<single_readdir *>(p);
8425
8426   if (c->full)
8427     return -1;  // already filled this dirent
8428
8429   *c->de = *de;
8430   if (c->stx)
8431     *c->stx = *stx;
8432   c->inode = in;
8433   c->full = true;
8434   return 1;
8435 }
8436
8437 struct dirent *Client::readdir(dir_result_t *d)
8438 {
8439   int ret;
8440   auto& de = d->de;
8441   single_readdir sr;
8442   sr.de = &de;
8443   sr.stx = NULL;
8444   sr.inode = NULL;
8445   sr.full = false;
8446
8447   // our callback fills the dirent and sets sr.full=true on first
8448   // call, and returns -1 the second time around.
8449   ret = readdir_r_cb(d, _readdir_single_dirent_cb, (void *)&sr);
8450   if (ret < -1) {
8451     errno = -ret;  // this sucks.
8452     return (dirent *) NULL;
8453   }
8454   if (sr.full) {
8455     return &de;
8456   }
8457   return (dirent *) NULL;
8458 }
8459
8460 int Client::readdirplus_r(dir_result_t *d, struct dirent *de,
8461                           struct ceph_statx *stx, unsigned want,
8462                           unsigned flags, Inode **out)
8463 {
8464   single_readdir sr;
8465   sr.de = de;
8466   sr.stx = stx;
8467   sr.inode = NULL;
8468   sr.full = false;
8469
8470   // our callback fills the dirent and sets sr.full=true on first
8471   // call, and returns -1 the second time around.
8472   int r = readdir_r_cb(d, _readdir_single_dirent_cb, (void *)&sr, want, flags, out);
8473   if (r < -1)
8474     return r;
8475   if (out)
8476     *out = sr.inode;
8477   if (sr.full)
8478     return 1;
8479   return 0;
8480 }
8481
8482
8483 /* getdents */
8484 struct getdents_result {
8485   char *buf;
8486   int buflen;
8487   int pos;
8488   bool fullent;
8489 };
8490
8491 static int _readdir_getdent_cb(void *p, struct dirent *de,
8492                                struct ceph_statx *stx, off_t off, Inode *in)
8493 {
8494   struct getdents_result *c = static_cast<getdents_result *>(p);
8495
8496   int dlen;
8497   if (c->fullent)
8498     dlen = sizeof(*de);
8499   else
8500     dlen = strlen(de->d_name) + 1;
8501
8502   if (c->pos + dlen > c->buflen)
8503     return -1;  // doesn't fit
8504
8505   if (c->fullent) {
8506     memcpy(c->buf + c->pos, de, sizeof(*de));
8507   } else {
8508     memcpy(c->buf + c->pos, de->d_name, dlen);
8509   }
8510   c->pos += dlen;
8511   return 0;
8512 }
8513
8514 int Client::_getdents(dir_result_t *dir, char *buf, int buflen, bool fullent)
8515 {
8516   getdents_result gr;
8517   gr.buf = buf;
8518   gr.buflen = buflen;
8519   gr.fullent = fullent;
8520   gr.pos = 0;
8521
8522   int r = readdir_r_cb(dir, _readdir_getdent_cb, (void *)&gr);
8523
8524   if (r < 0) { // some error
8525     if (r == -1) { // buffer ran out of space
8526       if (gr.pos) { // but we got some entries already!
8527         return gr.pos;
8528       } // or we need a larger buffer
8529       return -ERANGE;
8530     } else { // actual error, return it
8531       return r;
8532     }
8533   }
8534   return gr.pos;
8535 }
8536
8537
8538 /* getdir */
8539 struct getdir_result {
8540   list<string> *contents;
8541   int num;
8542 };
8543
8544 static int _getdir_cb(void *p, struct dirent *de, struct ceph_statx *stx, off_t off, Inode *in)
8545 {
8546   getdir_result *r = static_cast<getdir_result *>(p);
8547
8548   r->contents->push_back(de->d_name);
8549   r->num++;
8550   return 0;
8551 }
8552
8553 int Client::getdir(const char *relpath, list<string>& contents,
8554                    const UserPerm& perms)
8555 {
8556   ldout(cct, 3) << "getdir(" << relpath << ")" << dendl;
8557   {
8558     std::lock_guard lock(client_lock);
8559     tout(cct) << "getdir" << std::endl;
8560     tout(cct) << relpath << std::endl;
8561   }
8562
8563   dir_result_t *d;
8564   int r = opendir(relpath, &d, perms);
8565   if (r < 0)
8566     return r;
8567
8568   getdir_result gr;
8569   gr.contents = &contents;
8570   gr.num = 0;
8571   r = readdir_r_cb(d, _getdir_cb, (void *)&gr);
8572
8573   closedir(d);
8574
8575   if (r < 0)
8576     return r;
8577   return gr.num;
8578 }
8579
8580
8581 /****** file i/o **********/
8582 int Client::open(const char *relpath, int flags, const UserPerm& perms,
8583                  mode_t mode, int stripe_unit, int stripe_count,
8584                  int object_size, const char *data_pool)
8585 {
8586   int cflags = ceph_flags_sys2wire(flags);
8587
8588   ldout(cct, 3) << "open enter(" << relpath << ", " << cflags << "," << mode << ")" << dendl;
8589   std::lock_guard lock(client_lock);
8590   tout(cct) << "open" << std::endl;
8591   tout(cct) << relpath << std::endl;
8592   tout(cct) << cflags << std::endl;
8593
8594   if (unmounting)
8595     return -ENOTCONN;
8596
8597   Fh *fh = NULL;
8598
8599 #if defined(__linux__) && defined(O_PATH)
8600   /* When the O_PATH is being specified, others flags than O_DIRECTORY
8601    * and O_NOFOLLOW are ignored. Please refer do_entry_open() function
8602    * in kernel (fs/open.c). */
8603   if (flags & O_PATH)
8604     flags &= O_DIRECTORY | O_NOFOLLOW | O_PATH;
8605 #endif
8606
8607   filepath path(relpath);
8608   InodeRef in;
8609   bool created = false;
8610   /* O_CREATE with O_EXCL enforces O_NOFOLLOW. */
8611   bool followsym = !((flags & O_NOFOLLOW) || ((flags & O_CREAT) && (flags & O_EXCL)));
8612   int mask = ceph_caps_for_mode(ceph_flags_to_mode(cflags));
8613
8614   int r = path_walk(path, &in, perms, followsym, mask);
8615
8616   if (r == 0 && (flags & O_CREAT) && (flags & O_EXCL))
8617     return -EEXIST;
8618
8619 #if defined(__linux__) && defined(O_PATH)
8620   if (r == 0 && in->is_symlink() && (flags & O_NOFOLLOW) && !(flags & O_PATH))
8621 #else
8622   if (r == 0 && in->is_symlink() && (flags & O_NOFOLLOW))
8623 #endif
8624     return -ELOOP;
8625
8626   if (r == -ENOENT && (flags & O_CREAT)) {
8627     filepath dirpath = path;
8628     string dname = dirpath.last_dentry();
8629     dirpath.pop_dentry();
8630     InodeRef dir;
8631     r = path_walk(dirpath, &dir, perms, true,
8632                   cct->_conf->client_permissions ? CEPH_CAP_AUTH_SHARED : 0);
8633     if (r < 0)
8634       goto out;
8635     if (cct->_conf->client_permissions) {
8636       r = may_create(dir.get(), perms);
8637       if (r < 0)
8638         goto out;
8639     }
8640     r = _create(dir.get(), dname.c_str(), flags, mode, &in, &fh, stripe_unit,
8641                 stripe_count, object_size, data_pool, &created, perms);
8642   }
8643   if (r < 0)
8644     goto out;
8645
8646   if (!created) {
8647     // posix says we can only check permissions of existing files
8648     if (cct->_conf->client_permissions) {
8649       r = may_open(in.get(), flags, perms);
8650       if (r < 0)
8651         goto out;
8652     }
8653   }
8654
8655   if (!fh)
8656     r = _open(in.get(), flags, mode, &fh, perms);
8657   if (r >= 0) {
8658     // allocate a integer file descriptor
8659     ceph_assert(fh);
8660     r = get_fd();
8661     ceph_assert(fd_map.count(r) == 0);
8662     fd_map[r] = fh;
8663   }
8664
8665  out:
8666   tout(cct) << r << std::endl;
8667   ldout(cct, 3) << "open exit(" << path << ", " << cflags << ") = " << r << dendl;
8668   return r;
8669 }
8670
8671 int Client::open(const char *relpath, int flags, const UserPerm& perms, mode_t mode)
8672 {
8673   /* Use default file striping parameters */
8674   return open(relpath, flags, perms, mode, 0, 0, 0, NULL);
8675 }
8676
8677 int Client::lookup_hash(inodeno_t ino, inodeno_t dirino, const char *name,
8678                         const UserPerm& perms)
8679 {
8680   std::lock_guard lock(client_lock);
8681   ldout(cct, 3) << __func__ << " enter(" << ino << ", #" << dirino << "/" << name << ")" << dendl;
8682
8683   if (unmounting)
8684     return -ENOTCONN;
8685
8686   MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPHASH);
8687   filepath path(ino);
8688   req->set_filepath(path);
8689
8690   uint32_t h = ceph_str_hash(CEPH_STR_HASH_RJENKINS, name, strlen(name));
8691   char f[30];
8692   sprintf(f, "%u", h);
8693   filepath path2(dirino);
8694   path2.push_dentry(string(f));
8695   req->set_filepath2(path2);
8696
8697   int r = make_request(req, perms, NULL, NULL,
8698                        rand() % mdsmap->get_num_in_mds());
8699   ldout(cct, 3) << __func__ << " exit(" << ino << ", #" << dirino << "/" << name << ") = " << r << dendl;
8700   return r;
8701 }
8702
8703
8704 /**
8705  * Load inode into local cache.
8706  *
8707  * If inode pointer is non-NULL, and take a reference on
8708  * the resulting Inode object in one operation, so that caller
8709  * can safely assume inode will still be there after return.
8710  */
8711 int Client::_lookup_ino(inodeno_t ino, const UserPerm& perms, Inode **inode)
8712 {
8713   ldout(cct, 8) << __func__ << " enter(" << ino << ")" << dendl;
8714
8715   if (unmounting)
8716     return -ENOTCONN;
8717
8718   MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPINO);
8719   filepath path(ino);
8720   req->set_filepath(path);
8721
8722   int r = make_request(req, perms, NULL, NULL, rand() % mdsmap->get_num_in_mds());
8723   if (r == 0 && inode != NULL) {
8724     vinodeno_t vino(ino, CEPH_NOSNAP);
8725     unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino);
8726     ceph_assert(p != inode_map.end());
8727     *inode = p->second;
8728     _ll_get(*inode);
8729   }
8730   ldout(cct, 8) << __func__ << " exit(" << ino << ") = " << r << dendl;
8731   return r;
8732 }
8733
8734 int Client::lookup_ino(inodeno_t ino, const UserPerm& perms, Inode **inode)
8735 {
8736   std::lock_guard lock(client_lock);
8737   return _lookup_ino(ino, perms, inode);
8738 }
8739
8740 /**
8741  * Find the parent inode of `ino` and insert it into
8742  * our cache.  Conditionally also set `parent` to a referenced
8743  * Inode* if caller provides non-NULL value.
8744  */
8745 int Client::_lookup_parent(Inode *ino, const UserPerm& perms, Inode **parent)
8746 {
8747   ldout(cct, 8) << __func__ << " enter(" << ino->ino << ")" << dendl;
8748
8749   MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPPARENT);
8750   filepath path(ino->ino);
8751   req->set_filepath(path);
8752
8753   InodeRef target;
8754   int r = make_request(req, perms, &target, NULL, rand() % mdsmap->get_num_in_mds());
8755   // Give caller a reference to the parent ino if they provided a pointer.
8756   if (parent != NULL) {
8757     if (r == 0) {
8758       *parent = target.get();
8759       _ll_get(*parent);
8760       ldout(cct, 8) << __func__ << " found parent " << (*parent)->ino << dendl;
8761     } else {
8762       *parent = NULL;
8763     }
8764   }
8765   ldout(cct, 8) << __func__ << " exit(" << ino->ino << ") = " << r << dendl;
8766   return r;
8767 }
8768
8769 /**
8770  * Populate the parent dentry for `ino`, provided it is
8771  * a child of `parent`.
8772  */
8773 int Client::_lookup_name(Inode *ino, Inode *parent, const UserPerm& perms)
8774 {
8775   ceph_assert(parent->is_dir());
8776   ldout(cct, 3) << __func__ << " enter(" << ino->ino << ")" << dendl;
8777
8778   if (unmounting)
8779     return -ENOTCONN;
8780
8781   MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPNAME);
8782   req->set_filepath2(filepath(parent->ino));
8783   req->set_filepath(filepath(ino->ino));
8784   req->set_inode(ino);
8785
8786   int r = make_request(req, perms, NULL, NULL, rand() % mdsmap->get_num_in_mds());
8787   ldout(cct, 3) << __func__ << " exit(" << ino->ino << ") = " << r << dendl;
8788   return r;
8789 }
8790
8791 int Client::lookup_name(Inode *ino, Inode *parent, const UserPerm& perms)
8792 {
8793   std::lock_guard lock(client_lock);
8794   return _lookup_name(ino, parent, perms);
8795 }
8796
8797 Fh *Client::_create_fh(Inode *in, int flags, int cmode, const UserPerm& perms)
8798 {
8799   ceph_assert(in);
8800   Fh *f = new Fh(in, flags, cmode, fd_gen, perms);
8801
8802   ldout(cct, 10) << __func__ << " " << in->ino << " mode " << cmode << dendl;
8803
8804   if (in->snapid != CEPH_NOSNAP) {
8805     in->snap_cap_refs++;
8806     ldout(cct, 5) << "open success, fh is " << f << " combined IMMUTABLE SNAP caps "
8807             << ccap_string(in->caps_issued()) << dendl;
8808   }
8809
8810   const auto& conf = cct->_conf;
8811   f->readahead.set_trigger_requests(1);
8812   f->readahead.set_min_readahead_size(conf->client_readahead_min);
8813   uint64_t max_readahead = Readahead::NO_LIMIT;
8814   if (conf->client_readahead_max_bytes) {
8815     max_readahead = std::min(max_readahead, (uint64_t)conf->client_readahead_max_bytes);
8816   }
8817   if (conf->client_readahead_max_periods) {
8818     max_readahead = std::min(max_readahead, in->layout.get_period()*(uint64_t)conf->client_readahead_max_periods);
8819   }
8820   f->readahead.set_max_readahead_size(max_readahead);
8821   vector<uint64_t> alignments;
8822   alignments.push_back(in->layout.get_period());
8823   alignments.push_back(in->layout.stripe_unit);
8824   f->readahead.set_alignments(alignments);
8825
8826   return f;
8827 }
8828
8829 int Client::_release_fh(Fh *f)
8830 {
8831   //ldout(cct, 3) << "op: client->close(open_files[ " << fh << " ]);" << dendl;
8832   //ldout(cct, 3) << "op: open_files.erase( " << fh << " );" << dendl;
8833   Inode *in = f->inode.get();
8834   ldout(cct, 8) << __func__ << " " << f << " mode " << f->mode << " on " << *in << dendl;
8835
8836   in->unset_deleg(f);
8837
8838   if (in->snapid == CEPH_NOSNAP) {
8839     if (in->put_open_ref(f->mode)) {
8840       _flush(in, new C_Client_FlushComplete(this, in));
8841       check_caps(in, 0);
8842     }
8843   } else {
8844     ceph_assert(in->snap_cap_refs > 0);
8845     in->snap_cap_refs--;
8846   }
8847
8848   _release_filelocks(f);
8849
8850   // Finally, read any async err (i.e. from flushes)
8851   int err = f->take_async_err();
8852   if (err != 0) {
8853     ldout(cct, 1) << __func__ << " " << f << " on inode " << *in << " caught async_err = "
8854                   << cpp_strerror(err) << dendl;
8855   } else {
8856     ldout(cct, 10) << __func__ << " " << f << " on inode " << *in << " no async_err state" << dendl;
8857   }
8858
8859   _put_fh(f);
8860
8861   return err;
8862 }
8863
8864 void Client::_put_fh(Fh *f)
8865 {
8866   int left = f->put();
8867   if (!left) {
8868     delete f;
8869   }
8870 }
8871
8872 int Client::_open(Inode *in, int flags, mode_t mode, Fh **fhp,
8873                   const UserPerm& perms)
8874 {
8875   if (in->snapid != CEPH_NOSNAP &&
8876       (flags & (O_WRONLY | O_RDWR | O_CREAT | O_TRUNC | O_APPEND))) {
8877     return -EROFS;
8878   }
8879
8880   // use normalized flags to generate cmode
8881   int cflags = ceph_flags_sys2wire(flags);
8882   if (cct->_conf.get_val<bool>("client_force_lazyio"))
8883     cflags |= CEPH_O_LAZY;
8884
8885   int cmode = ceph_flags_to_mode(cflags);
8886   int want = ceph_caps_for_mode(cmode);
8887   int result = 0;
8888
8889   in->get_open_ref(cmode);  // make note of pending open, since it effects _wanted_ caps.
8890
8891   if ((flags & O_TRUNC) == 0 && in->caps_issued_mask(want)) {
8892     // update wanted?
8893     check_caps(in, CHECK_CAPS_NODELAY);
8894   } else {
8895
8896     MetaRequest *req = new MetaRequest(CEPH_MDS_OP_OPEN);
8897     filepath path;
8898     in->make_nosnap_relative_path(path);
8899     req->set_filepath(path);
8900     req->head.args.open.flags = cflags & ~CEPH_O_CREAT;
8901     req->head.args.open.mode = mode;
8902     req->head.args.open.pool = -1;
8903     if (cct->_conf->client_debug_getattr_caps)
8904       req->head.args.open.mask = DEBUG_GETATTR_CAPS;
8905     else
8906       req->head.args.open.mask = 0;
8907     req->head.args.open.old_size = in->size;   // for O_TRUNC
8908     req->set_inode(in);
8909     result = make_request(req, perms);
8910
8911     /*
8912      * NFS expects that delegations will be broken on a conflicting open,
8913      * not just when there is actual conflicting access to the file. SMB leases
8914      * and oplocks also have similar semantics.
8915      *
8916      * Ensure that clients that have delegations enabled will wait on minimal
8917      * caps during open, just to ensure that other clients holding delegations
8918      * return theirs first.
8919      */
8920     if (deleg_timeout && result == 0) {
8921       int need = 0, have;
8922
8923       if (cmode & CEPH_FILE_MODE_WR)
8924         need |= CEPH_CAP_FILE_WR;
8925       if (cmode & CEPH_FILE_MODE_RD)
8926         need |= CEPH_CAP_FILE_RD;
8927
8928       Fh fh(in, flags, cmode, fd_gen, perms);
8929       result = get_caps(&fh, need, want, &have, -1);
8930       if (result < 0) {
8931         ldout(cct, 8) << "Unable to get caps after open of inode " << *in <<
8932                           " . Denying open: " <<
8933                           cpp_strerror(result) << dendl;
8934       } else {
8935         put_cap_ref(in, need);
8936       }
8937     }
8938   }
8939
8940   // success?
8941   if (result >= 0) {
8942     if (fhp)
8943       *fhp = _create_fh(in, flags, cmode, perms);
8944   } else {
8945     in->put_open_ref(cmode);
8946   }
8947
8948   trim_cache();
8949
8950   return result;
8951 }
8952
8953 int Client::_renew_caps(Inode *in)
8954 {
8955   int wanted = in->caps_file_wanted();
8956   if (in->is_any_caps() &&
8957       ((wanted & CEPH_CAP_ANY_WR) == 0 || in->auth_cap)) {
8958     check_caps(in, CHECK_CAPS_NODELAY);
8959     return 0;
8960   }
8961
8962   int flags = 0;
8963   if ((wanted & CEPH_CAP_FILE_RD) && (wanted & CEPH_CAP_FILE_WR))
8964     flags = O_RDWR;
8965   else if (wanted & CEPH_CAP_FILE_RD)
8966     flags = O_RDONLY;
8967   else if (wanted & CEPH_CAP_FILE_WR)
8968     flags = O_WRONLY;
8969
8970   MetaRequest *req = new MetaRequest(CEPH_MDS_OP_OPEN);
8971   filepath path;
8972   in->make_nosnap_relative_path(path);
8973   req->set_filepath(path);
8974   req->head.args.open.flags = flags;
8975   req->head.args.open.pool = -1;
8976   if (cct->_conf->client_debug_getattr_caps)
8977     req->head.args.open.mask = DEBUG_GETATTR_CAPS;
8978   else
8979     req->head.args.open.mask = 0;
8980   req->set_inode(in);
8981
8982   // duplicate in case Cap goes away; not sure if that race is a concern?
8983   const UserPerm *pperm = in->get_best_perms();
8984   UserPerm perms;
8985   if (pperm != NULL)
8986     perms = *pperm;
8987   int ret = make_request(req, perms);
8988   return ret;
8989 }
8990
8991 int Client::close(int fd)
8992 {
8993   ldout(cct, 3) << "close enter(" << fd << ")" << dendl;
8994   std::lock_guard lock(client_lock);
8995   tout(cct) << "close" << std::endl;
8996   tout(cct) << fd << std::endl;
8997
8998   if (unmounting)
8999     return -ENOTCONN;
9000
9001   Fh *fh = get_filehandle(fd);
9002   if (!fh)
9003     return -EBADF;
9004   int err = _release_fh(fh);
9005   fd_map.erase(fd);
9006   put_fd(fd);
9007   ldout(cct, 3) << "close exit(" << fd << ")" << dendl;
9008   return err;
9009 }
9010
9011
9012 // ------------
9013 // read, write
9014
9015 loff_t Client::lseek(int fd, loff_t offset, int whence)
9016 {
9017   std::lock_guard lock(client_lock);
9018   tout(cct) << "lseek" << std::endl;
9019   tout(cct) << fd << std::endl;
9020   tout(cct) << offset << std::endl;
9021   tout(cct) << whence << std::endl;
9022
9023   if (unmounting)
9024     return -ENOTCONN;
9025
9026   Fh *f = get_filehandle(fd);
9027   if (!f)
9028     return -EBADF;
9029 #if defined(__linux__) && defined(O_PATH)
9030   if (f->flags & O_PATH)
9031     return -EBADF;
9032 #endif
9033   return _lseek(f, offset, whence);
9034 }
9035
9036 loff_t Client::_lseek(Fh *f, loff_t offset, int whence)
9037 {
9038   Inode *in = f->inode.get();
9039   bool whence_check = false;
9040   loff_t pos = -1;
9041
9042   switch (whence) {
9043   case SEEK_END:
9044     whence_check = true;
9045   break;
9046
9047 #ifdef SEEK_DATA
9048   case SEEK_DATA:
9049     whence_check = true;
9050   break;
9051 #endif
9052
9053 #ifdef SEEK_HOLE
9054   case SEEK_HOLE:
9055     whence_check = true;
9056   break;
9057 #endif
9058   }
9059
9060   if (whence_check) {
9061     int r = _getattr(in, CEPH_STAT_CAP_SIZE, f->actor_perms);
9062     if (r < 0)
9063       return r;
9064   }
9065
9066   switch (whence) {
9067   case SEEK_SET:
9068     pos = offset;
9069     break;
9070
9071   case SEEK_CUR:
9072     pos = f->pos + offset;
9073     break;
9074
9075   case SEEK_END:
9076     pos = in->size + offset;
9077     break;
9078
9079 #ifdef SEEK_DATA
9080   case SEEK_DATA:
9081     if (offset < 0 || static_cast<uint64_t>(offset) >= in->size)
9082       return -ENXIO;
9083     pos = offset;
9084     break;
9085 #endif
9086
9087 #ifdef SEEK_HOLE
9088   case SEEK_HOLE:
9089     if (offset < 0 || static_cast<uint64_t>(offset) >= in->size)
9090       return -ENXIO;
9091     pos = in->size;
9092     break;
9093 #endif
9094
9095   default:
9096     ldout(cct, 1) << __func__ << ": invalid whence value " << whence << dendl;
9097     return -EINVAL;
9098   }
9099
9100   if (pos < 0) {
9101     return -EINVAL;
9102   } else {
9103     f->pos = pos;
9104   }
9105
9106   ldout(cct, 8) << "_lseek(" << f << ", " << offset << ", " << whence << ") = " << f->pos << dendl;
9107   return f->pos;
9108 }
9109
9110
9111 void Client::lock_fh_pos(Fh *f)
9112 {
9113   ldout(cct, 10) << __func__ << " " << f << dendl;
9114
9115   if (f->pos_locked || !f->pos_waiters.empty()) {
9116     ceph::condition_variable cond;
9117     f->pos_waiters.push_back(&cond);
9118     ldout(cct, 10) << __func__ << " BLOCKING on " << f << dendl;
9119     std::unique_lock l{client_lock, std::adopt_lock};
9120     cond.wait(l, [f, me=&cond] {
9121       return !f->pos_locked && f->pos_waiters.front() == me;
9122     });
9123     l.release();
9124     ldout(cct, 10) << __func__ << " UNBLOCKING on " << f << dendl;
9125     ceph_assert(f->pos_waiters.front() == &cond);
9126     f->pos_waiters.pop_front();
9127   }
9128
9129   f->pos_locked = true;
9130 }
9131
9132 void Client::unlock_fh_pos(Fh *f)
9133 {
9134   ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
9135
9136   ldout(cct, 10) << __func__ << " " << f << dendl;
9137   f->pos_locked = false;
9138   if (!f->pos_waiters.empty()) {
9139     // only wake up the oldest waiter
9140     auto cond = f->pos_waiters.front();
9141     cond->notify_one();
9142   }
9143 }
9144
9145 int Client::uninline_data(Inode *in, Context *onfinish)
9146 {
9147   if (!in->inline_data.length()) {
9148     onfinish->complete(0);
9149     return 0;
9150   }
9151
9152   char oid_buf[32];
9153   snprintf(oid_buf, sizeof(oid_buf), "%llx.00000000", (long long unsigned)in->ino);
9154   object_t oid = oid_buf;
9155
9156   ObjectOperation create_ops;
9157   create_ops.create(false);
9158
9159   objecter->mutate(oid,
9160                    OSDMap::file_to_object_locator(in->layout),
9161                    create_ops,
9162                    in->snaprealm->get_snap_context(),
9163                    ceph::real_clock::now(),
9164                    0,
9165                    NULL);
9166
9167   bufferlist inline_version_bl;
9168   encode(in->inline_version, inline_version_bl);
9169
9170   ObjectOperation uninline_ops;
9171   uninline_ops.cmpxattr("inline_version",
9172                         CEPH_OSD_CMPXATTR_OP_GT,
9173                         CEPH_OSD_CMPXATTR_MODE_U64,
9174                         inline_version_bl);
9175   bufferlist inline_data = in->inline_data;
9176   uninline_ops.write(0, inline_data, in->truncate_size, in->truncate_seq);
9177   uninline_ops.setxattr("inline_version", stringify(in->inline_version));
9178
9179   objecter->mutate(oid,
9180                    OSDMap::file_to_object_locator(in->layout),
9181                    uninline_ops,
9182                    in->snaprealm->get_snap_context(),
9183                    ceph::real_clock::now(),
9184                    0,
9185                    onfinish);
9186
9187   return 0;
9188 }
9189
9190 //
9191
9192 // blocking osd interface
9193
9194 int Client::read(int fd, char *buf, loff_t size, loff_t offset)
9195 {
9196   std::unique_lock lock(client_lock);
9197   tout(cct) << "read" << std::endl;
9198   tout(cct) << fd << std::endl;
9199   tout(cct) << size << std::endl;
9200   tout(cct) << offset << std::endl;
9201
9202   if (unmounting)
9203     return -ENOTCONN;
9204
9205   Fh *f = get_filehandle(fd);
9206   if (!f)
9207     return -EBADF;
9208 #if defined(__linux__) && defined(O_PATH)
9209   if (f->flags & O_PATH)
9210     return -EBADF;
9211 #endif
9212   bufferlist bl;
9213   /* We can't return bytes written larger than INT_MAX, clamp size to that */
9214   size = std::min(size, (loff_t)INT_MAX);
9215   int r = _read(f, offset, size, &bl);
9216   ldout(cct, 3) << "read(" << fd << ", " << (void*)buf << ", " << size << ", " << offset << ") = " << r << dendl;
9217   if (r >= 0) {
9218     lock.unlock();
9219     bl.begin().copy(bl.length(), buf);
9220     r = bl.length();
9221   }
9222   return r;
9223 }
9224
9225 int Client::preadv(int fd, const struct iovec *iov, int iovcnt, loff_t offset)
9226 {
9227   if (iovcnt < 0)
9228     return -EINVAL;
9229   return _preadv_pwritev(fd, iov, iovcnt, offset, false);
9230 }
9231
9232 int64_t Client::_read(Fh *f, int64_t offset, uint64_t size, bufferlist *bl)
9233 {
9234   int want, have = 0;
9235   bool movepos = false;
9236   std::unique_ptr<C_SaferCond> onuninline;
9237   int64_t rc = 0;
9238   const auto& conf = cct->_conf;
9239   Inode *in = f->inode.get();
9240   utime_t lat;
9241   utime_t start = ceph_clock_now();
9242
9243   if ((f->mode & CEPH_FILE_MODE_RD) == 0)
9244     return -EBADF;
9245   //bool lazy = f->mode == CEPH_FILE_MODE_LAZY;
9246
9247   if (offset < 0) {
9248     lock_fh_pos(f);
9249     offset = f->pos;
9250     movepos = true;
9251   }
9252   loff_t start_pos = offset;
9253
9254   if (in->inline_version == 0) {
9255     auto r = _getattr(in, CEPH_STAT_CAP_INLINE_DATA, f->actor_perms, true);
9256     if (r < 0) {
9257       rc = r;
9258       goto done;
9259     }
9260     ceph_assert(in->inline_version > 0);
9261   }
9262
9263 retry:
9264   if (f->mode & CEPH_FILE_MODE_LAZY)
9265     want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO;
9266   else
9267     want = CEPH_CAP_FILE_CACHE;
9268   {
9269     auto r = get_caps(f, CEPH_CAP_FILE_RD, want, &have, -1);
9270     if (r < 0) {
9271       rc = r;
9272       goto done;
9273     }
9274   }
9275   if (f->flags & O_DIRECT)
9276     have &= ~(CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO);
9277
9278   if (in->inline_version < CEPH_INLINE_NONE) {
9279     if (!(have & CEPH_CAP_FILE_CACHE)) {
9280       onuninline.reset(new C_SaferCond("Client::_read_uninline_data flock"));
9281       uninline_data(in, onuninline.get());
9282     } else {
9283       uint32_t len = in->inline_data.length();
9284       uint64_t endoff = offset + size;
9285       if (endoff > in->size)
9286         endoff = in->size;
9287
9288       if (offset < len) {
9289         if (endoff <= len) {
9290           bl->substr_of(in->inline_data, offset, endoff - offset);
9291         } else {
9292           bl->substr_of(in->inline_data, offset, len - offset);
9293           bl->append_zero(endoff - len);
9294         }
9295         rc = endoff - offset;
9296       } else if ((uint64_t)offset < endoff) {
9297         bl->append_zero(endoff - offset);
9298         rc = endoff - offset;
9299       } else {
9300         rc = 0;
9301       }
9302       goto success;
9303     }
9304   }
9305
9306   if (!conf->client_debug_force_sync_read &&
9307       conf->client_oc &&
9308       (have & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO))) {
9309
9310     if (f->flags & O_RSYNC) {
9311       _flush_range(in, offset, size);
9312     }
9313     rc = _read_async(f, offset, size, bl);
9314     if (rc < 0)
9315       goto done;
9316   } else {
9317     if (f->flags & O_DIRECT)
9318       _flush_range(in, offset, size);
9319
9320     bool checkeof = false;
9321     rc = _read_sync(f, offset, size, bl, &checkeof);
9322     if (rc < 0)
9323       goto done;
9324     if (checkeof) {
9325       offset += rc;
9326       size -= rc;
9327
9328       put_cap_ref(in, CEPH_CAP_FILE_RD);
9329       have = 0;
9330       // reverify size
9331       {
9332         auto r = _getattr(in, CEPH_STAT_CAP_SIZE, f->actor_perms);
9333         if (r < 0) {
9334           rc = r;
9335           goto done;
9336         }
9337       }
9338
9339       // eof?  short read.
9340       if ((uint64_t)offset < in->size)
9341         goto retry;
9342     }
9343   }
9344
9345 success:
9346   ceph_assert(rc >= 0);
9347   if (movepos) {
9348     // adjust fd pos
9349     f->pos = start_pos + rc;
9350   }
9351
9352   lat = ceph_clock_now();
9353   lat -= start;
9354   logger->tinc(l_c_read, lat);
9355
9356 done:
9357   // done!
9358
9359   if (onuninline) {
9360     client_lock.unlock();
9361     int ret = onuninline->wait();
9362     client_lock.lock();
9363     if (ret >= 0 || ret == -ECANCELED) {
9364       in->inline_data.clear();
9365       in->inline_version = CEPH_INLINE_NONE;
9366       in->mark_caps_dirty(CEPH_CAP_FILE_WR);
9367       check_caps(in, 0);
9368     } else
9369       rc = ret;
9370   }
9371   if (have) {
9372     put_cap_ref(in, CEPH_CAP_FILE_RD);
9373   }
9374   if (movepos) {
9375     unlock_fh_pos(f);
9376   }
9377   return rc;
9378 }
9379
9380 Client::C_Readahead::C_Readahead(Client *c, Fh *f) :
9381     client(c), f(f) {
9382   f->get();
9383   f->readahead.inc_pending();
9384 }
9385
9386 Client::C_Readahead::~C_Readahead() {
9387   f->readahead.dec_pending();
9388   client->_put_fh(f);
9389 }
9390
9391 void Client::C_Readahead::finish(int r) {
9392   lgeneric_subdout(client->cct, client, 20) << "client." << client->get_nodeid() << " " << "C_Readahead on " << f->inode << dendl;
9393   client->put_cap_ref(f->inode.get(), CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE);
9394 }
9395
9396 int Client::_read_async(Fh *f, uint64_t off, uint64_t len, bufferlist *bl)
9397 {
9398   const auto& conf = cct->_conf;
9399   Inode *in = f->inode.get();
9400
9401   ldout(cct, 10) << __func__ << " " << *in << " " << off << "~" << len << dendl;
9402
9403   // trim read based on file size?
9404   if (off >= in->size)
9405     return 0;
9406   if (len == 0)
9407     return 0;
9408   if (off + len > in->size) {
9409     len = in->size - off;
9410   }
9411
9412   ldout(cct, 10) << " min_bytes=" << f->readahead.get_min_readahead_size()
9413                  << " max_bytes=" << f->readahead.get_max_readahead_size()
9414                  << " max_periods=" << conf->client_readahead_max_periods << dendl;
9415
9416   // read (and possibly block)
9417   int r = 0;
9418   C_SaferCond onfinish("Client::_read_async flock");
9419   r = objectcacher->file_read(&in->oset, &in->layout, in->snapid,
9420                               off, len, bl, 0, &onfinish);
9421   if (r == 0) {
9422     get_cap_ref(in, CEPH_CAP_FILE_CACHE);
9423     client_lock.unlock();
9424     r = onfinish.wait();
9425     client_lock.lock();
9426     put_cap_ref(in, CEPH_CAP_FILE_CACHE);
9427   }
9428
9429   if(f->readahead.get_min_readahead_size() > 0) {
9430     pair<uint64_t, uint64_t> readahead_extent = f->readahead.update(off, len, in->size);
9431     if (readahead_extent.second > 0) {
9432       ldout(cct, 20) << "readahead " << readahead_extent.first << "~" << readahead_extent.second
9433                      << " (caller wants " << off << "~" << len << ")" << dendl;
9434       Context *onfinish2 = new C_Readahead(this, f);
9435       int r2 = objectcacher->file_read(&in->oset, &in->layout, in->snapid,
9436                                        readahead_extent.first, readahead_extent.second,
9437                                        NULL, 0, onfinish2);
9438       if (r2 == 0) {
9439         ldout(cct, 20) << "readahead initiated, c " << onfinish2 << dendl;
9440         get_cap_ref(in, CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE);
9441       } else {
9442         ldout(cct, 20) << "readahead was no-op, already cached" << dendl;
9443         delete onfinish2;
9444       }
9445     }
9446   }
9447
9448   return r;
9449 }
9450
9451 int Client::_read_sync(Fh *f, uint64_t off, uint64_t len, bufferlist *bl,
9452                        bool *checkeof)
9453 {
9454   Inode *in = f->inode.get();
9455   uint64_t pos = off;
9456   int left = len;
9457   int read = 0;
9458
9459   ldout(cct, 10) << __func__ << " " << *in << " " << off << "~" << len << dendl;
9460
9461   while (left > 0) {
9462     C_SaferCond onfinish("Client::_read_sync flock");
9463     bufferlist tbl;
9464
9465     int wanted = left;
9466     filer->read_trunc(in->ino, &in->layout, in->snapid,
9467                       pos, left, &tbl, 0,
9468                       in->truncate_size, in->truncate_seq,
9469                       &onfinish);
9470     client_lock.unlock();
9471     int r = onfinish.wait();
9472     client_lock.lock();
9473
9474     // if we get ENOENT from OSD, assume 0 bytes returned
9475     if (r == -ENOENT)
9476       r = 0;
9477     if (r < 0)
9478       return r;
9479     if (tbl.length()) {
9480       r = tbl.length();
9481
9482       read += r;
9483       pos += r;
9484       left -= r;
9485       bl->claim_append(tbl);
9486     }
9487     // short read?
9488     if (r >= 0 && r < wanted) {
9489       if (pos < in->size) {
9490         // zero up to known EOF
9491         int64_t some = in->size - pos;
9492         if (some > left)
9493           some = left;
9494         auto z = buffer::ptr_node::create(some);
9495         z->zero();
9496         bl->push_back(std::move(z));
9497         read += some;
9498         pos += some;
9499         left -= some;
9500         if (left == 0)
9501           return read;
9502       }
9503
9504       *checkeof = true;
9505       return read;
9506     }
9507   }
9508   return read;
9509 }
9510
9511
9512 /*
9513  * we keep count of uncommitted sync writes on the inode, so that
9514  * fsync can DDRT.
9515  */
9516 void Client::_sync_write_commit(Inode *in)
9517 {
9518   ceph_assert(unsafe_sync_write > 0);
9519   unsafe_sync_write--;
9520
9521   put_cap_ref(in, CEPH_CAP_FILE_BUFFER);
9522
9523   ldout(cct, 15) << __func__ << " unsafe_sync_write = " << unsafe_sync_write << dendl;
9524   if (unsafe_sync_write == 0 && unmounting) {
9525     ldout(cct, 10) << __func__ << " -- no more unsafe writes, unmount can proceed" << dendl;
9526     mount_cond.notify_all();
9527   }
9528 }
9529
9530 int Client::write(int fd, const char *buf, loff_t size, loff_t offset)
9531 {
9532   std::lock_guard lock(client_lock);
9533   tout(cct) << "write" << std::endl;
9534   tout(cct) << fd << std::endl;
9535   tout(cct) << size << std::endl;
9536   tout(cct) << offset << std::endl;
9537
9538   if (unmounting)
9539     return -ENOTCONN;
9540
9541   Fh *fh = get_filehandle(fd);
9542   if (!fh)
9543     return -EBADF;
9544 #if defined(__linux__) && defined(O_PATH)
9545   if (fh->flags & O_PATH)
9546     return -EBADF;
9547 #endif
9548   /* We can't return bytes written larger than INT_MAX, clamp size to that */
9549   size = std::min(size, (loff_t)INT_MAX);
9550   int r = _write(fh, offset, size, buf, NULL, false);
9551   ldout(cct, 3) << "write(" << fd << ", \"...\", " << size << ", " << offset << ") = " << r << dendl;
9552   return r;
9553 }
9554
9555 int Client::pwritev(int fd, const struct iovec *iov, int iovcnt, int64_t offset)
9556 {
9557   if (iovcnt < 0)
9558     return -EINVAL;
9559   return _preadv_pwritev(fd, iov, iovcnt, offset, true);
9560 }
9561
9562 int64_t Client::_preadv_pwritev_locked(Fh *fh, const struct iovec *iov,
9563                                    unsigned iovcnt, int64_t offset, bool write,
9564                                    bool clamp_to_int)
9565 {
9566 #if defined(__linux__) && defined(O_PATH)
9567     if (fh->flags & O_PATH)
9568         return -EBADF;
9569 #endif
9570     loff_t totallen = 0;
9571     for (unsigned i = 0; i < iovcnt; i++) {
9572         totallen += iov[i].iov_len;
9573     }
9574
9575     /*
9576      * Some of the API functions take 64-bit size values, but only return
9577      * 32-bit signed integers. Clamp the I/O sizes in those functions so that
9578      * we don't do I/Os larger than the values we can return.
9579      */
9580     if (clamp_to_int) {
9581       totallen = std::min(totallen, (loff_t)INT_MAX);
9582     }
9583     if (write) {
9584         int64_t w = _write(fh, offset, totallen, NULL, iov, iovcnt);
9585         ldout(cct, 3) << "pwritev(" << fh << ", \"...\", " << totallen << ", " << offset << ") = " << w << dendl;
9586         return w;
9587     } else {
9588         bufferlist bl;
9589         int64_t r = _read(fh, offset, totallen, &bl);
9590         ldout(cct, 3) << "preadv(" << fh << ", " <<  offset << ") = " << r << dendl;
9591         if (r <= 0)
9592           return r;
9593
9594         auto iter = bl.cbegin();
9595         for (unsigned j = 0, resid = r; j < iovcnt && resid > 0; j++) {
9596                /*
9597                 * This piece of code aims to handle the case that bufferlist does not have enough data
9598                 * to fill in the iov
9599                 */
9600                const auto round_size = std::min<unsigned>(resid, iov[j].iov_len);
9601                iter.copy(round_size, reinterpret_cast<char*>(iov[j].iov_base));
9602                resid -= round_size;
9603                /* iter is self-updating */
9604         }
9605         return r;
9606     }
9607 }
9608
9609 int Client::_preadv_pwritev(int fd, const struct iovec *iov, unsigned iovcnt, int64_t offset, bool write)
9610 {
9611     std::lock_guard lock(client_lock);
9612     tout(cct) << fd << std::endl;
9613     tout(cct) << offset << std::endl;
9614
9615     if (unmounting)
9616      return -ENOTCONN;
9617
9618     Fh *fh = get_filehandle(fd);
9619     if (!fh)
9620         return -EBADF;
9621     return _preadv_pwritev_locked(fh, iov, iovcnt, offset, write, true);
9622 }
9623
9624 int64_t Client::_write(Fh *f, int64_t offset, uint64_t size, const char *buf,
9625                         const struct iovec *iov, int iovcnt)
9626 {
9627   uint64_t fpos = 0;
9628
9629   if ((uint64_t)(offset+size) > mdsmap->get_max_filesize()) //too large!
9630     return -EFBIG;
9631
9632   //ldout(cct, 7) << "write fh " << fh << " size " << size << " offset " << offset << dendl;
9633   Inode *in = f->inode.get();
9634
9635   if (objecter->osdmap_pool_full(in->layout.pool_id)) {
9636     return -ENOSPC;
9637   }
9638
9639   ceph_assert(in->snapid == CEPH_NOSNAP);
9640
9641   // was Fh opened as writeable?
9642   if ((f->mode & CEPH_FILE_MODE_WR) == 0)
9643     return -EBADF;
9644
9645   // use/adjust fd pos?
9646   if (offset < 0) {
9647     lock_fh_pos(f);
9648     /*
9649      * FIXME: this is racy in that we may block _after_ this point waiting for caps, and size may
9650      * change out from under us.
9651      */
9652     if (f->flags & O_APPEND) {
9653       auto r = _lseek(f, 0, SEEK_END);
9654       if (r < 0) {
9655         unlock_fh_pos(f);
9656         return r;
9657       }
9658     }
9659     offset = f->pos;
9660     fpos = offset+size;
9661     unlock_fh_pos(f);
9662   }
9663
9664   // check quota
9665   uint64_t endoff = offset + size;
9666   if (endoff > in->size && is_quota_bytes_exceeded(in, endoff - in->size,
9667                                                    f->actor_perms)) {
9668     return -EDQUOT;
9669   }
9670
9671   //bool lazy = f->mode == CEPH_FILE_MODE_LAZY;
9672
9673   ldout(cct, 10) << "cur file size is " << in->size << dendl;
9674
9675   // time it.
9676   utime_t start = ceph_clock_now();
9677
9678   if (in->inline_version == 0) {
9679     int r = _getattr(in, CEPH_STAT_CAP_INLINE_DATA, f->actor_perms, true);
9680     if (r < 0)
9681       return r;
9682     ceph_assert(in->inline_version > 0);
9683   }
9684
9685   // copy into fresh buffer (since our write may be resub, async)
9686   bufferlist bl;
9687   if (buf) {
9688     if (size > 0)
9689       bl.append(buf, size);
9690   } else if (iov){
9691     for (int i = 0; i < iovcnt; i++) {
9692       if (iov[i].iov_len > 0) {
9693         bl.append((const char *)iov[i].iov_base, iov[i].iov_len);
9694       }
9695     }
9696   }
9697
9698   utime_t lat;
9699   uint64_t totalwritten;
9700   int want, have;
9701   if (f->mode & CEPH_FILE_MODE_LAZY)
9702     want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO;
9703   else
9704     want = CEPH_CAP_FILE_BUFFER;
9705   int r = get_caps(f, CEPH_CAP_FILE_WR|CEPH_CAP_AUTH_SHARED, want, &have, endoff);
9706   if (r < 0)
9707     return r;
9708
9709   /* clear the setuid/setgid bits, if any */
9710   if (unlikely(in->mode & (S_ISUID|S_ISGID)) && size > 0) {
9711     struct ceph_statx stx = { 0 };
9712
9713     put_cap_ref(in, CEPH_CAP_AUTH_SHARED);
9714     r = __setattrx(in, &stx, CEPH_SETATTR_KILL_SGUID, f->actor_perms);
9715     if (r < 0)
9716       return r;
9717   } else {
9718     put_cap_ref(in, CEPH_CAP_AUTH_SHARED);
9719   }
9720
9721   if (f->flags & O_DIRECT)
9722     have &= ~(CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO);
9723
9724   ldout(cct, 10) << " snaprealm " << *in->snaprealm << dendl;
9725
9726   std::unique_ptr<C_SaferCond> onuninline = nullptr;
9727
9728   if (in->inline_version < CEPH_INLINE_NONE) {
9729     if (endoff > cct->_conf->client_max_inline_size ||
9730         endoff > CEPH_INLINE_MAX_SIZE ||
9731         !(have & CEPH_CAP_FILE_BUFFER)) {
9732       onuninline.reset(new C_SaferCond("Client::_write_uninline_data flock"));
9733       uninline_data(in, onuninline.get());
9734     } else {
9735       get_cap_ref(in, CEPH_CAP_FILE_BUFFER);
9736
9737       uint32_t len = in->inline_data.length();
9738
9739       if (endoff < len)
9740         in->inline_data.begin(endoff).copy(len - endoff, bl); // XXX
9741
9742       if (offset < len)
9743         in->inline_data.splice(offset, len - offset);
9744       else if (offset > len)
9745         in->inline_data.append_zero(offset - len);
9746
9747       in->inline_data.append(bl);
9748       in->inline_version++;
9749
9750       put_cap_ref(in, CEPH_CAP_FILE_BUFFER);
9751
9752       goto success;
9753     }
9754   }
9755
9756   if (cct->_conf->client_oc &&
9757       (have & (CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO))) {
9758     // do buffered write
9759     if (!in->oset.dirty_or_tx)
9760       get_cap_ref(in, CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_BUFFER);
9761
9762     get_cap_ref(in, CEPH_CAP_FILE_BUFFER);
9763
9764     // async, caching, non-blocking.
9765     r = objectcacher->file_write(&in->oset, &in->layout,
9766                                  in->snaprealm->get_snap_context(),
9767                                  offset, size, bl, ceph::real_clock::now(),
9768                                  0);
9769     put_cap_ref(in, CEPH_CAP_FILE_BUFFER);
9770
9771     if (r < 0)
9772       goto done;
9773
9774     // flush cached write if O_SYNC is set on file fh
9775     // O_DSYNC == O_SYNC on linux < 2.6.33
9776     // O_SYNC = __O_SYNC | O_DSYNC on linux >= 2.6.33
9777     if ((f->flags & O_SYNC) || (f->flags & O_DSYNC)) {
9778       _flush_range(in, offset, size);
9779     }
9780   } else {
9781     if (f->flags & O_DIRECT)
9782       _flush_range(in, offset, size);
9783
9784     // simple, non-atomic sync write
9785     C_SaferCond onfinish("Client::_write flock");
9786     unsafe_sync_write++;
9787     get_cap_ref(in, CEPH_CAP_FILE_BUFFER);  // released by onsafe callback
9788
9789     filer->write_trunc(in->ino, &in->layout, in->snaprealm->get_snap_context(),
9790                        offset, size, bl, ceph::real_clock::now(), 0,
9791                        in->truncate_size, in->truncate_seq,
9792                        &onfinish);
9793     client_lock.unlock();
9794     r = onfinish.wait();
9795     client_lock.lock();
9796     _sync_write_commit(in);
9797     if (r < 0)
9798       goto done;
9799   }
9800
9801   // if we get here, write was successful, update client metadata
9802 success:
9803   // time
9804   lat = ceph_clock_now();
9805   lat -= start;
9806   logger->tinc(l_c_wrlat, lat);
9807
9808   if (fpos) {
9809     lock_fh_pos(f);
9810     f->pos = fpos;
9811     unlock_fh_pos(f);
9812   }
9813   totalwritten = size;
9814   r = (int64_t)totalwritten;
9815
9816   // extend file?
9817   if (totalwritten + offset > in->size) {
9818     in->size = totalwritten + offset;
9819     in->mark_caps_dirty(CEPH_CAP_FILE_WR);
9820
9821     if (is_quota_bytes_approaching(in, f->actor_perms)) {
9822       check_caps(in, CHECK_CAPS_NODELAY);
9823     } else if (is_max_size_approaching(in)) {
9824       check_caps(in, 0);
9825     }
9826
9827     ldout(cct, 7) << "wrote to " << totalwritten+offset << ", extending file size" << dendl;
9828   } else {
9829     ldout(cct, 7) << "wrote to " << totalwritten+offset << ", leaving file size at " << in->size << dendl;
9830   }
9831
9832   // mtime
9833   in->mtime = in->ctime = ceph_clock_now();
9834   in->change_attr++;
9835   in->mark_caps_dirty(CEPH_CAP_FILE_WR);
9836
9837 done:
9838
9839   if (nullptr != onuninline) {
9840     client_lock.unlock();
9841     int uninline_ret = onuninline->wait();
9842     client_lock.lock();
9843
9844     if (uninline_ret >= 0 || uninline_ret == -ECANCELED) {
9845       in->inline_data.clear();
9846       in->inline_version = CEPH_INLINE_NONE;
9847       in->mark_caps_dirty(CEPH_CAP_FILE_WR);
9848       check_caps(in, 0);
9849     } else
9850       r = uninline_ret;
9851   }
9852
9853   put_cap_ref(in, CEPH_CAP_FILE_WR);
9854   return r;
9855 }
9856
9857 int Client::_flush(Fh *f)
9858 {
9859   Inode *in = f->inode.get();
9860   int err = f->take_async_err();
9861   if (err != 0) {
9862     ldout(cct, 1) << __func__ << ": " << f << " on inode " << *in << " caught async_err = "
9863                   << cpp_strerror(err) << dendl;
9864   } else {
9865     ldout(cct, 10) << __func__ << ": " << f << " on inode " << *in << " no async_err state" << dendl;
9866   }
9867
9868   return err;
9869 }
9870
9871 int Client::truncate(const char *relpath, loff_t length, const UserPerm& perms)
9872 {
9873   struct ceph_statx stx;
9874   stx.stx_size = length;
9875   return setattrx(relpath, &stx, CEPH_SETATTR_SIZE, perms);
9876 }
9877
9878 int Client::ftruncate(int fd, loff_t length, const UserPerm& perms)
9879 {
9880   std::lock_guard lock(client_lock);
9881   tout(cct) << __func__ << std::endl;
9882   tout(cct) << fd << std::endl;
9883   tout(cct) << length << std::endl;
9884
9885   if (unmounting)
9886     return -ENOTCONN;
9887
9888   Fh *f = get_filehandle(fd);
9889   if (!f)
9890     return -EBADF;
9891 #if defined(__linux__) && defined(O_PATH)
9892   if (f->flags & O_PATH)
9893     return -EBADF;
9894 #endif
9895   if ((f->mode & CEPH_FILE_MODE_WR) == 0)
9896     return -EBADF;
9897   struct stat attr;
9898   attr.st_size = length;
9899   return _setattr(f->inode, &attr, CEPH_SETATTR_SIZE, perms);
9900 }
9901
9902 int Client::fsync(int fd, bool syncdataonly)
9903 {
9904   std::lock_guard lock(client_lock);
9905   tout(cct) << "fsync" << std::endl;
9906   tout(cct) << fd << std::endl;
9907   tout(cct) << syncdataonly << std::endl;
9908
9909   if (unmounting)
9910     return -ENOTCONN;
9911
9912   Fh *f = get_filehandle(fd);
9913   if (!f)
9914     return -EBADF;
9915 #if defined(__linux__) && defined(O_PATH)
9916   if (f->flags & O_PATH)
9917     return -EBADF;
9918 #endif
9919   int r = _fsync(f, syncdataonly);
9920   if (r == 0) {
9921     // The IOs in this fsync were okay, but maybe something happened
9922     // in the background that we shoudl be reporting?
9923     r = f->take_async_err();
9924     ldout(cct, 5) << "fsync(" << fd << ", " << syncdataonly
9925                   << ") = 0, async_err = " << r << dendl;
9926   } else {
9927     // Assume that an error we encountered during fsync, even reported
9928     // synchronously, would also have applied the error to the Fh, and we
9929     // should clear it here to avoid returning the same error again on next
9930     // call.
9931     ldout(cct, 5) << "fsync(" << fd << ", " << syncdataonly << ") = "
9932                   << r << dendl;
9933     f->take_async_err();
9934   }
9935   return r;
9936 }
9937
9938 int Client::_fsync(Inode *in, bool syncdataonly)
9939 {
9940   int r = 0;
9941   std::unique_ptr<C_SaferCond> object_cacher_completion = nullptr;
9942   ceph_tid_t flush_tid = 0;
9943   InodeRef tmp_ref;
9944   utime_t lat;
9945   utime_t start = ceph_clock_now();
9946
9947   ldout(cct, 8) << "_fsync on " << *in << " " << (syncdataonly ? "(dataonly)":"(data+metadata)") << dendl;
9948
9949   if (cct->_conf->client_oc) {
9950     object_cacher_completion.reset(new C_SaferCond("Client::_fsync::lock"));
9951     tmp_ref = in; // take a reference; C_SaferCond doesn't and _flush won't either
9952     _flush(in, object_cacher_completion.get());
9953     ldout(cct, 15) << "using return-valued form of _fsync" << dendl;
9954   }
9955
9956   if (!syncdataonly && in->dirty_caps) {
9957     check_caps(in, CHECK_CAPS_NODELAY|CHECK_CAPS_SYNCHRONOUS);
9958     if (in->flushing_caps)
9959       flush_tid = last_flush_tid;
9960   } else ldout(cct, 10) << "no metadata needs to commit" << dendl;
9961
9962   if (!syncdataonly && !in->unsafe_ops.empty()) {
9963     flush_mdlog_sync();
9964
9965     MetaRequest *req = in->unsafe_ops.back();
9966     ldout(cct, 15) << "waiting on unsafe requests, last tid " << req->get_tid() <<  dendl;
9967
9968     req->get();
9969     wait_on_list(req->waitfor_safe);
9970     put_request(req);
9971   }
9972
9973   if (nullptr != object_cacher_completion) { // wait on a real reply instead of guessing
9974     client_lock.unlock();
9975     ldout(cct, 15) << "waiting on data to flush" << dendl;
9976     r = object_cacher_completion->wait();
9977     client_lock.lock();
9978     ldout(cct, 15) << "got " << r << " from flush writeback" << dendl;
9979   } else {
9980     // FIXME: this can starve
9981     while (in->cap_refs[CEPH_CAP_FILE_BUFFER] > 0) {
9982       ldout(cct, 10) << "ino " << in->ino << " has " << in->cap_refs[CEPH_CAP_FILE_BUFFER]
9983                      << " uncommitted, waiting" << dendl;
9984       wait_on_list(in->waitfor_commit);
9985     }
9986   }
9987
9988   if (!r) {
9989     if (flush_tid > 0)
9990       wait_sync_caps(in, flush_tid);
9991
9992     ldout(cct, 10) << "ino " << in->ino << " has no uncommitted writes" << dendl;
9993   } else {
9994     ldout(cct, 8) << "ino " << in->ino << " failed to commit to disk! "
9995                   << cpp_strerror(-r) << dendl;
9996   }
9997
9998   lat = ceph_clock_now();
9999   lat -= start;
10000   logger->tinc(l_c_fsync, lat);
10001
10002   return r;
10003 }
10004
10005 int Client::_fsync(Fh *f, bool syncdataonly)
10006 {
10007   ldout(cct, 8) << "_fsync(" << f << ", " << (syncdataonly ? "dataonly)":"data+metadata)") << dendl;
10008   return _fsync(f->inode.get(), syncdataonly);
10009 }
10010
10011 int Client::fstat(int fd, struct stat *stbuf, const UserPerm& perms, int mask)
10012 {
10013   std::lock_guard lock(client_lock);
10014   tout(cct) << "fstat mask " << hex << mask << dec << std::endl;
10015   tout(cct) << fd << std::endl;
10016
10017   if (unmounting)
10018     return -ENOTCONN;
10019
10020   Fh *f = get_filehandle(fd);
10021   if (!f)
10022     return -EBADF;
10023   int r = _getattr(f->inode, mask, perms);
10024   if (r < 0)
10025     return r;
10026   fill_stat(f->inode, stbuf, NULL);
10027   ldout(cct, 5) << "fstat(" << fd << ", " << stbuf << ") = " << r << dendl;
10028   return r;
10029 }
10030
10031 int Client::fstatx(int fd, struct ceph_statx *stx, const UserPerm& perms,
10032                    unsigned int want, unsigned int flags)
10033 {
10034   std::lock_guard lock(client_lock);
10035   tout(cct) << "fstatx flags " << hex << flags << " want " << want << dec << std::endl;
10036   tout(cct) << fd << std::endl;
10037
10038   if (unmounting)
10039     return -ENOTCONN;
10040
10041   Fh *f = get_filehandle(fd);
10042   if (!f)
10043     return -EBADF;
10044
10045   unsigned mask = statx_to_mask(flags, want);
10046
10047   int r = 0;
10048   if (mask && !f->inode->caps_issued_mask(mask, true)) {
10049     r = _getattr(f->inode, mask, perms);
10050     if (r < 0) {
10051       ldout(cct, 3) << "fstatx exit on error!" << dendl;
10052       return r;
10053     }
10054   }
10055
10056   fill_statx(f->inode, mask, stx);
10057   ldout(cct, 3) << "fstatx(" << fd << ", " << stx << ") = " << r << dendl;
10058   return r;
10059 }
10060
10061 // not written yet, but i want to link!
10062
10063 int Client::chdir(const char *relpath, std::string &new_cwd,
10064                   const UserPerm& perms)
10065 {
10066   std::lock_guard lock(client_lock);
10067   tout(cct) << "chdir" << std::endl;
10068   tout(cct) << relpath << std::endl;
10069
10070   if (unmounting)
10071     return -ENOTCONN;
10072
10073   filepath path(relpath);
10074   InodeRef in;
10075   int r = path_walk(path, &in, perms);
10076   if (r < 0)
10077     return r;
10078
10079   if (!(in.get()->is_dir()))
10080     return -ENOTDIR;
10081
10082   if (cwd != in)
10083     cwd.swap(in);
10084   ldout(cct, 3) << "chdir(" << relpath << ")  cwd now " << cwd->ino << dendl;
10085
10086   _getcwd(new_cwd, perms);
10087   return 0;
10088 }
10089
10090 void Client::_getcwd(string& dir, const UserPerm& perms)
10091 {
10092   filepath path;
10093   ldout(cct, 10) << __func__ << " " << *cwd << dendl;
10094
10095   Inode *in = cwd.get();
10096   while (in != root) {
10097     ceph_assert(in->dentries.size() < 2); // dirs can't be hard-linked
10098
10099     // A cwd or ancester is unlinked
10100     if (in->dentries.empty()) {
10101       return;
10102     }
10103
10104     Dentry *dn = in->get_first_parent();
10105
10106
10107     if (!dn) {
10108       // look it up
10109       ldout(cct, 10) << __func__ << " looking up parent for " << *in << dendl;
10110       MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPNAME);
10111       filepath path(in->ino);
10112       req->set_filepath(path);
10113       req->set_inode(in);
10114       int res = make_request(req, perms);
10115       if (res < 0)
10116         break;
10117
10118       // start over
10119       path = filepath();
10120       in = cwd.get();
10121       continue;
10122     }
10123     path.push_front_dentry(dn->name);
10124     in = dn->dir->parent_inode;
10125   }
10126   dir = "/";
10127   dir += path.get_path();
10128 }
10129
10130 void Client::getcwd(string& dir, const UserPerm& perms)
10131 {
10132   std::lock_guard l(client_lock);
10133   if (!unmounting)
10134     _getcwd(dir, perms);
10135 }
10136
10137 int Client::statfs(const char *path, struct statvfs *stbuf,
10138                    const UserPerm& perms)
10139 {
10140   std::lock_guard l(client_lock);
10141   tout(cct) << __func__ << std::endl;
10142   unsigned long int total_files_on_fs;
10143
10144   if (unmounting)
10145     return -ENOTCONN;
10146
10147   ceph_statfs stats;
10148   C_SaferCond cond;
10149
10150   const vector<int64_t> &data_pools = mdsmap->get_data_pools();
10151   if (data_pools.size() == 1) {
10152     objecter->get_fs_stats(stats, data_pools[0], &cond);
10153   } else {
10154     objecter->get_fs_stats(stats, boost::optional<int64_t>(), &cond);
10155   }
10156
10157   client_lock.unlock();
10158   int rval = cond.wait();
10159   assert(root);
10160   total_files_on_fs = root->rstat.rfiles + root->rstat.rsubdirs;
10161   client_lock.lock();
10162
10163   if (rval < 0) {
10164     ldout(cct, 1) << "underlying call to statfs returned error: "
10165                   << cpp_strerror(rval)
10166                   << dendl;
10167     return rval;
10168   }
10169
10170   memset(stbuf, 0, sizeof(*stbuf));
10171
10172   /*
10173    * we're going to set a block size of 4MB so we can represent larger
10174    * FSes without overflowing. Additionally convert the space
10175    * measurements from KB to bytes while making them in terms of
10176    * blocks.  We use 4MB only because it is big enough, and because it
10177    * actually *is* the (ceph) default block size.
10178    */
10179   const int CEPH_BLOCK_SHIFT = 22;
10180   stbuf->f_frsize = 1 << CEPH_BLOCK_SHIFT;
10181   stbuf->f_bsize = 1 << CEPH_BLOCK_SHIFT;
10182   stbuf->f_files = total_files_on_fs;
10183   stbuf->f_ffree = 0;
10184   stbuf->f_favail = -1;
10185   stbuf->f_fsid = -1;       // ??
10186   stbuf->f_flag = 0;        // ??
10187   stbuf->f_namemax = NAME_MAX;
10188
10189   // Usually quota_root will == root_ancestor, but if the mount root has no
10190   // quota but we can see a parent of it that does have a quota, we'll
10191   // respect that one instead.
10192   ceph_assert(root != nullptr);
10193   Inode *quota_root = root->quota.is_enable() ? root : get_quota_root(root, perms);
10194
10195   // get_quota_root should always give us something
10196   // because client quotas are always enabled
10197   ceph_assert(quota_root != nullptr);
10198
10199   if (quota_root && cct->_conf->client_quota_df && quota_root->quota.max_bytes) {
10200
10201     // Skip the getattr if any sessions are stale, as we don't want to
10202     // block `df` if this client has e.g. been evicted, or if the MDS cluster
10203     // is unhealthy.
10204     if (!_any_stale_sessions()) {
10205       int r = _getattr(quota_root, 0, perms, true);
10206       if (r != 0) {
10207         // Ignore return value: error getting latest inode metadata is not a good
10208         // reason to break "df".
10209         lderr(cct) << "Error in getattr on quota root 0x"
10210                    << std::hex << quota_root->ino << std::dec
10211                    << " statfs result may be outdated" << dendl;
10212       }
10213     }
10214
10215     // Special case: if there is a size quota set on the Inode acting
10216     // as the root for this client mount, then report the quota status
10217     // as the filesystem statistics.
10218     const fsblkcnt_t total = quota_root->quota.max_bytes >> CEPH_BLOCK_SHIFT;
10219     const fsblkcnt_t used = quota_root->rstat.rbytes >> CEPH_BLOCK_SHIFT;
10220     // It is possible for a quota to be exceeded: arithmetic here must
10221     // handle case where used > total.
10222     const fsblkcnt_t free = total > used ? total - used : 0;
10223
10224     stbuf->f_blocks = total;
10225     stbuf->f_bfree = free;
10226     stbuf->f_bavail = free;
10227   } else {
10228     // General case: report the cluster statistics returned from RADOS. Because
10229     // multiple pools may be used without one filesystem namespace via
10230     // layouts, this is the most correct thing we can do.
10231     stbuf->f_blocks = stats.kb >> (CEPH_BLOCK_SHIFT - 10);
10232     stbuf->f_bfree = stats.kb_avail >> (CEPH_BLOCK_SHIFT - 10);
10233     stbuf->f_bavail = stats.kb_avail >> (CEPH_BLOCK_SHIFT - 10);
10234   }
10235
10236   return rval;
10237 }
10238
10239 int Client::_do_filelock(Inode *in, Fh *fh, int lock_type, int op, int sleep,
10240                          struct flock *fl, uint64_t owner, bool removing)
10241 {
10242   ldout(cct, 10) << __func__ << " ino " << in->ino
10243                  << (lock_type == CEPH_LOCK_FCNTL ? " fcntl" : " flock")
10244                  << " type " << fl->l_type << " owner " << owner
10245                  << " " << fl->l_start << "~" << fl->l_len << dendl;
10246
10247   if (in->flags & I_ERROR_FILELOCK)
10248     return -EIO;
10249
10250   int lock_cmd;
10251   if (F_RDLCK == fl->l_type)
10252     lock_cmd = CEPH_LOCK_SHARED;
10253   else if (F_WRLCK == fl->l_type)
10254     lock_cmd = CEPH_LOCK_EXCL;
10255   else if (F_UNLCK == fl->l_type)
10256     lock_cmd = CEPH_LOCK_UNLOCK;
10257   else
10258     return -EIO;
10259
10260   if (op != CEPH_MDS_OP_SETFILELOCK || lock_cmd == CEPH_LOCK_UNLOCK)
10261     sleep = 0;
10262
10263   /*
10264    * Set the most significant bit, so that MDS knows the 'owner'
10265    * is sufficient to identify the owner of lock. (old code uses
10266    * both 'owner' and 'pid')
10267    */
10268   owner |= (1ULL << 63);
10269
10270   MetaRequest *req = new MetaRequest(op);
10271   filepath path;
10272   in->make_nosnap_relative_path(path);
10273   req->set_filepath(path);
10274   req->set_inode(in);
10275
10276   req->head.args.filelock_change.rule = lock_type;
10277   req->head.args.filelock_change.type = lock_cmd;
10278   req->head.args.filelock_change.owner = owner;
10279   req->head.args.filelock_change.pid = fl->l_pid;
10280   req->head.args.filelock_change.start = fl->l_start;
10281   req->head.args.filelock_change.length = fl->l_len;
10282   req->head.args.filelock_change.wait = sleep;
10283
10284   int ret;
10285   bufferlist bl;
10286
10287   if (sleep && switch_interrupt_cb) {
10288     // enable interrupt
10289     switch_interrupt_cb(callback_handle, req->get());
10290     ret = make_request(req, fh->actor_perms, NULL, NULL, -1, &bl);
10291     // disable interrupt
10292     switch_interrupt_cb(callback_handle, NULL);
10293     if (ret == 0 && req->aborted()) {
10294       // effect of this lock request has been revoked by the 'lock intr' request
10295       ret = req->get_abort_code();
10296     }
10297     put_request(req);
10298   } else {
10299     ret = make_request(req, fh->actor_perms, NULL, NULL, -1, &bl);
10300   }
10301
10302   if (ret == 0) {
10303     if (op == CEPH_MDS_OP_GETFILELOCK) {
10304       ceph_filelock filelock;
10305       auto p = bl.cbegin();
10306       decode(filelock, p);
10307
10308       if (CEPH_LOCK_SHARED == filelock.type)
10309         fl->l_type = F_RDLCK;
10310       else if (CEPH_LOCK_EXCL == filelock.type)
10311         fl->l_type = F_WRLCK;
10312       else
10313         fl->l_type = F_UNLCK;
10314
10315       fl->l_whence = SEEK_SET;
10316       fl->l_start = filelock.start;
10317       fl->l_len = filelock.length;
10318       fl->l_pid = filelock.pid;
10319     } else if (op == CEPH_MDS_OP_SETFILELOCK) {
10320       ceph_lock_state_t *lock_state;
10321       if (lock_type == CEPH_LOCK_FCNTL) {
10322         if (!in->fcntl_locks)
10323           in->fcntl_locks.reset(new ceph_lock_state_t(cct, CEPH_LOCK_FCNTL));
10324         lock_state = in->fcntl_locks.get();
10325       } else if (lock_type == CEPH_LOCK_FLOCK) {
10326         if (!in->flock_locks)
10327           in->flock_locks.reset(new ceph_lock_state_t(cct, CEPH_LOCK_FLOCK));
10328         lock_state = in->flock_locks.get();
10329       } else {
10330         ceph_abort();
10331         return -EINVAL;
10332       }
10333       _update_lock_state(fl, owner, lock_state);
10334
10335       if (!removing) {
10336         if (lock_type == CEPH_LOCK_FCNTL) {
10337           if (!fh->fcntl_locks)
10338             fh->fcntl_locks.reset(new ceph_lock_state_t(cct, CEPH_LOCK_FCNTL));
10339           lock_state = fh->fcntl_locks.get();
10340         } else {
10341           if (!fh->flock_locks)
10342             fh->flock_locks.reset(new ceph_lock_state_t(cct, CEPH_LOCK_FLOCK));
10343           lock_state = fh->flock_locks.get();
10344         }
10345         _update_lock_state(fl, owner, lock_state);
10346       }
10347     } else
10348       ceph_abort();
10349   }
10350   return ret;
10351 }
10352
10353 int Client::_interrupt_filelock(MetaRequest *req)
10354 {
10355   // Set abort code, but do not kick. The abort code prevents the request
10356   // from being re-sent.
10357   req->abort(-EINTR);
10358   if (req->mds < 0)
10359     return 0; // haven't sent the request
10360
10361   Inode *in = req->inode();
10362
10363   int lock_type;
10364   if (req->head.args.filelock_change.rule == CEPH_LOCK_FLOCK)
10365     lock_type = CEPH_LOCK_FLOCK_INTR;
10366   else if (req->head.args.filelock_change.rule == CEPH_LOCK_FCNTL)
10367     lock_type = CEPH_LOCK_FCNTL_INTR;
10368   else {
10369     ceph_abort();
10370     return -EINVAL;
10371   }
10372
10373   MetaRequest *intr_req = new MetaRequest(CEPH_MDS_OP_SETFILELOCK);
10374   filepath path;
10375   in->make_nosnap_relative_path(path);
10376   intr_req->set_filepath(path);
10377   intr_req->set_inode(in);
10378   intr_req->head.args.filelock_change = req->head.args.filelock_change;
10379   intr_req->head.args.filelock_change.rule = lock_type;
10380   intr_req->head.args.filelock_change.type = CEPH_LOCK_UNLOCK;
10381
10382   UserPerm perms(req->get_uid(), req->get_gid());
10383   return make_request(intr_req, perms, NULL, NULL, -1);
10384 }
10385
10386 void Client::_encode_filelocks(Inode *in, bufferlist& bl)
10387 {
10388   if (!in->fcntl_locks && !in->flock_locks)
10389     return;
10390
10391   unsigned nr_fcntl_locks = in->fcntl_locks ? in->fcntl_locks->held_locks.size() : 0;
10392   encode(nr_fcntl_locks, bl);
10393   if (nr_fcntl_locks) {
10394     auto &lock_state = in->fcntl_locks;
10395     for(multimap<uint64_t, ceph_filelock>::iterator p = lock_state->held_locks.begin();
10396         p != lock_state->held_locks.end();
10397         ++p)
10398       encode(p->second, bl);
10399   }
10400
10401   unsigned nr_flock_locks = in->flock_locks ? in->flock_locks->held_locks.size() : 0;
10402   encode(nr_flock_locks, bl);
10403   if (nr_flock_locks) {
10404     auto &lock_state = in->flock_locks;
10405     for(multimap<uint64_t, ceph_filelock>::iterator p = lock_state->held_locks.begin();
10406         p != lock_state->held_locks.end();
10407         ++p)
10408       encode(p->second, bl);
10409   }
10410
10411   ldout(cct, 10) << __func__ << " ino " << in->ino << ", " << nr_fcntl_locks
10412                  << " fcntl locks, " << nr_flock_locks << " flock locks" <<  dendl;
10413 }
10414
10415 void Client::_release_filelocks(Fh *fh)
10416 {
10417   if (!fh->fcntl_locks && !fh->flock_locks)
10418     return;
10419
10420   Inode *in = fh->inode.get();
10421   ldout(cct, 10) << __func__ << " " << fh << " ino " << in->ino << dendl;
10422
10423   list<ceph_filelock> activated_locks;
10424
10425   list<pair<int, ceph_filelock> > to_release;
10426
10427   if (fh->fcntl_locks) {
10428     auto &lock_state = fh->fcntl_locks;
10429     for(auto p = lock_state->held_locks.begin(); p != lock_state->held_locks.end(); ) {
10430       auto q = p++;
10431       if (in->flags & I_ERROR_FILELOCK) {
10432         lock_state->remove_lock(q->second, activated_locks);
10433       } else {
10434         to_release.push_back(pair<int, ceph_filelock>(CEPH_LOCK_FCNTL, q->second));
10435       }
10436     }
10437     lock_state.reset();
10438   }
10439   if (fh->flock_locks) {
10440     auto &lock_state = fh->flock_locks;
10441     for(auto p = lock_state->held_locks.begin(); p != lock_state->held_locks.end(); ) {
10442       auto q = p++;
10443       if (in->flags & I_ERROR_FILELOCK) {
10444         lock_state->remove_lock(q->second, activated_locks);
10445       } else {
10446         to_release.push_back(pair<int, ceph_filelock>(CEPH_LOCK_FLOCK, q->second));
10447       }
10448     }
10449     lock_state.reset();
10450   }
10451
10452   if ((in->flags & I_ERROR_FILELOCK) && !in->has_any_filelocks())
10453     in->flags &= ~I_ERROR_FILELOCK;
10454
10455   if (to_release.empty())
10456     return;
10457
10458   struct flock fl;
10459   memset(&fl, 0, sizeof(fl));
10460   fl.l_whence = SEEK_SET;
10461   fl.l_type = F_UNLCK;
10462
10463   for (list<pair<int, ceph_filelock> >::iterator p = to_release.begin();
10464        p != to_release.end();
10465        ++p) {
10466     fl.l_start = p->second.start;
10467     fl.l_len = p->second.length;
10468     fl.l_pid = p->second.pid;
10469     _do_filelock(in, fh, p->first, CEPH_MDS_OP_SETFILELOCK, 0, &fl,
10470                  p->second.owner, true);
10471   }
10472 }
10473
10474 void Client::_update_lock_state(struct flock *fl, uint64_t owner,
10475                                 ceph_lock_state_t *lock_state)
10476 {
10477   int lock_cmd;
10478   if (F_RDLCK == fl->l_type)
10479     lock_cmd = CEPH_LOCK_SHARED;
10480   else if (F_WRLCK == fl->l_type)
10481     lock_cmd = CEPH_LOCK_EXCL;
10482   else
10483     lock_cmd = CEPH_LOCK_UNLOCK;;
10484
10485   ceph_filelock filelock;
10486   filelock.start = fl->l_start;
10487   filelock.length = fl->l_len;
10488   filelock.client = 0;
10489   // see comment in _do_filelock()
10490   filelock.owner = owner | (1ULL << 63);
10491   filelock.pid = fl->l_pid;
10492   filelock.type = lock_cmd;
10493
10494   if (filelock.type == CEPH_LOCK_UNLOCK) {
10495     list<ceph_filelock> activated_locks;
10496     lock_state->remove_lock(filelock, activated_locks);
10497   } else {
10498     bool r = lock_state->add_lock(filelock, false, false, NULL);
10499     ceph_assert(r);
10500   }
10501 }
10502
10503 int Client::_getlk(Fh *fh, struct flock *fl, uint64_t owner)
10504 {
10505   Inode *in = fh->inode.get();
10506   ldout(cct, 10) << "_getlk " << fh << " ino " << in->ino << dendl;
10507   int ret = _do_filelock(in, fh, CEPH_LOCK_FCNTL, CEPH_MDS_OP_GETFILELOCK, 0, fl, owner);
10508   return ret;
10509 }
10510
10511 int Client::_setlk(Fh *fh, struct flock *fl, uint64_t owner, int sleep)
10512 {
10513   Inode *in = fh->inode.get();
10514   ldout(cct, 10) << "_setlk " << fh << " ino " << in->ino << dendl;
10515   int ret =  _do_filelock(in, fh, CEPH_LOCK_FCNTL, CEPH_MDS_OP_SETFILELOCK, sleep, fl, owner);
10516   ldout(cct, 10) << "_setlk " << fh << " ino " << in->ino << " result=" << ret << dendl;
10517   return ret;
10518 }
10519
10520 int Client::_flock(Fh *fh, int cmd, uint64_t owner)
10521 {
10522   Inode *in = fh->inode.get();
10523   ldout(cct, 10) << "_flock " << fh << " ino " << in->ino << dendl;
10524
10525   int sleep = !(cmd & LOCK_NB);
10526   cmd &= ~LOCK_NB;
10527
10528   int type;
10529   switch (cmd) {
10530     case LOCK_SH:
10531       type = F_RDLCK;
10532       break;
10533     case LOCK_EX:
10534       type = F_WRLCK;
10535       break;
10536     case LOCK_UN:
10537       type = F_UNLCK;
10538       break;
10539     default:
10540       return -EINVAL;
10541   }
10542
10543   struct flock fl;
10544   memset(&fl, 0, sizeof(fl));
10545   fl.l_type = type;
10546   fl.l_whence = SEEK_SET;
10547
10548   int ret =  _do_filelock(in, fh, CEPH_LOCK_FLOCK, CEPH_MDS_OP_SETFILELOCK, sleep, &fl, owner);
10549   ldout(cct, 10) << "_flock " << fh << " ino " << in->ino << " result=" << ret << dendl;
10550   return ret;
10551 }
10552
10553 int Client::ll_statfs(Inode *in, struct statvfs *stbuf, const UserPerm& perms)
10554 {
10555   /* Since the only thing this does is wrap a call to statfs, and
10556      statfs takes a lock, it doesn't seem we have a need to split it
10557      out. */
10558   return statfs(0, stbuf, perms);
10559 }
10560
10561 void Client::ll_register_callbacks(struct ceph_client_callback_args *args)
10562 {
10563   if (!args)
10564     return;
10565   std::lock_guard l(client_lock);
10566   ldout(cct, 10) << __func__ << " cb " << args->handle
10567                  << " invalidate_ino_cb " << args->ino_cb
10568                  << " invalidate_dentry_cb " << args->dentry_cb
10569                  << " switch_interrupt_cb " << args->switch_intr_cb
10570                  << " remount_cb " << args->remount_cb
10571                  << dendl;
10572   callback_handle = args->handle;
10573   if (args->ino_cb) {
10574     ino_invalidate_cb = args->ino_cb;
10575     async_ino_invalidator.start();
10576   }
10577   if (args->dentry_cb) {
10578     dentry_invalidate_cb = args->dentry_cb;
10579     async_dentry_invalidator.start();
10580   }
10581   if (args->switch_intr_cb) {
10582     switch_interrupt_cb = args->switch_intr_cb;
10583     interrupt_finisher.start();
10584   }
10585   if (args->remount_cb) {
10586     remount_cb = args->remount_cb;
10587     remount_finisher.start();
10588   }
10589   if (args->ino_release_cb) {
10590     ino_release_cb = args->ino_release_cb;
10591     async_ino_releasor.start();
10592   }
10593   if (args->umask_cb)
10594     umask_cb = args->umask_cb;
10595 }
10596
10597 int Client::test_dentry_handling(bool can_invalidate)
10598 {
10599   int r = 0;
10600
10601   can_invalidate_dentries = can_invalidate;
10602
10603   if (can_invalidate_dentries) {
10604     ceph_assert(dentry_invalidate_cb);
10605     ldout(cct, 1) << "using dentry_invalidate_cb" << dendl;
10606     r = 0;
10607   } else {
10608     ceph_assert(remount_cb);
10609     ldout(cct, 1) << "using remount_cb" << dendl;
10610     r = _do_remount(false);
10611   }
10612
10613   return r;
10614 }
10615
10616 int Client::_sync_fs()
10617 {
10618   ldout(cct, 10) << __func__ << dendl;
10619
10620   // flush file data
10621   std::unique_ptr<C_SaferCond> cond = nullptr;
10622   if (cct->_conf->client_oc) {
10623     cond.reset(new C_SaferCond("Client::_sync_fs:lock"));
10624     objectcacher->flush_all(cond.get());
10625   }
10626
10627   // flush caps
10628   flush_caps_sync();
10629   ceph_tid_t flush_tid = last_flush_tid;
10630
10631   // wait for unsafe mds requests
10632   wait_unsafe_requests();
10633
10634   wait_sync_caps(flush_tid);
10635
10636   if (nullptr != cond) {
10637     client_lock.unlock();
10638     ldout(cct, 15) << __func__ << " waiting on data to flush" << dendl;
10639     cond->wait();
10640     ldout(cct, 15) << __func__ << " flush finished" << dendl;
10641     client_lock.lock();
10642   }
10643
10644   return 0;
10645 }
10646
10647 int Client::sync_fs()
10648 {
10649   std::lock_guard l(client_lock);
10650
10651   if (unmounting)
10652     return -ENOTCONN;
10653
10654   return _sync_fs();
10655 }
10656
10657 int64_t Client::drop_caches()
10658 {
10659   std::lock_guard l(client_lock);
10660   return objectcacher->release_all();
10661 }
10662
10663 int Client::_lazyio(Fh *fh, int enable)
10664 {
10665   Inode *in = fh->inode.get();
10666   ldout(cct, 20) << __func__ << " " << *in << " " << !!enable << dendl;
10667
10668   if (!!(fh->mode & CEPH_FILE_MODE_LAZY) == !!enable)
10669     return 0;
10670
10671   int orig_mode = fh->mode;
10672   if (enable) {
10673     fh->mode |= CEPH_FILE_MODE_LAZY;
10674     in->get_open_ref(fh->mode);
10675     in->put_open_ref(orig_mode);
10676     check_caps(in, CHECK_CAPS_NODELAY);
10677   } else {
10678     fh->mode &= ~CEPH_FILE_MODE_LAZY;
10679     in->get_open_ref(fh->mode);
10680     in->put_open_ref(orig_mode);
10681     check_caps(in, 0);
10682   }
10683
10684   return 0;
10685 }
10686
10687 int Client::lazyio(int fd, int enable)
10688 {
10689   std::lock_guard l(client_lock);
10690   Fh *f = get_filehandle(fd);
10691   if (!f)
10692     return -EBADF;
10693
10694   return _lazyio(f, enable);
10695 }
10696
10697 int Client::ll_lazyio(Fh *fh, int enable)
10698 {
10699   std::lock_guard lock(client_lock);
10700   ldout(cct, 3) << __func__ << " " << fh << " " << fh->inode->ino << " " << !!enable << dendl;
10701   tout(cct) << __func__ << std::endl;
10702
10703   return _lazyio(fh, enable);
10704 }
10705
10706 int Client::lazyio_propagate(int fd, loff_t offset, size_t count)
10707 {
10708   std::lock_guard l(client_lock);
10709   ldout(cct, 3) << "op: client->lazyio_propagate(" << fd
10710           << ", " << offset << ", " << count << ")" << dendl;
10711
10712   Fh *f = get_filehandle(fd);
10713   if (!f)
10714     return -EBADF;
10715
10716   // for now
10717   _fsync(f, true);
10718
10719   return 0;
10720 }
10721
10722 int Client::lazyio_synchronize(int fd, loff_t offset, size_t count)
10723 {
10724   std::lock_guard l(client_lock);
10725   ldout(cct, 3) << "op: client->lazyio_synchronize(" << fd
10726           << ", " << offset << ", " << count << ")" << dendl;
10727
10728   Fh *f = get_filehandle(fd);
10729   if (!f)
10730     return -EBADF;
10731   Inode *in = f->inode.get();
10732
10733   _fsync(f, true);
10734   if (_release(in)) {
10735     int r =_getattr(in, CEPH_STAT_CAP_SIZE, f->actor_perms);
10736     if (r < 0)
10737       return r;
10738   }
10739   return 0;
10740 }
10741
10742
10743 // =============================
10744 // snaps
10745
10746 int Client::mksnap(const char *relpath, const char *name, const UserPerm& perm)
10747 {
10748   std::lock_guard l(client_lock);
10749
10750   if (unmounting)
10751     return -ENOTCONN;
10752
10753   filepath path(relpath);
10754   InodeRef in;
10755   int r = path_walk(path, &in, perm);
10756   if (r < 0)
10757     return r;
10758   if (cct->_conf->client_permissions) {
10759     r = may_create(in.get(), perm);
10760     if (r < 0)
10761       return r;
10762   }
10763   Inode *snapdir = open_snapdir(in.get());
10764   return _mkdir(snapdir, name, 0, perm);
10765 }
10766
10767 int Client::rmsnap(const char *relpath, const char *name, const UserPerm& perms)
10768 {
10769   std::lock_guard l(client_lock);
10770
10771   if (unmounting)
10772     return -ENOTCONN;
10773
10774   filepath path(relpath);
10775   InodeRef in;
10776   int r = path_walk(path, &in, perms);
10777   if (r < 0)
10778     return r;
10779   if (cct->_conf->client_permissions) {
10780     r = may_delete(in.get(), NULL, perms);
10781     if (r < 0)
10782       return r;
10783   }
10784   Inode *snapdir = open_snapdir(in.get());
10785   return _rmdir(snapdir, name, perms);
10786 }
10787
10788 // =============================
10789 // expose caps
10790
10791 int Client::get_caps_issued(int fd) {
10792
10793   std::lock_guard lock(client_lock);
10794
10795   if (unmounting)
10796     return -ENOTCONN;
10797
10798   Fh *f = get_filehandle(fd);
10799   if (!f)
10800     return -EBADF;
10801
10802   return f->inode->caps_issued();
10803 }
10804
10805 int Client::get_caps_issued(const char *path, const UserPerm& perms)
10806 {
10807   std::lock_guard lock(client_lock);
10808
10809   if (unmounting)
10810     return -ENOTCONN;
10811
10812   filepath p(path);
10813   InodeRef in;
10814   int r = path_walk(p, &in, perms, true);
10815   if (r < 0)
10816     return r;
10817   return in->caps_issued();
10818 }
10819
10820 // =========================================
10821 // low level
10822
10823 Inode *Client::open_snapdir(Inode *diri)
10824 {
10825   Inode *in;
10826   vinodeno_t vino(diri->ino, CEPH_SNAPDIR);
10827   if (!inode_map.count(vino)) {
10828     in = new Inode(this, vino, &diri->layout);
10829
10830     in->ino = diri->ino;
10831     in->snapid = CEPH_SNAPDIR;
10832     in->mode = diri->mode;
10833     in->uid = diri->uid;
10834     in->gid = diri->gid;
10835     in->nlink = 1;
10836     in->mtime = diri->mtime;
10837     in->ctime = diri->ctime;
10838     in->btime = diri->btime;
10839     in->atime = diri->atime;
10840     in->size = diri->size;
10841     in->change_attr = diri->change_attr;
10842
10843     in->dirfragtree.clear();
10844     in->snapdir_parent = diri;
10845     diri->flags |= I_SNAPDIR_OPEN;
10846     inode_map[vino] = in;
10847     if (use_faked_inos())
10848       _assign_faked_ino(in);
10849     ldout(cct, 10) << "open_snapdir created snapshot inode " << *in << dendl;
10850   } else {
10851     in = inode_map[vino];
10852     ldout(cct, 10) << "open_snapdir had snapshot inode " << *in << dendl;
10853   }
10854   return in;
10855 }
10856
10857 int Client::ll_lookup(Inode *parent, const char *name, struct stat *attr,
10858                       Inode **out, const UserPerm& perms)
10859 {
10860   std::lock_guard lock(client_lock);
10861   vinodeno_t vparent = _get_vino(parent);
10862   ldout(cct, 3) << __func__ << " " << vparent << " " << name << dendl;
10863   tout(cct) << __func__ << std::endl;
10864   tout(cct) << name << std::endl;
10865
10866   if (unmounting)
10867     return -ENOTCONN;
10868
10869   int r = 0;
10870   if (!fuse_default_permissions) {
10871     if (strcmp(name, ".") && strcmp(name, "..")) {
10872       r = may_lookup(parent, perms);
10873       if (r < 0)
10874         return r;
10875     }
10876   }
10877
10878   string dname(name);
10879   InodeRef in;
10880
10881   r = _lookup(parent, dname, CEPH_STAT_CAP_INODE_ALL, &in, perms);
10882   if (r < 0) {
10883     attr->st_ino = 0;
10884     goto out;
10885   }
10886
10887   ceph_assert(in);
10888   fill_stat(in, attr);
10889   _ll_get(in.get());
10890
10891  out:
10892   ldout(cct, 3) << __func__ << " " << vparent << " " << name
10893           << " -> " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
10894   tout(cct) << attr->st_ino << std::endl;
10895   *out = in.get();
10896   return r;
10897 }
10898
10899 int Client::ll_lookup_inode(
10900     struct inodeno_t ino,
10901     const UserPerm& perms,
10902     Inode **inode)
10903 {
10904   ceph_assert(inode != NULL);
10905   std::lock_guard lock(client_lock);
10906   ldout(cct, 3) << "ll_lookup_inode " << ino  << dendl;
10907
10908   if (unmounting)
10909     return -ENOTCONN;
10910
10911   // Num1: get inode and *inode
10912   int r = _lookup_ino(ino, perms, inode);
10913   if (r)
10914     return r;
10915
10916   ceph_assert(*inode != NULL);
10917
10918   if (!(*inode)->dentries.empty()) {
10919     ldout(cct, 8) << __func__ << " dentry already present" << dendl;
10920     return 0;
10921   }
10922
10923   if ((*inode)->is_root()) {
10924     ldout(cct, 8) << "ino is root, no parent" << dendl;
10925     return 0;
10926   }
10927
10928   // Num2: Request the parent inode, so that we can look up the name
10929   Inode *parent;
10930   r = _lookup_parent(*inode, perms, &parent);
10931   if (r) {
10932     _ll_forget(*inode, 1);
10933     return r;
10934   }
10935
10936   ceph_assert(parent != NULL);
10937
10938   // Num3: Finally, get the name (dentry) of the requested inode
10939   r = _lookup_name(*inode, parent, perms);
10940   if (r) {
10941     // Unexpected error
10942     _ll_forget(parent, 1);
10943     _ll_forget(*inode, 1);
10944     return r;
10945   }
10946
10947   _ll_forget(parent, 1);
10948   return 0;
10949 }
10950
10951 int Client::ll_lookupx(Inode *parent, const char *name, Inode **out,
10952                        struct ceph_statx *stx, unsigned want, unsigned flags,
10953                        const UserPerm& perms)
10954 {
10955   std::lock_guard lock(client_lock);
10956   vinodeno_t vparent = _get_vino(parent);
10957   ldout(cct, 3) << __func__ << " " << vparent << " " << name << dendl;
10958   tout(cct) << "ll_lookupx" << std::endl;
10959   tout(cct) << name << std::endl;
10960
10961   if (unmounting)
10962     return -ENOTCONN;
10963
10964   int r = 0;
10965   if (!fuse_default_permissions) {
10966     r = may_lookup(parent, perms);
10967     if (r < 0)
10968       return r;
10969   }
10970
10971   string dname(name);
10972   InodeRef in;
10973
10974   unsigned mask = statx_to_mask(flags, want);
10975   r = _lookup(parent, dname, mask, &in, perms);
10976   if (r < 0) {
10977     stx->stx_ino = 0;
10978     stx->stx_mask = 0;
10979   } else {
10980     ceph_assert(in);
10981     fill_statx(in, mask, stx);
10982     _ll_get(in.get());
10983   }
10984
10985   ldout(cct, 3) << __func__ << " " << vparent << " " << name
10986           << " -> " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
10987   tout(cct) << stx->stx_ino << std::endl;
10988   *out = in.get();
10989   return r;
10990 }
10991
10992 int Client::ll_walk(const char* name, Inode **out, struct ceph_statx *stx,
10993                     unsigned int want, unsigned int flags, const UserPerm& perms)
10994 {
10995   std::lock_guard lock(client_lock);
10996
10997   if (unmounting)
10998     return -ENOTCONN;
10999
11000   filepath fp(name, 0);
11001   InodeRef in;
11002   int rc;
11003   unsigned mask = statx_to_mask(flags, want);
11004
11005   ldout(cct, 3) << __func__ << " " << name << dendl;
11006   tout(cct) << __func__ << std::endl;
11007   tout(cct) << name << std::endl;
11008
11009   rc = path_walk(fp, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW), mask);
11010   if (rc < 0) {
11011     /* zero out mask, just in case... */
11012     stx->stx_mask = 0;
11013     stx->stx_ino = 0;
11014     *out = NULL;
11015     return rc;
11016   } else {
11017     ceph_assert(in);
11018     fill_statx(in, mask, stx);
11019     _ll_get(in.get());
11020     *out = in.get();
11021     return 0;
11022   }
11023 }
11024
11025 void Client::_ll_get(Inode *in)
11026 {
11027   if (in->ll_ref == 0) {
11028     in->get();
11029     if (in->is_dir() && !in->dentries.empty()) {
11030       ceph_assert(in->dentries.size() == 1); // dirs can't be hard-linked
11031       in->get_first_parent()->get(); // pin dentry
11032     }
11033     if (in->snapid != CEPH_NOSNAP)
11034       ll_snap_ref[in->snapid]++;
11035   }
11036   in->ll_get();
11037   ldout(cct, 20) << __func__ << " " << in << " " << in->ino << " -> " << in->ll_ref << dendl;
11038 }
11039
11040 int Client::_ll_put(Inode *in, uint64_t num)
11041 {
11042   in->ll_put(num);
11043   ldout(cct, 20) << __func__ << " " << in << " " << in->ino << " " << num << " -> " << in->ll_ref << dendl;
11044   if (in->ll_ref == 0) {
11045     if (in->is_dir() && !in->dentries.empty()) {
11046       ceph_assert(in->dentries.size() == 1); // dirs can't be hard-linked
11047       in->get_first_parent()->put(); // unpin dentry
11048     }
11049     if (in->snapid != CEPH_NOSNAP) {
11050       auto p = ll_snap_ref.find(in->snapid);
11051       ceph_assert(p != ll_snap_ref.end());
11052       ceph_assert(p->second > 0);
11053       if (--p->second == 0)
11054         ll_snap_ref.erase(p);
11055     }
11056     put_inode(in);
11057     return 0;
11058   } else {
11059     return in->ll_ref;
11060   }
11061 }
11062
11063 void Client::_ll_drop_pins()
11064 {
11065   ldout(cct, 10) << __func__ << dendl;
11066   std::set<InodeRef> to_be_put; //this set will be deconstructed item by item when exit
11067   ceph::unordered_map<vinodeno_t, Inode*>::iterator next;
11068   for (ceph::unordered_map<vinodeno_t, Inode*>::iterator it = inode_map.begin();
11069        it != inode_map.end();
11070        it = next) {
11071     Inode *in = it->second;
11072     next = it;
11073     ++next;
11074     if (in->ll_ref){
11075       to_be_put.insert(in);
11076       _ll_put(in, in->ll_ref);
11077     }
11078   }
11079 }
11080
11081 bool Client::_ll_forget(Inode *in, uint64_t count)
11082 {
11083   inodeno_t ino = in->ino;
11084
11085   ldout(cct, 8) << __func__ << " " << ino << " " << count << dendl;
11086   tout(cct) << __func__ << std::endl;
11087   tout(cct) << ino.val << std::endl;
11088   tout(cct) << count << std::endl;
11089
11090   // Ignore forget if we're no longer mounted
11091   if (unmounting)
11092     return true;
11093
11094   if (ino == 1) return true;  // ignore forget on root.
11095
11096   bool last = false;
11097   if (in->ll_ref < count) {
11098     ldout(cct, 1) << "WARNING: ll_forget on " << ino << " " << count
11099                   << ", which only has ll_ref=" << in->ll_ref << dendl;
11100     _ll_put(in, in->ll_ref);
11101     last = true;
11102   } else {
11103     if (_ll_put(in, count) == 0)
11104       last = true;
11105   }
11106
11107   return last;
11108 }
11109
11110 bool Client::ll_forget(Inode *in, uint64_t count)
11111 {
11112   std::lock_guard lock(client_lock);
11113   return _ll_forget(in, count);
11114 }
11115
11116 bool Client::ll_put(Inode *in)
11117 {
11118   /* ll_forget already takes the lock */
11119   return ll_forget(in, 1);
11120 }
11121
11122 int Client::ll_get_snap_ref(snapid_t snap)
11123 {
11124   std::lock_guard lock(client_lock);
11125   auto p = ll_snap_ref.find(snap);
11126   if (p != ll_snap_ref.end())
11127     return p->second;
11128   return 0;
11129 }
11130
11131 snapid_t Client::ll_get_snapid(Inode *in)
11132 {
11133   std::lock_guard lock(client_lock);
11134   return in->snapid;
11135 }
11136
11137 Inode *Client::ll_get_inode(ino_t ino)
11138 {
11139   std::lock_guard lock(client_lock);
11140
11141   if (unmounting)
11142     return NULL;
11143
11144   vinodeno_t vino = _map_faked_ino(ino);
11145   unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino);
11146   if (p == inode_map.end())
11147     return NULL;
11148   Inode *in = p->second;
11149   _ll_get(in);
11150   return in;
11151 }
11152
11153 Inode *Client::ll_get_inode(vinodeno_t vino)
11154 {
11155   std::lock_guard lock(client_lock);
11156
11157   if (unmounting)
11158     return NULL;
11159
11160   unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino);
11161   if (p == inode_map.end())
11162     return NULL;
11163   Inode *in = p->second;
11164   _ll_get(in);
11165   return in;
11166 }
11167
11168 int Client::_ll_getattr(Inode *in, int caps, const UserPerm& perms)
11169 {
11170   vinodeno_t vino = _get_vino(in);
11171
11172   ldout(cct, 8) << __func__ << " " << vino << dendl;
11173   tout(cct) << __func__ << std::endl;
11174   tout(cct) << vino.ino.val << std::endl;
11175
11176   if (vino.snapid < CEPH_NOSNAP)
11177     return 0;
11178   else
11179     return _getattr(in, caps, perms);
11180 }
11181
11182 int Client::ll_getattr(Inode *in, struct stat *attr, const UserPerm& perms)
11183 {
11184   std::lock_guard lock(client_lock);
11185
11186   if (unmounting)
11187     return -ENOTCONN;
11188
11189   int res = _ll_getattr(in, CEPH_STAT_CAP_INODE_ALL, perms);
11190
11191   if (res == 0)
11192     fill_stat(in, attr);
11193   ldout(cct, 3) << __func__ << " " << _get_vino(in) << " = " << res << dendl;
11194   return res;
11195 }
11196
11197 int Client::ll_getattrx(Inode *in, struct ceph_statx *stx, unsigned int want,
11198                         unsigned int flags, const UserPerm& perms)
11199 {
11200   std::lock_guard lock(client_lock);
11201
11202   if (unmounting)
11203     return -ENOTCONN;
11204
11205   int res = 0;
11206   unsigned mask = statx_to_mask(flags, want);
11207
11208   if (mask && !in->caps_issued_mask(mask, true))
11209     res = _ll_getattr(in, mask, perms);
11210
11211   if (res == 0)
11212     fill_statx(in, mask, stx);
11213   ldout(cct, 3) << __func__ << " " << _get_vino(in) << " = " << res << dendl;
11214   return res;
11215 }
11216
11217 int Client::_ll_setattrx(Inode *in, struct ceph_statx *stx, int mask,
11218                          const UserPerm& perms, InodeRef *inp)
11219 {
11220   vinodeno_t vino = _get_vino(in);
11221
11222   ldout(cct, 8) << __func__ << " " << vino << " mask " << hex << mask << dec
11223                 << dendl;
11224   tout(cct) << __func__ << std::endl;
11225   tout(cct) << vino.ino.val << std::endl;
11226   tout(cct) << stx->stx_mode << std::endl;
11227   tout(cct) << stx->stx_uid << std::endl;
11228   tout(cct) << stx->stx_gid << std::endl;
11229   tout(cct) << stx->stx_size << std::endl;
11230   tout(cct) << stx->stx_mtime << std::endl;
11231   tout(cct) << stx->stx_atime << std::endl;
11232   tout(cct) << stx->stx_btime << std::endl;
11233   tout(cct) << mask << std::endl;
11234
11235   if (!fuse_default_permissions) {
11236     int res = may_setattr(in, stx, mask, perms);
11237     if (res < 0)
11238       return res;
11239   }
11240
11241   mask &= ~(CEPH_SETATTR_MTIME_NOW | CEPH_SETATTR_ATIME_NOW);
11242
11243   return __setattrx(in, stx, mask, perms, inp);
11244 }
11245
11246 int Client::ll_setattrx(Inode *in, struct ceph_statx *stx, int mask,
11247                         const UserPerm& perms)
11248 {
11249   std::lock_guard lock(client_lock);
11250
11251   if (unmounting)
11252     return -ENOTCONN;
11253
11254   InodeRef target(in);
11255   int res = _ll_setattrx(in, stx, mask, perms, &target);
11256   if (res == 0) {
11257     ceph_assert(in == target.get());
11258     fill_statx(in, in->caps_issued(), stx);
11259   }
11260
11261   ldout(cct, 3) << __func__ << " " << _get_vino(in) << " = " << res << dendl;
11262   return res;
11263 }
11264
11265 int Client::ll_setattr(Inode *in, struct stat *attr, int mask,
11266                        const UserPerm& perms)
11267 {
11268   struct ceph_statx stx;
11269   stat_to_statx(attr, &stx);
11270
11271   std::lock_guard lock(client_lock);
11272
11273   if (unmounting)
11274     return -ENOTCONN;
11275
11276   InodeRef target(in);
11277   int res = _ll_setattrx(in, &stx, mask, perms, &target);
11278   if (res == 0) {
11279     ceph_assert(in == target.get());
11280     fill_stat(in, attr);
11281   }
11282
11283   ldout(cct, 3) << __func__ << " " << _get_vino(in) << " = " << res << dendl;
11284   return res;
11285 }
11286
11287
11288 // ----------
11289 // xattrs
11290
11291 int Client::getxattr(const char *path, const char *name, void *value, size_t size,
11292                      const UserPerm& perms)
11293 {
11294   std::lock_guard lock(client_lock);
11295
11296   if (unmounting)
11297     return -ENOTCONN;
11298
11299   InodeRef in;
11300   int r = Client::path_walk(path, &in, perms, true, CEPH_STAT_CAP_XATTR);
11301   if (r < 0)
11302     return r;
11303   return _getxattr(in, name, value, size, perms);
11304 }
11305
11306 int Client::lgetxattr(const char *path, const char *name, void *value, size_t size,
11307                       const UserPerm& perms)
11308 {
11309   std::lock_guard lock(client_lock);
11310
11311   if (unmounting)
11312     return -ENOTCONN;
11313
11314   InodeRef in;
11315   int r = Client::path_walk(path, &in, perms, false, CEPH_STAT_CAP_XATTR);
11316   if (r < 0)
11317     return r;
11318   return _getxattr(in, name, value, size, perms);
11319 }
11320
11321 int Client::fgetxattr(int fd, const char *name, void *value, size_t size,
11322                       const UserPerm& perms)
11323 {
11324   std::lock_guard lock(client_lock);
11325
11326   if (unmounting)
11327     return -ENOTCONN;
11328
11329   Fh *f = get_filehandle(fd);
11330   if (!f)
11331     return -EBADF;
11332   return _getxattr(f->inode, name, value, size, perms);
11333 }
11334
11335 int Client::listxattr(const char *path, char *list, size_t size,
11336                       const UserPerm& perms)
11337 {
11338   std::lock_guard lock(client_lock);
11339
11340   if (unmounting)
11341     return -ENOTCONN;
11342
11343   InodeRef in;
11344   int r = Client::path_walk(path, &in, perms, true, CEPH_STAT_CAP_XATTR);
11345   if (r < 0)
11346     return r;
11347   return Client::_listxattr(in.get(), list, size, perms);
11348 }
11349
11350 int Client::llistxattr(const char *path, char *list, size_t size,
11351                        const UserPerm& perms)
11352 {
11353   std::lock_guard lock(client_lock);
11354
11355   if (unmounting)
11356     return -ENOTCONN;
11357
11358   InodeRef in;
11359   int r = Client::path_walk(path, &in, perms, false, CEPH_STAT_CAP_XATTR);
11360   if (r < 0)
11361     return r;
11362   return Client::_listxattr(in.get(), list, size, perms);
11363 }
11364
11365 int Client::flistxattr(int fd, char *list, size_t size, const UserPerm& perms)
11366 {
11367   std::lock_guard lock(client_lock);
11368
11369   if (unmounting)
11370     return -ENOTCONN;
11371
11372   Fh *f = get_filehandle(fd);
11373   if (!f)
11374     return -EBADF;
11375   return Client::_listxattr(f->inode.get(), list, size, perms);
11376 }
11377
11378 int Client::removexattr(const char *path, const char *name,
11379                         const UserPerm& perms)
11380 {
11381   std::lock_guard lock(client_lock);
11382
11383   if (unmounting)
11384     return -ENOTCONN;
11385
11386   InodeRef in;
11387   int r = Client::path_walk(path, &in, perms, true);
11388   if (r < 0)
11389     return r;
11390   return _removexattr(in, name, perms);
11391 }
11392
11393 int Client::lremovexattr(const char *path, const char *name,
11394                          const UserPerm& perms)
11395 {
11396   std::lock_guard lock(client_lock);
11397
11398   if (unmounting)
11399     return -ENOTCONN;
11400
11401   InodeRef in;
11402   int r = Client::path_walk(path, &in, perms, false);
11403   if (r < 0)
11404     return r;
11405   return _removexattr(in, name, perms);
11406 }
11407
11408 int Client::fremovexattr(int fd, const char *name, const UserPerm& perms)
11409 {
11410   std::lock_guard lock(client_lock);
11411
11412   if (unmounting)
11413     return -ENOTCONN;
11414
11415   Fh *f = get_filehandle(fd);
11416   if (!f)
11417     return -EBADF;
11418   return _removexattr(f->inode, name, perms);
11419 }
11420
11421 int Client::setxattr(const char *path, const char *name, const void *value,
11422                      size_t size, int flags, const UserPerm& perms)
11423 {
11424   _setxattr_maybe_wait_for_osdmap(name, value, size);
11425
11426   std::lock_guard lock(client_lock);
11427
11428   if (unmounting)
11429     return -ENOTCONN;
11430
11431   InodeRef in;
11432   int r = Client::path_walk(path, &in, perms, true);
11433   if (r < 0)
11434     return r;
11435   return _setxattr(in, name, value, size, flags, perms);
11436 }
11437
11438 int Client::lsetxattr(const char *path, const char *name, const void *value,
11439                       size_t size, int flags, const UserPerm& perms)
11440 {
11441   _setxattr_maybe_wait_for_osdmap(name, value, size);
11442
11443   std::lock_guard lock(client_lock);
11444
11445   if (unmounting)
11446     return -ENOTCONN;
11447
11448   InodeRef in;
11449   int r = Client::path_walk(path, &in, perms, false);
11450   if (r < 0)
11451     return r;
11452   return _setxattr(in, name, value, size, flags, perms);
11453 }
11454
11455 int Client::fsetxattr(int fd, const char *name, const void *value, size_t size,
11456                       int flags, const UserPerm& perms)
11457 {
11458   _setxattr_maybe_wait_for_osdmap(name, value, size);
11459
11460   std::lock_guard lock(client_lock);
11461
11462   if (unmounting)
11463     return -ENOTCONN;
11464
11465   Fh *f = get_filehandle(fd);
11466   if (!f)
11467     return -EBADF;
11468   return _setxattr(f->inode, name, value, size, flags, perms);
11469 }
11470
11471 int Client::_getxattr(Inode *in, const char *name, void *value, size_t size,
11472                       const UserPerm& perms)
11473 {
11474   int r;
11475
11476   const VXattr *vxattr = _match_vxattr(in, name);
11477   if (vxattr) {
11478     r = -ENODATA;
11479
11480     // Do a force getattr to get the latest quota before returning
11481     // a value to userspace.
11482     int flags = 0;
11483     if (vxattr->flags & VXATTR_RSTAT) {
11484       flags |= CEPH_STAT_RSTAT;
11485     }
11486     if (vxattr->flags & VXATTR_DIRSTAT) {
11487       flags |= CEPH_CAP_FILE_SHARED;
11488     }
11489     r = _getattr(in, flags, perms, true);
11490     if (r != 0) {
11491       // Error from getattr!
11492       return r;
11493     }
11494
11495     // call pointer-to-member function
11496     char buf[256];
11497     if (!(vxattr->exists_cb && !(this->*(vxattr->exists_cb))(in))) {
11498       r = (this->*(vxattr->getxattr_cb))(in, buf, sizeof(buf));
11499     } else {
11500       r = -ENODATA;
11501     }
11502
11503     if (size != 0) {
11504       if (r > (int)size) {
11505         r = -ERANGE;
11506       } else if (r > 0) {
11507         memcpy(value, buf, r);
11508       }
11509     }
11510     goto out;
11511   }
11512
11513   if (acl_type == NO_ACL && !strncmp(name, "system.", 7)) {
11514     r = -EOPNOTSUPP;
11515     goto out;
11516   }
11517
11518   r = _getattr(in, CEPH_STAT_CAP_XATTR, perms, in->xattr_version == 0);
11519   if (r == 0) {
11520     string n(name);
11521     r = -ENODATA;
11522    if (in->xattrs.count(n)) {
11523       r = in->xattrs[n].length();
11524       if (r > 0 && size != 0) {
11525         if (size >= (unsigned)r)
11526           memcpy(value, in->xattrs[n].c_str(), r);
11527         else
11528           r = -ERANGE;
11529       }
11530     }
11531   }
11532  out:
11533   ldout(cct, 8) << "_getxattr(" << in->ino << ", \"" << name << "\", " << size << ") = " << r << dendl;
11534   return r;
11535 }
11536
11537 int Client::_getxattr(InodeRef &in, const char *name, void *value, size_t size,
11538                       const UserPerm& perms)
11539 {
11540   if (cct->_conf->client_permissions) {
11541     int r = xattr_permission(in.get(), name, MAY_READ, perms);
11542     if (r < 0)
11543       return r;
11544   }
11545   return _getxattr(in.get(), name, value, size, perms);
11546 }
11547
11548 int Client::ll_getxattr(Inode *in, const char *name, void *value,
11549                         size_t size, const UserPerm& perms)
11550 {
11551   std::lock_guard lock(client_lock);
11552
11553   if (unmounting)
11554     return -ENOTCONN;
11555
11556   vinodeno_t vino = _get_vino(in);
11557
11558   ldout(cct, 3) << __func__ << " " << vino << " " << name << " size " << size << dendl;
11559   tout(cct) << __func__ << std::endl;
11560   tout(cct) << vino.ino.val << std::endl;
11561   tout(cct) << name << std::endl;
11562
11563   if (!fuse_default_permissions) {
11564     int r = xattr_permission(in, name, MAY_READ, perms);
11565     if (r < 0)
11566       return r;
11567   }
11568
11569   return _getxattr(in, name, value, size, perms);
11570 }
11571
11572 int Client::_listxattr(Inode *in, char *name, size_t size,
11573                        const UserPerm& perms)
11574 {
11575   bool len_only = (size == 0);
11576   int r = _getattr(in, CEPH_STAT_CAP_XATTR, perms, in->xattr_version == 0);
11577   if (r != 0) {
11578     goto out;
11579   }
11580
11581   r = 0;
11582   for (const auto& p : in->xattrs) {
11583     size_t this_len = p.first.length() + 1;
11584     r += this_len;
11585     if (len_only)
11586       continue;
11587
11588     if (this_len > size) {
11589       r = -ERANGE;
11590       goto out;
11591     }
11592
11593     memcpy(name, p.first.c_str(), this_len);
11594     name += this_len;
11595     size -= this_len;
11596   }
11597 out:
11598   ldout(cct, 8) << __func__ << "(" << in->ino << ", " << size << ") = " << r << dendl;
11599   return r;
11600 }
11601
11602 int Client::ll_listxattr(Inode *in, char *names, size_t size,
11603                          const UserPerm& perms)
11604 {
11605   std::lock_guard lock(client_lock);
11606
11607   if (unmounting)
11608     return -ENOTCONN;
11609
11610   vinodeno_t vino = _get_vino(in);
11611
11612   ldout(cct, 3) << __func__ << " " << vino << " size " << size << dendl;
11613   tout(cct) << __func__ << std::endl;
11614   tout(cct) << vino.ino.val << std::endl;
11615   tout(cct) << size << std::endl;
11616
11617   return _listxattr(in, names, size, perms);
11618 }
11619
11620 int Client::_do_setxattr(Inode *in, const char *name, const void *value,
11621                          size_t size, int flags, const UserPerm& perms)
11622 {
11623
11624   int xattr_flags = 0;
11625   if (!value)
11626     xattr_flags |= CEPH_XATTR_REMOVE;
11627   if (flags & XATTR_CREATE)
11628     xattr_flags |= CEPH_XATTR_CREATE;
11629   if (flags & XATTR_REPLACE)
11630     xattr_flags |= CEPH_XATTR_REPLACE;
11631
11632   MetaRequest *req = new MetaRequest(CEPH_MDS_OP_SETXATTR);
11633   filepath path;
11634   in->make_nosnap_relative_path(path);
11635   req->set_filepath(path);
11636   req->set_string2(name);
11637   req->set_inode(in);
11638   req->head.args.setxattr.flags = xattr_flags;
11639
11640   bufferlist bl;
11641   assert (value || size == 0);
11642   bl.append((const char*)value, size);
11643   req->set_data(bl);
11644
11645   int res = make_request(req, perms);
11646
11647   trim_cache();
11648   ldout(cct, 3) << __func__ << "(" << in->ino << ", \"" << name << "\") = " <<
11649     res << dendl;
11650   return res;
11651 }
11652
11653 int Client::_setxattr(Inode *in, const char *name, const void *value,
11654                       size_t size, int flags, const UserPerm& perms)
11655 {
11656   if (in->snapid != CEPH_NOSNAP) {
11657     return -EROFS;
11658   }
11659
11660   if (size == 0) {
11661     value = "";
11662   } else if (value == NULL) {
11663       return -EINVAL;
11664   }
11665
11666   bool posix_acl_xattr = false;
11667   if (acl_type == POSIX_ACL)
11668     posix_acl_xattr = !strncmp(name, "system.", 7);
11669
11670   if (strncmp(name, "user.", 5) &&
11671       strncmp(name, "security.", 9) &&
11672       strncmp(name, "trusted.", 8) &&
11673       strncmp(name, "ceph.", 5) &&
11674       !posix_acl_xattr)
11675     return -EOPNOTSUPP;
11676
11677   bool check_realm = false;
11678
11679   if (posix_acl_xattr) {
11680     if (!strcmp(name, ACL_EA_ACCESS)) {
11681       mode_t new_mode = in->mode;
11682       if (value) {
11683         int ret = posix_acl_equiv_mode(value, size, &new_mode);
11684         if (ret < 0)
11685           return ret;
11686         if (ret == 0) {
11687           value = NULL;
11688           size = 0;
11689         }
11690         if (new_mode != in->mode) {
11691           struct ceph_statx stx;
11692           stx.stx_mode = new_mode;
11693           ret = _do_setattr(in, &stx, CEPH_SETATTR_MODE, perms, NULL);
11694           if (ret < 0)
11695             return ret;
11696         }
11697       }
11698     } else if (!strcmp(name, ACL_EA_DEFAULT)) {
11699       if (value) {
11700         if (!S_ISDIR(in->mode))
11701           return -EACCES;
11702         int ret = posix_acl_check(value, size);
11703         if (ret < 0)
11704           return -EINVAL;
11705         if (ret == 0) {
11706           value = NULL;
11707           size = 0;
11708         }
11709       }
11710     } else {
11711       return -EOPNOTSUPP;
11712     }
11713   } else {
11714     const VXattr *vxattr = _match_vxattr(in, name);
11715     if (vxattr) {
11716       if (vxattr->readonly)
11717         return -EOPNOTSUPP;
11718       if (vxattr->name.compare(0, 10, "ceph.quota") == 0 && value)
11719         check_realm = true;
11720     }
11721   }
11722
11723   int ret = _do_setxattr(in, name, value, size, flags, perms);
11724   if (ret >= 0 && check_realm) {
11725     // check if snaprealm was created for quota inode
11726     if (in->quota.is_enable() &&
11727         !(in->snaprealm && in->snaprealm->ino == in->ino))
11728       ret = -EOPNOTSUPP;
11729   }
11730
11731   return ret;
11732 }
11733
11734 int Client::_setxattr(InodeRef &in, const char *name, const void *value,
11735                       size_t size, int flags, const UserPerm& perms)
11736 {
11737   if (cct->_conf->client_permissions) {
11738     int r = xattr_permission(in.get(), name, MAY_WRITE, perms);
11739     if (r < 0)
11740       return r;
11741   }
11742   return _setxattr(in.get(), name, value, size, flags, perms);
11743 }
11744
11745 int Client::_setxattr_check_data_pool(string& name, string& value, const OSDMap *osdmap)
11746 {
11747   string tmp;
11748   if (name == "layout") {
11749     string::iterator begin = value.begin();
11750     string::iterator end = value.end();
11751     keys_and_values<string::iterator> p;    // create instance of parser
11752     std::map<string, string> m;             // map to receive results
11753     if (!qi::parse(begin, end, p, m)) {     // returns true if successful
11754       return -EINVAL;
11755     }
11756     if (begin != end)
11757       return -EINVAL;
11758     for (map<string,string>::iterator q = m.begin(); q != m.end(); ++q) {
11759       if (q->first == "pool") {
11760         tmp = q->second;
11761         break;
11762       }
11763     }
11764   } else if (name == "layout.pool") {
11765     tmp = value;
11766   }
11767
11768   if (tmp.length()) {
11769     int64_t pool;
11770     try {
11771       pool = boost::lexical_cast<unsigned>(tmp);
11772       if (!osdmap->have_pg_pool(pool))
11773         return -ENOENT;
11774     } catch (boost::bad_lexical_cast const&) {
11775       pool = osdmap->lookup_pg_pool_name(tmp);
11776       if (pool < 0) {
11777         return -ENOENT;
11778       }
11779     }
11780   }
11781
11782   return 0;
11783 }
11784
11785 void Client::_setxattr_maybe_wait_for_osdmap(const char *name, const void *value, size_t size)
11786 {
11787   // For setting pool of layout, MetaRequest need osdmap epoch.
11788   // There is a race which create a new data pool but client and mds both don't have.
11789   // Make client got the latest osdmap which make mds quickly judge whether get newer osdmap.
11790   if (strcmp(name, "ceph.file.layout.pool") == 0 || strcmp(name, "ceph.dir.layout.pool") == 0 ||
11791       strcmp(name, "ceph.file.layout") == 0 || strcmp(name, "ceph.dir.layout") == 0) {
11792     string rest(strstr(name, "layout"));
11793     string v((const char*)value, size);
11794     int r = objecter->with_osdmap([&](const OSDMap& o) {
11795       return _setxattr_check_data_pool(rest, v, &o);
11796     });
11797
11798     if (r == -ENOENT) {
11799       C_SaferCond ctx;
11800       objecter->wait_for_latest_osdmap(&ctx);
11801       ctx.wait();
11802     }
11803   }
11804 }
11805
11806 int Client::ll_setxattr(Inode *in, const char *name, const void *value,
11807                         size_t size, int flags, const UserPerm& perms)
11808 {
11809   _setxattr_maybe_wait_for_osdmap(name, value, size);
11810
11811   std::lock_guard lock(client_lock);
11812
11813   if (unmounting)
11814     return -ENOTCONN;
11815
11816   vinodeno_t vino = _get_vino(in);
11817
11818   ldout(cct, 3) << __func__ << " " << vino << " " << name << " size " << size << dendl;
11819   tout(cct) << __func__ << std::endl;
11820   tout(cct) << vino.ino.val << std::endl;
11821   tout(cct) << name << std::endl;
11822
11823   if (!fuse_default_permissions) {
11824     int r = xattr_permission(in, name, MAY_WRITE, perms);
11825     if (r < 0)
11826       return r;
11827   }
11828   return _setxattr(in, name, value, size, flags, perms);
11829 }
11830
11831 int Client::_removexattr(Inode *in, const char *name, const UserPerm& perms)
11832 {
11833   if (in->snapid != CEPH_NOSNAP) {
11834     return -EROFS;
11835   }
11836
11837   // same xattrs supported by kernel client
11838   if (strncmp(name, "user.", 5) &&
11839       strncmp(name, "system.", 7) &&
11840       strncmp(name, "security.", 9) &&
11841       strncmp(name, "trusted.", 8) &&
11842       strncmp(name, "ceph.", 5))
11843     return -EOPNOTSUPP;
11844
11845   const VXattr *vxattr = _match_vxattr(in, name);
11846   if (vxattr && vxattr->readonly)
11847     return -EOPNOTSUPP;
11848
11849   MetaRequest *req = new MetaRequest(CEPH_MDS_OP_RMXATTR);
11850   filepath path;
11851   in->make_nosnap_relative_path(path);
11852   req->set_filepath(path);
11853   req->set_filepath2(name);
11854   req->set_inode(in);
11855
11856   int res = make_request(req, perms);
11857
11858   trim_cache();
11859   ldout(cct, 8) << "_removexattr(" << in->ino << ", \"" << name << "\") = " << res << dendl;
11860   return res;
11861 }
11862
11863 int Client::_removexattr(InodeRef &in, const char *name, const UserPerm& perms)
11864 {
11865   if (cct->_conf->client_permissions) {
11866     int r = xattr_permission(in.get(), name, MAY_WRITE, perms);
11867     if (r < 0)
11868       return r;
11869   }
11870   return _removexattr(in.get(), name, perms);
11871 }
11872
11873 int Client::ll_removexattr(Inode *in, const char *name, const UserPerm& perms)
11874 {
11875   std::lock_guard lock(client_lock);
11876
11877   if (unmounting)
11878     return -ENOTCONN;
11879
11880   vinodeno_t vino = _get_vino(in);
11881
11882   ldout(cct, 3) << "ll_removexattr " << vino << " " << name << dendl;
11883   tout(cct) << "ll_removexattr" << std::endl;
11884   tout(cct) << vino.ino.val << std::endl;
11885   tout(cct) << name << std::endl;
11886
11887   if (!fuse_default_permissions) {
11888     int r = xattr_permission(in, name, MAY_WRITE, perms);
11889     if (r < 0)
11890       return r;
11891   }
11892
11893   return _removexattr(in, name, perms);
11894 }
11895
11896 bool Client::_vxattrcb_quota_exists(Inode *in)
11897 {
11898   return in->quota.is_enable() &&
11899    (in->snapid != CEPH_NOSNAP ||
11900     (in->snaprealm && in->snaprealm->ino == in->ino));
11901 }
11902 size_t Client::_vxattrcb_quota(Inode *in, char *val, size_t size)
11903 {
11904   return snprintf(val, size,
11905                   "max_bytes=%lld max_files=%lld",
11906                   (long long int)in->quota.max_bytes,
11907                   (long long int)in->quota.max_files);
11908 }
11909 size_t Client::_vxattrcb_quota_max_bytes(Inode *in, char *val, size_t size)
11910 {
11911   return snprintf(val, size, "%lld", (long long int)in->quota.max_bytes);
11912 }
11913 size_t Client::_vxattrcb_quota_max_files(Inode *in, char *val, size_t size)
11914 {
11915   return snprintf(val, size, "%lld", (long long int)in->quota.max_files);
11916 }
11917
11918 bool Client::_vxattrcb_layout_exists(Inode *in)
11919 {
11920   return in->layout != file_layout_t();
11921 }
11922 size_t Client::_vxattrcb_layout(Inode *in, char *val, size_t size)
11923 {
11924   int r = snprintf(val, size,
11925       "stripe_unit=%llu stripe_count=%llu object_size=%llu pool=",
11926       (unsigned long long)in->layout.stripe_unit,
11927       (unsigned long long)in->layout.stripe_count,
11928       (unsigned long long)in->layout.object_size);
11929   objecter->with_osdmap([&](const OSDMap& o) {
11930       if (o.have_pg_pool(in->layout.pool_id))
11931         r += snprintf(val + r, size - r, "%s",
11932                       o.get_pool_name(in->layout.pool_id).c_str());
11933       else
11934         r += snprintf(val + r, size - r, "%" PRIu64,
11935                       (uint64_t)in->layout.pool_id);
11936     });
11937   if (in->layout.pool_ns.length())
11938     r += snprintf(val + r, size - r, " pool_namespace=%s",
11939                   in->layout.pool_ns.c_str());
11940   return r;
11941 }
11942 size_t Client::_vxattrcb_layout_stripe_unit(Inode *in, char *val, size_t size)
11943 {
11944   return snprintf(val, size, "%llu", (unsigned long long)in->layout.stripe_unit);
11945 }
11946 size_t Client::_vxattrcb_layout_stripe_count(Inode *in, char *val, size_t size)
11947 {
11948   return snprintf(val, size, "%llu", (unsigned long long)in->layout.stripe_count);
11949 }
11950 size_t Client::_vxattrcb_layout_object_size(Inode *in, char *val, size_t size)
11951 {
11952   return snprintf(val, size, "%llu", (unsigned long long)in->layout.object_size);
11953 }
11954 size_t Client::_vxattrcb_layout_pool(Inode *in, char *val, size_t size)
11955 {
11956   size_t r;
11957   objecter->with_osdmap([&](const OSDMap& o) {
11958       if (o.have_pg_pool(in->layout.pool_id))
11959         r = snprintf(val, size, "%s", o.get_pool_name(
11960                        in->layout.pool_id).c_str());
11961       else
11962         r = snprintf(val, size, "%" PRIu64, (uint64_t)in->layout.pool_id);
11963     });
11964   return r;
11965 }
11966 size_t Client::_vxattrcb_layout_pool_namespace(Inode *in, char *val, size_t size)
11967 {
11968   return snprintf(val, size, "%s", in->layout.pool_ns.c_str());
11969 }
11970 size_t Client::_vxattrcb_dir_entries(Inode *in, char *val, size_t size)
11971 {
11972   return snprintf(val, size, "%llu", (unsigned long long)(in->dirstat.nfiles + in->dirstat.nsubdirs));
11973 }
11974 size_t Client::_vxattrcb_dir_files(Inode *in, char *val, size_t size)
11975 {
11976   return snprintf(val, size, "%llu", (unsigned long long)in->dirstat.nfiles);
11977 }
11978 size_t Client::_vxattrcb_dir_subdirs(Inode *in, char *val, size_t size)
11979 {
11980   return snprintf(val, size, "%llu", (unsigned long long)in->dirstat.nsubdirs);
11981 }
11982 size_t Client::_vxattrcb_dir_rentries(Inode *in, char *val, size_t size)
11983 {
11984   return snprintf(val, size, "%llu", (unsigned long long)(in->rstat.rfiles + in->rstat.rsubdirs));
11985 }
11986 size_t Client::_vxattrcb_dir_rfiles(Inode *in, char *val, size_t size)
11987 {
11988   return snprintf(val, size, "%llu", (unsigned long long)in->rstat.rfiles);
11989 }
11990 size_t Client::_vxattrcb_dir_rsubdirs(Inode *in, char *val, size_t size)
11991 {
11992   return snprintf(val, size, "%llu", (unsigned long long)in->rstat.rsubdirs);
11993 }
11994 size_t Client::_vxattrcb_dir_rbytes(Inode *in, char *val, size_t size)
11995 {
11996   return snprintf(val, size, "%llu", (unsigned long long)in->rstat.rbytes);
11997 }
11998 size_t Client::_vxattrcb_dir_rctime(Inode *in, char *val, size_t size)
11999 {
12000   return snprintf(val, size, "%ld.%09ld", (long)in->rstat.rctime.sec(),
12001       (long)in->rstat.rctime.nsec());
12002 }
12003 bool Client::_vxattrcb_dir_pin_exists(Inode *in)
12004 {
12005   return in->dir_pin != -ENODATA;
12006 }
12007 size_t Client::_vxattrcb_dir_pin(Inode *in, char *val, size_t size)
12008 {
12009   return snprintf(val, size, "%ld", (long)in->dir_pin);
12010 }
12011
12012 bool Client::_vxattrcb_snap_btime_exists(Inode *in)
12013 {
12014   return !in->snap_btime.is_zero();
12015 }
12016
12017 size_t Client::_vxattrcb_snap_btime(Inode *in, char *val, size_t size)
12018 {
12019   return snprintf(val, size, "%llu.%09lu",
12020       (long long unsigned)in->snap_btime.sec(),
12021       (long unsigned)in->snap_btime.nsec());
12022 }
12023
12024 size_t Client::_vxattrcb_cluster_fsid(Inode *in, char *val, size_t size)
12025 {
12026   return snprintf(val, size, "%s", monclient->get_fsid().to_string().c_str());
12027 }
12028
12029 size_t Client::_vxattrcb_client_id(Inode *in, char *val, size_t size)
12030 {
12031   auto name = messenger->get_myname();
12032   return snprintf(val, size, "%s%ld", name.type_str(), name.num());
12033 }
12034
12035 #define CEPH_XATTR_NAME(_type, _name) "ceph." #_type "." #_name
12036 #define CEPH_XATTR_NAME2(_type, _name, _name2) "ceph." #_type "." #_name "." #_name2
12037
12038 #define XATTR_NAME_CEPH(_type, _name, _flags)                 \
12039 {                                                              \
12040   name: CEPH_XATTR_NAME(_type, _name),                         \
12041   getxattr_cb: &Client::_vxattrcb_ ## _type ## _ ## _name,     \
12042   readonly: true,                                              \
12043   exists_cb: NULL,                                             \
12044   flags: _flags,                                               \
12045 }
12046 #define XATTR_LAYOUT_FIELD(_type, _name, _field)                \
12047 {                                                               \
12048   name: CEPH_XATTR_NAME2(_type, _name, _field),                 \
12049   getxattr_cb: &Client::_vxattrcb_ ## _name ## _ ## _field,     \
12050   readonly: false,                                              \
12051   exists_cb: &Client::_vxattrcb_layout_exists,                  \
12052   flags: 0,                                                     \
12053 }
12054 #define XATTR_QUOTA_FIELD(_type, _name)                         \
12055 {                                                               \
12056   name: CEPH_XATTR_NAME(_type, _name),                          \
12057   getxattr_cb: &Client::_vxattrcb_ ## _type ## _ ## _name,      \
12058   readonly: false,                                              \
12059   exists_cb: &Client::_vxattrcb_quota_exists,                   \
12060   flags: 0,                                                     \
12061 }
12062
12063 const Client::VXattr Client::_dir_vxattrs[] = {
12064   {
12065     name: "ceph.dir.layout",
12066     getxattr_cb: &Client::_vxattrcb_layout,
12067     readonly: false,
12068     exists_cb: &Client::_vxattrcb_layout_exists,
12069     flags: 0,
12070   },
12071   XATTR_LAYOUT_FIELD(dir, layout, stripe_unit),
12072   XATTR_LAYOUT_FIELD(dir, layout, stripe_count),
12073   XATTR_LAYOUT_FIELD(dir, layout, object_size),
12074   XATTR_LAYOUT_FIELD(dir, layout, pool),
12075   XATTR_LAYOUT_FIELD(dir, layout, pool_namespace),
12076   XATTR_NAME_CEPH(dir, entries, VXATTR_DIRSTAT),
12077   XATTR_NAME_CEPH(dir, files, VXATTR_DIRSTAT),
12078   XATTR_NAME_CEPH(dir, subdirs, VXATTR_DIRSTAT),
12079   XATTR_NAME_CEPH(dir, rentries, VXATTR_RSTAT),
12080   XATTR_NAME_CEPH(dir, rfiles, VXATTR_RSTAT),
12081   XATTR_NAME_CEPH(dir, rsubdirs, VXATTR_RSTAT),
12082   XATTR_NAME_CEPH(dir, rbytes, VXATTR_RSTAT),
12083   XATTR_NAME_CEPH(dir, rctime, VXATTR_RSTAT),
12084   {
12085     name: "ceph.quota",
12086     getxattr_cb: &Client::_vxattrcb_quota,
12087     readonly: false,
12088     exists_cb: &Client::_vxattrcb_quota_exists,
12089     flags: 0,
12090   },
12091   XATTR_QUOTA_FIELD(quota, max_bytes),
12092   XATTR_QUOTA_FIELD(quota, max_files),
12093   {
12094     name: "ceph.dir.pin",
12095     getxattr_cb: &Client::_vxattrcb_dir_pin,
12096     readonly: false,
12097     exists_cb: &Client::_vxattrcb_dir_pin_exists,
12098     flags: 0,
12099   },
12100   {
12101     name: "ceph.snap.btime",
12102     getxattr_cb: &Client::_vxattrcb_snap_btime,
12103     readonly: true,
12104     exists_cb: &Client::_vxattrcb_snap_btime_exists,
12105     flags: 0,
12106   },
12107   { name: "" }     /* Required table terminator */
12108 };
12109
12110 const Client::VXattr Client::_file_vxattrs[] = {
12111   {
12112     name: "ceph.file.layout",
12113     getxattr_cb: &Client::_vxattrcb_layout,
12114     readonly: false,
12115     exists_cb: &Client::_vxattrcb_layout_exists,
12116     flags: 0,
12117   },
12118   XATTR_LAYOUT_FIELD(file, layout, stripe_unit),
12119   XATTR_LAYOUT_FIELD(file, layout, stripe_count),
12120   XATTR_LAYOUT_FIELD(file, layout, object_size),
12121   XATTR_LAYOUT_FIELD(file, layout, pool),
12122   XATTR_LAYOUT_FIELD(file, layout, pool_namespace),
12123   {
12124     name: "ceph.snap.btime",
12125     getxattr_cb: &Client::_vxattrcb_snap_btime,
12126     readonly: true,
12127     exists_cb: &Client::_vxattrcb_snap_btime_exists,
12128     flags: 0,
12129   },
12130   { name: "" }     /* Required table terminator */
12131 };
12132
12133 const Client::VXattr Client::_common_vxattrs[] = {
12134   {
12135     name: "ceph.cluster_fsid",
12136     getxattr_cb: &Client::_vxattrcb_cluster_fsid,
12137     readonly: true,
12138     exists_cb: nullptr,
12139     flags: 0,
12140   },
12141   {
12142     name: "ceph.client_id",
12143     getxattr_cb: &Client::_vxattrcb_client_id,
12144     readonly: true,
12145     exists_cb: nullptr,
12146     flags: 0,
12147   },
12148   { name: "" }     /* Required table terminator */
12149 };
12150
12151 const Client::VXattr *Client::_get_vxattrs(Inode *in)
12152 {
12153   if (in->is_dir())
12154     return _dir_vxattrs;
12155   else if (in->is_file())
12156     return _file_vxattrs;
12157   return NULL;
12158 }
12159
12160 const Client::VXattr *Client::_match_vxattr(Inode *in, const char *name)
12161 {
12162   if (strncmp(name, "ceph.", 5) == 0) {
12163     const VXattr *vxattr = _get_vxattrs(in);
12164     if (vxattr) {
12165       while (!vxattr->name.empty()) {
12166         if (vxattr->name == name)
12167           return vxattr;
12168         vxattr++;
12169       }
12170     }
12171
12172     // for common vxattrs
12173     vxattr = _common_vxattrs;
12174     while (!vxattr->name.empty()) {
12175       if (vxattr->name == name)
12176         return vxattr;
12177       vxattr++;
12178     }
12179   }
12180
12181   return NULL;
12182 }
12183
12184 int Client::ll_readlink(Inode *in, char *buf, size_t buflen, const UserPerm& perms)
12185 {
12186   std::lock_guard lock(client_lock);
12187
12188   if (unmounting)
12189     return -ENOTCONN;
12190
12191   vinodeno_t vino = _get_vino(in);
12192
12193   ldout(cct, 3) << "ll_readlink " << vino << dendl;
12194   tout(cct) << "ll_readlink" << std::endl;
12195   tout(cct) << vino.ino.val << std::endl;
12196
12197   for (auto dn : in->dentries) {
12198     touch_dn(dn);
12199   }
12200
12201   int r = _readlink(in, buf, buflen); // FIXME: no permission checking!
12202   ldout(cct, 3) << "ll_readlink " << vino << " = " << r << dendl;
12203   return r;
12204 }
12205
12206 int Client::_mknod(Inode *dir, const char *name, mode_t mode, dev_t rdev,
12207                    const UserPerm& perms, InodeRef *inp)
12208 {
12209   ldout(cct, 8) << "_mknod(" << dir->ino << " " << name << ", 0" << oct
12210                 << mode << dec << ", " << rdev << ", uid " << perms.uid()
12211                 << ", gid " << perms.gid() << ")" << dendl;
12212
12213   if (strlen(name) > NAME_MAX)
12214     return -ENAMETOOLONG;
12215
12216   if (dir->snapid != CEPH_NOSNAP) {
12217     return -EROFS;
12218   }
12219   if (is_quota_files_exceeded(dir, perms)) {
12220     return -EDQUOT;
12221   }
12222
12223   MetaRequest *req = new MetaRequest(CEPH_MDS_OP_MKNOD);
12224
12225   filepath path;
12226   dir->make_nosnap_relative_path(path);
12227   path.push_dentry(name);
12228   req->set_filepath(path);
12229   req->set_inode(dir);
12230   req->head.args.mknod.rdev = rdev;
12231   req->dentry_drop = CEPH_CAP_FILE_SHARED;
12232   req->dentry_unless = CEPH_CAP_FILE_EXCL;
12233
12234   bufferlist xattrs_bl;
12235   int res = _posix_acl_create(dir, &mode, xattrs_bl, perms);
12236   if (res < 0)
12237     goto fail;
12238   req->head.args.mknod.mode = mode;
12239   if (xattrs_bl.length() > 0)
12240     req->set_data(xattrs_bl);
12241
12242   Dentry *de;
12243   res = get_or_create(dir, name, &de);
12244   if (res < 0)
12245     goto fail;
12246   req->set_dentry(de);
12247
12248   res = make_request(req, perms, inp);
12249
12250   trim_cache();
12251
12252   ldout(cct, 8) << "mknod(" << path << ", 0" << oct << mode << dec << ") = " << res << dendl;
12253   return res;
12254
12255  fail:
12256   put_request(req);
12257   return res;
12258 }
12259
12260 int Client::ll_mknod(Inode *parent, const char *name, mode_t mode,
12261                      dev_t rdev, struct stat *attr, Inode **out,
12262                      const UserPerm& perms)
12263 {
12264   std::lock_guard lock(client_lock);
12265
12266   if (unmounting)
12267     return -ENOTCONN;
12268
12269   vinodeno_t vparent = _get_vino(parent);
12270
12271   ldout(cct, 3) << "ll_mknod " << vparent << " " << name << dendl;
12272   tout(cct) << "ll_mknod" << std::endl;
12273   tout(cct) << vparent.ino.val << std::endl;
12274   tout(cct) << name << std::endl;
12275   tout(cct) << mode << std::endl;
12276   tout(cct) << rdev << std::endl;
12277
12278   if (!fuse_default_permissions) {
12279     int r = may_create(parent, perms);
12280     if (r < 0)
12281       return r;
12282   }
12283
12284   InodeRef in;
12285   int r = _mknod(parent, name, mode, rdev, perms, &in);
12286   if (r == 0) {
12287     fill_stat(in, attr);
12288     _ll_get(in.get());
12289   }
12290   tout(cct) << attr->st_ino << std::endl;
12291   ldout(cct, 3) << "ll_mknod " << vparent << " " << name
12292           << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
12293   *out = in.get();
12294   return r;
12295 }
12296
12297 int Client::ll_mknodx(Inode *parent, const char *name, mode_t mode,
12298                       dev_t rdev, Inode **out,
12299                       struct ceph_statx *stx, unsigned want, unsigned flags,
12300                       const UserPerm& perms)
12301 {
12302   unsigned caps = statx_to_mask(flags, want);
12303   std::lock_guard lock(client_lock);
12304
12305   if (unmounting)
12306     return -ENOTCONN;
12307
12308   vinodeno_t vparent = _get_vino(parent);
12309
12310   ldout(cct, 3) << "ll_mknodx " << vparent << " " << name << dendl;
12311   tout(cct) << "ll_mknodx" << std::endl;
12312   tout(cct) << vparent.ino.val << std::endl;
12313   tout(cct) << name << std::endl;
12314   tout(cct) << mode << std::endl;
12315   tout(cct) << rdev << std::endl;
12316
12317   if (!fuse_default_permissions) {
12318     int r = may_create(parent, perms);
12319     if (r < 0)
12320       return r;
12321   }
12322
12323   InodeRef in;
12324   int r = _mknod(parent, name, mode, rdev, perms, &in);
12325   if (r == 0) {
12326     fill_statx(in, caps, stx);
12327     _ll_get(in.get());
12328   }
12329   tout(cct) << stx->stx_ino << std::endl;
12330   ldout(cct, 3) << "ll_mknodx " << vparent << " " << name
12331           << " = " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
12332   *out = in.get();
12333   return r;
12334 }
12335
12336 int Client::_create(Inode *dir, const char *name, int flags, mode_t mode,
12337                     InodeRef *inp, Fh **fhp, int stripe_unit, int stripe_count,
12338                     int object_size, const char *data_pool, bool *created,
12339                     const UserPerm& perms)
12340 {
12341   ldout(cct, 8) << "_create(" << dir->ino << " " << name << ", 0" << oct <<
12342     mode << dec << ")" << dendl;
12343
12344   if (strlen(name) > NAME_MAX)
12345     return -ENAMETOOLONG;
12346   if (dir->snapid != CEPH_NOSNAP) {
12347     return -EROFS;
12348   }
12349   if (is_quota_files_exceeded(dir, perms)) {
12350     return -EDQUOT;
12351   }
12352
12353   // use normalized flags to generate cmode
12354   int cflags = ceph_flags_sys2wire(flags);
12355   if (cct->_conf.get_val<bool>("client_force_lazyio"))
12356     cflags |= CEPH_O_LAZY;
12357
12358   int cmode = ceph_flags_to_mode(cflags);
12359
12360   int64_t pool_id = -1;
12361   if (data_pool && *data_pool) {
12362     pool_id = objecter->with_osdmap(
12363       std::mem_fn(&OSDMap::lookup_pg_pool_name), data_pool);
12364     if (pool_id < 0)
12365       return -EINVAL;
12366     if (pool_id > 0xffffffffll)
12367       return -ERANGE;  // bummer!
12368   }
12369
12370   MetaRequest *req = new MetaRequest(CEPH_MDS_OP_CREATE);
12371
12372   filepath path;
12373   dir->make_nosnap_relative_path(path);
12374   path.push_dentry(name);
12375   req->set_filepath(path);
12376   req->set_inode(dir);
12377   req->head.args.open.flags = cflags | CEPH_O_CREAT;
12378
12379   req->head.args.open.stripe_unit = stripe_unit;
12380   req->head.args.open.stripe_count = stripe_count;
12381   req->head.args.open.object_size = object_size;
12382   if (cct->_conf->client_debug_getattr_caps)
12383     req->head.args.open.mask = DEBUG_GETATTR_CAPS;
12384   else
12385     req->head.args.open.mask = 0;
12386   req->head.args.open.pool = pool_id;
12387   req->dentry_drop = CEPH_CAP_FILE_SHARED;
12388   req->dentry_unless = CEPH_CAP_FILE_EXCL;
12389
12390   mode |= S_IFREG;
12391   bufferlist xattrs_bl;
12392   int res = _posix_acl_create(dir, &mode, xattrs_bl, perms);
12393   if (res < 0)
12394     goto fail;
12395   req->head.args.open.mode = mode;
12396   if (xattrs_bl.length() > 0)
12397     req->set_data(xattrs_bl);
12398
12399   Dentry *de;
12400   res = get_or_create(dir, name, &de);
12401   if (res < 0)
12402     goto fail;
12403   req->set_dentry(de);
12404
12405   res = make_request(req, perms, inp, created);
12406   if (res < 0) {
12407     goto reply_error;
12408   }
12409
12410   /* If the caller passed a value in fhp, do the open */
12411   if(fhp) {
12412     (*inp)->get_open_ref(cmode);
12413     *fhp = _create_fh(inp->get(), flags, cmode, perms);
12414   }
12415
12416  reply_error:
12417   trim_cache();
12418
12419   ldout(cct, 8) << "create(" << path << ", 0" << oct << mode << dec
12420                 << " layout " << stripe_unit
12421                 << ' ' << stripe_count
12422                 << ' ' << object_size
12423                 <<") = " << res << dendl;
12424   return res;
12425
12426  fail:
12427   put_request(req);
12428   return res;
12429 }
12430
12431
12432 int Client::_mkdir(Inode *dir, const char *name, mode_t mode, const UserPerm& perm,
12433                    InodeRef *inp)
12434 {
12435   ldout(cct, 8) << "_mkdir(" << dir->ino << " " << name << ", 0" << oct
12436                 << mode << dec << ", uid " << perm.uid()
12437                 << ", gid " << perm.gid() << ")" << dendl;
12438
12439   if (strlen(name) > NAME_MAX)
12440     return -ENAMETOOLONG;
12441
12442   if (dir->snapid != CEPH_NOSNAP && dir->snapid != CEPH_SNAPDIR) {
12443     return -EROFS;
12444   }
12445   if (is_quota_files_exceeded(dir, perm)) {
12446     return -EDQUOT;
12447   }
12448   MetaRequest *req = new MetaRequest(dir->snapid == CEPH_SNAPDIR ?
12449                                      CEPH_MDS_OP_MKSNAP : CEPH_MDS_OP_MKDIR);
12450
12451   filepath path;
12452   dir->make_nosnap_relative_path(path);
12453   path.push_dentry(name);
12454   req->set_filepath(path);
12455   req->set_inode(dir);
12456   req->dentry_drop = CEPH_CAP_FILE_SHARED;
12457   req->dentry_unless = CEPH_CAP_FILE_EXCL;
12458
12459   mode |= S_IFDIR;
12460   bufferlist xattrs_bl;
12461   int res = _posix_acl_create(dir, &mode, xattrs_bl, perm);
12462   if (res < 0)
12463     goto fail;
12464   req->head.args.mkdir.mode = mode;
12465   if (xattrs_bl.length() > 0)
12466     req->set_data(xattrs_bl);
12467
12468   Dentry *de;
12469   res = get_or_create(dir, name, &de);
12470   if (res < 0)
12471     goto fail;
12472   req->set_dentry(de);
12473
12474   ldout(cct, 10) << "_mkdir: making request" << dendl;
12475   res = make_request(req, perm, inp);
12476   ldout(cct, 10) << "_mkdir result is " << res << dendl;
12477
12478   trim_cache();
12479
12480   ldout(cct, 8) << "_mkdir(" << path << ", 0" << oct << mode << dec << ") = " << res << dendl;
12481   return res;
12482
12483  fail:
12484   put_request(req);
12485   return res;
12486 }
12487
12488 int Client::ll_mkdir(Inode *parent, const char *name, mode_t mode,
12489                      struct stat *attr, Inode **out, const UserPerm& perm)
12490 {
12491   std::lock_guard lock(client_lock);
12492
12493   if (unmounting)
12494     return -ENOTCONN;
12495
12496   vinodeno_t vparent = _get_vino(parent);
12497
12498   ldout(cct, 3) << "ll_mkdir " << vparent << " " << name << dendl;
12499   tout(cct) << "ll_mkdir" << std::endl;
12500   tout(cct) << vparent.ino.val << std::endl;
12501   tout(cct) << name << std::endl;
12502   tout(cct) << mode << std::endl;
12503
12504   if (!fuse_default_permissions) {
12505     int r = may_create(parent, perm);
12506     if (r < 0)
12507       return r;
12508   }
12509
12510   InodeRef in;
12511   int r = _mkdir(parent, name, mode, perm, &in);
12512   if (r == 0) {
12513     fill_stat(in, attr);
12514     _ll_get(in.get());
12515   }
12516   tout(cct) << attr->st_ino << std::endl;
12517   ldout(cct, 3) << "ll_mkdir " << vparent << " " << name
12518           << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
12519   *out = in.get();
12520   return r;
12521 }
12522
12523 int Client::ll_mkdirx(Inode *parent, const char *name, mode_t mode, Inode **out,
12524                       struct ceph_statx *stx, unsigned want, unsigned flags,
12525                       const UserPerm& perms)
12526 {
12527   std::lock_guard lock(client_lock);
12528
12529   if (unmounting)
12530     return -ENOTCONN;
12531
12532   vinodeno_t vparent = _get_vino(parent);
12533
12534   ldout(cct, 3) << "ll_mkdirx " << vparent << " " << name << dendl;
12535   tout(cct) << "ll_mkdirx" << std::endl;
12536   tout(cct) << vparent.ino.val << std::endl;
12537   tout(cct) << name << std::endl;
12538   tout(cct) << mode << std::endl;
12539
12540   if (!fuse_default_permissions) {
12541     int r = may_create(parent, perms);
12542     if (r < 0)
12543       return r;
12544   }
12545
12546   InodeRef in;
12547   int r = _mkdir(parent, name, mode, perms, &in);
12548   if (r == 0) {
12549     fill_statx(in, statx_to_mask(flags, want), stx);
12550     _ll_get(in.get());
12551   } else {
12552     stx->stx_ino = 0;
12553     stx->stx_mask = 0;
12554   }
12555   tout(cct) << stx->stx_ino << std::endl;
12556   ldout(cct, 3) << "ll_mkdirx " << vparent << " " << name
12557           << " = " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
12558   *out = in.get();
12559   return r;
12560 }
12561
12562 int Client::_symlink(Inode *dir, const char *name, const char *target,
12563                      const UserPerm& perms, InodeRef *inp)
12564 {
12565   ldout(cct, 8) << "_symlink(" << dir->ino << " " << name << ", " << target
12566                 << ", uid " << perms.uid() << ", gid " << perms.gid() << ")"
12567                 << dendl;
12568
12569   if (strlen(name) > NAME_MAX)
12570     return -ENAMETOOLONG;
12571
12572   if (dir->snapid != CEPH_NOSNAP) {
12573     return -EROFS;
12574   }
12575   if (is_quota_files_exceeded(dir, perms)) {
12576     return -EDQUOT;
12577   }
12578
12579   MetaRequest *req = new MetaRequest(CEPH_MDS_OP_SYMLINK);
12580
12581   filepath path;
12582   dir->make_nosnap_relative_path(path);
12583   path.push_dentry(name);
12584   req->set_filepath(path);
12585   req->set_inode(dir);
12586   req->set_string2(target);
12587   req->dentry_drop = CEPH_CAP_FILE_SHARED;
12588   req->dentry_unless = CEPH_CAP_FILE_EXCL;
12589
12590   Dentry *de;
12591   int res = get_or_create(dir, name, &de);
12592   if (res < 0)
12593     goto fail;
12594   req->set_dentry(de);
12595
12596   res = make_request(req, perms, inp);
12597
12598   trim_cache();
12599   ldout(cct, 8) << "_symlink(\"" << path << "\", \"" << target << "\") = " <<
12600     res << dendl;
12601   return res;
12602
12603  fail:
12604   put_request(req);
12605   return res;
12606 }
12607
12608 int Client::ll_symlink(Inode *parent, const char *name, const char *value,
12609                        struct stat *attr, Inode **out, const UserPerm& perms)
12610 {
12611   std::lock_guard lock(client_lock);
12612
12613   if (unmounting)
12614     return -ENOTCONN;
12615
12616   vinodeno_t vparent = _get_vino(parent);
12617
12618   ldout(cct, 3) << "ll_symlink " << vparent << " " << name << " -> " << value
12619                 << dendl;
12620   tout(cct) << "ll_symlink" << std::endl;
12621   tout(cct) << vparent.ino.val << std::endl;
12622   tout(cct) << name << std::endl;
12623   tout(cct) << value << std::endl;
12624
12625   if (!fuse_default_permissions) {
12626     int r = may_create(parent, perms);
12627     if (r < 0)
12628       return r;
12629   }
12630
12631   InodeRef in;
12632   int r = _symlink(parent, name, value, perms, &in);
12633   if (r == 0) {
12634     fill_stat(in, attr);
12635     _ll_get(in.get());
12636   }
12637   tout(cct) << attr->st_ino << std::endl;
12638   ldout(cct, 3) << "ll_symlink " << vparent << " " << name
12639           << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
12640   *out = in.get();
12641   return r;
12642 }
12643
12644 int Client::ll_symlinkx(Inode *parent, const char *name, const char *value,
12645                         Inode **out, struct ceph_statx *stx, unsigned want,
12646                         unsigned flags, const UserPerm& perms)
12647 {
12648   std::lock_guard lock(client_lock);
12649
12650   if (unmounting)
12651     return -ENOTCONN;
12652
12653   vinodeno_t vparent = _get_vino(parent);
12654
12655   ldout(cct, 3) << "ll_symlinkx " << vparent << " " << name << " -> " << value
12656                 << dendl;
12657   tout(cct) << "ll_symlinkx" << std::endl;
12658   tout(cct) << vparent.ino.val << std::endl;
12659   tout(cct) << name << std::endl;
12660   tout(cct) << value << std::endl;
12661
12662   if (!fuse_default_permissions) {
12663     int r = may_create(parent, perms);
12664     if (r < 0)
12665       return r;
12666   }
12667
12668   InodeRef in;
12669   int r = _symlink(parent, name, value, perms, &in);
12670   if (r == 0) {
12671     fill_statx(in, statx_to_mask(flags, want), stx);
12672     _ll_get(in.get());
12673   }
12674   tout(cct) << stx->stx_ino << std::endl;
12675   ldout(cct, 3) << "ll_symlinkx " << vparent << " " << name
12676           << " = " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
12677   *out = in.get();
12678   return r;
12679 }
12680
12681 int Client::_unlink(Inode *dir, const char *name, const UserPerm& perm)
12682 {
12683   ldout(cct, 8) << "_unlink(" << dir->ino << " " << name
12684                 << " uid " << perm.uid() << " gid " << perm.gid()
12685                 << ")" << dendl;
12686
12687   if (dir->snapid != CEPH_NOSNAP) {
12688     return -EROFS;
12689   }
12690
12691   MetaRequest *req = new MetaRequest(CEPH_MDS_OP_UNLINK);
12692
12693   filepath path;
12694   dir->make_nosnap_relative_path(path);
12695   path.push_dentry(name);
12696   req->set_filepath(path);
12697
12698   InodeRef otherin;
12699   Inode *in;
12700   Dentry *de;
12701
12702   int res = get_or_create(dir, name, &de);
12703   if (res < 0)
12704     goto fail;
12705   req->set_dentry(de);
12706   req->dentry_drop = CEPH_CAP_FILE_SHARED;
12707   req->dentry_unless = CEPH_CAP_FILE_EXCL;
12708
12709   res = _lookup(dir, name, 0, &otherin, perm);
12710   if (res < 0)
12711     goto fail;
12712
12713   in = otherin.get();
12714   req->set_other_inode(in);
12715   in->break_all_delegs();
12716   req->other_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
12717
12718   req->set_inode(dir);
12719
12720   res = make_request(req, perm);
12721
12722   trim_cache();
12723   ldout(cct, 8) << "unlink(" << path << ") = " << res << dendl;
12724   return res;
12725
12726  fail:
12727   put_request(req);
12728   return res;
12729 }
12730
12731 int Client::ll_unlink(Inode *in, const char *name, const UserPerm& perm)
12732 {
12733   std::lock_guard lock(client_lock);
12734
12735   if (unmounting)
12736     return -ENOTCONN;
12737
12738   vinodeno_t vino = _get_vino(in);
12739
12740   ldout(cct, 3) << "ll_unlink " << vino << " " << name << dendl;
12741   tout(cct) << "ll_unlink" << std::endl;
12742   tout(cct) << vino.ino.val << std::endl;
12743   tout(cct) << name << std::endl;
12744
12745   if (!fuse_default_permissions) {
12746     int r = may_delete(in, name, perm);
12747     if (r < 0)
12748       return r;
12749   }
12750   return _unlink(in, name, perm);
12751 }
12752
12753 int Client::_rmdir(Inode *dir, const char *name, const UserPerm& perms)
12754 {
12755   ldout(cct, 8) << "_rmdir(" << dir->ino << " " << name << " uid "
12756                 << perms.uid() << " gid " << perms.gid() << ")" << dendl;
12757
12758   if (dir->snapid != CEPH_NOSNAP && dir->snapid != CEPH_SNAPDIR) {
12759     return -EROFS;
12760   }
12761
12762   int op = dir->snapid == CEPH_SNAPDIR ? CEPH_MDS_OP_RMSNAP : CEPH_MDS_OP_RMDIR;
12763   MetaRequest *req = new MetaRequest(op);
12764   filepath path;
12765   dir->make_nosnap_relative_path(path);
12766   path.push_dentry(name);
12767   req->set_filepath(path);
12768   req->set_inode(dir);
12769
12770   req->dentry_drop = CEPH_CAP_FILE_SHARED;
12771   req->dentry_unless = CEPH_CAP_FILE_EXCL;
12772   req->other_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
12773
12774   InodeRef in;
12775
12776   Dentry *de;
12777   int res = get_or_create(dir, name, &de);
12778   if (res < 0)
12779     goto fail;
12780   if (op == CEPH_MDS_OP_RMDIR)
12781     req->set_dentry(de);
12782   else
12783     de->get();
12784
12785   res = _lookup(dir, name, 0, &in, perms);
12786   if (res < 0)
12787     goto fail;
12788
12789   if (op == CEPH_MDS_OP_RMSNAP) {
12790     unlink(de, true, true);
12791     de->put();
12792   }
12793   req->set_other_inode(in.get());
12794
12795   res = make_request(req, perms);
12796
12797   trim_cache();
12798   ldout(cct, 8) << "rmdir(" << path << ") = " << res << dendl;
12799   return res;
12800
12801  fail:
12802   put_request(req);
12803   return res;
12804 }
12805
12806 int Client::ll_rmdir(Inode *in, const char *name, const UserPerm& perms)
12807 {
12808   std::lock_guard lock(client_lock);
12809
12810   if (unmounting)
12811     return -ENOTCONN;
12812
12813   vinodeno_t vino = _get_vino(in);
12814
12815   ldout(cct, 3) << "ll_rmdir " << vino << " " << name << dendl;
12816   tout(cct) << "ll_rmdir" << std::endl;
12817   tout(cct) << vino.ino.val << std::endl;
12818   tout(cct) << name << std::endl;
12819
12820   if (!fuse_default_permissions) {
12821     int r = may_delete(in, name, perms);
12822     if (r < 0)
12823       return r;
12824   }
12825
12826   return _rmdir(in, name, perms);
12827 }
12828
12829 int Client::_rename(Inode *fromdir, const char *fromname, Inode *todir, const char *toname, const UserPerm& perm)
12830 {
12831   ldout(cct, 8) << "_rename(" << fromdir->ino << " " << fromname << " to "
12832                 << todir->ino << " " << toname
12833                 << " uid " << perm.uid() << " gid " << perm.gid() << ")"
12834                 << dendl;
12835
12836   if (fromdir->snapid != todir->snapid)
12837     return -EXDEV;
12838
12839   int op = CEPH_MDS_OP_RENAME;
12840   if (fromdir->snapid != CEPH_NOSNAP) {
12841     if (fromdir == todir && fromdir->snapid == CEPH_SNAPDIR)
12842       op = CEPH_MDS_OP_RENAMESNAP;
12843     else
12844       return -EROFS;
12845   }
12846
12847   InodeRef target;
12848   MetaRequest *req = new MetaRequest(op);
12849
12850   filepath from;
12851   fromdir->make_nosnap_relative_path(from);
12852   from.push_dentry(fromname);
12853   filepath to;
12854   todir->make_nosnap_relative_path(to);
12855   to.push_dentry(toname);
12856   req->set_filepath(to);
12857   req->set_filepath2(from);
12858
12859   Dentry *oldde;
12860   int res = get_or_create(fromdir, fromname, &oldde);
12861   if (res < 0)
12862     goto fail;
12863   Dentry *de;
12864   res = get_or_create(todir, toname, &de);
12865   if (res < 0)
12866     goto fail;
12867
12868   if (op == CEPH_MDS_OP_RENAME) {
12869     req->set_old_dentry(oldde);
12870     req->old_dentry_drop = CEPH_CAP_FILE_SHARED;
12871     req->old_dentry_unless = CEPH_CAP_FILE_EXCL;
12872
12873     req->set_dentry(de);
12874     req->dentry_drop = CEPH_CAP_FILE_SHARED;
12875     req->dentry_unless = CEPH_CAP_FILE_EXCL;
12876
12877     InodeRef oldin, otherin;
12878     Inode *fromdir_root = nullptr;
12879     Inode *todir_root = nullptr;
12880     int mask = 0;
12881     bool quota_check = false;
12882     if (fromdir != todir) {
12883       fromdir_root =
12884         fromdir->quota.is_enable() ? fromdir : get_quota_root(fromdir, perm);
12885       todir_root =
12886         todir->quota.is_enable() ? todir : get_quota_root(todir, perm);
12887
12888       if (todir_root->quota.is_enable() && fromdir_root != todir_root) {
12889         // use CEPH_STAT_RSTAT mask to force send getattr or lookup request
12890         // to auth MDS to get latest rstat for todir_root and source dir
12891         // even if their dentry caches and inode caps are satisfied.
12892         res = _getattr(todir_root, CEPH_STAT_RSTAT, perm, true);
12893         if (res < 0)
12894           goto fail;
12895
12896         quota_check = true;
12897         if (oldde->inode && oldde->inode->is_dir()) {
12898           mask |= CEPH_STAT_RSTAT;
12899         }
12900       }
12901     }
12902
12903     res = _lookup(fromdir, fromname, mask, &oldin, perm);
12904     if (res < 0)
12905       goto fail;
12906
12907     Inode *oldinode = oldin.get();
12908     oldinode->break_all_delegs();
12909     req->set_old_inode(oldinode);
12910     req->old_inode_drop = CEPH_CAP_LINK_SHARED;
12911
12912     if (quota_check) {
12913       int64_t old_bytes, old_files;
12914       if (oldinode->is_dir()) {
12915         old_bytes = oldinode->rstat.rbytes;
12916         old_files = oldinode->rstat.rsize();
12917       } else {
12918         old_bytes = oldinode->size;
12919         old_files = 1;
12920       }
12921
12922       bool quota_exceed = false;
12923       if (todir_root && todir_root->quota.max_bytes &&
12924           (old_bytes + todir_root->rstat.rbytes) >= todir_root->quota.max_bytes) {
12925         ldout(cct, 10) << "_rename (" << oldinode->ino << " bytes="
12926                        << old_bytes << ") to (" << todir->ino
12927                        << ") will exceed quota on " << *todir_root << dendl;
12928         quota_exceed = true;
12929       }
12930
12931       if (todir_root && todir_root->quota.max_files &&
12932           (old_files + todir_root->rstat.rsize()) >= todir_root->quota.max_files) {
12933         ldout(cct, 10) << "_rename (" << oldinode->ino << " files="
12934                        << old_files << ") to (" << todir->ino
12935                        << ") will exceed quota on " << *todir_root << dendl;
12936         quota_exceed = true;
12937       }
12938
12939       if (quota_exceed) {
12940         res = (oldinode->is_dir()) ? -EXDEV : -EDQUOT;
12941         goto fail;
12942       }
12943     }
12944
12945     res = _lookup(todir, toname, 0, &otherin, perm);
12946     switch (res) {
12947     case 0:
12948       {
12949         Inode *in = otherin.get();
12950         req->set_other_inode(in);
12951         in->break_all_delegs();
12952       }
12953       req->other_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
12954       break;
12955     case -ENOENT:
12956       break;
12957     default:
12958       goto fail;
12959     }
12960
12961     req->set_inode(todir);
12962   } else {
12963     // renamesnap reply contains no tracedn, so we need to invalidate
12964     // dentry manually
12965     unlink(oldde, true, true);
12966     unlink(de, true, true);
12967
12968     req->set_inode(todir);
12969   }
12970
12971   res = make_request(req, perm, &target);
12972   ldout(cct, 10) << "rename result is " << res << dendl;
12973
12974   // renamed item from our cache
12975
12976   trim_cache();
12977   ldout(cct, 8) << "_rename(" << from << ", " << to << ") = " << res << dendl;
12978   return res;
12979
12980  fail:
12981   put_request(req);
12982   return res;
12983 }
12984
12985 int Client::ll_rename(Inode *parent, const char *name, Inode *newparent,
12986                       const char *newname, const UserPerm& perm)
12987 {
12988   std::lock_guard lock(client_lock);
12989
12990   if (unmounting)
12991     return -ENOTCONN;
12992
12993   vinodeno_t vparent = _get_vino(parent);
12994   vinodeno_t vnewparent = _get_vino(newparent);
12995
12996   ldout(cct, 3) << "ll_rename " << vparent << " " << name << " to "
12997           << vnewparent << " " << newname << dendl;
12998   tout(cct) << "ll_rename" << std::endl;
12999   tout(cct) << vparent.ino.val << std::endl;
13000   tout(cct) << name << std::endl;
13001   tout(cct) << vnewparent.ino.val << std::endl;
13002   tout(cct) << newname << std::endl;
13003
13004   if (!fuse_default_permissions) {
13005     int r = may_delete(parent, name, perm);
13006     if (r < 0)
13007       return r;
13008     r = may_delete(newparent, newname, perm);
13009     if (r < 0 && r != -ENOENT)
13010       return r;
13011   }
13012
13013   return _rename(parent, name, newparent, newname, perm);
13014 }
13015
13016 int Client::_link(Inode *in, Inode *dir, const char *newname, const UserPerm& perm, InodeRef *inp)
13017 {
13018   ldout(cct, 8) << "_link(" << in->ino << " to " << dir->ino << " " << newname
13019                 << " uid " << perm.uid() << " gid " << perm.gid() << ")" << dendl;
13020
13021   if (strlen(newname) > NAME_MAX)
13022     return -ENAMETOOLONG;
13023
13024   if (in->snapid != CEPH_NOSNAP || dir->snapid != CEPH_NOSNAP) {
13025     return -EROFS;
13026   }
13027   if (is_quota_files_exceeded(dir, perm)) {
13028     return -EDQUOT;
13029   }
13030
13031   in->break_all_delegs();
13032   MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LINK);
13033
13034   filepath path(newname, dir->ino);
13035   req->set_filepath(path);
13036   filepath existing(in->ino);
13037   req->set_filepath2(existing);
13038
13039   req->set_inode(dir);
13040   req->inode_drop = CEPH_CAP_FILE_SHARED;
13041   req->inode_unless = CEPH_CAP_FILE_EXCL;
13042
13043   Dentry *de;
13044   int res = get_or_create(dir, newname, &de);
13045   if (res < 0)
13046     goto fail;
13047   req->set_dentry(de);
13048
13049   res = make_request(req, perm, inp);
13050   ldout(cct, 10) << "link result is " << res << dendl;
13051
13052   trim_cache();
13053   ldout(cct, 8) << "link(" << existing << ", " << path << ") = " << res << dendl;
13054   return res;
13055
13056  fail:
13057   put_request(req);
13058   return res;
13059 }
13060
13061 int Client::ll_link(Inode *in, Inode *newparent, const char *newname,
13062                     const UserPerm& perm)
13063 {
13064   std::lock_guard lock(client_lock);
13065
13066   if (unmounting)
13067     return -ENOTCONN;
13068
13069   vinodeno_t vino = _get_vino(in);
13070   vinodeno_t vnewparent = _get_vino(newparent);
13071
13072   ldout(cct, 3) << "ll_link " << vino << " to " << vnewparent << " " <<
13073     newname << dendl;
13074   tout(cct) << "ll_link" << std::endl;
13075   tout(cct) << vino.ino.val << std::endl;
13076   tout(cct) << vnewparent << std::endl;
13077   tout(cct) << newname << std::endl;
13078
13079   InodeRef target;
13080
13081   if (!fuse_default_permissions) {
13082     if (S_ISDIR(in->mode))
13083       return -EPERM;
13084
13085     int r = may_hardlink(in, perm);
13086     if (r < 0)
13087       return r;
13088
13089     r = may_create(newparent, perm);
13090     if (r < 0)
13091       return r;
13092   }
13093
13094   return _link(in, newparent, newname, perm, &target);
13095 }
13096
13097 int Client::ll_num_osds(void)
13098 {
13099   std::lock_guard lock(client_lock);
13100   return objecter->with_osdmap(std::mem_fn(&OSDMap::get_num_osds));
13101 }
13102
13103 int Client::ll_osdaddr(int osd, uint32_t *addr)
13104 {
13105   std::lock_guard lock(client_lock);
13106
13107   entity_addr_t g;
13108   bool exists = objecter->with_osdmap([&](const OSDMap& o) {
13109       if (!o.exists(osd))
13110         return false;
13111       g = o.get_addrs(osd).front();
13112       return true;
13113     });
13114   if (!exists)
13115     return -1;
13116   uint32_t nb_addr = (g.in4_addr()).sin_addr.s_addr;
13117   *addr = ntohl(nb_addr);
13118   return 0;
13119 }
13120
13121 uint32_t Client::ll_stripe_unit(Inode *in)
13122 {
13123   std::lock_guard lock(client_lock);
13124   return in->layout.stripe_unit;
13125 }
13126
13127 uint64_t Client::ll_snap_seq(Inode *in)
13128 {
13129   std::lock_guard lock(client_lock);
13130   return in->snaprealm->seq;
13131 }
13132
13133 int Client::ll_file_layout(Inode *in, file_layout_t *layout)
13134 {
13135   std::lock_guard lock(client_lock);
13136   *layout = in->layout;
13137   return 0;
13138 }
13139
13140 int Client::ll_file_layout(Fh *fh, file_layout_t *layout)
13141 {
13142   return ll_file_layout(fh->inode.get(), layout);
13143 }
13144
13145 /* Currently we cannot take advantage of redundancy in reads, since we
13146    would have to go through all possible placement groups (a
13147    potentially quite large number determined by a hash), and use CRUSH
13148    to calculate the appropriate set of OSDs for each placement group,
13149    then index into that.  An array with one entry per OSD is much more
13150    tractable and works for demonstration purposes. */
13151
13152 int Client::ll_get_stripe_osd(Inode *in, uint64_t blockno,
13153                               file_layout_t* layout)
13154 {
13155   std::lock_guard lock(client_lock);
13156
13157   inodeno_t ino = in->ino;
13158   uint32_t object_size = layout->object_size;
13159   uint32_t su = layout->stripe_unit;
13160   uint32_t stripe_count = layout->stripe_count;
13161   uint64_t stripes_per_object = object_size / su;
13162   uint64_t stripeno = 0, stripepos = 0;
13163
13164   if(stripe_count) {
13165       stripeno = blockno / stripe_count;    // which horizontal stripe        (Y)
13166       stripepos = blockno % stripe_count;   // which object in the object set (X)
13167   }
13168   uint64_t objectsetno = stripeno / stripes_per_object;       // which object set
13169   uint64_t objectno = objectsetno * stripe_count + stripepos;  // object id
13170
13171   object_t oid = file_object_t(ino, objectno);
13172   return objecter->with_osdmap([&](const OSDMap& o) {
13173       ceph_object_layout olayout =
13174         o.file_to_object_layout(oid, *layout);
13175       pg_t pg = (pg_t)olayout.ol_pgid;
13176       vector<int> osds;
13177       int primary;
13178       o.pg_to_acting_osds(pg, &osds, &primary);
13179       return primary;
13180     });
13181 }
13182
13183 /* Return the offset of the block, internal to the object */
13184
13185 uint64_t Client::ll_get_internal_offset(Inode *in, uint64_t blockno)
13186 {
13187   std::lock_guard lock(client_lock);
13188   file_layout_t *layout=&(in->layout);
13189   uint32_t object_size = layout->object_size;
13190   uint32_t su = layout->stripe_unit;
13191   uint64_t stripes_per_object = object_size / su;
13192
13193   return (blockno % stripes_per_object) * su;
13194 }
13195
13196 int Client::ll_opendir(Inode *in, int flags, dir_result_t** dirpp,
13197                        const UserPerm& perms)
13198 {
13199   std::lock_guard lock(client_lock);
13200
13201   if (unmounting)
13202     return -ENOTCONN;
13203
13204   vinodeno_t vino = _get_vino(in);
13205
13206   ldout(cct, 3) << "ll_opendir " << vino << dendl;
13207   tout(cct) << "ll_opendir" << std::endl;
13208   tout(cct) << vino.ino.val << std::endl;
13209
13210   if (!fuse_default_permissions) {
13211     int r = may_open(in, flags, perms);
13212     if (r < 0)
13213       return r;
13214   }
13215
13216   int r = _opendir(in, dirpp, perms);
13217   tout(cct) << (unsigned long)*dirpp << std::endl;
13218
13219   ldout(cct, 3) << "ll_opendir " << vino << " = " << r << " (" << *dirpp << ")"
13220                 << dendl;
13221   return r;
13222 }
13223
13224 int Client::ll_releasedir(dir_result_t *dirp)
13225 {
13226   std::lock_guard lock(client_lock);
13227   ldout(cct, 3) << "ll_releasedir " << dirp << dendl;
13228   tout(cct) << "ll_releasedir" << std::endl;
13229   tout(cct) << (unsigned long)dirp << std::endl;
13230
13231   if (unmounting)
13232     return -ENOTCONN;
13233
13234   _closedir(dirp);
13235   return 0;
13236 }
13237
13238 int Client::ll_fsyncdir(dir_result_t *dirp)
13239 {
13240   std::lock_guard lock(client_lock);
13241   ldout(cct, 3) << "ll_fsyncdir " << dirp << dendl;
13242   tout(cct) << "ll_fsyncdir" << std::endl;
13243   tout(cct) << (unsigned long)dirp << std::endl;
13244
13245   if (unmounting)
13246     return -ENOTCONN;
13247
13248   return _fsync(dirp->inode.get(), false);
13249 }
13250
13251 int Client::ll_open(Inode *in, int flags, Fh **fhp, const UserPerm& perms)
13252 {
13253   ceph_assert(!(flags & O_CREAT));
13254
13255   std::lock_guard lock(client_lock);
13256
13257   if (unmounting)
13258     return -ENOTCONN;
13259
13260   vinodeno_t vino = _get_vino(in);
13261
13262   ldout(cct, 3) << "ll_open " << vino << " " << ceph_flags_sys2wire(flags) << dendl;
13263   tout(cct) << "ll_open" << std::endl;
13264   tout(cct) << vino.ino.val << std::endl;
13265   tout(cct) << ceph_flags_sys2wire(flags) << std::endl;
13266
13267   int r;
13268   if (!fuse_default_permissions) {
13269     r = may_open(in, flags, perms);
13270     if (r < 0)
13271       goto out;
13272   }
13273
13274   r = _open(in, flags, 0, fhp /* may be NULL */, perms);
13275
13276  out:
13277   Fh *fhptr = fhp ? *fhp : NULL;
13278   if (fhptr) {
13279     ll_unclosed_fh_set.insert(fhptr);
13280   }
13281   tout(cct) << (unsigned long)fhptr << std::endl;
13282   ldout(cct, 3) << "ll_open " << vino << " " << ceph_flags_sys2wire(flags) <<
13283       " = " << r << " (" << fhptr << ")" << dendl;
13284   return r;
13285 }
13286
13287 int Client::_ll_create(Inode *parent, const char *name, mode_t mode,
13288                       int flags, InodeRef *in, int caps, Fh **fhp,
13289                       const UserPerm& perms)
13290 {
13291   *fhp = NULL;
13292
13293   vinodeno_t vparent = _get_vino(parent);
13294
13295   ldout(cct, 8) << "_ll_create " << vparent << " " << name << " 0" << oct <<
13296     mode << dec << " " << ceph_flags_sys2wire(flags) << ", uid " << perms.uid()
13297                 << ", gid " << perms.gid() << dendl;
13298   tout(cct) << "ll_create" << std::endl;
13299   tout(cct) << vparent.ino.val << std::endl;
13300   tout(cct) << name << std::endl;
13301   tout(cct) << mode << std::endl;
13302   tout(cct) << ceph_flags_sys2wire(flags) << std::endl;
13303
13304   bool created = false;
13305   int r = _lookup(parent, name, caps, in, perms);
13306
13307   if (r == 0 && (flags & O_CREAT) && (flags & O_EXCL))
13308     return -EEXIST;
13309
13310   if (r == -ENOENT && (flags & O_CREAT)) {
13311     if (!fuse_default_permissions) {
13312       r = may_create(parent, perms);
13313       if (r < 0)
13314         goto out;
13315     }
13316     r = _create(parent, name, flags, mode, in, fhp, 0, 0, 0, NULL, &created,
13317                 perms);
13318     if (r < 0)
13319       goto out;
13320   }
13321
13322   if (r < 0)
13323     goto out;
13324
13325   ceph_assert(*in);
13326
13327   ldout(cct, 20) << "_ll_create created = " << created << dendl;
13328   if (!created) {
13329     if (!fuse_default_permissions) {
13330       r = may_open(in->get(), flags, perms);
13331       if (r < 0) {
13332         if (*fhp) {
13333           int release_r = _release_fh(*fhp);
13334           ceph_assert(release_r == 0);  // during create, no async data ops should have happened
13335         }
13336         goto out;
13337       }
13338     }
13339     if (*fhp == NULL) {
13340       r = _open(in->get(), flags, mode, fhp, perms);
13341       if (r < 0)
13342         goto out;
13343     }
13344   }
13345
13346 out:
13347   if (*fhp) {
13348     ll_unclosed_fh_set.insert(*fhp);
13349   }
13350
13351   ino_t ino = 0;
13352   if (r >= 0) {
13353     Inode *inode = in->get();
13354     if (use_faked_inos())
13355       ino = inode->faked_ino;
13356     else
13357       ino = inode->ino;
13358   }
13359
13360   tout(cct) << (unsigned long)*fhp << std::endl;
13361   tout(cct) << ino << std::endl;
13362   ldout(cct, 8) << "_ll_create " << vparent << " " << name << " 0" << oct <<
13363     mode << dec << " " << ceph_flags_sys2wire(flags) << " = " << r << " (" <<
13364     *fhp << " " << hex << ino << dec << ")" << dendl;
13365
13366   return r;
13367 }
13368
13369 int Client::ll_create(Inode *parent, const char *name, mode_t mode,
13370                       int flags, struct stat *attr, Inode **outp, Fh **fhp,
13371                       const UserPerm& perms)
13372 {
13373   std::lock_guard lock(client_lock);
13374   InodeRef in;
13375
13376   if (unmounting)
13377     return -ENOTCONN;
13378
13379   int r = _ll_create(parent, name, mode, flags, &in, CEPH_STAT_CAP_INODE_ALL,
13380                       fhp, perms);
13381   if (r >= 0) {
13382     ceph_assert(in);
13383
13384     // passing an Inode in outp requires an additional ref
13385     if (outp) {
13386       _ll_get(in.get());
13387       *outp = in.get();
13388     }
13389     fill_stat(in, attr);
13390   } else {
13391     attr->st_ino = 0;
13392   }
13393
13394   return r;
13395 }
13396
13397 int Client::ll_createx(Inode *parent, const char *name, mode_t mode,
13398                         int oflags, Inode **outp, Fh **fhp,
13399                         struct ceph_statx *stx, unsigned want, unsigned lflags,
13400                         const UserPerm& perms)
13401 {
13402   unsigned caps = statx_to_mask(lflags, want);
13403   std::lock_guard lock(client_lock);
13404   InodeRef in;
13405
13406   if (unmounting)
13407     return -ENOTCONN;
13408
13409   int r = _ll_create(parent, name, mode, oflags, &in, caps, fhp, perms);
13410   if (r >= 0) {
13411     ceph_assert(in);
13412
13413     // passing an Inode in outp requires an additional ref
13414     if (outp) {
13415       _ll_get(in.get());
13416       *outp = in.get();
13417     }
13418     fill_statx(in, caps, stx);
13419   } else {
13420     stx->stx_ino = 0;
13421     stx->stx_mask = 0;
13422   }
13423
13424   return r;
13425 }
13426
13427 loff_t Client::ll_lseek(Fh *fh, loff_t offset, int whence)
13428 {
13429   std::lock_guard lock(client_lock);
13430   tout(cct) << "ll_lseek" << std::endl;
13431   tout(cct) << offset << std::endl;
13432   tout(cct) << whence << std::endl;
13433
13434   if (unmounting)
13435     return -ENOTCONN;
13436
13437   return _lseek(fh, offset, whence);
13438 }
13439
13440 int Client::ll_read(Fh *fh, loff_t off, loff_t len, bufferlist *bl)
13441 {
13442   std::lock_guard lock(client_lock);
13443   ldout(cct, 3) << "ll_read " << fh << " " << fh->inode->ino << " " << " " << off << "~" << len << dendl;
13444   tout(cct) << "ll_read" << std::endl;
13445   tout(cct) << (unsigned long)fh << std::endl;
13446   tout(cct) << off << std::endl;
13447   tout(cct) << len << std::endl;
13448
13449   if (unmounting)
13450     return -ENOTCONN;
13451
13452   /* We can't return bytes written larger than INT_MAX, clamp len to that */
13453   len = std::min(len, (loff_t)INT_MAX);
13454   int r = _read(fh, off, len, bl);
13455   ldout(cct, 3) << "ll_read " << fh << " " << off << "~" << len << " = " << r
13456                 << dendl;
13457   return r;
13458 }
13459
13460 int Client::ll_read_block(Inode *in, uint64_t blockid,
13461                           char *buf,
13462                           uint64_t offset,
13463                           uint64_t length,
13464                           file_layout_t* layout)
13465 {
13466   std::lock_guard lock(client_lock);
13467
13468   if (unmounting)
13469     return -ENOTCONN;
13470
13471   vinodeno_t vino = _get_vino(in);
13472   object_t oid = file_object_t(vino.ino, blockid);
13473   C_SaferCond onfinish;
13474   bufferlist bl;
13475
13476   objecter->read(oid,
13477                  object_locator_t(layout->pool_id),
13478                  offset,
13479                  length,
13480                  vino.snapid,
13481                  &bl,
13482                  CEPH_OSD_FLAG_READ,
13483                  &onfinish);
13484
13485   client_lock.unlock();
13486   int r = onfinish.wait();
13487   client_lock.lock();
13488
13489   if (r >= 0) {
13490       bl.begin().copy(bl.length(), buf);
13491       r = bl.length();
13492   }
13493
13494   return r;
13495 }
13496
13497 /* It appears that the OSD doesn't return success unless the entire
13498    buffer was written, return the write length on success. */
13499
13500 int Client::ll_write_block(Inode *in, uint64_t blockid,
13501                            char* buf, uint64_t offset,
13502                            uint64_t length, file_layout_t* layout,
13503                            uint64_t snapseq, uint32_t sync)
13504 {
13505   vinodeno_t vino = ll_get_vino(in);
13506   int r = 0;
13507   std::unique_ptr<C_SaferCond> onsafe = nullptr;
13508
13509   if (length == 0) {
13510     return -EINVAL;
13511   }
13512   if (true || sync) {
13513     /* if write is stable, the epilogue is waiting on
13514      * flock */
13515     onsafe.reset(new C_SaferCond("Client::ll_write_block flock"));
13516   }
13517   object_t oid = file_object_t(vino.ino, blockid);
13518   SnapContext fakesnap;
13519   ceph::bufferlist bl;
13520   if (length > 0) {
13521     bl.push_back(buffer::copy(buf, length));
13522   }
13523
13524   ldout(cct, 1) << "ll_block_write for " << vino.ino << "." << blockid
13525                 << dendl;
13526
13527   fakesnap.seq = snapseq;
13528
13529   /* lock just in time */
13530   client_lock.lock();
13531   if (unmounting) {
13532     client_lock.unlock();
13533     return -ENOTCONN;
13534   }
13535
13536   objecter->write(oid,
13537                   object_locator_t(layout->pool_id),
13538                   offset,
13539                   length,
13540                   fakesnap,
13541                   bl,
13542                   ceph::real_clock::now(),
13543                   0,
13544                   onsafe.get());
13545
13546   client_lock.unlock();
13547   if (nullptr != onsafe) {
13548     r = onsafe->wait();
13549   }
13550
13551   if (r < 0) {
13552     return r;
13553   } else {
13554     return length;
13555   }
13556 }
13557
13558 int Client::ll_commit_blocks(Inode *in,
13559                              uint64_t offset,
13560                              uint64_t length)
13561 {
13562     std::lock_guard lock(client_lock);
13563     /*
13564     BarrierContext *bctx;
13565     vinodeno_t vino = _get_vino(in);
13566     uint64_t ino = vino.ino;
13567
13568     ldout(cct, 1) << "ll_commit_blocks for " << vino.ino << " from "
13569                   << offset << " to " << length << dendl;
13570
13571     if (length == 0) {
13572       return -EINVAL;
13573     }
13574
13575     map<uint64_t, BarrierContext*>::iterator p = barriers.find(ino);
13576     if (p != barriers.end()) {
13577       barrier_interval civ(offset, offset + length);
13578       p->second->commit_barrier(civ);
13579     }
13580     */
13581     return 0;
13582 }
13583
13584 int Client::ll_write(Fh *fh, loff_t off, loff_t len, const char *data)
13585 {
13586   std::lock_guard lock(client_lock);
13587   ldout(cct, 3) << "ll_write " << fh << " " << fh->inode->ino << " " << off <<
13588     "~" << len << dendl;
13589   tout(cct) << "ll_write" << std::endl;
13590   tout(cct) << (unsigned long)fh << std::endl;
13591   tout(cct) << off << std::endl;
13592   tout(cct) << len << std::endl;
13593
13594   if (unmounting)
13595     return -ENOTCONN;
13596
13597   /* We can't return bytes written larger than INT_MAX, clamp len to that */
13598   len = std::min(len, (loff_t)INT_MAX);
13599   int r = _write(fh, off, len, data, NULL, 0);
13600   ldout(cct, 3) << "ll_write " << fh << " " << off << "~" << len << " = " << r
13601                 << dendl;
13602   return r;
13603 }
13604
13605 int64_t Client::ll_writev(struct Fh *fh, const struct iovec *iov, int iovcnt, int64_t off)
13606 {
13607   std::lock_guard lock(client_lock);
13608   if (unmounting)
13609    return -ENOTCONN;
13610   return _preadv_pwritev_locked(fh, iov, iovcnt, off, true, false);
13611 }
13612
13613 int64_t Client::ll_readv(struct Fh *fh, const struct iovec *iov, int iovcnt, int64_t off)
13614 {
13615   std::lock_guard lock(client_lock);
13616   if (unmounting)
13617    return -ENOTCONN;
13618   return _preadv_pwritev_locked(fh, iov, iovcnt, off, false, false);
13619 }
13620
13621 int Client::ll_flush(Fh *fh)
13622 {
13623   std::lock_guard lock(client_lock);
13624   ldout(cct, 3) << "ll_flush " << fh << " " << fh->inode->ino << " " << dendl;
13625   tout(cct) << "ll_flush" << std::endl;
13626   tout(cct) << (unsigned long)fh << std::endl;
13627
13628   if (unmounting)
13629     return -ENOTCONN;
13630
13631   return _flush(fh);
13632 }
13633
13634 int Client::ll_fsync(Fh *fh, bool syncdataonly)
13635 {
13636   std::lock_guard lock(client_lock);
13637   ldout(cct, 3) << "ll_fsync " << fh << " " << fh->inode->ino << " " << dendl;
13638   tout(cct) << "ll_fsync" << std::endl;
13639   tout(cct) << (unsigned long)fh << std::endl;
13640
13641   if (unmounting)
13642     return -ENOTCONN;
13643
13644   int r = _fsync(fh, syncdataonly);
13645   if (r) {
13646     // If we're returning an error, clear it from the FH
13647     fh->take_async_err();
13648   }
13649   return r;
13650 }
13651
13652 int Client::ll_sync_inode(Inode *in, bool syncdataonly)
13653 {
13654   std::lock_guard lock(client_lock);
13655   ldout(cct, 3) << "ll_sync_inode " << *in << " " << dendl;
13656   tout(cct) << "ll_sync_inode" << std::endl;
13657   tout(cct) << (unsigned long)in << std::endl;
13658
13659   if (unmounting)
13660     return -ENOTCONN;
13661
13662   return _fsync(in, syncdataonly);
13663 }
13664
13665 #ifdef FALLOC_FL_PUNCH_HOLE
13666
13667 int Client::_fallocate(Fh *fh, int mode, int64_t offset, int64_t length)
13668 {
13669   if (offset < 0 || length <= 0)
13670     return -EINVAL;
13671
13672   if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
13673     return -EOPNOTSUPP;
13674
13675   if ((mode & FALLOC_FL_PUNCH_HOLE) && !(mode & FALLOC_FL_KEEP_SIZE))
13676     return -EOPNOTSUPP;
13677
13678   Inode *in = fh->inode.get();
13679
13680   if (objecter->osdmap_pool_full(in->layout.pool_id) &&
13681       !(mode & FALLOC_FL_PUNCH_HOLE)) {
13682     return -ENOSPC;
13683   }
13684
13685   if (in->snapid != CEPH_NOSNAP)
13686     return -EROFS;
13687
13688   if ((fh->mode & CEPH_FILE_MODE_WR) == 0)
13689     return -EBADF;
13690
13691   uint64_t size = offset + length;
13692   if (!(mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE)) &&
13693       size > in->size &&
13694       is_quota_bytes_exceeded(in, size - in->size, fh->actor_perms)) {
13695     return -EDQUOT;
13696   }
13697
13698   int have;
13699   int r = get_caps(fh, CEPH_CAP_FILE_WR, CEPH_CAP_FILE_BUFFER, &have, -1);
13700   if (r < 0)
13701     return r;
13702
13703   std::unique_ptr<C_SaferCond> onuninline = nullptr;
13704   if (mode & FALLOC_FL_PUNCH_HOLE) {
13705     if (in->inline_version < CEPH_INLINE_NONE &&
13706         (have & CEPH_CAP_FILE_BUFFER)) {
13707       bufferlist bl;
13708       auto inline_iter = in->inline_data.cbegin();
13709       int len = in->inline_data.length();
13710       if (offset < len) {
13711         if (offset > 0)
13712           inline_iter.copy(offset, bl);
13713         int size = length;
13714         if (offset + size > len)
13715           size = len - offset;
13716         if (size > 0)
13717           bl.append_zero(size);
13718         if (offset + size < len) {
13719           inline_iter += size;
13720           inline_iter.copy(len - offset - size, bl);
13721         }
13722         in->inline_data = bl;
13723         in->inline_version++;
13724       }
13725       in->mtime = in->ctime = ceph_clock_now();
13726       in->change_attr++;
13727       in->mark_caps_dirty(CEPH_CAP_FILE_WR);
13728     } else {
13729       if (in->inline_version < CEPH_INLINE_NONE) {
13730         onuninline.reset(new C_SaferCond("Client::_fallocate_uninline_data flock"));
13731         uninline_data(in, onuninline.get());
13732       }
13733
13734       C_SaferCond onfinish("Client::_punch_hole flock");
13735
13736       unsafe_sync_write++;
13737       get_cap_ref(in, CEPH_CAP_FILE_BUFFER);
13738
13739       _invalidate_inode_cache(in, offset, length);
13740       filer->zero(in->ino, &in->layout,
13741                   in->snaprealm->get_snap_context(),
13742                   offset, length,
13743                   ceph::real_clock::now(),
13744                   0, true, &onfinish);
13745       in->mtime = in->ctime = ceph_clock_now();
13746       in->change_attr++;
13747       in->mark_caps_dirty(CEPH_CAP_FILE_WR);
13748
13749       client_lock.unlock();
13750       onfinish.wait();
13751       client_lock.lock();
13752       _sync_write_commit(in);
13753     }
13754   } else if (!(mode & FALLOC_FL_KEEP_SIZE)) {
13755     uint64_t size = offset + length;
13756     if (size > in->size) {
13757       in->size = size;
13758       in->mtime = in->ctime = ceph_clock_now();
13759       in->change_attr++;
13760       in->mark_caps_dirty(CEPH_CAP_FILE_WR);
13761
13762       if (is_quota_bytes_approaching(in, fh->actor_perms)) {
13763         check_caps(in, CHECK_CAPS_NODELAY);
13764       } else if (is_max_size_approaching(in)) {
13765         check_caps(in, 0);
13766       }
13767     }
13768   }
13769
13770   if (nullptr != onuninline) {
13771     client_lock.unlock();
13772     int ret = onuninline->wait();
13773     client_lock.lock();
13774
13775     if (ret >= 0 || ret == -ECANCELED) {
13776       in->inline_data.clear();
13777       in->inline_version = CEPH_INLINE_NONE;
13778       in->mark_caps_dirty(CEPH_CAP_FILE_WR);
13779       check_caps(in, 0);
13780     } else
13781       r = ret;
13782   }
13783
13784   put_cap_ref(in, CEPH_CAP_FILE_WR);
13785   return r;
13786 }
13787 #else
13788
13789 int Client::_fallocate(Fh *fh, int mode, int64_t offset, int64_t length)
13790 {
13791   return -EOPNOTSUPP;
13792 }
13793
13794 #endif
13795
13796
13797 int Client::ll_fallocate(Fh *fh, int mode, int64_t offset, int64_t length)
13798 {
13799   std::lock_guard lock(client_lock);
13800   ldout(cct, 3) << __func__ << " " << fh << " " << fh->inode->ino << " " << dendl;
13801   tout(cct) << __func__ << " " << mode << " " << offset << " " << length << std::endl;
13802   tout(cct) << (unsigned long)fh << std::endl;
13803
13804   if (unmounting)
13805     return -ENOTCONN;
13806
13807   return _fallocate(fh, mode, offset, length);
13808 }
13809
13810 int Client::fallocate(int fd, int mode, loff_t offset, loff_t length)
13811 {
13812   std::lock_guard lock(client_lock);
13813   tout(cct) << __func__ << " " << " " << fd << mode << " " << offset << " " << length << std::endl;
13814
13815   if (unmounting)
13816     return -ENOTCONN;
13817
13818   Fh *fh = get_filehandle(fd);
13819   if (!fh)
13820     return -EBADF;
13821 #if defined(__linux__) && defined(O_PATH)
13822   if (fh->flags & O_PATH)
13823     return -EBADF;
13824 #endif
13825   return _fallocate(fh, mode, offset, length);
13826 }
13827
13828 int Client::ll_release(Fh *fh)
13829 {
13830   std::lock_guard lock(client_lock);
13831
13832   if (unmounting)
13833     return -ENOTCONN;
13834
13835   ldout(cct, 3) << __func__ << " (fh)" << fh << " " << fh->inode->ino << " " <<
13836     dendl;
13837   tout(cct) << __func__ << " (fh)" << std::endl;
13838   tout(cct) << (unsigned long)fh << std::endl;
13839
13840   if (ll_unclosed_fh_set.count(fh))
13841     ll_unclosed_fh_set.erase(fh);
13842   return _release_fh(fh);
13843 }
13844
13845 int Client::ll_getlk(Fh *fh, struct flock *fl, uint64_t owner)
13846 {
13847   std::lock_guard lock(client_lock);
13848
13849   ldout(cct, 3) << "ll_getlk (fh)" << fh << " " << fh->inode->ino << dendl;
13850   tout(cct) << "ll_getk (fh)" << (unsigned long)fh << std::endl;
13851
13852   if (unmounting)
13853     return -ENOTCONN;
13854
13855   return _getlk(fh, fl, owner);
13856 }
13857
13858 int Client::ll_setlk(Fh *fh, struct flock *fl, uint64_t owner, int sleep)
13859 {
13860   std::lock_guard lock(client_lock);
13861
13862   ldout(cct, 3) << __func__ << "  (fh) " << fh << " " << fh->inode->ino << dendl;
13863   tout(cct) << __func__ << " (fh)" << (unsigned long)fh << std::endl;
13864
13865   if (unmounting)
13866     return -ENOTCONN;
13867
13868   return _setlk(fh, fl, owner, sleep);
13869 }
13870
13871 int Client::ll_flock(Fh *fh, int cmd, uint64_t owner)
13872 {
13873   std::lock_guard lock(client_lock);
13874
13875   ldout(cct, 3) << __func__ << "  (fh) " << fh << " " << fh->inode->ino << dendl;
13876   tout(cct) << __func__ << " (fh)" << (unsigned long)fh << std::endl;
13877
13878   if (unmounting)
13879     return -ENOTCONN;
13880
13881   return _flock(fh, cmd, owner);
13882 }
13883
13884 int Client::set_deleg_timeout(uint32_t timeout)
13885 {
13886   std::lock_guard lock(client_lock);
13887
13888   /*
13889    * The whole point is to prevent blacklisting so we must time out the
13890    * delegation before the session autoclose timeout kicks in.
13891    */
13892   if (timeout >= mdsmap->get_session_autoclose())
13893     return -EINVAL;
13894
13895   deleg_timeout = timeout;
13896   return 0;
13897 }
13898
13899 int Client::ll_delegation(Fh *fh, unsigned cmd, ceph_deleg_cb_t cb, void *priv)
13900 {
13901   int ret = -EINVAL;
13902
13903   std::lock_guard lock(client_lock);
13904
13905   if (!mounted)
13906     return -ENOTCONN;
13907
13908   Inode *inode = fh->inode.get();
13909
13910   switch(cmd) {
13911   case CEPH_DELEGATION_NONE:
13912     inode->unset_deleg(fh);
13913     ret = 0;
13914     break;
13915   default:
13916     try {
13917       ret = inode->set_deleg(fh, cmd, cb, priv);
13918     } catch (std::bad_alloc&) {
13919       ret = -ENOMEM;
13920     }
13921     break;
13922   }
13923   return ret;
13924 }
13925
13926 class C_Client_RequestInterrupt : public Context  {
13927 private:
13928   Client *client;
13929   MetaRequest *req;
13930 public:
13931   C_Client_RequestInterrupt(Client *c, MetaRequest *r) : client(c), req(r) {
13932     req->get();
13933   }
13934   void finish(int r) override {
13935     std::lock_guard l(client->client_lock);
13936     ceph_assert(req->head.op == CEPH_MDS_OP_SETFILELOCK);
13937     client->_interrupt_filelock(req);
13938     client->put_request(req);
13939   }
13940 };
13941
13942 void Client::ll_interrupt(void *d)
13943 {
13944   MetaRequest *req = static_cast<MetaRequest*>(d);
13945   ldout(cct, 3) << __func__ << " tid " << req->get_tid() << dendl;
13946   tout(cct) << __func__ << " tid " << req->get_tid() << std::endl;
13947   interrupt_finisher.queue(new C_Client_RequestInterrupt(this, req));
13948 }
13949
13950 // =========================================
13951 // layout
13952
13953 // expose file layouts
13954
13955 int Client::describe_layout(const char *relpath, file_layout_t *lp,
13956                             const UserPerm& perms)
13957 {
13958   std::lock_guard lock(client_lock);
13959
13960   if (unmounting)
13961     return -ENOTCONN;
13962
13963   filepath path(relpath);
13964   InodeRef in;
13965   int r = path_walk(path, &in, perms);
13966   if (r < 0)
13967     return r;
13968
13969   *lp = in->layout;
13970
13971   ldout(cct, 3) << __func__ << "(" << relpath << ") = 0" << dendl;
13972   return 0;
13973 }
13974
13975 int Client::fdescribe_layout(int fd, file_layout_t *lp)
13976 {
13977   std::lock_guard lock(client_lock);
13978
13979   if (unmounting)
13980     return -ENOTCONN;
13981
13982   Fh *f = get_filehandle(fd);
13983   if (!f)
13984     return -EBADF;
13985   Inode *in = f->inode.get();
13986
13987   *lp = in->layout;
13988
13989   ldout(cct, 3) << __func__ << "(" << fd << ") = 0" << dendl;
13990   return 0;
13991 }
13992
13993 int64_t Client::get_default_pool_id()
13994 {
13995   std::lock_guard lock(client_lock);
13996
13997   if (unmounting)
13998     return -ENOTCONN;
13999
14000   /* first data pool is the default */
14001   return mdsmap->get_first_data_pool();
14002 }
14003
14004 // expose osdmap
14005
14006 int64_t Client::get_pool_id(const char *pool_name)
14007 {
14008   std::lock_guard lock(client_lock);
14009
14010   if (unmounting)
14011     return -ENOTCONN;
14012
14013   return objecter->with_osdmap(std::mem_fn(&OSDMap::lookup_pg_pool_name),
14014                                pool_name);
14015 }
14016
14017 string Client::get_pool_name(int64_t pool)
14018 {
14019   std::lock_guard lock(client_lock);
14020
14021   if (unmounting)
14022     return string();
14023
14024   return objecter->with_osdmap([pool](const OSDMap& o) {
14025       return o.have_pg_pool(pool) ? o.get_pool_name(pool) : string();
14026     });
14027 }
14028
14029 int Client::get_pool_replication(int64_t pool)
14030 {
14031   std::lock_guard lock(client_lock);
14032
14033   if (unmounting)
14034     return -ENOTCONN;
14035
14036   return objecter->with_osdmap([pool](const OSDMap& o) {
14037       return o.have_pg_pool(pool) ? o.get_pg_pool(pool)->get_size() : -ENOENT;
14038     });
14039 }
14040
14041 int Client::get_file_extent_osds(int fd, loff_t off, loff_t *len, vector<int>& osds)
14042 {
14043   std::lock_guard lock(client_lock);
14044
14045   if (unmounting)
14046     return -ENOTCONN;
14047
14048   Fh *f = get_filehandle(fd);
14049   if (!f)
14050     return -EBADF;
14051   Inode *in = f->inode.get();
14052
14053   vector<ObjectExtent> extents;
14054   Striper::file_to_extents(cct, in->ino, &in->layout, off, 1, in->truncate_size, extents);
14055   ceph_assert(extents.size() == 1);
14056
14057   objecter->with_osdmap([&](const OSDMap& o) {
14058       pg_t pg = o.object_locator_to_pg(extents[0].oid, extents[0].oloc);
14059       o.pg_to_acting_osds(pg, osds);
14060     });
14061
14062   if (osds.empty())
14063     return -EINVAL;
14064
14065   /*
14066    * Return the remainder of the extent (stripe unit)
14067    *
14068    * If length = 1 is passed to Striper::file_to_extents we get a single
14069    * extent back, but its length is one so we still need to compute the length
14070    * to the end of the stripe unit.
14071    *
14072    * If length = su then we may get 1 or 2 objects back in the extents vector
14073    * which would have to be examined. Even then, the offsets are local to the
14074    * object, so matching up to the file offset is extra work.
14075    *
14076    * It seems simpler to stick with length = 1 and manually compute the
14077    * remainder.
14078    */
14079   if (len) {
14080     uint64_t su = in->layout.stripe_unit;
14081     *len = su - (off % su);
14082   }
14083
14084   return 0;
14085 }
14086
14087 int Client::get_osd_crush_location(int id, vector<pair<string, string> >& path)
14088 {
14089   std::lock_guard lock(client_lock);
14090
14091   if (unmounting)
14092     return -ENOTCONN;
14093
14094   if (id < 0)
14095     return -EINVAL;
14096   return objecter->with_osdmap([&](const OSDMap& o) {
14097       return o.crush->get_full_location_ordered(id, path);
14098     });
14099 }
14100
14101 int Client::get_file_stripe_address(int fd, loff_t offset,
14102                                     vector<entity_addr_t>& address)
14103 {
14104   std::lock_guard lock(client_lock);
14105
14106   if (unmounting)
14107     return -ENOTCONN;
14108
14109   Fh *f = get_filehandle(fd);
14110   if (!f)
14111     return -EBADF;
14112   Inode *in = f->inode.get();
14113
14114   // which object?
14115   vector<ObjectExtent> extents;
14116   Striper::file_to_extents(cct, in->ino, &in->layout, offset, 1,
14117                            in->truncate_size, extents);
14118   ceph_assert(extents.size() == 1);
14119
14120   // now we have the object and its 'layout'
14121   return objecter->with_osdmap([&](const OSDMap& o) {
14122       pg_t pg = o.object_locator_to_pg(extents[0].oid, extents[0].oloc);
14123       vector<int> osds;
14124       o.pg_to_acting_osds(pg, osds);
14125       if (osds.empty())
14126         return -EINVAL;
14127       for (unsigned i = 0; i < osds.size(); i++) {
14128         entity_addr_t addr = o.get_addrs(osds[i]).front();
14129         address.push_back(addr);
14130       }
14131       return 0;
14132     });
14133 }
14134
14135 int Client::get_osd_addr(int osd, entity_addr_t& addr)
14136 {
14137   std::lock_guard lock(client_lock);
14138
14139   if (unmounting)
14140     return -ENOTCONN;
14141
14142   return objecter->with_osdmap([&](const OSDMap& o) {
14143       if (!o.exists(osd))
14144         return -ENOENT;
14145
14146       addr = o.get_addrs(osd).front();
14147       return 0;
14148     });
14149 }
14150
14151 int Client::enumerate_layout(int fd, vector<ObjectExtent>& result,
14152                              loff_t length, loff_t offset)
14153 {
14154   std::lock_guard lock(client_lock);
14155
14156   if (unmounting)
14157     return -ENOTCONN;
14158
14159   Fh *f = get_filehandle(fd);
14160   if (!f)
14161     return -EBADF;
14162   Inode *in = f->inode.get();
14163
14164   // map to a list of extents
14165   Striper::file_to_extents(cct, in->ino, &in->layout, offset, length, in->truncate_size, result);
14166
14167   ldout(cct, 3) << __func__ << "(" << fd << ", " << length << ", " << offset << ") = 0" << dendl;
14168   return 0;
14169 }
14170
14171
14172 /* find an osd with the same ip.  -ENXIO if none. */
14173 int Client::get_local_osd()
14174 {
14175   std::lock_guard lock(client_lock);
14176
14177   if (unmounting)
14178     return -ENOTCONN;
14179
14180   objecter->with_osdmap([this](const OSDMap& o) {
14181       if (o.get_epoch() != local_osd_epoch) {
14182         local_osd = o.find_osd_on_ip(messenger->get_myaddrs().front());
14183         local_osd_epoch = o.get_epoch();
14184       }
14185     });
14186   return local_osd;
14187 }
14188
14189
14190
14191
14192
14193
14194 // ===============================
14195
14196 void Client::ms_handle_connect(Connection *con)
14197 {
14198   ldout(cct, 10) << __func__ << " on " << con->get_peer_addr() << dendl;
14199 }
14200
14201 bool Client::ms_handle_reset(Connection *con)
14202 {
14203   ldout(cct, 0) << __func__ << " on " << con->get_peer_addr() << dendl;
14204   return false;
14205 }
14206
14207 void Client::ms_handle_remote_reset(Connection *con)
14208 {
14209   ldout(cct, 0) << __func__ << " on " << con->get_peer_addr() << dendl;
14210   std::lock_guard l(client_lock);
14211   switch (con->get_peer_type()) {
14212   case CEPH_ENTITY_TYPE_MDS:
14213     {
14214       // kludge to figure out which mds this is; fixme with a Connection* state
14215       mds_rank_t mds = MDS_RANK_NONE;
14216       MetaSession *s = NULL;
14217       for (auto &p : mds_sessions) {
14218         if (mdsmap->get_addrs(p.first) == con->get_peer_addrs()) {
14219           mds = p.first;
14220           s = &p.second;
14221         }
14222       }
14223       if (mds >= 0) {
14224         assert (s != NULL);
14225         switch (s->state) {
14226         case MetaSession::STATE_CLOSING:
14227           ldout(cct, 1) << "reset from mds we were closing; we'll call that closed" << dendl;
14228           _closed_mds_session(s);
14229           break;
14230
14231         case MetaSession::STATE_OPENING:
14232           {
14233             ldout(cct, 1) << "reset from mds we were opening; retrying" << dendl;
14234             list<Context*> waiters;
14235             waiters.swap(s->waiting_for_open);
14236             _closed_mds_session(s);
14237             MetaSession *news = _get_or_open_mds_session(mds);
14238             news->waiting_for_open.swap(waiters);
14239           }
14240           break;
14241
14242         case MetaSession::STATE_OPEN:
14243           {
14244             objecter->maybe_request_map(); /* to check if we are blacklisted */
14245             if (cct->_conf.get_val<bool>("client_reconnect_stale")) {
14246               ldout(cct, 1) << "reset from mds we were open; close mds session for reconnect" << dendl;
14247               _closed_mds_session(s);
14248             } else {
14249               ldout(cct, 1) << "reset from mds we were open; mark session as stale" << dendl;
14250               s->state = MetaSession::STATE_STALE;
14251             }
14252           }
14253           break;
14254
14255         case MetaSession::STATE_NEW:
14256         case MetaSession::STATE_CLOSED:
14257         default:
14258           break;
14259         }
14260       }
14261     }
14262     break;
14263   }
14264 }
14265
14266 bool Client::ms_handle_refused(Connection *con)
14267 {
14268   ldout(cct, 1) << __func__ << " on " << con->get_peer_addr() << dendl;
14269   return false;
14270 }
14271
14272 Inode *Client::get_quota_root(Inode *in, const UserPerm& perms)
14273 {
14274   Inode *quota_in = root_ancestor;
14275   SnapRealm *realm = in->snaprealm;
14276   while (realm) {
14277     ldout(cct, 10) << __func__ << " realm " << realm->ino << dendl;
14278     if (realm->ino != in->ino) {
14279       auto p = inode_map.find(vinodeno_t(realm->ino, CEPH_NOSNAP));
14280       if (p == inode_map.end())
14281         break;
14282
14283       if (p->second->quota.is_enable()) {
14284         quota_in = p->second;
14285         break;
14286       }
14287     }
14288     realm = realm->pparent;
14289   }
14290   ldout(cct, 10) << __func__ << " " << in->vino() << " -> " << quota_in->vino() << dendl;
14291   return quota_in;
14292 }
14293
14294 /**
14295  * Traverse quota ancestors of the Inode, return true
14296  * if any of them passes the passed function
14297  */
14298 bool Client::check_quota_condition(Inode *in, const UserPerm& perms,
14299                                    std::function<bool (const Inode &in)> test)
14300 {
14301   while (true) {
14302     ceph_assert(in != NULL);
14303     if (test(*in)) {
14304       return true;
14305     }
14306
14307     if (in == root_ancestor) {
14308       // We're done traversing, drop out
14309       return false;
14310     } else {
14311       // Continue up the tree
14312       in = get_quota_root(in, perms);
14313     }
14314   }
14315
14316   return false;
14317 }
14318
14319 bool Client::is_quota_files_exceeded(Inode *in, const UserPerm& perms)
14320 {
14321   return check_quota_condition(in, perms,
14322       [](const Inode &in) {
14323         return in.quota.max_files && in.rstat.rsize() >= in.quota.max_files;
14324       });
14325 }
14326
14327 bool Client::is_quota_bytes_exceeded(Inode *in, int64_t new_bytes,
14328                                      const UserPerm& perms)
14329 {
14330   return check_quota_condition(in, perms,
14331       [&new_bytes](const Inode &in) {
14332         return in.quota.max_bytes && (in.rstat.rbytes + new_bytes)
14333                > in.quota.max_bytes;
14334       });
14335 }
14336
14337 bool Client::is_quota_bytes_approaching(Inode *in, const UserPerm& perms)
14338 {
14339   ceph_assert(in->size >= in->reported_size);
14340   const uint64_t size = in->size - in->reported_size;
14341   return check_quota_condition(in, perms,
14342       [&size](const Inode &in) {
14343         if (in.quota.max_bytes) {
14344           if (in.rstat.rbytes >= in.quota.max_bytes) {
14345             return true;
14346           }
14347
14348           const uint64_t space = in.quota.max_bytes - in.rstat.rbytes;
14349           return (space >> 4) < size;
14350         } else {
14351           return false;
14352         }
14353       });
14354 }
14355
14356 enum {
14357   POOL_CHECKED = 1,
14358   POOL_CHECKING = 2,
14359   POOL_READ = 4,
14360   POOL_WRITE = 8,
14361 };
14362
14363 int Client::check_pool_perm(Inode *in, int need)
14364 {
14365   if (!cct->_conf->client_check_pool_perm)
14366     return 0;
14367
14368   /* Only need to do this for regular files */
14369   if (!in->is_file())
14370     return 0;
14371
14372   int64_t pool_id = in->layout.pool_id;
14373   std::string pool_ns = in->layout.pool_ns;
14374   std::pair<int64_t, std::string> perm_key(pool_id, pool_ns);
14375   int have = 0;
14376   while (true) {
14377     auto it = pool_perms.find(perm_key);
14378     if (it == pool_perms.end())
14379       break;
14380     if (it->second == POOL_CHECKING) {
14381       // avoid concurrent checkings
14382       wait_on_list(waiting_for_pool_perm);
14383     } else {
14384       have = it->second;
14385       ceph_assert(have & POOL_CHECKED);
14386       break;
14387     }
14388   }
14389
14390   if (!have) {
14391     if (in->snapid != CEPH_NOSNAP) {
14392       // pool permission check needs to write to the first object. But for snapshot,
14393       // head of the first object may have alread been deleted. To avoid creating
14394       // orphan object, skip the check for now.
14395       return 0;
14396     }
14397
14398     pool_perms[perm_key] = POOL_CHECKING;
14399
14400     char oid_buf[32];
14401     snprintf(oid_buf, sizeof(oid_buf), "%llx.00000000", (unsigned long long)in->ino);
14402     object_t oid = oid_buf;
14403
14404     SnapContext nullsnapc;
14405
14406     C_SaferCond rd_cond;
14407     ObjectOperation rd_op;
14408     rd_op.stat(NULL, (ceph::real_time*)nullptr, NULL);
14409
14410     objecter->mutate(oid, OSDMap::file_to_object_locator(in->layout), rd_op,
14411                      nullsnapc, ceph::real_clock::now(), 0, &rd_cond);
14412
14413     C_SaferCond wr_cond;
14414     ObjectOperation wr_op;
14415     wr_op.create(true);
14416
14417     objecter->mutate(oid, OSDMap::file_to_object_locator(in->layout), wr_op,
14418                      nullsnapc, ceph::real_clock::now(), 0, &wr_cond);
14419
14420     client_lock.unlock();
14421     int rd_ret = rd_cond.wait();
14422     int wr_ret = wr_cond.wait();
14423     client_lock.lock();
14424
14425     bool errored = false;
14426
14427     if (rd_ret == 0 || rd_ret == -ENOENT)
14428       have |= POOL_READ;
14429     else if (rd_ret != -EPERM) {
14430       ldout(cct, 10) << __func__ << " on pool " << pool_id << " ns " << pool_ns
14431                      << " rd_err = " << rd_ret << " wr_err = " << wr_ret << dendl;
14432       errored = true;
14433     }
14434
14435     if (wr_ret == 0 || wr_ret == -EEXIST)
14436       have |= POOL_WRITE;
14437     else if (wr_ret != -EPERM) {
14438       ldout(cct, 10) << __func__ << " on pool " << pool_id << " ns " << pool_ns
14439                      << " rd_err = " << rd_ret << " wr_err = " << wr_ret << dendl;
14440       errored = true;
14441     }
14442
14443     if (errored) {
14444       // Indeterminate: erase CHECKING state so that subsequent calls re-check.
14445       // Raise EIO because actual error code might be misleading for
14446       // userspace filesystem user.
14447       pool_perms.erase(perm_key);
14448       signal_cond_list(waiting_for_pool_perm);
14449       return -EIO;
14450     }
14451
14452     pool_perms[perm_key] = have | POOL_CHECKED;
14453     signal_cond_list(waiting_for_pool_perm);
14454   }
14455
14456   if ((need & CEPH_CAP_FILE_RD) && !(have & POOL_READ)) {
14457     ldout(cct, 10) << __func__ << " on pool " << pool_id << " ns " << pool_ns
14458                    << " need " << ccap_string(need) << ", but no read perm" << dendl;
14459     return -EPERM;
14460   }
14461   if ((need & CEPH_CAP_FILE_WR) && !(have & POOL_WRITE)) {
14462     ldout(cct, 10) << __func__ << " on pool " << pool_id << " ns " << pool_ns
14463                    << " need " << ccap_string(need) << ", but no write perm" << dendl;
14464     return -EPERM;
14465   }
14466
14467   return 0;
14468 }
14469
14470 int Client::_posix_acl_permission(Inode *in, const UserPerm& perms, unsigned want)
14471 {
14472   if (acl_type == POSIX_ACL) {
14473     if (in->xattrs.count(ACL_EA_ACCESS)) {
14474       const bufferptr& access_acl = in->xattrs[ACL_EA_ACCESS];
14475
14476       return posix_acl_permits(access_acl, in->uid, in->gid, perms, want);
14477     }
14478   }
14479   return -EAGAIN;
14480 }
14481
14482 int Client::_posix_acl_chmod(Inode *in, mode_t mode, const UserPerm& perms)
14483 {
14484   if (acl_type == NO_ACL)
14485     return 0;
14486
14487   int r = _getattr(in, CEPH_STAT_CAP_XATTR, perms, in->xattr_version == 0);
14488   if (r < 0)
14489     goto out;
14490
14491   if (acl_type == POSIX_ACL) {
14492     if (in->xattrs.count(ACL_EA_ACCESS)) {
14493       const bufferptr& access_acl = in->xattrs[ACL_EA_ACCESS];
14494       bufferptr acl(access_acl.c_str(), access_acl.length());
14495       r = posix_acl_access_chmod(acl, mode);
14496       if (r < 0)
14497         goto out;
14498       r = _do_setxattr(in, ACL_EA_ACCESS, acl.c_str(), acl.length(), 0, perms);
14499     } else {
14500       r = 0;
14501     }
14502   }
14503 out:
14504   ldout(cct, 10) << __func__ << " ino " << in->ino << " result=" << r << dendl;
14505   return r;
14506 }
14507
14508 int Client::_posix_acl_create(Inode *dir, mode_t *mode, bufferlist& xattrs_bl,
14509                               const UserPerm& perms)
14510 {
14511   if (acl_type == NO_ACL)
14512     return 0;
14513
14514   if (S_ISLNK(*mode))
14515     return 0;
14516
14517   int r = _getattr(dir, CEPH_STAT_CAP_XATTR, perms, dir->xattr_version == 0);
14518   if (r < 0)
14519     goto out;
14520
14521   if (acl_type == POSIX_ACL) {
14522     if (dir->xattrs.count(ACL_EA_DEFAULT)) {
14523       map<string, bufferptr> xattrs;
14524
14525       const bufferptr& default_acl = dir->xattrs[ACL_EA_DEFAULT];
14526       bufferptr acl(default_acl.c_str(), default_acl.length());
14527       r = posix_acl_inherit_mode(acl, mode);
14528       if (r < 0)
14529         goto out;
14530
14531       if (r > 0) {
14532         r = posix_acl_equiv_mode(acl.c_str(), acl.length(), mode);
14533         if (r < 0)
14534           goto out;
14535         if (r > 0)
14536           xattrs[ACL_EA_ACCESS] = acl;
14537       }
14538
14539       if (S_ISDIR(*mode))
14540         xattrs[ACL_EA_DEFAULT] = dir->xattrs[ACL_EA_DEFAULT];
14541
14542       r = xattrs.size();
14543       if (r > 0)
14544         encode(xattrs, xattrs_bl);
14545     } else {
14546       if (umask_cb)
14547         *mode &= ~umask_cb(callback_handle);
14548       r = 0;
14549     }
14550   }
14551 out:
14552   ldout(cct, 10) << __func__ << " dir ino " << dir->ino << " result=" << r << dendl;
14553   return r;
14554 }
14555
14556 void Client::set_filer_flags(int flags)
14557 {
14558   std::lock_guard l(client_lock);
14559   ceph_assert(flags == 0 ||
14560          flags == CEPH_OSD_FLAG_LOCALIZE_READS);
14561   objecter->add_global_op_flags(flags);
14562 }
14563
14564 void Client::clear_filer_flags(int flags)
14565 {
14566   std::lock_guard l(client_lock);
14567   ceph_assert(flags == CEPH_OSD_FLAG_LOCALIZE_READS);
14568   objecter->clear_global_op_flag(flags);
14569 }
14570
14571 // called before mount
14572 void Client::set_uuid(const std::string& uuid)
14573 {
14574   std::lock_guard l(client_lock);
14575   assert(initialized);
14576   assert(!uuid.empty());
14577
14578   metadata["uuid"] = uuid;
14579   _close_sessions();
14580 }
14581
14582 // called before mount. 0 means infinite
14583 void Client::set_session_timeout(unsigned timeout)
14584 {
14585   std::lock_guard l(client_lock);
14586   assert(initialized);
14587
14588   metadata["timeout"] = stringify(timeout);
14589 }
14590
14591 // called before mount
14592 int Client::start_reclaim(const std::string& uuid, unsigned flags,
14593                           const std::string& fs_name)
14594 {
14595   std::lock_guard l(client_lock);
14596   if (!initialized)
14597     return -ENOTCONN;
14598
14599   if (uuid.empty())
14600     return -EINVAL;
14601
14602   {
14603     auto it = metadata.find("uuid");
14604     if (it != metadata.end() && it->second == uuid)
14605       return -EINVAL;
14606   }
14607
14608   int r = subscribe_mdsmap(fs_name);
14609   if (r < 0) {
14610     lderr(cct) << "mdsmap subscription failed: " << cpp_strerror(r) << dendl;
14611     return r;
14612   }
14613
14614   if (metadata.empty())
14615     populate_metadata("");
14616
14617   while (mdsmap->get_epoch() == 0)
14618     wait_on_list(waiting_for_mdsmap);
14619
14620   reclaim_errno = 0;
14621   for (unsigned mds = 0; mds < mdsmap->get_num_in_mds(); ) {
14622     if (!mdsmap->is_up(mds)) {
14623       ldout(cct, 10) << "mds." << mds << " not active, waiting for new mdsmap" << dendl;
14624       wait_on_list(waiting_for_mdsmap);
14625       continue;
14626     }
14627
14628     MetaSession *session;
14629     if (!have_open_session(mds)) {
14630       session = _get_or_open_mds_session(mds);
14631       if (session->state == MetaSession::STATE_REJECTED)
14632         return -EPERM;
14633       if (session->state != MetaSession::STATE_OPENING) {
14634         // umounting?
14635         return -EINVAL;
14636       }
14637       ldout(cct, 10) << "waiting for session to mds." << mds << " to open" << dendl;
14638       wait_on_context_list(session->waiting_for_open);
14639       continue;
14640     }
14641
14642     session = &mds_sessions.at(mds);
14643     if (!session->mds_features.test(CEPHFS_FEATURE_RECLAIM_CLIENT))
14644       return -EOPNOTSUPP;
14645
14646     if (session->reclaim_state == MetaSession::RECLAIM_NULL ||
14647         session->reclaim_state == MetaSession::RECLAIMING) {
14648       session->reclaim_state = MetaSession::RECLAIMING;
14649       auto m = make_message<MClientReclaim>(uuid, flags);
14650       session->con->send_message2(std::move(m));
14651       wait_on_list(waiting_for_reclaim);
14652     } else if (session->reclaim_state == MetaSession::RECLAIM_FAIL) {
14653       return reclaim_errno ? : -ENOTRECOVERABLE;
14654     } else {
14655       mds++;
14656     }
14657   }
14658
14659   // didn't find target session in any mds
14660   if (reclaim_target_addrs.empty()) {
14661     if (flags & CEPH_RECLAIM_RESET)
14662       return -ENOENT;
14663     return -ENOTRECOVERABLE;
14664   }
14665
14666   if (flags & CEPH_RECLAIM_RESET)
14667     return 0;
14668
14669   // use blacklist to check if target session was killed
14670   // (config option mds_session_blacklist_on_evict needs to be true)
14671   C_SaferCond cond;
14672   if (!objecter->wait_for_map(reclaim_osd_epoch, &cond)) {
14673     ldout(cct, 10) << __func__ << ": waiting for OSD epoch " << reclaim_osd_epoch << dendl;
14674     client_lock.unlock();
14675     cond.wait();
14676     client_lock.lock();
14677   }
14678
14679   bool blacklisted = objecter->with_osdmap(
14680       [this](const OSDMap &osd_map) -> bool {
14681         return osd_map.is_blacklisted(reclaim_target_addrs);
14682       });
14683   if (blacklisted)
14684     return -ENOTRECOVERABLE;
14685
14686   metadata["reclaiming_uuid"] = uuid;
14687   return 0;
14688 }
14689
14690 void Client::finish_reclaim()
14691 {
14692   auto it = metadata.find("reclaiming_uuid");
14693   if (it == metadata.end()) {
14694     for (auto &p : mds_sessions)
14695       p.second.reclaim_state = MetaSession::RECLAIM_NULL;
14696     return;
14697   }
14698
14699   for (auto &p : mds_sessions) {
14700     p.second.reclaim_state = MetaSession::RECLAIM_NULL;
14701     auto m = make_message<MClientReclaim>("", MClientReclaim::FLAG_FINISH);
14702     p.second.con->send_message2(std::move(m));
14703   }
14704
14705   metadata["uuid"] = it->second;
14706   metadata.erase(it);
14707 }
14708
14709 void Client::handle_client_reclaim_reply(const MConstRef<MClientReclaimReply>& reply)
14710 {
14711   mds_rank_t from = mds_rank_t(reply->get_source().num());
14712   ldout(cct, 10) << __func__ << " " << *reply << " from mds." << from << dendl;
14713
14714   MetaSession *session = _get_mds_session(from, reply->get_connection().get());
14715   if (!session) {
14716     ldout(cct, 10) << " discarding reclaim reply from sessionless mds." <<  from << dendl;
14717     return;
14718   }
14719
14720   if (reply->get_result() >= 0) {
14721     session->reclaim_state = MetaSession::RECLAIM_OK;
14722     if (reply->get_epoch() > reclaim_osd_epoch)
14723       reclaim_osd_epoch = reply->get_epoch();
14724     if (!reply->get_addrs().empty())
14725       reclaim_target_addrs = reply->get_addrs();
14726   } else {
14727     session->reclaim_state = MetaSession::RECLAIM_FAIL;
14728     reclaim_errno = reply->get_result();
14729   }
14730
14731   signal_cond_list(waiting_for_reclaim);
14732 }
14733
14734 /**
14735  * This is included in cap release messages, to cause
14736  * the MDS to wait until this OSD map epoch.  It is necessary
14737  * in corner cases where we cancel RADOS ops, so that
14738  * nobody else tries to do IO to the same objects in
14739  * the same epoch as the cancelled ops.
14740  */
14741 void Client::set_cap_epoch_barrier(epoch_t e)
14742 {
14743   ldout(cct, 5) << __func__ << " epoch = " << e << dendl;
14744   cap_epoch_barrier = e;
14745 }
14746
14747 const char** Client::get_tracked_conf_keys() const
14748 {
14749   static const char* keys[] = {
14750     "client_cache_size",
14751     "client_cache_mid",
14752     "client_acl_type",
14753     "client_deleg_timeout",
14754     "client_deleg_break_on_open",
14755     NULL
14756   };
14757   return keys;
14758 }
14759
14760 void Client::handle_conf_change(const ConfigProxy& conf,
14761                                 const std::set <std::string> &changed)
14762 {
14763   std::lock_guard lock(client_lock);
14764
14765   if (changed.count("client_cache_mid")) {
14766     lru.lru_set_midpoint(cct->_conf->client_cache_mid);
14767   }
14768   if (changed.count("client_acl_type")) {
14769     acl_type = NO_ACL;
14770     if (cct->_conf->client_acl_type == "posix_acl")
14771       acl_type = POSIX_ACL;
14772   }
14773 }
14774
14775 void intrusive_ptr_add_ref(Inode *in)
14776 {
14777   in->get();
14778 }
14779
14780 void intrusive_ptr_release(Inode *in)
14781 {
14782   in->client->put_inode(in);
14783 }
14784
14785 mds_rank_t Client::_get_random_up_mds() const
14786 {
14787   ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
14788
14789   std::set<mds_rank_t> up;
14790   mdsmap->get_up_mds_set(up);
14791
14792   if (up.empty())
14793     return MDS_RANK_NONE;
14794   std::set<mds_rank_t>::const_iterator p = up.begin();
14795   for (int n = rand() % up.size(); n; n--)
14796     ++p;
14797   return *p;
14798 }
14799
14800
14801 StandaloneClient::StandaloneClient(Messenger *m, MonClient *mc)
14802     : Client(m, mc, new Objecter(m->cct, m, mc, nullptr))
14803 {
14804   monclient->set_messenger(m);
14805   objecter->set_client_incarnation(0);
14806 }
14807
14808 StandaloneClient::~StandaloneClient()
14809 {
14810   delete objecter;
14811   objecter = nullptr;
14812 }
14813
14814 int StandaloneClient::init()
14815 {
14816   _pre_init();
14817   objecter->init();
14818
14819   client_lock.lock();
14820   ceph_assert(!is_initialized());
14821
14822   messenger->add_dispatcher_tail(objecter);
14823   messenger->add_dispatcher_tail(this);
14824
14825   monclient->set_want_keys(CEPH_ENTITY_TYPE_MDS | CEPH_ENTITY_TYPE_OSD);
14826   int r = monclient->init();
14827   if (r < 0) {
14828     // need to do cleanup because we're in an intermediate init state
14829     timer.shutdown();
14830     client_lock.unlock();
14831     objecter->shutdown();
14832     objectcacher->stop();
14833     monclient->shutdown();
14834     return r;
14835   }
14836   objecter->start();
14837
14838   client_lock.unlock();
14839   _finish_init();
14840
14841   return 0;
14842 }
14843
14844 void StandaloneClient::shutdown()
14845 {
14846   Client::shutdown();
14847   objecter->shutdown();
14848   monclient->shutdown();
14849 }