ceph/src/client/Client.cc

   1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
   2 // vim: ts=8 sw=2 smarttab
   3 /*
   4  * Ceph - scalable distributed file system
   5  *
   6  * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
   7  *
   8  * This is free software; you can redistribute it and/or
   9  * modify it under the terms of the GNU Lesser General Public
  10  * License version 2.1, as published by the Free Software
  11  * Foundation.  See file COPYING.
  12  *
  13  */
  14
  15
  16 // unix-ey fs stuff
  17 #include <unistd.h>
  18 #include <sys/types.h>
  19 #include <time.h>
  20 #include <utime.h>
  21 #include <string.h>
  22 #include <sys/stat.h>
  23 #include <sys/param.h>
  24 #include <fcntl.h>
  25 #include <sys/file.h>
  26 #ifndef _WIN32
  27 #include <sys/utsname.h>
  28 #endif
  29 #include <sys/uio.h>
  30
  31 #include <boost/lexical_cast.hpp>
  32 #include <boost/fusion/include/std_pair.hpp>
  33
  34 #include "common/async/waiter.h"
  35
  36 #if defined(__FreeBSD__) || defined(_WIN32)
  37 #define XATTR_CREATE    0x1
  38 #define XATTR_REPLACE   0x2
  39 #else
  40 #include <sys/xattr.h>
  41 #endif
  42
  43 #if defined(__linux__)
  44 #include <linux/falloc.h>
  45 #endif
  46
  47 #include <sys/statvfs.h>
  48
  49 #include "common/config.h"
  50 #include "common/version.h"
  51 #include "common/async/blocked_completion.h"
  52
  53 #include "mon/MonClient.h"
  54
  55 #include "messages/MClientCaps.h"
  56 #include "messages/MClientLease.h"
  57 #include "messages/MClientQuota.h"
  58 #include "messages/MClientReclaim.h"
  59 #include "messages/MClientReclaimReply.h"
  60 #include "messages/MClientReconnect.h"
  61 #include "messages/MClientReply.h"
  62 #include "messages/MClientRequest.h"
  63 #include "messages/MClientRequestForward.h"
  64 #include "messages/MClientSession.h"
  65 #include "messages/MClientSnap.h"
  66 #include "messages/MClientMetrics.h"
  67 #include "messages/MCommandReply.h"
  68 #include "messages/MFSMap.h"
  69 #include "messages/MFSMapUser.h"
  70 #include "messages/MMDSMap.h"
  71 #include "messages/MOSDMap.h"
  72
  73 #include "mds/flock.h"
  74 #include "mds/cephfs_features.h"
  75 #include "osd/OSDMap.h"
  76 #include "osdc/Filer.h"
  77
  78 #include "common/Cond.h"
  79 #include "common/perf_counters.h"
  80 #include "common/admin_socket.h"
  81 #include "common/errno.h"
  82 #include "include/str_list.h"
  83
  84 #define dout_subsys ceph_subsys_client
  85
  86 #include "include/lru.h"
  87 #include "include/compat.h"
  88 #include "include/stringify.h"
  89 #include "include/random.h"
  90
  91 #include "Client.h"
  92 #include "Inode.h"
  93 #include "Dentry.h"
  94 #include "Delegation.h"
  95 #include "Dir.h"
  96 #include "ClientSnapRealm.h"
  97 #include "Fh.h"
  98 #include "MetaSession.h"
  99 #include "MetaRequest.h"
 100 #include "ObjecterWriteback.h"
 101 #include "posix_acl.h"
 102
 103 #include "include/ceph_assert.h"
 104 #include "include/stat.h"
 105
 106 #include "include/cephfs/ceph_ll_client.h"
 107
 108 #if HAVE_GETGROUPLIST
 109 #include <grp.h>
 110 #include <pwd.h>
 111 #include <unistd.h>
 112 #endif
 113
 114 #undef dout_prefix
 115 #define dout_prefix *_dout << "client." << whoami << " "
 116
 117 #define  tout(cct)       if (!cct->_conf->client_trace.empty()) traceout
 118
 119 // FreeBSD fails to define this
 120 #ifndef O_DSYNC
 121 #define O_DSYNC 0x0
 122 #endif
 123 // Darwin fails to define this
 124 #ifndef O_RSYNC
 125 #define O_RSYNC 0x0
 126 #endif
 127
 128 #ifndef O_DIRECT
 129 #define O_DIRECT 0x0
 130 #endif
 131
 132 // Windows doesn't define those values. While the Posix compatibilty layer
 133 // doesn't support those values, the Windows native functions do provide
 134 // similar flags. Special care should be taken if we're going to use those
 135 // flags in ceph-dokan. The current values are no-ops, while propagating
 136 // them to the rest of the code might cause the Windows functions to reject
 137 // them as invalid.
 138 #ifndef O_NOFOLLOW
 139 #define O_NOFOLLOW 0x0
 140 #endif
 141
 142 #ifndef O_SYNC
 143 #define O_SYNC 0x0
 144 #endif
 145
 146 #define DEBUG_GETATTR_CAPS (CEPH_CAP_XATTR_SHARED)
 147
 148 #ifndef S_IXUGO
 149 #define S_IXUGO (S_IXUSR|S_IXGRP|S_IXOTH)
 150 #endif
 151
 152 using std::dec;
 153 using std::hex;
 154 using std::list;
 155 using std::oct;
 156 using std::pair;
 157 using std::string;
 158 using std::vector;
 159
 160 using namespace TOPNSPC::common;
 161
 162 namespace bs = boost::system;
 163 namespace ca = ceph::async;
 164
 165 void client_flush_set_callback(void *p, ObjectCacher::ObjectSet *oset)
 166 {
 167   Client *client = static_cast<Client*>(p);
 168   client->flush_set_callback(oset);
 169 }
 170
 171 bool Client::is_reserved_vino(vinodeno_t &vino) {
 172   if (MDS_IS_PRIVATE_INO(vino.ino)) {
 173     ldout(cct, -1) << __func__ << " attempt to access reserved inode number " << vino << dendl;
 174     return true;
 175   }
 176   return false;
 177 }
 178
 179 // running average and standard deviation -- presented in
 180 // Donald Knuth's TAoCP, Volume II.
 181 double calc_average(double old_avg, double value, uint64_t count) {
 182   double new_avg;
 183   if (count == 1) {
 184     new_avg = value;
 185   } else {
 186     new_avg = old_avg + ((value - old_avg) / count);
 187   }
 188
 189   return new_avg;
 190 }
 191
 192 double calc_sq_sum(double old_sq_sum, double old_mean, double new_mean,
 193                    double value, uint64_t count) {
 194   double new_sq_sum;
 195   if (count == 1) {
 196     new_sq_sum = 0.0;
 197   } else {
 198     new_sq_sum = old_sq_sum + (value - old_mean)*(value - new_mean);
 199   }
 200
 201   return new_sq_sum;
 202 }
 203
 204 // -------------
 205
 206 Client::CommandHook::CommandHook(Client *client) :
 207   m_client(client)
 208 {
 209 }
 210
 211 int Client::CommandHook::call(
 212   std::string_view command,
 213   const cmdmap_t& cmdmap,
 214   Formatter *f,
 215   std::ostream& errss,
 216   bufferlist& out)
 217 {
 218   f->open_object_section("result");
 219   {
 220     std::scoped_lock l{m_client->client_lock};
 221     if (command == "mds_requests")
 222       m_client->dump_mds_requests(f);
 223     else if (command == "mds_sessions") {
 224       bool cap_dump = false;
 225       cmd_getval(cmdmap, "cap_dump", cap_dump);
 226       m_client->dump_mds_sessions(f, cap_dump);
 227     } else if (command == "dump_cache")
 228       m_client->dump_cache(f);
 229     else if (command == "kick_stale_sessions")
 230       m_client->_kick_stale_sessions();
 231     else if (command == "status")
 232       m_client->dump_status(f);
 233     else
 234       ceph_abort_msg("bad command registered");
 235   }
 236   f->close_section();
 237   return 0;
 238 }
 239
 240
 241 // -------------
 242
 243 int Client::get_fd_inode(int fd, InodeRef *in) {
 244   int r = 0;
 245   if (fd == CEPHFS_AT_FDCWD) {
 246     *in = cwd;
 247   } else {
 248     Fh *f = get_filehandle(fd);
 249     if (!f) {
 250       r = -CEPHFS_EBADF;
 251     } else {
 252       *in = f->inode;
 253     }
 254   }
 255   return r;
 256 }
 257
 258 dir_result_t::dir_result_t(Inode *in, const UserPerm& perms)
 259   : inode(in), offset(0), next_offset(2),
 260     release_count(0), ordered_count(0), cache_index(0), start_shared_gen(0),
 261     perms(perms)
 262   { }
 263
 264 void Client::_reset_faked_inos()
 265 {
 266   ino_t start = 1024;
 267   free_faked_inos.clear();
 268   free_faked_inos.insert(start, (uint32_t)-1 - start + 1);
 269   last_used_faked_ino = 0;
 270   last_used_faked_root = 0;
 271   #ifdef _WIN32
 272   // On Windows, sizeof(ino_t) is just 2. Despite that, most "native"
 273   // Windows structures, including Dokan ones, are using 64B identifiers.
 274   _use_faked_inos = false;
 275   #else
 276   _use_faked_inos = sizeof(ino_t) < 8 || cct->_conf->client_use_faked_inos;
 277   #endif
 278 }
 279
 280 void Client::_assign_faked_ino(Inode *in)
 281 {
 282   if (0 == last_used_faked_ino)
 283     last_used_faked_ino = last_used_faked_ino + 2048; // start(1024)~2048 reserved for _assign_faked_root
 284   interval_set<ino_t>::const_iterator it = free_faked_inos.lower_bound(last_used_faked_ino + 1);
 285   if (it == free_faked_inos.end() && last_used_faked_ino > 0) {
 286     last_used_faked_ino = 2048;
 287     it = free_faked_inos.lower_bound(last_used_faked_ino + 1);
 288   }
 289   ceph_assert(it != free_faked_inos.end());
 290   if (last_used_faked_ino < it.get_start()) {
 291     ceph_assert(it.get_len() > 0);
 292     last_used_faked_ino = it.get_start();
 293   } else {
 294     ++last_used_faked_ino;
 295     ceph_assert(it.get_start() + it.get_len() > last_used_faked_ino);
 296   }
 297   in->faked_ino = last_used_faked_ino;
 298   free_faked_inos.erase(in->faked_ino);
 299   faked_ino_map[in->faked_ino] = in->vino();
 300 }
 301
 302 /*
 303  * In the faked mode, if you export multiple subdirectories,
 304  * you will see that the inode numbers of the exported subdirectories
 305  * are the same. so we distinguish the mount point by reserving
 306  * the "fake ids" between "1024~2048" and combining the last
 307  * 10bits(0x3ff) of the "root inodes".
 308 */
 309 void Client::_assign_faked_root(Inode *in)
 310 {
 311   interval_set<ino_t>::const_iterator it = free_faked_inos.lower_bound(last_used_faked_root + 1);
 312   if (it == free_faked_inos.end() && last_used_faked_root > 0) {
 313     last_used_faked_root = 0;
 314     it = free_faked_inos.lower_bound(last_used_faked_root + 1);
 315   }
 316   ceph_assert(it != free_faked_inos.end());
 317   vinodeno_t inode_info = in->vino();
 318   uint64_t inode_num = (uint64_t)inode_info.ino;
 319   ldout(cct, 10) << "inode_num " << inode_num << "inode_num & 0x3ff=" << (inode_num & 0x3ff)<< dendl;
 320   last_used_faked_root = it.get_start()  + (inode_num & 0x3ff); // 0x3ff mask and get_start will not exceed 2048
 321   ceph_assert(it.get_start() + it.get_len() > last_used_faked_root);
 322
 323   in->faked_ino = last_used_faked_root;
 324   free_faked_inos.erase(in->faked_ino);
 325   faked_ino_map[in->faked_ino] = in->vino();
 326 }
 327
 328 void Client::_release_faked_ino(Inode *in)
 329 {
 330   free_faked_inos.insert(in->faked_ino);
 331   faked_ino_map.erase(in->faked_ino);
 332 }
 333
 334 vinodeno_t Client::_map_faked_ino(ino_t ino)
 335 {
 336   vinodeno_t vino;
 337   if (ino == 1)
 338     vino = root->vino();
 339   else if (faked_ino_map.count(ino))
 340     vino = faked_ino_map[ino];
 341   else
 342     vino = vinodeno_t(0, CEPH_NOSNAP);
 343   ldout(cct, 10) << __func__ << " " << ino << " -> " << vino << dendl;
 344   return vino;
 345 }
 346
 347 vinodeno_t Client::map_faked_ino(ino_t ino)
 348 {
 349   std::scoped_lock lock(client_lock);
 350   return _map_faked_ino(ino);
 351 }
 352
 353 // cons/des
 354
 355 Client::Client(Messenger *m, MonClient *mc, Objecter *objecter_)
 356   : Dispatcher(m->cct->get()),
 357     timer(m->cct, timer_lock, false),
 358     messenger(m),
 359     monclient(mc),
 360     objecter(objecter_),
 361     whoami(mc->get_global_id()),
 362     mount_state(CLIENT_UNMOUNTED, "Client::mountstate_lock"),
 363     initialize_state(CLIENT_NEW, "Client::initstate_lock"),
 364     cct_deleter{m->cct, [](CephContext *p) {p->put();}},
 365     async_ino_invalidator(m->cct),
 366     async_dentry_invalidator(m->cct),
 367     interrupt_finisher(m->cct),
 368     remount_finisher(m->cct),
 369     async_ino_releasor(m->cct),
 370     objecter_finisher(m->cct),
 371     m_command_hook(this),
 372     fscid(0)
 373 {
 374   _reset_faked_inos();
 375
 376   user_id = cct->_conf->client_mount_uid;
 377   group_id = cct->_conf->client_mount_gid;
 378   fuse_default_permissions = cct->_conf.get_val<bool>(
 379     "fuse_default_permissions");
 380
 381   _collect_and_send_global_metrics = cct->_conf.get_val<bool>(
 382     "client_collect_and_send_global_metrics");
 383
 384   mount_timeout = cct->_conf.get_val<std::chrono::seconds>(
 385     "client_mount_timeout");
 386
 387   caps_release_delay = cct->_conf.get_val<std::chrono::seconds>(
 388     "client_caps_release_delay");
 389
 390   if (cct->_conf->client_acl_type == "posix_acl")
 391     acl_type = POSIX_ACL;
 392
 393   lru.lru_set_midpoint(cct->_conf->client_cache_mid);
 394
 395   // file handles
 396   free_fd_set.insert(10, 1<<30);
 397
 398   mdsmap.reset(new MDSMap);
 399
 400   // osd interfaces
 401   writeback_handler.reset(new ObjecterWriteback(objecter, &objecter_finisher,
 402                                             &client_lock));
 403   objectcacher.reset(new ObjectCacher(cct, "libcephfs", *writeback_handler, client_lock,
 404                                   client_flush_set_callback,    // all commit callback
 405                                   (void*)this,
 406                                   cct->_conf->client_oc_size,
 407                                   cct->_conf->client_oc_max_objects,
 408                                   cct->_conf->client_oc_max_dirty,
 409                                   cct->_conf->client_oc_target_dirty,
 410                                   cct->_conf->client_oc_max_dirty_age,
 411                                   true));
 412 }
 413
 414
 415 Client::~Client()
 416 {
 417   ceph_assert(ceph_mutex_is_not_locked(client_lock));
 418
 419   // If the task is crashed or aborted and doesn't
 420   // get any chance to run the umount and shutdow.
 421   {
 422     std::scoped_lock l{client_lock};
 423     tick_thread_stopped = true;
 424     upkeep_cond.notify_one();
 425   }
 426
 427   if (upkeeper.joinable())
 428     upkeeper.join();
 429
 430   // It is necessary to hold client_lock, because any inode destruction
 431   // may call into ObjectCacher, which asserts that it's lock (which is
 432   // client_lock) is held.
 433   std::scoped_lock l{client_lock};
 434   tear_down_cache();
 435 }
 436
 437 void Client::tear_down_cache()
 438 {
 439   // fd's
 440   for (auto &[fd, fh] : fd_map) {
 441     ldout(cct, 1) << __func__ << " forcing close of fh " << fd << " ino " << fh->inode->ino << dendl;
 442     _release_fh(fh);
 443   }
 444   fd_map.clear();
 445
 446   while (!opened_dirs.empty()) {
 447     dir_result_t *dirp = *opened_dirs.begin();
 448     ldout(cct, 1) << __func__ << " forcing close of dir " << dirp << " ino " << dirp->inode->ino << dendl;
 449     _closedir(dirp);
 450   }
 451
 452   // caps!
 453   // *** FIXME ***
 454
 455   // empty lru
 456   trim_cache();
 457   ceph_assert(lru.lru_get_size() == 0);
 458
 459   // close root ino
 460   ceph_assert(inode_map.size() <= 1 + root_parents.size());
 461   if (root && inode_map.size() == 1 + root_parents.size()) {
 462     root.reset();
 463   }
 464
 465   ceph_assert(inode_map.empty());
 466 }
 467
 468 inodeno_t Client::get_root_ino()
 469 {
 470   std::scoped_lock l(client_lock);
 471   if (use_faked_inos())
 472     return root->faked_ino;
 473   else
 474     return root->ino;
 475 }
 476
 477 Inode *Client::get_root()
 478 {
 479   std::scoped_lock l(client_lock);
 480   root->ll_get();
 481   return root.get();
 482 }
 483
 484
 485 // debug crapola
 486
 487 void Client::dump_inode(Formatter *f, Inode *in, set<Inode*>& did, bool disconnected)
 488 {
 489   filepath path;
 490   in->make_long_path(path);
 491   ldout(cct, 1) << "dump_inode: "
 492                 << (disconnected ? "DISCONNECTED ":"")
 493                 << "inode " << in->ino
 494                 << " " << path
 495                 << " ref " << in->get_nref()
 496                 << " " << *in << dendl;
 497
 498   if (f) {
 499     f->open_object_section("inode");
 500     f->dump_stream("path") << path;
 501     if (disconnected)
 502       f->dump_int("disconnected", 1);
 503     in->dump(f);
 504     f->close_section();
 505   }
 506
 507   did.insert(in);
 508   if (in->dir) {
 509     ldout(cct, 1) << "  dir " << in->dir << " size " << in->dir->dentries.size() << dendl;
 510     for (ceph::unordered_map<string, Dentry*>::iterator it = in->dir->dentries.begin();
 511          it != in->dir->dentries.end();
 512          ++it) {
 513       ldout(cct, 1) << "   " << in->ino << " dn " << it->first << " " << it->second << " ref " << it->second->ref << dendl;
 514       if (f) {
 515         f->open_object_section("dentry");
 516         it->second->dump(f);
 517         f->close_section();
 518       }
 519       if (it->second->inode)
 520         dump_inode(f, it->second->inode.get(), did, false);
 521     }
 522   }
 523 }
 524
 525 void Client::dump_cache(Formatter *f)
 526 {
 527   set<Inode*> did;
 528
 529   ldout(cct, 1) << __func__ << dendl;
 530
 531   if (f)
 532     f->open_array_section("cache");
 533
 534   if (root)
 535     dump_inode(f, root.get(), did, true);
 536
 537   // make a second pass to catch anything disconnected
 538   for (ceph::unordered_map<vinodeno_t, Inode*>::iterator it = inode_map.begin();
 539        it != inode_map.end();
 540        ++it) {
 541     if (did.count(it->second))
 542       continue;
 543     dump_inode(f, it->second, did, true);
 544   }
 545
 546   if (f)
 547     f->close_section();
 548 }
 549
 550 void Client::dump_status(Formatter *f)
 551 {
 552   ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
 553
 554   ldout(cct, 1) << __func__ << dendl;
 555
 556   const epoch_t osd_epoch
 557     = objecter->with_osdmap(std::mem_fn(&OSDMap::get_epoch));
 558
 559   if (f) {
 560     f->open_object_section("metadata");
 561     for (const auto& kv : metadata)
 562       f->dump_string(kv.first.c_str(), kv.second);
 563     f->close_section();
 564
 565     f->dump_int("dentry_count", lru.lru_get_size());
 566     f->dump_int("dentry_pinned_count", lru.lru_get_num_pinned());
 567     f->dump_int("id", get_nodeid().v);
 568     entity_inst_t inst(messenger->get_myname(), messenger->get_myaddr_legacy());
 569     f->dump_object("inst", inst);
 570     f->dump_object("addr", inst.addr);
 571     f->dump_stream("inst_str") << inst.name << " " << inst.addr.get_legacy_str();
 572     f->dump_string("addr_str", inst.addr.get_legacy_str());
 573     f->dump_int("inode_count", inode_map.size());
 574     f->dump_int("mds_epoch", mdsmap->get_epoch());
 575     f->dump_int("osd_epoch", osd_epoch);
 576     f->dump_int("osd_epoch_barrier", cap_epoch_barrier);
 577     f->dump_bool("blocklisted", blocklisted);
 578     f->dump_string("fs_name", mdsmap->get_fs_name());
 579   }
 580 }
 581
 582 void Client::_pre_init()
 583 {
 584   timer.init();
 585
 586   objecter_finisher.start();
 587   filer.reset(new Filer(objecter, &objecter_finisher));
 588
 589   objectcacher->start();
 590 }
 591
 592 int Client::init()
 593 {
 594   RWRef_t iref_writer(initialize_state, CLIENT_INITIALIZING, false);
 595   ceph_assert(iref_writer.is_first_writer());
 596
 597   _pre_init();
 598   {
 599     std::scoped_lock l{client_lock};
 600     messenger->add_dispatcher_tail(this);
 601   }
 602   _finish_init();
 603   iref_writer.update_state(CLIENT_INITIALIZED);
 604   return 0;
 605 }
 606
 607 void Client::_finish_init()
 608 {
 609   {
 610     std::scoped_lock l{client_lock};
 611     // logger
 612     PerfCountersBuilder plb(cct, "client", l_c_first, l_c_last);
 613     plb.add_time_avg(l_c_reply, "reply", "Latency of receiving a reply on metadata request");
 614     plb.add_time_avg(l_c_lat, "lat", "Latency of processing a metadata request");
 615     plb.add_time_avg(l_c_wrlat, "wrlat", "Latency of a file data write operation");
 616     plb.add_time_avg(l_c_read, "rdlat", "Latency of a file data read operation");
 617     plb.add_time_avg(l_c_fsync, "fsync", "Latency of a file sync operation");
 618     // average, standard deviation mds/r/w/ latencies
 619     plb.add_time(l_c_md_avg, "mdavg", "Average latency for processing metadata requests");
 620     plb.add_u64(l_c_md_sqsum, "mdsqsum", "Sum of squares (to calculate variability/stdev) for metadata requests");
 621     plb.add_u64(l_c_md_ops, "mdops", "Total metadata IO operations");
 622     plb.add_time(l_c_rd_avg, "readavg", "Average latency for processing read requests");
 623     plb.add_u64(l_c_rd_sqsum, "readsqsum", "Sum of squares ((to calculate variability/stdev) for read requests");
 624     plb.add_u64(l_c_rd_ops, "rdops", "Total read IO operations");
 625     plb.add_time(l_c_wr_avg, "writeavg", "Average latency for processing write requests");
 626     plb.add_u64(l_c_wr_sqsum, "writesqsum", "Sum of squares ((to calculate variability/stdev) for write requests");
 627     plb.add_u64(l_c_wr_ops, "rdops", "Total write IO operations");
 628     logger.reset(plb.create_perf_counters());
 629     cct->get_perfcounters_collection()->add(logger.get());
 630   }
 631
 632   cct->_conf.add_observer(this);
 633
 634   AdminSocket* admin_socket = cct->get_admin_socket();
 635   int ret = admin_socket->register_command("mds_requests",
 636                                            &m_command_hook,
 637                                            "show in-progress mds requests");
 638   if (ret < 0) {
 639     lderr(cct) << "error registering admin socket command: "
 640                << cpp_strerror(-ret) << dendl;
 641   }
 642   ret = admin_socket->register_command("mds_sessions "
 643                                        "name=cap_dump,type=CephBool,req=false",
 644                                        &m_command_hook,
 645                                        "show mds session state");
 646   if (ret < 0) {
 647     lderr(cct) << "error registering admin socket command: "
 648                << cpp_strerror(-ret) << dendl;
 649   }
 650   ret = admin_socket->register_command("dump_cache",
 651                                        &m_command_hook,
 652                                        "show in-memory metadata cache contents");
 653   if (ret < 0) {
 654     lderr(cct) << "error registering admin socket command: "
 655                << cpp_strerror(-ret) << dendl;
 656   }
 657   ret = admin_socket->register_command("kick_stale_sessions",
 658                                        &m_command_hook,
 659                                        "kick sessions that were remote reset");
 660   if (ret < 0) {
 661     lderr(cct) << "error registering admin socket command: "
 662                << cpp_strerror(-ret) << dendl;
 663   }
 664   ret = admin_socket->register_command("status",
 665                                        &m_command_hook,
 666                                        "show overall client status");
 667   if (ret < 0) {
 668     lderr(cct) << "error registering admin socket command: "
 669                << cpp_strerror(-ret) << dendl;
 670   }
 671 }
 672
 673 void Client::shutdown()
 674 {
 675   ldout(cct, 1) << __func__ << dendl;
 676
 677   // If we were not mounted, but were being used for sending
 678   // MDS commands, we may have sessions that need closing.
 679   {
 680     std::scoped_lock l{client_lock};
 681
 682     // To make sure the tick thread will be stoppped before
 683     // destructing the Client, just in case like the _mount()
 684     // failed but didn't not get a chance to stop the tick
 685     // thread
 686     tick_thread_stopped = true;
 687     upkeep_cond.notify_one();
 688
 689     _close_sessions();
 690   }
 691   cct->_conf.remove_observer(this);
 692
 693   cct->get_admin_socket()->unregister_commands(&m_command_hook);
 694
 695   if (ino_invalidate_cb) {
 696     ldout(cct, 10) << "shutdown stopping cache invalidator finisher" << dendl;
 697     async_ino_invalidator.wait_for_empty();
 698     async_ino_invalidator.stop();
 699   }
 700
 701   if (dentry_invalidate_cb) {
 702     ldout(cct, 10) << "shutdown stopping dentry invalidator finisher" << dendl;
 703     async_dentry_invalidator.wait_for_empty();
 704     async_dentry_invalidator.stop();
 705   }
 706
 707   if (switch_interrupt_cb) {
 708     ldout(cct, 10) << "shutdown stopping interrupt finisher" << dendl;
 709     interrupt_finisher.wait_for_empty();
 710     interrupt_finisher.stop();
 711   }
 712
 713   if (remount_cb) {
 714     ldout(cct, 10) << "shutdown stopping remount finisher" << dendl;
 715     remount_finisher.wait_for_empty();
 716     remount_finisher.stop();
 717   }
 718
 719   if (ino_release_cb) {
 720     ldout(cct, 10) << "shutdown stopping inode release finisher" << dendl;
 721     async_ino_releasor.wait_for_empty();
 722     async_ino_releasor.stop();
 723   }
 724
 725   objectcacher->stop();  // outside of client_lock! this does a join.
 726
 727   /*
 728    * We are shuting down the client.
 729    *
 730    * Just declare the state to CLIENT_NEW to block and fail any
 731    * new comming "reader" and then try to wait all the in-flight
 732    * "readers" to finish.
 733    */
 734   RWRef_t iref_writer(initialize_state, CLIENT_NEW, false);
 735   if (!iref_writer.is_first_writer())
 736     return;
 737   iref_writer.wait_readers_done();
 738
 739   {
 740     std::scoped_lock l(timer_lock);
 741     timer.shutdown();
 742   }
 743
 744   objecter_finisher.wait_for_empty();
 745   objecter_finisher.stop();
 746
 747   if (logger) {
 748     cct->get_perfcounters_collection()->remove(logger.get());
 749     logger.reset();
 750   }
 751 }
 752
 753 void Client::update_io_stat_metadata(utime_t latency) {
 754   auto lat_nsec = latency.to_nsec();
 755   // old values are used to compute new ones
 756   auto o_avg = logger->tget(l_c_md_avg).to_nsec();
 757   auto o_sqsum = logger->get(l_c_md_sqsum);
 758
 759   auto n_avg = calc_average(o_avg, lat_nsec, nr_metadata_request);
 760   auto n_sqsum = calc_sq_sum(o_sqsum, o_avg, n_avg, lat_nsec,
 761                               nr_metadata_request);
 762
 763   logger->tinc(l_c_lat, latency);
 764   logger->tinc(l_c_reply, latency);
 765
 766   utime_t avg;
 767   avg.set_from_double(n_avg / 1000000000);
 768   logger->tset(l_c_md_avg, avg);
 769   logger->set(l_c_md_sqsum, n_sqsum);
 770   logger->set(l_c_md_ops, nr_metadata_request);
 771 }
 772
 773 void Client::update_io_stat_read(utime_t latency) {
 774   auto lat_nsec = latency.to_nsec();
 775   // old values are used to compute new ones
 776   auto o_avg = logger->tget(l_c_rd_avg).to_nsec();
 777   auto o_sqsum = logger->get(l_c_rd_sqsum);
 778
 779   auto n_avg = calc_average(o_avg, lat_nsec, nr_read_request);
 780   auto n_sqsum = calc_sq_sum(o_sqsum, o_avg, n_avg, lat_nsec,
 781                               nr_read_request);
 782
 783   logger->tinc(l_c_read, latency);
 784
 785   utime_t avg;
 786   avg.set_from_double(n_avg / 1000000000);
 787   logger->tset(l_c_rd_avg, avg);
 788   logger->set(l_c_rd_sqsum, n_sqsum);
 789   logger->set(l_c_rd_ops, nr_read_request);
 790 }
 791
 792 void Client::update_io_stat_write(utime_t latency) {
 793   auto lat_nsec = latency.to_nsec();
 794   // old values are used to compute new ones
 795   auto o_avg = logger->tget(l_c_wr_avg).to_nsec();
 796   auto o_sqsum = logger->get(l_c_wr_sqsum);
 797
 798   auto n_avg = calc_average(o_avg, lat_nsec, nr_write_request);
 799   auto n_sqsum = calc_sq_sum(o_sqsum, o_avg, n_avg, lat_nsec,
 800                               nr_write_request);
 801
 802   logger->tinc(l_c_wrlat, latency);
 803
 804   utime_t avg;
 805   avg.set_from_double(n_avg / 1000000000);
 806   logger->tset(l_c_wr_avg, avg);
 807   logger->set(l_c_wr_sqsum, n_sqsum);
 808   logger->set(l_c_wr_ops, nr_write_request);
 809 }
 810
 811 // ===================
 812 // metadata cache stuff
 813
 814 void Client::trim_cache(bool trim_kernel_dcache)
 815 {
 816   uint64_t max = cct->_conf->client_cache_size;
 817   ldout(cct, 20) << "trim_cache size " << lru.lru_get_size() << " max " << max << dendl;
 818   unsigned last = 0;
 819   while (lru.lru_get_size() != last) {
 820     last = lru.lru_get_size();
 821
 822     if (!is_unmounting() && lru.lru_get_size() <= max)  break;
 823
 824     // trim!
 825     Dentry *dn = static_cast<Dentry*>(lru.lru_get_next_expire());
 826     if (!dn)
 827       break;  // done
 828
 829     trim_dentry(dn);
 830   }
 831
 832   if (trim_kernel_dcache && lru.lru_get_size() > max)
 833     _invalidate_kernel_dcache();
 834
 835   // hose root?
 836   if (lru.lru_get_size() == 0 && root && root->get_nref() == 1 && inode_map.size() == 1 + root_parents.size()) {
 837     ldout(cct, 15) << "trim_cache trimmed root " << root << dendl;
 838     root.reset();
 839   }
 840 }
 841
 842 void Client::trim_cache_for_reconnect(MetaSession *s)
 843 {
 844   mds_rank_t mds = s->mds_num;
 845   ldout(cct, 20) << __func__ << " mds." << mds << dendl;
 846
 847   int trimmed = 0;
 848   list<Dentry*> skipped;
 849   while (lru.lru_get_size() > 0) {
 850     Dentry *dn = static_cast<Dentry*>(lru.lru_expire());
 851     if (!dn)
 852       break;
 853
 854     if ((dn->inode && dn->inode->caps.count(mds)) ||
 855         dn->dir->parent_inode->caps.count(mds)) {
 856       trim_dentry(dn);
 857       trimmed++;
 858     } else
 859       skipped.push_back(dn);
 860   }
 861
 862   for(list<Dentry*>::iterator p = skipped.begin(); p != skipped.end(); ++p)
 863     lru.lru_insert_mid(*p);
 864
 865   ldout(cct, 20) << __func__ << " mds." << mds
 866                  << " trimmed " << trimmed << " dentries" << dendl;
 867
 868   if (s->caps.size() > 0)
 869     _invalidate_kernel_dcache();
 870 }
 871
 872 void Client::trim_dentry(Dentry *dn)
 873 {
 874   ldout(cct, 15) << "trim_dentry unlinking dn " << dn->name
 875                  << " in dir "
 876                  << std::hex << dn->dir->parent_inode->ino << std::dec
 877                  << dendl;
 878   if (dn->inode) {
 879     Inode *diri = dn->dir->parent_inode;
 880     clear_dir_complete_and_ordered(diri, true);
 881   }
 882   unlink(dn, false, false);  // drop dir, drop dentry
 883 }
 884
 885
 886 void Client::update_inode_file_size(Inode *in, int issued, uint64_t size,
 887                                     uint64_t truncate_seq, uint64_t truncate_size)
 888 {
 889   uint64_t prior_size = in->size;
 890
 891   if (truncate_seq > in->truncate_seq ||
 892       (truncate_seq == in->truncate_seq && size > in->size)) {
 893     ldout(cct, 10) << "size " << in->size << " -> " << size << dendl;
 894     in->size = size;
 895     in->reported_size = size;
 896     if (truncate_seq != in->truncate_seq) {
 897       ldout(cct, 10) << "truncate_seq " << in->truncate_seq << " -> "
 898                << truncate_seq << dendl;
 899       in->truncate_seq = truncate_seq;
 900       in->oset.truncate_seq = truncate_seq;
 901
 902       // truncate cached file data
 903       if (prior_size > size) {
 904         _invalidate_inode_cache(in, truncate_size, prior_size - truncate_size);
 905       }
 906     }
 907
 908     // truncate inline data
 909     if (in->inline_version < CEPH_INLINE_NONE) {
 910       uint32_t len = in->inline_data.length();
 911       if (size < len)
 912         in->inline_data.splice(size, len - size);
 913     }
 914   }
 915   if (truncate_seq >= in->truncate_seq &&
 916       in->truncate_size != truncate_size) {
 917     if (in->is_file()) {
 918       ldout(cct, 10) << "truncate_size " << in->truncate_size << " -> "
 919                << truncate_size << dendl;
 920       in->truncate_size = truncate_size;
 921       in->oset.truncate_size = truncate_size;
 922     } else {
 923       ldout(cct, 0) << "Hmmm, truncate_seq && truncate_size changed on non-file inode!" << dendl;
 924     }
 925   }
 926 }
 927
 928 void Client::update_inode_file_time(Inode *in, int issued, uint64_t time_warp_seq,
 929                                     utime_t ctime, utime_t mtime, utime_t atime)
 930 {
 931   ldout(cct, 10) << __func__ << " " << *in << " " << ccap_string(issued)
 932                  << " ctime " << ctime << " mtime " << mtime << dendl;
 933
 934   if (time_warp_seq > in->time_warp_seq)
 935     ldout(cct, 10) << " mds time_warp_seq " << time_warp_seq
 936                    << " is higher than local time_warp_seq "
 937                    << in->time_warp_seq << dendl;
 938
 939   int warn = false;
 940   // be careful with size, mtime, atime
 941   if (issued & (CEPH_CAP_FILE_EXCL|
 942                 CEPH_CAP_FILE_WR|
 943                 CEPH_CAP_FILE_BUFFER|
 944                 CEPH_CAP_AUTH_EXCL|
 945                 CEPH_CAP_XATTR_EXCL)) {
 946     ldout(cct, 30) << "Yay have enough caps to look at our times" << dendl;
 947     if (ctime > in->ctime)
 948       in->ctime = ctime;
 949     if (time_warp_seq > in->time_warp_seq) {
 950       //the mds updated times, so take those!
 951       in->mtime = mtime;
 952       in->atime = atime;
 953       in->time_warp_seq = time_warp_seq;
 954     } else if (time_warp_seq == in->time_warp_seq) {
 955       //take max times
 956       if (mtime > in->mtime)
 957         in->mtime = mtime;
 958       if (atime > in->atime)
 959         in->atime = atime;
 960     } else if (issued & CEPH_CAP_FILE_EXCL) {
 961       //ignore mds values as we have a higher seq
 962     } else warn = true;
 963   } else {
 964     ldout(cct, 30) << "Don't have enough caps, just taking mds' time values" << dendl;
 965     if (time_warp_seq >= in->time_warp_seq) {
 966       in->ctime = ctime;
 967       in->mtime = mtime;
 968       in->atime = atime;
 969       in->time_warp_seq = time_warp_seq;
 970     } else warn = true;
 971   }
 972   if (warn) {
 973     ldout(cct, 0) << "WARNING: " << *in << " mds time_warp_seq "
 974             << time_warp_seq << " is lower than local time_warp_seq "
 975             << in->time_warp_seq
 976             << dendl;
 977   }
 978 }
 979
 980 void Client::_fragmap_remove_non_leaves(Inode *in)
 981 {
 982   for (map<frag_t,int>::iterator p = in->fragmap.begin(); p != in->fragmap.end(); )
 983     if (!in->dirfragtree.is_leaf(p->first))
 984       in->fragmap.erase(p++);
 985     else
 986       ++p;
 987 }
 988
 989 void Client::_fragmap_remove_stopped_mds(Inode *in, mds_rank_t mds)
 990 {
 991   for (auto p = in->fragmap.begin(); p != in->fragmap.end(); )
 992     if (p->second == mds)
 993       in->fragmap.erase(p++);
 994     else
 995       ++p;
 996 }
 997
 998 Inode * Client::add_update_inode(InodeStat *st, utime_t from,
 999                                  MetaSession *session,
1000                                  const UserPerm& request_perms)
1001 {
1002   Inode *in;
1003   bool was_new = false;
1004   if (inode_map.count(st->vino)) {
1005     in = inode_map[st->vino];
1006     ldout(cct, 12) << __func__ << " had " << *in << " caps " << ccap_string(st->cap.caps) << dendl;
1007   } else {
1008     in = new Inode(this, st->vino, &st->layout);
1009     inode_map[st->vino] = in;
1010
1011     if (use_faked_inos())
1012       _assign_faked_ino(in);
1013
1014     if (!root) {
1015       root = in;
1016       if (use_faked_inos())
1017         _assign_faked_root(root.get());
1018       root_ancestor = in;
1019       cwd = root;
1020     } else if (is_mounting()) {
1021       root_parents[root_ancestor] = in;
1022       root_ancestor = in;
1023     }
1024
1025     // immutable bits
1026     in->ino = st->vino.ino;
1027     in->snapid = st->vino.snapid;
1028     in->mode = st->mode & S_IFMT;
1029     was_new = true;
1030   }
1031
1032   in->rdev = st->rdev;
1033   if (in->is_symlink())
1034     in->symlink = st->symlink;
1035
1036   // only update inode if mds info is strictly newer, or it is the same and projected (odd).
1037   bool new_version = false;
1038   if (in->version == 0 ||
1039       ((st->cap.flags & CEPH_CAP_FLAG_AUTH) &&
1040        (in->version & ~1) < st->version))
1041     new_version = true;
1042
1043   int issued;
1044   in->caps_issued(&issued);
1045   issued |= in->caps_dirty();
1046   int new_issued = ~issued & (int)st->cap.caps;
1047
1048   if ((new_version || (new_issued & CEPH_CAP_AUTH_SHARED)) &&
1049       !(issued & CEPH_CAP_AUTH_EXCL)) {
1050     in->mode = st->mode;
1051     in->uid = st->uid;
1052     in->gid = st->gid;
1053     in->btime = st->btime;
1054     in->snap_btime = st->snap_btime;
1055     in->snap_metadata = st->snap_metadata;
1056   }
1057
1058   if ((new_version || (new_issued & CEPH_CAP_LINK_SHARED)) &&
1059       !(issued & CEPH_CAP_LINK_EXCL)) {
1060     in->nlink = st->nlink;
1061   }
1062
1063   if (new_version || (new_issued & CEPH_CAP_ANY_RD)) {
1064     update_inode_file_time(in, issued, st->time_warp_seq,
1065                            st->ctime, st->mtime, st->atime);
1066   }
1067
1068   if (new_version ||
1069       (new_issued & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR))) {
1070     in->layout = st->layout;
1071     update_inode_file_size(in, issued, st->size, st->truncate_seq, st->truncate_size);
1072   }
1073
1074   if (in->is_dir()) {
1075     if (new_version || (new_issued & CEPH_CAP_FILE_SHARED)) {
1076       in->dirstat = st->dirstat;
1077     }
1078     // dir_layout/rstat/quota are not tracked by capability, update them only if
1079     // the inode stat is from auth mds
1080     if (new_version || (st->cap.flags & CEPH_CAP_FLAG_AUTH)) {
1081       in->dir_layout = st->dir_layout;
1082       ldout(cct, 20) << " dir hash is " << (int)in->dir_layout.dl_dir_hash << dendl;
1083       in->rstat = st->rstat;
1084       in->quota = st->quota;
1085       in->dir_pin = st->dir_pin;
1086     }
1087     // move me if/when version reflects fragtree changes.
1088     if (in->dirfragtree != st->dirfragtree) {
1089       in->dirfragtree = st->dirfragtree;
1090       _fragmap_remove_non_leaves(in);
1091     }
1092   }
1093
1094   if ((in->xattr_version  == 0 || !(issued & CEPH_CAP_XATTR_EXCL)) &&
1095       st->xattrbl.length() &&
1096       st->xattr_version > in->xattr_version) {
1097     auto p = st->xattrbl.cbegin();
1098     decode(in->xattrs, p);
1099     in->xattr_version = st->xattr_version;
1100   }
1101
1102   if (st->inline_version > in->inline_version) {
1103     in->inline_data = st->inline_data;
1104     in->inline_version = st->inline_version;
1105   }
1106
1107   /* always take a newer change attr */
1108   if (st->change_attr > in->change_attr)
1109     in->change_attr = st->change_attr;
1110
1111   if (st->version > in->version)
1112     in->version = st->version;
1113
1114   if (was_new)
1115     ldout(cct, 12) << __func__ << " adding " << *in << " caps " << ccap_string(st->cap.caps) << dendl;
1116
1117   if (!st->cap.caps)
1118     return in;   // as with readdir returning indoes in different snaprealms (no caps!)
1119
1120   if (in->snapid == CEPH_NOSNAP) {
1121     add_update_cap(in, session, st->cap.cap_id, st->cap.caps, st->cap.wanted,
1122                    st->cap.seq, st->cap.mseq, inodeno_t(st->cap.realm),
1123                    st->cap.flags, request_perms);
1124     if (in->auth_cap && in->auth_cap->session == session) {
1125       in->max_size = st->max_size;
1126       in->rstat = st->rstat;
1127     }
1128
1129     // setting I_COMPLETE needs to happen after adding the cap
1130     if (in->is_dir() &&
1131         (st->cap.caps & CEPH_CAP_FILE_SHARED) &&
1132         (issued & CEPH_CAP_FILE_EXCL) == 0 &&
1133         in->dirstat.nfiles == 0 &&
1134         in->dirstat.nsubdirs == 0) {
1135       ldout(cct, 10) << " marking (I_COMPLETE|I_DIR_ORDERED) on empty dir " << *in << dendl;
1136       in->flags |= I_COMPLETE | I_DIR_ORDERED;
1137       if (in->dir) {
1138         ldout(cct, 10) << " dir is open on empty dir " << in->ino << " with "
1139                        << in->dir->dentries.size() << " entries, marking all dentries null" << dendl;
1140         in->dir->readdir_cache.clear();
1141         for (const auto& p : in->dir->dentries) {
1142           unlink(p.second, true, true);  // keep dir, keep dentry
1143         }
1144         if (in->dir->dentries.empty())
1145           close_dir(in->dir);
1146       }
1147     }
1148   } else {
1149     in->snap_caps |= st->cap.caps;
1150   }
1151
1152   in->fscrypt = st->fscrypt;
1153   return in;
1154 }
1155
1156
1157 /*
1158  * insert_dentry_inode - insert + link a single dentry + inode into the metadata cache.
1159  */
1160 Dentry *Client::insert_dentry_inode(Dir *dir, const string& dname, LeaseStat *dlease,
1161                                     Inode *in, utime_t from, MetaSession *session,
1162                                     Dentry *old_dentry)
1163 {
1164   Dentry *dn = NULL;
1165   if (dir->dentries.count(dname))
1166     dn = dir->dentries[dname];
1167
1168   ldout(cct, 12) << __func__ << " '" << dname << "' vino " << in->vino()
1169                  << " in dir " << dir->parent_inode->vino() << " dn " << dn
1170                  << dendl;
1171
1172   if (dn && dn->inode) {
1173     if (dn->inode->vino() == in->vino()) {
1174       touch_dn(dn);
1175       ldout(cct, 12) << " had dentry " << dname
1176                << " with correct vino " << dn->inode->vino()
1177                << dendl;
1178     } else {
1179       ldout(cct, 12) << " had dentry " << dname
1180                << " with WRONG vino " << dn->inode->vino()
1181                << dendl;
1182       unlink(dn, true, true);  // keep dir, keep dentry
1183     }
1184   }
1185
1186   if (!dn || !dn->inode) {
1187     InodeRef tmp_ref(in);
1188     if (old_dentry) {
1189       if (old_dentry->dir != dir) {
1190         Inode *old_diri = old_dentry->dir->parent_inode;
1191         clear_dir_complete_and_ordered(old_diri, false);
1192       }
1193       unlink(old_dentry, dir == old_dentry->dir, false);  // drop dentry, keep dir open if its the same dir
1194     }
1195     Inode *diri = dir->parent_inode;
1196     clear_dir_complete_and_ordered(diri, false);
1197     dn = link(dir, dname, in, dn);
1198   }
1199
1200   update_dentry_lease(dn, dlease, from, session);
1201   return dn;
1202 }
1203
1204 void Client::update_dentry_lease(Dentry *dn, LeaseStat *dlease, utime_t from, MetaSession *session)
1205 {
1206   utime_t dttl = from;
1207   dttl += (float)dlease->duration_ms / 1000.0;
1208
1209   ldout(cct, 15) << __func__ << " " << *dn << " " << *dlease << " from " << from << dendl;
1210
1211   ceph_assert(dn);
1212
1213   if (dlease->mask & CEPH_LEASE_VALID) {
1214     if (dttl > dn->lease_ttl) {
1215       ldout(cct, 10) << "got dentry lease on " << dn->name
1216                << " dur " << dlease->duration_ms << "ms ttl " << dttl << dendl;
1217       dn->lease_ttl = dttl;
1218       dn->lease_mds = session->mds_num;
1219       dn->lease_seq = dlease->seq;
1220       dn->lease_gen = session->cap_gen;
1221     }
1222   }
1223   dn->cap_shared_gen = dn->dir->parent_inode->shared_gen;
1224   if (dlease->mask & CEPH_LEASE_PRIMARY_LINK)
1225     dn->mark_primary();
1226   dn->alternate_name = std::move(dlease->alternate_name);
1227 }
1228
1229
1230 /*
1231  * update MDS location cache for a single inode
1232  */
1233 void Client::update_dir_dist(Inode *in, DirStat *dst, mds_rank_t from)
1234 {
1235   // auth
1236   ldout(cct, 20) << "got dirfrag map for " << in->ino << " frag " << dst->frag << " to mds " << dst->auth << dendl;
1237   if (dst->auth >= 0) {
1238     in->fragmap[dst->frag] = dst->auth;
1239   } else {
1240     in->fragmap.erase(dst->frag);
1241   }
1242   if (!in->dirfragtree.is_leaf(dst->frag)) {
1243     in->dirfragtree.force_to_leaf(cct, dst->frag);
1244     _fragmap_remove_non_leaves(in);
1245   }
1246
1247   // replicated, only update from auth mds reply
1248   if (from == dst->auth) {
1249     in->dir_replicated = !dst->dist.empty();
1250     if (!dst->dist.empty())
1251       in->frag_repmap[dst->frag].assign(dst->dist.begin(), dst->dist.end()) ;
1252     else
1253       in->frag_repmap.erase(dst->frag);
1254   }
1255 }
1256
1257 void Client::clear_dir_complete_and_ordered(Inode *diri, bool complete)
1258 {
1259   if (complete)
1260     diri->dir_release_count++;
1261   else
1262     diri->dir_ordered_count++;
1263   if (diri->flags & I_COMPLETE) {
1264     if (complete) {
1265       ldout(cct, 10) << " clearing (I_COMPLETE|I_DIR_ORDERED) on " << *diri << dendl;
1266       diri->flags &= ~(I_COMPLETE | I_DIR_ORDERED);
1267     } else {
1268       if (diri->flags & I_DIR_ORDERED) {
1269         ldout(cct, 10) << " clearing I_DIR_ORDERED on " << *diri << dendl;
1270         diri->flags &= ~I_DIR_ORDERED;
1271       }
1272     }
1273     if (diri->dir)
1274       diri->dir->readdir_cache.clear();
1275   }
1276 }
1277
1278 /*
1279  * insert results from readdir or lssnap into the metadata cache.
1280  */
1281 void Client::insert_readdir_results(MetaRequest *request, MetaSession *session, Inode *diri) {
1282
1283   auto& reply = request->reply;
1284   ConnectionRef con = request->reply->get_connection();
1285   uint64_t features;
1286   if(session->mds_features.test(CEPHFS_FEATURE_REPLY_ENCODING)) {
1287     features = (uint64_t)-1;
1288   }
1289   else {
1290     features = con->get_features();
1291   }
1292
1293   dir_result_t *dirp = request->dirp;
1294   ceph_assert(dirp);
1295
1296   // the extra buffer list is only set for readdir and lssnap replies
1297   auto p = reply->get_extra_bl().cbegin();
1298   if (!p.end()) {
1299     // snapdir?
1300     if (request->head.op == CEPH_MDS_OP_LSSNAP) {
1301       ceph_assert(diri);
1302       diri = open_snapdir(diri);
1303     }
1304
1305     // only open dir if we're actually adding stuff to it!
1306     Dir *dir = diri->open_dir();
1307     ceph_assert(dir);
1308
1309     // dirstat
1310     DirStat dst(p, features);
1311     __u32 numdn;
1312     __u16 flags;
1313     decode(numdn, p);
1314     decode(flags, p);
1315
1316     bool end = ((unsigned)flags & CEPH_READDIR_FRAG_END);
1317     bool hash_order = ((unsigned)flags & CEPH_READDIR_HASH_ORDER);
1318
1319     frag_t fg = (unsigned)request->head.args.readdir.frag;
1320     unsigned readdir_offset = dirp->next_offset;
1321     string readdir_start = dirp->last_name;
1322     ceph_assert(!readdir_start.empty() || readdir_offset == 2);
1323
1324     unsigned last_hash = 0;
1325     if (hash_order) {
1326       if (!readdir_start.empty()) {
1327         last_hash = ceph_frag_value(diri->hash_dentry_name(readdir_start));
1328       } else if (flags & CEPH_READDIR_OFFSET_HASH) {
1329         /* mds understands offset_hash */
1330         last_hash = (unsigned)request->head.args.readdir.offset_hash;
1331       }
1332     }
1333
1334     if (fg != dst.frag) {
1335       ldout(cct, 10) << "insert_trace got new frag " << fg << " -> " << dst.frag << dendl;
1336       fg = dst.frag;
1337       if (!hash_order) {
1338         readdir_offset = 2;
1339         readdir_start.clear();
1340         dirp->offset = dir_result_t::make_fpos(fg, readdir_offset, false);
1341       }
1342     }
1343
1344     ldout(cct, 10) << __func__ << " " << numdn << " readdir items, end=" << end
1345                    << ", hash_order=" << hash_order
1346                    << ", readdir_start " << readdir_start
1347                    << ", last_hash " << last_hash
1348                    << ", next_offset " << readdir_offset << dendl;
1349
1350     if (diri->snapid != CEPH_SNAPDIR &&
1351         fg.is_leftmost() && readdir_offset == 2 &&
1352         !(hash_order && last_hash)) {
1353       dirp->release_count = diri->dir_release_count;
1354       dirp->ordered_count = diri->dir_ordered_count;
1355       dirp->start_shared_gen = diri->shared_gen;
1356       dirp->cache_index = 0;
1357     }
1358
1359     dirp->buffer_frag = fg;
1360
1361     _readdir_drop_dirp_buffer(dirp);
1362     dirp->buffer.reserve(numdn);
1363
1364     string dname;
1365     LeaseStat dlease;
1366     for (unsigned i=0; i<numdn; i++) {
1367       decode(dname, p);
1368       dlease.decode(p, features);
1369       InodeStat ist(p, features);
1370
1371       ldout(cct, 15) << "" << i << ": '" << dname << "'" << dendl;
1372
1373       Inode *in = add_update_inode(&ist, request->sent_stamp, session,
1374                                    request->perms);
1375       Dentry *dn;
1376       if (diri->dir->dentries.count(dname)) {
1377         Dentry *olddn = diri->dir->dentries[dname];
1378         if (olddn->inode != in) {
1379           // replace incorrect dentry
1380           unlink(olddn, true, true);  // keep dir, dentry
1381           dn = link(dir, dname, in, olddn);
1382           ceph_assert(dn == olddn);
1383         } else {
1384           // keep existing dn
1385           dn = olddn;
1386           touch_dn(dn);
1387         }
1388       } else {
1389         // new dn
1390         dn = link(dir, dname, in, NULL);
1391       }
1392       dn->alternate_name = std::move(dlease.alternate_name);
1393
1394       update_dentry_lease(dn, &dlease, request->sent_stamp, session);
1395       if (hash_order) {
1396         unsigned hash = ceph_frag_value(diri->hash_dentry_name(dname));
1397         if (hash != last_hash)
1398           readdir_offset = 2;
1399         last_hash = hash;
1400         dn->offset = dir_result_t::make_fpos(hash, readdir_offset++, true);
1401       } else {
1402         dn->offset = dir_result_t::make_fpos(fg, readdir_offset++, false);
1403       }
1404       // add to readdir cache
1405       if (dirp->release_count == diri->dir_release_count &&
1406           dirp->ordered_count == diri->dir_ordered_count &&
1407           dirp->start_shared_gen == diri->shared_gen) {
1408         if (dirp->cache_index == dir->readdir_cache.size()) {
1409           if (i == 0) {
1410             ceph_assert(!dirp->inode->is_complete_and_ordered());
1411             dir->readdir_cache.reserve(dirp->cache_index + numdn);
1412           }
1413           dir->readdir_cache.push_back(dn);
1414         } else if (dirp->cache_index < dir->readdir_cache.size()) {
1415           if (dirp->inode->is_complete_and_ordered())
1416             ceph_assert(dir->readdir_cache[dirp->cache_index] == dn);
1417           else
1418             dir->readdir_cache[dirp->cache_index] = dn;
1419         } else {
1420           ceph_abort_msg("unexpected readdir buffer idx");
1421         }
1422         dirp->cache_index++;
1423       }
1424       // add to cached result list
1425       dirp->buffer.push_back(dir_result_t::dentry(dn->offset, dname, dn->alternate_name, in));
1426       ldout(cct, 15) << __func__ << "  " << hex << dn->offset << dec << ": '" << dname << "' -> " << in->ino << dendl;
1427     }
1428
1429     if (numdn > 0)
1430       dirp->last_name = dname;
1431     if (end)
1432       dirp->next_offset = 2;
1433     else
1434       dirp->next_offset = readdir_offset;
1435
1436     if (dir->is_empty())
1437       close_dir(dir);
1438   }
1439 }
1440
1441 /** insert_trace
1442  *
1443  * insert a trace from a MDS reply into the cache.
1444  */
1445 Inode* Client::insert_trace(MetaRequest *request, MetaSession *session)
1446 {
1447   auto& reply = request->reply;
1448   int op = request->get_op();
1449
1450   ldout(cct, 10) << "insert_trace from " << request->sent_stamp << " mds." << session->mds_num
1451            << " is_target=" << (int)reply->head.is_target
1452            << " is_dentry=" << (int)reply->head.is_dentry
1453            << dendl;
1454
1455   auto p = reply->get_trace_bl().cbegin();
1456   if (request->got_unsafe) {
1457     ldout(cct, 10) << "insert_trace -- already got unsafe; ignoring" << dendl;
1458     ceph_assert(p.end());
1459     return NULL;
1460   }
1461
1462   if (p.end()) {
1463     ldout(cct, 10) << "insert_trace -- no trace" << dendl;
1464
1465     Dentry *d = request->dentry();
1466     if (d) {
1467       Inode *diri = d->dir->parent_inode;
1468       clear_dir_complete_and_ordered(diri, true);
1469     }
1470
1471     if (d && reply->get_result() == 0) {
1472       if (op == CEPH_MDS_OP_RENAME) {
1473         // rename
1474         Dentry *od = request->old_dentry();
1475         ldout(cct, 10) << " unlinking rename src dn " << od << " for traceless reply" << dendl;
1476         ceph_assert(od);
1477         unlink(od, true, true);  // keep dir, dentry
1478       } else if (op == CEPH_MDS_OP_RMDIR ||
1479                  op == CEPH_MDS_OP_UNLINK) {
1480         // unlink, rmdir
1481         ldout(cct, 10) << " unlinking unlink/rmdir dn " << d << " for traceless reply" << dendl;
1482         unlink(d, true, true);  // keep dir, dentry
1483       }
1484     }
1485     return NULL;
1486   }
1487
1488   ConnectionRef con = request->reply->get_connection();
1489   uint64_t features;
1490   if (session->mds_features.test(CEPHFS_FEATURE_REPLY_ENCODING)) {
1491     features = (uint64_t)-1;
1492   }
1493   else {
1494     features = con->get_features();
1495   }
1496   ldout(cct, 10) << " features 0x" << hex << features << dec << dendl;
1497
1498   // snap trace
1499   SnapRealm *realm = NULL;
1500   if (reply->snapbl.length())
1501     update_snap_trace(reply->snapbl, &realm);
1502
1503   ldout(cct, 10) << " hrm "
1504            << " is_target=" << (int)reply->head.is_target
1505            << " is_dentry=" << (int)reply->head.is_dentry
1506            << dendl;
1507
1508   InodeStat dirst;
1509   DirStat dst;
1510   string dname;
1511   LeaseStat dlease;
1512   InodeStat ist;
1513
1514   if (reply->head.is_dentry) {
1515     dirst.decode(p, features);
1516     dst.decode(p, features);
1517     decode(dname, p);
1518     dlease.decode(p, features);
1519   }
1520
1521   Inode *in = 0;
1522   if (reply->head.is_target) {
1523     ist.decode(p, features);
1524     if (cct->_conf->client_debug_getattr_caps) {
1525       unsigned wanted = 0;
1526       if (op == CEPH_MDS_OP_GETATTR || op == CEPH_MDS_OP_LOOKUP)
1527         wanted = request->head.args.getattr.mask;
1528       else if (op == CEPH_MDS_OP_OPEN || op == CEPH_MDS_OP_CREATE)
1529         wanted = request->head.args.open.mask;
1530
1531       if ((wanted & CEPH_CAP_XATTR_SHARED) &&
1532           !(ist.xattr_version > 0 && ist.xattrbl.length() > 0))
1533         ceph_abort_msg("MDS reply does not contain xattrs");
1534     }
1535
1536     in = add_update_inode(&ist, request->sent_stamp, session,
1537                           request->perms);
1538   }
1539
1540   Inode *diri = NULL;
1541   if (reply->head.is_dentry) {
1542     diri = add_update_inode(&dirst, request->sent_stamp, session,
1543                             request->perms);
1544     mds_rank_t from_mds = mds_rank_t(reply->get_source().num());
1545     update_dir_dist(diri, &dst, from_mds);  // dir stat info is attached to ..
1546
1547     if (in) {
1548       Dir *dir = diri->open_dir();
1549       insert_dentry_inode(dir, dname, &dlease, in, request->sent_stamp, session,
1550                           (op == CEPH_MDS_OP_RENAME) ? request->old_dentry() : NULL);
1551     } else {
1552       Dentry *dn = NULL;
1553       if (diri->dir && diri->dir->dentries.count(dname)) {
1554         dn = diri->dir->dentries[dname];
1555         if (dn->inode) {
1556           clear_dir_complete_and_ordered(diri, false);
1557           unlink(dn, true, true);  // keep dir, dentry
1558         }
1559       }
1560       if (dlease.duration_ms > 0) {
1561         if (!dn) {
1562           Dir *dir = diri->open_dir();
1563           dn = link(dir, dname, NULL, NULL);
1564         }
1565         update_dentry_lease(dn, &dlease, request->sent_stamp, session);
1566       }
1567     }
1568   } else if (op == CEPH_MDS_OP_LOOKUPSNAP ||
1569              op == CEPH_MDS_OP_MKSNAP) {
1570     ldout(cct, 10) << " faking snap lookup weirdness" << dendl;
1571     // fake it for snap lookup
1572     vinodeno_t vino = ist.vino;
1573     vino.snapid = CEPH_SNAPDIR;
1574     ceph_assert(inode_map.count(vino));
1575     diri = inode_map[vino];
1576
1577     string dname = request->path.last_dentry();
1578
1579     LeaseStat dlease;
1580     dlease.duration_ms = 0;
1581
1582     if (in) {
1583       Dir *dir = diri->open_dir();
1584       insert_dentry_inode(dir, dname, &dlease, in, request->sent_stamp, session);
1585     } else {
1586       if (diri->dir && diri->dir->dentries.count(dname)) {
1587         Dentry *dn = diri->dir->dentries[dname];
1588         if (dn->inode)
1589           unlink(dn, true, true);  // keep dir, dentry
1590       }
1591     }
1592   }
1593
1594   if (in) {
1595     if (op == CEPH_MDS_OP_READDIR ||
1596         op == CEPH_MDS_OP_LSSNAP) {
1597       insert_readdir_results(request, session, in);
1598     } else if (op == CEPH_MDS_OP_LOOKUPNAME) {
1599       // hack: return parent inode instead
1600       in = diri;
1601     }
1602
1603     if (request->dentry() == NULL && in != request->inode()) {
1604       // pin the target inode if its parent dentry is not pinned
1605       request->set_other_inode(in);
1606     }
1607   }
1608
1609   if (realm)
1610     put_snap_realm(realm);
1611
1612   request->target = in;
1613   return in;
1614 }
1615
1616 // -------
1617
1618 mds_rank_t Client::choose_target_mds(MetaRequest *req, Inode** phash_diri)
1619 {
1620   mds_rank_t mds = MDS_RANK_NONE;
1621   __u32 hash = 0;
1622   bool is_hash = false;
1623   int issued = 0;
1624
1625   Inode *in = NULL;
1626   Dentry *de = NULL;
1627
1628   if (req->resend_mds >= 0) {
1629     mds = req->resend_mds;
1630     req->resend_mds = -1;
1631     ldout(cct, 10) << __func__ << " resend_mds specified as mds." << mds << dendl;
1632     goto out;
1633   }
1634
1635   if (cct->_conf->client_use_random_mds)
1636     goto random_mds;
1637
1638   in = req->inode();
1639   de = req->dentry();
1640   if (in) {
1641     ldout(cct, 20) << __func__ << " starting with req->inode " << *in << dendl;
1642     if (req->path.depth()) {
1643       hash = in->hash_dentry_name(req->path[0]);
1644       ldout(cct, 20) << __func__ << " inode dir hash is " << (int)in->dir_layout.dl_dir_hash
1645                << " on " << req->path[0]
1646                << " => " << hash << dendl;
1647       is_hash = true;
1648     }
1649   } else if (de) {
1650     if (de->inode) {
1651       in = de->inode.get();
1652       ldout(cct, 20) << __func__ << " starting with req->dentry inode " << *in << dendl;
1653     } else {
1654       in = de->dir->parent_inode;
1655       hash = in->hash_dentry_name(de->name);
1656       ldout(cct, 20) << __func__ << " dentry dir hash is " << (int)in->dir_layout.dl_dir_hash
1657                << " on " << de->name
1658                << " => " << hash << dendl;
1659       is_hash = true;
1660     }
1661   }
1662   if (in) {
1663     if (in->snapid != CEPH_NOSNAP) {
1664       ldout(cct, 10) << __func__ << " " << *in << " is snapped, using nonsnap parent" << dendl;
1665       while (in->snapid != CEPH_NOSNAP) {
1666         if (in->snapid == CEPH_SNAPDIR)
1667           in = in->snapdir_parent.get();
1668         else if (!in->dentries.empty())
1669           /* In most cases there will only be one dentry, so getting it
1670            * will be the correct action. If there are multiple hard links,
1671            * I think the MDS should be able to redirect as needed*/
1672           in = in->get_first_parent()->dir->parent_inode;
1673         else {
1674           ldout(cct, 10) << "got unlinked inode, can't look at parent" << dendl;
1675           break;
1676         }
1677       }
1678       is_hash = false;
1679     }
1680
1681     ldout(cct, 20) << __func__ << " " << *in << " is_hash=" << is_hash
1682              << " hash=" << hash << dendl;
1683
1684     if (req->get_op() == CEPH_MDS_OP_GETATTR)
1685       issued = req->inode()->caps_issued();
1686
1687     if (is_hash && S_ISDIR(in->mode) && (!in->fragmap.empty() || !in->frag_repmap.empty())) {
1688       frag_t fg = in->dirfragtree[hash];
1689       if (!req->auth_is_best(issued)) {
1690         auto repmapit = in->frag_repmap.find(fg);
1691         if (repmapit != in->frag_repmap.end()) {
1692           auto& repmap = repmapit->second;
1693           auto r = ceph::util::generate_random_number<uint64_t>(0, repmap.size()-1);
1694           mds = repmap.at(r);
1695         }
1696       } else if (in->fragmap.count(fg)) {
1697         mds = in->fragmap[fg];
1698         if (phash_diri)
1699           *phash_diri = in;
1700       } else if (in->auth_cap) {
1701         req->send_to_auth = true;
1702         mds = in->auth_cap->session->mds_num;
1703       }
1704       if (mds >= 0) {
1705         ldout(cct, 10) << __func__ << " from dirfragtree hash" << dendl;
1706         goto out;
1707       }
1708     }
1709
1710     if (in->auth_cap && req->auth_is_best(issued)) {
1711       mds = in->auth_cap->session->mds_num;
1712     } else if (!in->caps.empty()) {
1713       mds = in->caps.begin()->second.session->mds_num;
1714     } else {
1715       goto random_mds;
1716     }
1717     ldout(cct, 10) << __func__ << " from caps on inode " << *in << dendl;
1718
1719     goto out;
1720   }
1721
1722 random_mds:
1723   if (mds < 0) {
1724     mds = _get_random_up_mds();
1725     ldout(cct, 10) << "did not get mds through better means, so chose random mds " << mds << dendl;
1726   }
1727
1728 out:
1729   ldout(cct, 20) << "mds is " << mds << dendl;
1730   return mds;
1731 }
1732
1733 void Client::connect_mds_targets(mds_rank_t mds)
1734 {
1735   ldout(cct, 10) << __func__ << " for mds." << mds << dendl;
1736   ceph_assert(mds_sessions.count(mds));
1737   const MDSMap::mds_info_t& info = mdsmap->get_mds_info(mds);
1738   for (const auto &rank : info.export_targets) {
1739     if (mds_sessions.count(rank) == 0 &&
1740         mdsmap->is_clientreplay_or_active_or_stopping(rank)) {
1741       ldout(cct, 10) << "check_mds_sessions opening mds." << mds
1742                      << " export target mds." << rank << dendl;
1743       _open_mds_session(rank);
1744     }
1745   }
1746 }
1747
1748 void Client::dump_mds_sessions(Formatter *f, bool cap_dump)
1749 {
1750   f->dump_int("id", get_nodeid().v);
1751   entity_inst_t inst(messenger->get_myname(), messenger->get_myaddr_legacy());
1752   f->dump_object("inst", inst);
1753   f->dump_stream("inst_str") << inst;
1754   f->dump_stream("addr_str") << inst.addr;
1755   f->open_array_section("sessions");
1756   for (const auto &p : mds_sessions) {
1757     f->open_object_section("session");
1758     p.second->dump(f, cap_dump);
1759     f->close_section();
1760   }
1761   f->close_section();
1762   f->dump_int("mdsmap_epoch", mdsmap->get_epoch());
1763 }
1764
1765 void Client::dump_mds_requests(Formatter *f)
1766 {
1767   for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
1768        p != mds_requests.end();
1769        ++p) {
1770     f->open_object_section("request");
1771     p->second->dump(f);
1772     f->close_section();
1773   }
1774 }
1775
1776 int Client::verify_reply_trace(int r, MetaSession *session,
1777                                MetaRequest *request, const MConstRef<MClientReply>& reply,
1778                                InodeRef *ptarget, bool *pcreated,
1779                                const UserPerm& perms)
1780 {
1781   // check whether this request actually did the create, and set created flag
1782   bufferlist extra_bl;
1783   inodeno_t created_ino;
1784   bool got_created_ino = false;
1785   ceph::unordered_map<vinodeno_t, Inode*>::iterator p;
1786
1787   extra_bl = reply->get_extra_bl();
1788   if (extra_bl.length() >= 8) {
1789     if (session->mds_features.test(CEPHFS_FEATURE_DELEG_INO)) {
1790      struct openc_response_t    ocres;
1791
1792      decode(ocres, extra_bl);
1793      created_ino = ocres.created_ino;
1794      /*
1795       * The userland cephfs client doesn't have a way to do an async create
1796       * (yet), so just discard delegated_inos for now. Eventually we should
1797       * store them and use them in create calls, even if they are synchronous,
1798       * if only for testing purposes.
1799       */
1800      ldout(cct, 10) << "delegated_inos: " << ocres.delegated_inos << dendl;
1801     } else {
1802      // u64 containing number of created ino
1803      decode(created_ino, extra_bl);
1804     }
1805     ldout(cct, 10) << "make_request created ino " << created_ino << dendl;
1806     got_created_ino = true;
1807   }
1808
1809   if (pcreated)
1810     *pcreated = got_created_ino;
1811
1812   if (request->target) {
1813     *ptarget = request->target;
1814     ldout(cct, 20) << "make_request target is " << *ptarget->get() << dendl;
1815   } else {
1816     if (got_created_ino && (p = inode_map.find(vinodeno_t(created_ino, CEPH_NOSNAP))) != inode_map.end()) {
1817       (*ptarget) = p->second;
1818       ldout(cct, 20) << "make_request created, target is " << *ptarget->get() << dendl;
1819     } else {
1820       // we got a traceless reply, and need to look up what we just
1821       // created.  for now, do this by name.  someday, do this by the
1822       // ino... which we know!  FIXME.
1823       InodeRef target;
1824       Dentry *d = request->dentry();
1825       if (d) {
1826         if (d->dir) {
1827           ldout(cct, 10) << "make_request got traceless reply, looking up #"
1828                          << d->dir->parent_inode->ino << "/" << d->name
1829                          << " got_ino " << got_created_ino
1830                          << " ino " << created_ino
1831                          << dendl;
1832           r = _do_lookup(d->dir->parent_inode, d->name, request->regetattr_mask,
1833                          &target, perms);
1834         } else {
1835           // if the dentry is not linked, just do our best. see #5021.
1836           ceph_abort_msg("how did this happen?  i want logs!");
1837         }
1838       } else {
1839         Inode *in = request->inode();
1840         ldout(cct, 10) << "make_request got traceless reply, forcing getattr on #"
1841                        << in->ino << dendl;
1842         r = _getattr(in, request->regetattr_mask, perms, true);
1843         target = in;
1844       }
1845       if (r >= 0) {
1846         // verify ino returned in reply and trace_dist are the same
1847         if (got_created_ino &&
1848             created_ino.val != target->ino.val) {
1849           ldout(cct, 5) << "create got ino " << created_ino << " but then failed on lookup; EINTR?" << dendl;
1850           r = -CEPHFS_EINTR;
1851         }
1852         if (ptarget)
1853           ptarget->swap(target);
1854       }
1855     }
1856   }
1857
1858   return r;
1859 }
1860
1861
1862 /**
1863  * make a request
1864  *
1865  * Blocking helper to make an MDS request.
1866  *
1867  * If the ptarget flag is set, behavior changes slightly: the caller
1868  * expects to get a pointer to the inode we are creating or operating
1869  * on.  As a result, we will follow up any traceless mutation reply
1870  * with a getattr or lookup to transparently handle a traceless reply
1871  * from the MDS (as when the MDS restarts and the client has to replay
1872  * a request).
1873  *
1874  * @param request the MetaRequest to execute
1875  * @param perms The user uid/gid to execute as (eventually, full group lists?)
1876  * @param ptarget [optional] address to store a pointer to the target inode we want to create or operate on
1877  * @param pcreated [optional; required if ptarget] where to store a bool of whether our create atomically created a file
1878  * @param use_mds [optional] prefer a specific mds (-1 for default)
1879  * @param pdirbl [optional; disallowed if ptarget] where to pass extra reply payload to the caller
1880  */
1881 int Client::make_request(MetaRequest *request,
1882                          const UserPerm& perms,
1883                          InodeRef *ptarget, bool *pcreated,
1884                          mds_rank_t use_mds,
1885                          bufferlist *pdirbl)
1886 {
1887   int r = 0;
1888
1889   // assign a unique tid
1890   ceph_tid_t tid = ++last_tid;
1891   request->set_tid(tid);
1892
1893   // and timestamp
1894   request->op_stamp = ceph_clock_now();
1895   request->created = ceph::coarse_mono_clock::now();
1896
1897   // make note
1898   mds_requests[tid] = request->get();
1899   if (oldest_tid == 0 && request->get_op() != CEPH_MDS_OP_SETFILELOCK)
1900     oldest_tid = tid;
1901
1902   request->set_caller_perms(perms);
1903
1904   if (cct->_conf->client_inject_fixed_oldest_tid) {
1905     ldout(cct, 20) << __func__ << " injecting fixed oldest_client_tid(1)" << dendl;
1906     request->set_oldest_client_tid(1);
1907   } else {
1908     request->set_oldest_client_tid(oldest_tid);
1909   }
1910
1911   // hack target mds?
1912   if (use_mds >= 0)
1913     request->resend_mds = use_mds;
1914
1915   MetaSessionRef session = NULL;
1916   while (1) {
1917     if (request->aborted())
1918       break;
1919
1920     if (blocklisted) {
1921       request->abort(-CEPHFS_EBLOCKLISTED);
1922       break;
1923     }
1924
1925     // set up wait cond
1926     ceph::condition_variable caller_cond;
1927     request->caller_cond = &caller_cond;
1928
1929     // choose mds
1930     Inode *hash_diri = NULL;
1931     mds_rank_t mds = choose_target_mds(request, &hash_diri);
1932     int mds_state = (mds == MDS_RANK_NONE) ? MDSMap::STATE_NULL : mdsmap->get_state(mds);
1933     if (mds_state != MDSMap::STATE_ACTIVE && mds_state != MDSMap::STATE_STOPPING) {
1934       if (mds_state == MDSMap::STATE_NULL && mds >= mdsmap->get_max_mds()) {
1935         if (hash_diri) {
1936           ldout(cct, 10) << " target mds." << mds << " has stopped, remove it from fragmap" << dendl;
1937           _fragmap_remove_stopped_mds(hash_diri, mds);
1938         } else {
1939           ldout(cct, 10) << " target mds." << mds << " has stopped, trying a random mds" << dendl;
1940           request->resend_mds = _get_random_up_mds();
1941         }
1942       } else {
1943         ldout(cct, 10) << " target mds." << mds << " not active, waiting for new mdsmap" << dendl;
1944         wait_on_list(waiting_for_mdsmap);
1945       }
1946       continue;
1947     }
1948
1949     // open a session?
1950     if (!have_open_session(mds)) {
1951       session = _get_or_open_mds_session(mds);
1952       if (session->state == MetaSession::STATE_REJECTED) {
1953         request->abort(-CEPHFS_EPERM);
1954         break;
1955       }
1956       // wait
1957       if (session->state == MetaSession::STATE_OPENING) {
1958         ldout(cct, 10) << "waiting for session to mds." << mds << " to open" << dendl;
1959         wait_on_context_list(session->waiting_for_open);
1960         continue;
1961       }
1962
1963       if (!have_open_session(mds))
1964         continue;
1965     } else {
1966       session = mds_sessions.at(mds);
1967     }
1968
1969     // send request.
1970     send_request(request, session.get());
1971
1972     // wait for signal
1973     ldout(cct, 20) << "awaiting reply|forward|kick on " << &caller_cond << dendl;
1974     request->kick = false;
1975     std::unique_lock l{client_lock, std::adopt_lock};
1976     caller_cond.wait(l, [request] {
1977       return (request->reply ||           // reply
1978               request->resend_mds >= 0 || // forward
1979               request->kick);
1980     });
1981     l.release();
1982     request->caller_cond = nullptr;
1983
1984     // did we get a reply?
1985     if (request->reply)
1986       break;
1987   }
1988
1989   if (!request->reply) {
1990     ceph_assert(request->aborted());
1991     ceph_assert(!request->got_unsafe);
1992     r = request->get_abort_code();
1993     request->item.remove_myself();
1994     unregister_request(request);
1995     put_request(request);
1996     return r;
1997   }
1998
1999   // got it!
2000   auto reply = std::move(request->reply);
2001   r = reply->get_result();
2002   if (r >= 0)
2003     request->success = true;
2004
2005   // kick dispatcher (we've got it!)
2006   ceph_assert(request->dispatch_cond);
2007   request->dispatch_cond->notify_all();
2008   ldout(cct, 20) << "sendrecv kickback on tid " << tid << " " << request->dispatch_cond << dendl;
2009   request->dispatch_cond = 0;
2010
2011   if (r >= 0 && ptarget)
2012     r = verify_reply_trace(r, session.get(), request, reply, ptarget, pcreated, perms);
2013
2014   if (pdirbl)
2015     *pdirbl = reply->get_extra_bl();
2016
2017   // -- log times --
2018   utime_t lat = ceph_clock_now();
2019   lat -= request->sent_stamp;
2020   ldout(cct, 20) << "lat " << lat << dendl;
2021
2022   ++nr_metadata_request;
2023   update_io_stat_metadata(lat);
2024
2025   put_request(request);
2026   return r;
2027 }
2028
2029 void Client::unregister_request(MetaRequest *req)
2030 {
2031   mds_requests.erase(req->tid);
2032   if (req->tid == oldest_tid) {
2033     map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.upper_bound(oldest_tid);
2034     while (true) {
2035       if (p == mds_requests.end()) {
2036         oldest_tid = 0;
2037         break;
2038       }
2039       if (p->second->get_op() != CEPH_MDS_OP_SETFILELOCK) {
2040         oldest_tid = p->first;
2041         break;
2042       }
2043       ++p;
2044     }
2045   }
2046   put_request(req);
2047 }
2048
2049 void Client::put_request(MetaRequest *request)
2050 {
2051   if (request->_put()) {
2052     int op = -1;
2053     if (request->success)
2054       op = request->get_op();
2055     InodeRef other_in;
2056     request->take_other_inode(&other_in);
2057     delete request;
2058
2059     if (other_in &&
2060         (op == CEPH_MDS_OP_RMDIR ||
2061          op == CEPH_MDS_OP_RENAME ||
2062          op == CEPH_MDS_OP_RMSNAP)) {
2063       _try_to_trim_inode(other_in.get(), false);
2064     }
2065   }
2066 }
2067
2068 int Client::encode_inode_release(Inode *in, MetaRequest *req,
2069                          mds_rank_t mds, int drop,
2070                          int unless, int force)
2071 {
2072   ldout(cct, 20) << __func__ << " enter(in:" << *in << ", req:" << req
2073            << " mds:" << mds << ", drop:" << ccap_string(drop) << ", unless:" << ccap_string(unless)
2074            << ", force:" << force << ")" << dendl;
2075   int released = 0;
2076   auto it = in->caps.find(mds);
2077   if (it != in->caps.end()) {
2078     Cap &cap = it->second;
2079     drop &= ~(in->dirty_caps | get_caps_used(in));
2080     if ((drop & cap.issued) &&
2081         !(unless & cap.issued)) {
2082       ldout(cct, 25) << "dropping caps " << ccap_string(drop) << dendl;
2083       cap.issued &= ~drop;
2084       cap.implemented &= ~drop;
2085       released = 1;
2086     } else {
2087       released = force;
2088     }
2089     if (released) {
2090       cap.wanted = in->caps_wanted();
2091       if (&cap == in->auth_cap &&
2092           !(cap.wanted & CEPH_CAP_ANY_FILE_WR)) {
2093         in->requested_max_size = 0;
2094         ldout(cct, 25) << "reset requested_max_size due to not wanting any file write cap" << dendl;
2095       }
2096       ceph_mds_request_release rel;
2097       rel.ino = in->ino;
2098       rel.cap_id = cap.cap_id;
2099       rel.seq = cap.seq;
2100       rel.issue_seq = cap.issue_seq;
2101       rel.mseq = cap.mseq;
2102       rel.caps = cap.implemented;
2103       rel.wanted = cap.wanted;
2104       rel.dname_len = 0;
2105       rel.dname_seq = 0;
2106       req->cap_releases.push_back(MClientRequest::Release(rel,""));
2107     }
2108   }
2109   ldout(cct, 25) << __func__ << " exit(in:" << *in << ") released:"
2110            << released << dendl;
2111   return released;
2112 }
2113
2114 void Client::encode_dentry_release(Dentry *dn, MetaRequest *req,
2115                            mds_rank_t mds, int drop, int unless)
2116 {
2117   ldout(cct, 20) << __func__ << " enter(dn:"
2118            << dn << ")" << dendl;
2119   int released = 0;
2120   if (dn->dir)
2121     released = encode_inode_release(dn->dir->parent_inode, req,
2122                                     mds, drop, unless, 1);
2123   if (released && dn->lease_mds == mds) {
2124     ldout(cct, 25) << "preemptively releasing dn to mds" << dendl;
2125     auto& rel = req->cap_releases.back();
2126     rel.item.dname_len = dn->name.length();
2127     rel.item.dname_seq = dn->lease_seq;
2128     rel.dname = dn->name;
2129     dn->lease_mds = -1;
2130   }
2131   ldout(cct, 25) << __func__ << " exit(dn:"
2132            << dn << ")" << dendl;
2133 }
2134
2135
2136 /*
2137  * This requires the MClientRequest *request member to be set.
2138  * It will error out horribly without one.
2139  * Additionally, if you set any *drop member, you'd better have
2140  * set the corresponding dentry!
2141  */
2142 void Client::encode_cap_releases(MetaRequest *req, mds_rank_t mds)
2143 {
2144   ldout(cct, 20) << __func__ << " enter (req: "
2145                  << req << ", mds: " << mds << ")" << dendl;
2146   if (req->inode_drop && req->inode())
2147     encode_inode_release(req->inode(), req,
2148                          mds, req->inode_drop,
2149                          req->inode_unless);
2150
2151   if (req->old_inode_drop && req->old_inode())
2152     encode_inode_release(req->old_inode(), req,
2153                          mds, req->old_inode_drop,
2154                          req->old_inode_unless);
2155   if (req->other_inode_drop && req->other_inode())
2156     encode_inode_release(req->other_inode(), req,
2157                          mds, req->other_inode_drop,
2158                          req->other_inode_unless);
2159
2160   if (req->dentry_drop && req->dentry())
2161     encode_dentry_release(req->dentry(), req,
2162                           mds, req->dentry_drop,
2163                           req->dentry_unless);
2164
2165   if (req->old_dentry_drop && req->old_dentry())
2166     encode_dentry_release(req->old_dentry(), req,
2167                           mds, req->old_dentry_drop,
2168                           req->old_dentry_unless);
2169   ldout(cct, 25) << __func__ << " exit (req: "
2170            << req << ", mds " << mds <<dendl;
2171 }
2172
2173 bool Client::have_open_session(mds_rank_t mds)
2174 {
2175   const auto &it = mds_sessions.find(mds);
2176   return it != mds_sessions.end() &&
2177     (it->second->state == MetaSession::STATE_OPEN ||
2178      it->second->state == MetaSession::STATE_STALE);
2179 }
2180
2181 MetaSessionRef Client::_get_mds_session(mds_rank_t mds, Connection *con)
2182 {
2183   const auto &it = mds_sessions.find(mds);
2184   if (it == mds_sessions.end() || it->second->con != con) {
2185     return NULL;
2186   } else {
2187     return it->second;
2188   }
2189 }
2190
2191 MetaSessionRef Client::_get_or_open_mds_session(mds_rank_t mds)
2192 {
2193   auto it = mds_sessions.find(mds);
2194   return it == mds_sessions.end() ? _open_mds_session(mds) : it->second;
2195 }
2196
2197 /**
2198  * Populate a map of strings with client-identifying metadata,
2199  * such as the hostname.  Call this once at initialization.
2200  */
2201 void Client::populate_metadata(const std::string &mount_root)
2202 {
2203   // Hostname
2204 #ifdef _WIN32
2205   // TODO: move this to compat.h
2206   char hostname[64];
2207   DWORD hostname_sz = 64;
2208   GetComputerNameA(hostname, &hostname_sz);
2209   metadata["hostname"] = hostname;
2210 #else
2211   struct utsname u;
2212   int r = uname(&u);
2213   if (r >= 0) {
2214     metadata["hostname"] = u.nodename;
2215     ldout(cct, 20) << __func__ << " read hostname '" << u.nodename << "'" << dendl;
2216   } else {
2217     ldout(cct, 1) << __func__ << " failed to read hostname (" << cpp_strerror(r) << ")" << dendl;
2218   }
2219 #endif
2220
2221   metadata["pid"] = stringify(getpid());
2222
2223   // Ceph entity id (the '0' in "client.0")
2224   metadata["entity_id"] = cct->_conf->name.get_id();
2225
2226   // Our mount position
2227   if (!mount_root.empty()) {
2228     metadata["root"] = mount_root;
2229   }
2230
2231   // Ceph version
2232   metadata["ceph_version"] = pretty_version_to_str();
2233   metadata["ceph_sha1"] = git_version_to_str();
2234
2235   // Apply any metadata from the user's configured overrides
2236   std::vector<std::string> tokens;
2237   get_str_vec(cct->_conf->client_metadata, ",", tokens);
2238   for (const auto &i : tokens) {
2239     auto eqpos = i.find("=");
2240     // Throw out anything that isn't of the form "<str>=<str>"
2241     if (eqpos == 0 || eqpos == std::string::npos || eqpos == i.size()) {
2242       lderr(cct) << "Invalid metadata keyval pair: '" << i << "'" << dendl;
2243       continue;
2244     }
2245     metadata[i.substr(0, eqpos)] = i.substr(eqpos + 1);
2246   }
2247 }
2248
2249 /**
2250  * Optionally add or override client metadata fields.
2251  */
2252 void Client::update_metadata(std::string const &k, std::string const &v)
2253 {
2254   RWRef_t iref_reader(initialize_state, CLIENT_INITIALIZED);
2255   ceph_assert(iref_reader.is_state_satisfied());
2256
2257   std::scoped_lock l(client_lock);
2258
2259   auto it = metadata.find(k);
2260   if (it != metadata.end()) {
2261     ldout(cct, 1) << __func__ << " warning, overriding metadata field '" << k
2262                   << "' from '" << it->second << "' to '" << v << "'" << dendl;
2263   }
2264
2265   metadata[k] = v;
2266 }
2267
2268 MetaSessionRef Client::_open_mds_session(mds_rank_t mds)
2269 {
2270   ldout(cct, 10) << __func__ << " mds." << mds << dendl;
2271   auto addrs = mdsmap->get_addrs(mds);
2272   auto em = mds_sessions.emplace(std::piecewise_construct,
2273       std::forward_as_tuple(mds),
2274       std::forward_as_tuple(new MetaSession(mds, messenger->connect_to_mds(addrs), addrs)));
2275   ceph_assert(em.second); /* not already present */
2276   auto session = em.first->second;
2277
2278   auto m = make_message<MClientSession>(CEPH_SESSION_REQUEST_OPEN);
2279   m->metadata = metadata;
2280   m->supported_features = feature_bitset_t(CEPHFS_FEATURES_CLIENT_SUPPORTED);
2281   m->metric_spec = feature_bitset_t(CEPHFS_METRIC_FEATURES_ALL);
2282   session->con->send_message2(std::move(m));
2283   return session;
2284 }
2285
2286 void Client::_close_mds_session(MetaSession *s)
2287 {
2288   ldout(cct, 2) << __func__ << " mds." << s->mds_num << " seq " << s->seq << dendl;
2289   s->state = MetaSession::STATE_CLOSING;
2290   s->con->send_message2(make_message<MClientSession>(CEPH_SESSION_REQUEST_CLOSE, s->seq));
2291 }
2292
2293 void Client::_closed_mds_session(MetaSession *s, int err, bool rejected)
2294 {
2295   ldout(cct, 5) << __func__ << " mds." << s->mds_num << " seq " << s->seq << dendl;
2296   if (rejected && s->state != MetaSession::STATE_CLOSING)
2297     s->state = MetaSession::STATE_REJECTED;
2298   else
2299     s->state = MetaSession::STATE_CLOSED;
2300   s->con->mark_down();
2301   signal_context_list(s->waiting_for_open);
2302   mount_cond.notify_all();
2303   remove_session_caps(s, err);
2304   kick_requests_closed(s);
2305   mds_ranks_closing.erase(s->mds_num);
2306   if (s->state == MetaSession::STATE_CLOSED)
2307     mds_sessions.erase(s->mds_num);
2308 }
2309
2310 void Client::handle_client_session(const MConstRef<MClientSession>& m)
2311 {
2312   mds_rank_t from = mds_rank_t(m->get_source().num());
2313   ldout(cct, 10) << __func__ << " " << *m << " from mds." << from << dendl;
2314
2315   std::scoped_lock cl(client_lock);
2316   auto session = _get_mds_session(from, m->get_connection().get());
2317   if (!session) {
2318     ldout(cct, 10) << " discarding session message from sessionless mds " << m->get_source_inst() << dendl;
2319     return;
2320   }
2321
2322   switch (m->get_op()) {
2323   case CEPH_SESSION_OPEN:
2324     {
2325       feature_bitset_t missing_features(CEPHFS_FEATURES_CLIENT_REQUIRED);
2326       missing_features -= m->supported_features;
2327       if (!missing_features.empty()) {
2328         lderr(cct) << "mds." << from << " lacks required features '"
2329                    << missing_features << "', closing session " << dendl;
2330         _close_mds_session(session.get());
2331         _closed_mds_session(session.get(), -CEPHFS_EPERM, true);
2332         break;
2333       }
2334       session->mds_features = std::move(m->supported_features);
2335       session->mds_metric_flags = std::move(m->metric_spec.metric_flags);
2336
2337       renew_caps(session.get());
2338       session->state = MetaSession::STATE_OPEN;
2339       if (is_unmounting())
2340         mount_cond.notify_all();
2341       else
2342         connect_mds_targets(from);
2343       signal_context_list(session->waiting_for_open);
2344       break;
2345     }
2346
2347   case CEPH_SESSION_CLOSE:
2348     _closed_mds_session(session.get());
2349     break;
2350
2351   case CEPH_SESSION_RENEWCAPS:
2352     if (session->cap_renew_seq == m->get_seq()) {
2353       bool was_stale = ceph_clock_now() >= session->cap_ttl;
2354       session->cap_ttl =
2355         session->last_cap_renew_request + mdsmap->get_session_timeout();
2356       if (was_stale)
2357         wake_up_session_caps(session.get(), false);
2358     }
2359     break;
2360
2361   case CEPH_SESSION_STALE:
2362     // invalidate session caps/leases
2363     session->cap_gen++;
2364     session->cap_ttl = ceph_clock_now();
2365     session->cap_ttl -= 1;
2366     renew_caps(session.get());
2367     break;
2368
2369   case CEPH_SESSION_RECALL_STATE:
2370     /*
2371      * Call the renew caps and flush cap releases just before
2372      * triming the caps in case the tick() won't get a chance
2373      * to run them, which could cause the client to be blocklisted
2374      * and MDS daemons trying to recall the caps again and
2375      * again.
2376      *
2377      * In most cases it will do nothing, and the new cap releases
2378      * added by trim_caps() followed will be deferred flushing
2379      * by tick().
2380      */
2381     renew_and_flush_cap_releases();
2382     trim_caps(session.get(), m->get_max_caps());
2383     break;
2384
2385   case CEPH_SESSION_FLUSHMSG:
2386     /* flush cap release */
2387     if (auto& m = session->release; m) {
2388       session->con->send_message2(std::move(m));
2389     }
2390     session->con->send_message2(make_message<MClientSession>(CEPH_SESSION_FLUSHMSG_ACK, m->get_seq()));
2391     break;
2392
2393   case CEPH_SESSION_FORCE_RO:
2394     force_session_readonly(session.get());
2395     break;
2396
2397   case CEPH_SESSION_REJECT:
2398     {
2399       std::string_view error_str;
2400       auto it = m->metadata.find("error_string");
2401       if (it != m->metadata.end())
2402         error_str = it->second;
2403       else
2404         error_str = "unknown error";
2405       lderr(cct) << "mds." << from << " rejected us (" << error_str << ")" << dendl;
2406
2407       _closed_mds_session(session.get(), -CEPHFS_EPERM, true);
2408     }
2409     break;
2410
2411   default:
2412     ceph_abort();
2413   }
2414 }
2415
2416 bool Client::_any_stale_sessions() const
2417 {
2418   ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
2419
2420   for (const auto &p : mds_sessions) {
2421     if (p.second->state == MetaSession::STATE_STALE) {
2422       return true;
2423     }
2424   }
2425
2426   return false;
2427 }
2428
2429 void Client::_kick_stale_sessions()
2430 {
2431   ldout(cct, 1) << __func__ << dendl;
2432
2433   for (auto it = mds_sessions.begin(); it != mds_sessions.end(); ) {
2434     auto s = it->second;
2435     if (s->state == MetaSession::STATE_REJECTED) {
2436       mds_sessions.erase(it->first);
2437       continue;
2438     }
2439     if (s->state == MetaSession::STATE_STALE)
2440       _closed_mds_session(s.get());
2441   }
2442 }
2443
2444 void Client::send_request(MetaRequest *request, MetaSession *session,
2445                           bool drop_cap_releases)
2446 {
2447   // make the request
2448   mds_rank_t mds = session->mds_num;
2449   ldout(cct, 10) << __func__ << " rebuilding request " << request->get_tid()
2450                  << " for mds." << mds << dendl;
2451   auto r = build_client_request(request);
2452   if (request->dentry()) {
2453     r->set_dentry_wanted();
2454   }
2455   if (request->got_unsafe) {
2456     r->set_replayed_op();
2457     if (request->target)
2458       r->head.ino = request->target->ino;
2459   } else {
2460     encode_cap_releases(request, mds);
2461     if (drop_cap_releases) // we haven't send cap reconnect yet, drop cap releases
2462       request->cap_releases.clear();
2463     else
2464       r->releases.swap(request->cap_releases);
2465   }
2466   r->set_mdsmap_epoch(mdsmap->get_epoch());
2467   if (r->head.op == CEPH_MDS_OP_SETXATTR) {
2468     objecter->with_osdmap([r](const OSDMap& o) {
2469         r->set_osdmap_epoch(o.get_epoch());
2470       });
2471   }
2472
2473   if (request->mds == -1) {
2474     request->sent_stamp = ceph_clock_now();
2475     ldout(cct, 20) << __func__ << " set sent_stamp to " << request->sent_stamp << dendl;
2476   }
2477   request->mds = mds;
2478
2479   Inode *in = request->inode();
2480   if (in) {
2481     auto it = in->caps.find(mds);
2482     if (it != in->caps.end()) {
2483       request->sent_on_mseq = it->second.mseq;
2484     }
2485   }
2486
2487   session->requests.push_back(&request->item);
2488
2489   ldout(cct, 10) << __func__ << " " << *r << " to mds." << mds << dendl;
2490   session->con->send_message2(std::move(r));
2491 }
2492
2493 ref_t<MClientRequest> Client::build_client_request(MetaRequest *request)
2494 {
2495   auto req = make_message<MClientRequest>(request->get_op());
2496   req->set_tid(request->tid);
2497   req->set_stamp(request->op_stamp);
2498   memcpy(&req->head, &request->head, sizeof(ceph_mds_request_head));
2499
2500   // if the filepath's haven't been set, set them!
2501   if (request->path.empty()) {
2502     Inode *in = request->inode();
2503     Dentry *de = request->dentry();
2504     if (in)
2505       in->make_nosnap_relative_path(request->path);
2506     else if (de) {
2507       if (de->inode)
2508         de->inode->make_nosnap_relative_path(request->path);
2509       else if (de->dir) {
2510         de->dir->parent_inode->make_nosnap_relative_path(request->path);
2511         request->path.push_dentry(de->name);
2512       }
2513       else ldout(cct, 1) << "Warning -- unable to construct a filepath!"
2514                    << " No path, inode, or appropriately-endowed dentry given!"
2515                    << dendl;
2516     } else ldout(cct, 1) << "Warning -- unable to construct a filepath!"
2517                    << " No path, inode, or dentry given!"
2518                    << dendl;
2519   }
2520   req->set_filepath(request->get_filepath());
2521   req->set_filepath2(request->get_filepath2());
2522   req->set_alternate_name(request->alternate_name);
2523   req->set_data(request->data);
2524   req->set_retry_attempt(request->retry_attempt++);
2525   req->head.num_fwd = request->num_fwd;
2526   const gid_t *_gids;
2527   int gid_count = request->perms.get_gids(&_gids);
2528   req->set_gid_list(gid_count, _gids);
2529   return req;
2530 }
2531
2532
2533
2534 void Client::handle_client_request_forward(const MConstRef<MClientRequestForward>& fwd)
2535 {
2536   mds_rank_t mds = mds_rank_t(fwd->get_source().num());
2537
2538   std::scoped_lock cl(client_lock);
2539   auto session = _get_mds_session(mds, fwd->get_connection().get());
2540   if (!session) {
2541     return;
2542   }
2543   ceph_tid_t tid = fwd->get_tid();
2544
2545   if (mds_requests.count(tid) == 0) {
2546     ldout(cct, 10) << __func__ << " no pending request on tid " << tid << dendl;
2547     return;
2548   }
2549
2550   MetaRequest *request = mds_requests[tid];
2551   ceph_assert(request);
2552
2553   /*
2554    * The type of 'num_fwd' in ceph 'MClientRequestForward'
2555    * is 'int32_t', while in 'ceph_mds_request_head' the
2556    * type is '__u8'. So in case the request bounces between
2557    * MDSes exceeding 256 times, the client will get stuck.
2558    *
2559    * In this case it's ususally a bug in MDS and continue
2560    * bouncing the request makes no sense.
2561    *
2562    * In future this could be fixed in ceph code, so avoid
2563    * using the hardcode here.
2564    */
2565   int max_fwd = sizeof(((struct ceph_mds_request_head*)0)->num_fwd);
2566   max_fwd = 1 << (max_fwd * CHAR_BIT) - 1;
2567   auto num_fwd = fwd->get_num_fwd();
2568   if (num_fwd <= request->num_fwd || num_fwd >= max_fwd) {
2569     if (request->num_fwd >= max_fwd || num_fwd >= max_fwd) {
2570       request->abort(-EMULTIHOP);
2571       request->caller_cond->notify_all();
2572       ldout(cct, 1) << __func__ << " tid " << tid << " seq overflow"
2573                     << ", abort it" << dendl;
2574     } else {
2575       ldout(cct, 10) << __func__ << " tid " << tid
2576                      << " old fwd seq " << fwd->get_num_fwd()
2577                      << " <= req fwd " << request->num_fwd
2578                      << ", ignore it" << dendl;
2579     }
2580     return;
2581   }
2582
2583   // reset retry counter
2584   request->retry_attempt = 0;
2585
2586   // request not forwarded, or dest mds has no session.
2587   // resend.
2588   ldout(cct, 10) << __func__ << " tid " << tid
2589            << " fwd " << fwd->get_num_fwd()
2590            << " to mds." << fwd->get_dest_mds()
2591            << ", resending to " << fwd->get_dest_mds()
2592            << dendl;
2593
2594   request->mds = -1;
2595   request->item.remove_myself();
2596   request->num_fwd = num_fwd;
2597   request->resend_mds = fwd->get_dest_mds();
2598   request->caller_cond->notify_all();
2599 }
2600
2601 bool Client::is_dir_operation(MetaRequest *req)
2602 {
2603   int op = req->get_op();
2604   if (op == CEPH_MDS_OP_MKNOD || op == CEPH_MDS_OP_LINK ||
2605       op == CEPH_MDS_OP_UNLINK || op == CEPH_MDS_OP_RENAME ||
2606       op == CEPH_MDS_OP_MKDIR || op == CEPH_MDS_OP_RMDIR ||
2607       op == CEPH_MDS_OP_SYMLINK || op == CEPH_MDS_OP_CREATE)
2608     return true;
2609   return false;
2610 }
2611
2612 void Client::handle_client_reply(const MConstRef<MClientReply>& reply)
2613 {
2614   mds_rank_t mds_num = mds_rank_t(reply->get_source().num());
2615
2616   std::scoped_lock cl(client_lock);
2617   auto session = _get_mds_session(mds_num, reply->get_connection().get());
2618   if (!session) {
2619     return;
2620   }
2621
2622   ceph_tid_t tid = reply->get_tid();
2623   bool is_safe = reply->is_safe();
2624
2625   if (mds_requests.count(tid) == 0) {
2626     lderr(cct) << __func__ << " no pending request on tid " << tid
2627                << " safe is:" << is_safe << dendl;
2628     return;
2629   }
2630   MetaRequest *request = mds_requests.at(tid);
2631
2632   ldout(cct, 20) << __func__ << " got a reply. Safe:" << is_safe
2633                  << " tid " << tid << dendl;
2634
2635   if (request->got_unsafe && !is_safe) {
2636     //duplicate response
2637     ldout(cct, 0) << "got a duplicate reply on tid " << tid << " from mds "
2638             << mds_num << " safe:" << is_safe << dendl;
2639     return;
2640   }
2641
2642   ceph_assert(!request->reply);
2643   request->reply = reply;
2644   insert_trace(request, session.get());
2645
2646   // Handle unsafe reply
2647   if (!is_safe) {
2648     request->got_unsafe = true;
2649     session->unsafe_requests.push_back(&request->unsafe_item);
2650     if (is_dir_operation(request)) {
2651       Inode *dir = request->inode();
2652       ceph_assert(dir);
2653       dir->unsafe_ops.push_back(&request->unsafe_dir_item);
2654     }
2655     if (request->target) {
2656       InodeRef &in = request->target;
2657       in->unsafe_ops.push_back(&request->unsafe_target_item);
2658     }
2659   }
2660
2661   // Only signal the caller once (on the first reply):
2662   // Either its an unsafe reply, or its a safe reply and no unsafe reply was sent.
2663   if (!is_safe || !request->got_unsafe) {
2664     ceph::condition_variable cond;
2665     request->dispatch_cond = &cond;
2666
2667     // wake up waiter
2668     ldout(cct, 20) << __func__ << " signalling caller " << (void*)request->caller_cond << dendl;
2669     request->caller_cond->notify_all();
2670
2671     // wake for kick back
2672     std::unique_lock l{client_lock, std::adopt_lock};
2673     cond.wait(l, [tid, request, &cond, this] {
2674       if (request->dispatch_cond) {
2675         ldout(cct, 20) << "handle_client_reply awaiting kickback on tid "
2676                        << tid << " " << &cond << dendl;
2677       }
2678       return !request->dispatch_cond;
2679     });
2680     l.release();
2681   }
2682
2683   if (is_safe) {
2684     // the filesystem change is committed to disk
2685     // we're done, clean up
2686     if (request->got_unsafe) {
2687       request->unsafe_item.remove_myself();
2688       request->unsafe_dir_item.remove_myself();
2689       request->unsafe_target_item.remove_myself();
2690       signal_cond_list(request->waitfor_safe);
2691     }
2692     request->item.remove_myself();
2693     unregister_request(request);
2694   }
2695   if (is_unmounting())
2696     mount_cond.notify_all();
2697 }
2698
2699 void Client::_handle_full_flag(int64_t pool)
2700 {
2701   ldout(cct, 1) << __func__ << ": FULL: cancelling outstanding operations "
2702     << "on " << pool << dendl;
2703   // Cancel all outstanding ops in this pool with -CEPHFS_ENOSPC: it is necessary
2704   // to do this rather than blocking, because otherwise when we fill up we
2705   // potentially lock caps forever on files with dirty pages, and we need
2706   // to be able to release those caps to the MDS so that it can delete files
2707   // and free up space.
2708   epoch_t cancelled_epoch = objecter->op_cancel_writes(-CEPHFS_ENOSPC, pool);
2709
2710   // For all inodes with layouts in this pool and a pending flush write op
2711   // (i.e. one of the ones we will cancel), we've got to purge_set their data
2712   // from ObjectCacher so that it doesn't re-issue the write in response to
2713   // the ENOSPC error.
2714   // Fortunately since we're cancelling everything in a given pool, we don't
2715   // need to know which ops belong to which ObjectSet, we can just blow all
2716   // the un-flushed cached data away and mark any dirty inodes' async_err
2717   // field with -CEPHFS_ENOSPC as long as we're sure all the ops we cancelled were
2718   // affecting this pool, and all the objectsets we're purging were also
2719   // in this pool.
2720   for (unordered_map<vinodeno_t,Inode*>::iterator i = inode_map.begin();
2721        i != inode_map.end(); ++i)
2722   {
2723     Inode *inode = i->second;
2724     if (inode->oset.dirty_or_tx
2725         && (pool == -1 || inode->layout.pool_id == pool)) {
2726       ldout(cct, 4) << __func__ << ": FULL: inode 0x" << std::hex << i->first << std::dec
2727         << " has dirty objects, purging and setting ENOSPC" << dendl;
2728       objectcacher->purge_set(&inode->oset);
2729       inode->set_async_err(-CEPHFS_ENOSPC);
2730     }
2731   }
2732
2733   if (cancelled_epoch != (epoch_t)-1) {
2734     set_cap_epoch_barrier(cancelled_epoch);
2735   }
2736 }
2737
2738 void Client::handle_osd_map(const MConstRef<MOSDMap>& m)
2739 {
2740   std::scoped_lock cl(client_lock);
2741
2742   const auto myaddrs = messenger->get_myaddrs();
2743   bool new_blocklist = objecter->with_osdmap(
2744     [&](const OSDMap& o) {
2745       return o.is_blocklisted(myaddrs);
2746     });
2747
2748   if (new_blocklist && !blocklisted) {
2749     auto epoch = objecter->with_osdmap([](const OSDMap &o){
2750         return o.get_epoch();
2751         });
2752     lderr(cct) << "I was blocklisted at osd epoch " << epoch << dendl;
2753     blocklisted = true;
2754
2755     _abort_mds_sessions(-CEPHFS_EBLOCKLISTED);
2756
2757     // Since we know all our OSD ops will fail, cancel them all preemtively,
2758     // so that on an unhealthy cluster we can umount promptly even if e.g.
2759     // some PGs were inaccessible.
2760     objecter->op_cancel_writes(-CEPHFS_EBLOCKLISTED);
2761
2762   }
2763
2764   if (blocklisted) {
2765     // Handle case where we were blocklisted but no longer are
2766     blocklisted = objecter->with_osdmap([myaddrs](const OSDMap &o){
2767         return o.is_blocklisted(myaddrs);});
2768   }
2769
2770   // Always subscribe to next osdmap for blocklisted client
2771   // until this client is not blocklisted.
2772   if (blocklisted) {
2773     objecter->maybe_request_map();
2774   }
2775
2776   if (objecter->osdmap_full_flag()) {
2777     _handle_full_flag(-1);
2778   } else {
2779     // Accumulate local list of full pools so that I can drop
2780     // the objecter lock before re-entering objecter in
2781     // cancel_writes
2782     std::vector<int64_t> full_pools;
2783
2784     objecter->with_osdmap([&full_pools](const OSDMap &o) {
2785         for (const auto& kv : o.get_pools()) {
2786           if (kv.second.has_flag(pg_pool_t::FLAG_FULL)) {
2787             full_pools.push_back(kv.first);
2788           }
2789         }
2790       });
2791
2792     for (auto p : full_pools)
2793       _handle_full_flag(p);
2794
2795     // Subscribe to subsequent maps to watch for the full flag going
2796     // away.  For the global full flag objecter does this for us, but
2797     // it pays no attention to the per-pool full flag so in this branch
2798     // we do it ourselves.
2799     if (!full_pools.empty()) {
2800       objecter->maybe_request_map();
2801     }
2802   }
2803 }
2804
2805
2806 // ------------------------
2807 // incoming messages
2808
2809
2810 bool Client::ms_dispatch2(const MessageRef &m)
2811 {
2812   RWRef_t iref_reader(initialize_state, CLIENT_INITIALIZED);
2813   if (!iref_reader.is_state_satisfied()) {
2814     ldout(cct, 10) << "inactive, discarding " << *m << dendl;
2815     return true;
2816   }
2817
2818   switch (m->get_type()) {
2819     // mounting and mds sessions
2820   case CEPH_MSG_MDS_MAP:
2821     handle_mds_map(ref_cast<MMDSMap>(m));
2822     break;
2823   case CEPH_MSG_FS_MAP:
2824     handle_fs_map(ref_cast<MFSMap>(m));
2825     break;
2826   case CEPH_MSG_FS_MAP_USER:
2827     handle_fs_map_user(ref_cast<MFSMapUser>(m));
2828     break;
2829   case CEPH_MSG_CLIENT_SESSION:
2830     handle_client_session(ref_cast<MClientSession>(m));
2831     break;
2832
2833   case CEPH_MSG_OSD_MAP:
2834     handle_osd_map(ref_cast<MOSDMap>(m));
2835     break;
2836
2837     // requests
2838   case CEPH_MSG_CLIENT_REQUEST_FORWARD:
2839     handle_client_request_forward(ref_cast<MClientRequestForward>(m));
2840     break;
2841   case CEPH_MSG_CLIENT_REPLY:
2842     handle_client_reply(ref_cast<MClientReply>(m));
2843     break;
2844
2845   // reclaim reply
2846   case CEPH_MSG_CLIENT_RECLAIM_REPLY:
2847     handle_client_reclaim_reply(ref_cast<MClientReclaimReply>(m));
2848     break;
2849
2850   case CEPH_MSG_CLIENT_SNAP:
2851     handle_snap(ref_cast<MClientSnap>(m));
2852     break;
2853   case CEPH_MSG_CLIENT_CAPS:
2854     handle_caps(ref_cast<MClientCaps>(m));
2855     break;
2856   case CEPH_MSG_CLIENT_LEASE:
2857     handle_lease(ref_cast<MClientLease>(m));
2858     break;
2859   case MSG_COMMAND_REPLY:
2860     if (m->get_source().type() == CEPH_ENTITY_TYPE_MDS) {
2861       handle_command_reply(ref_cast<MCommandReply>(m));
2862     } else {
2863       return false;
2864     }
2865     break;
2866   case CEPH_MSG_CLIENT_QUOTA:
2867     handle_quota(ref_cast<MClientQuota>(m));
2868     break;
2869
2870   default:
2871     return false;
2872   }
2873
2874   // unmounting?
2875   std::scoped_lock cl(client_lock);
2876   if (is_unmounting()) {
2877     ldout(cct, 10) << "unmounting: trim pass, size was " << lru.lru_get_size()
2878              << "+" << inode_map.size() << dendl;
2879     uint64_t size = lru.lru_get_size() + inode_map.size();
2880     trim_cache();
2881     if (size > lru.lru_get_size() + inode_map.size()) {
2882       ldout(cct, 10) << "unmounting: trim pass, cache shrank, poking unmount()" << dendl;
2883       mount_cond.notify_all();
2884     } else {
2885       ldout(cct, 10) << "unmounting: trim pass, size still " << lru.lru_get_size()
2886                << "+" << inode_map.size() << dendl;
2887     }
2888   }
2889
2890   return true;
2891 }
2892
2893 void Client::handle_fs_map(const MConstRef<MFSMap>& m)
2894 {
2895   std::scoped_lock cl(client_lock);
2896   fsmap.reset(new FSMap(m->get_fsmap()));
2897
2898   signal_cond_list(waiting_for_fsmap);
2899
2900   monclient->sub_got("fsmap", fsmap->get_epoch());
2901 }
2902
2903 void Client::handle_fs_map_user(const MConstRef<MFSMapUser>& m)
2904 {
2905   std::scoped_lock cl(client_lock);
2906   fsmap_user.reset(new FSMapUser);
2907   *fsmap_user = m->get_fsmap();
2908
2909   monclient->sub_got("fsmap.user", fsmap_user->get_epoch());
2910   signal_cond_list(waiting_for_fsmap);
2911 }
2912
2913 // Cancel all the commands for missing or laggy GIDs
2914 void Client::cancel_commands(const MDSMap& newmap)
2915 {
2916   std::vector<ceph_tid_t> cancel_ops;
2917
2918   std::scoped_lock cmd_lock(command_lock);
2919   auto &commands = command_table.get_commands();
2920   for (const auto &[tid, op] : commands) {
2921     const mds_gid_t op_mds_gid = op.mds_gid;
2922     if (newmap.is_dne_gid(op_mds_gid) || newmap.is_laggy_gid(op_mds_gid)) {
2923       ldout(cct, 1) << __func__ << ": cancelling command op " << tid << dendl;
2924       cancel_ops.push_back(tid);
2925       if (op.outs) {
2926         std::ostringstream ss;
2927         ss << "MDS " << op_mds_gid << " went away";
2928         *(op.outs) = ss.str();
2929       }
2930       /*
2931        * No need to make the con->mark_down under
2932        * client_lock here, because the con will
2933        * has its own lock.
2934        */
2935       op.con->mark_down();
2936       if (op.on_finish)
2937         op.on_finish->complete(-CEPHFS_ETIMEDOUT);
2938     }
2939   }
2940
2941   for (const auto &tid : cancel_ops)
2942     command_table.erase(tid);
2943 }
2944
2945 void Client::handle_mds_map(const MConstRef<MMDSMap>& m)
2946 {
2947   std::unique_lock cl(client_lock);
2948   if (m->get_epoch() <= mdsmap->get_epoch()) {
2949     ldout(cct, 1) << __func__ << " epoch " << m->get_epoch()
2950                   << " is identical to or older than our "
2951                   << mdsmap->get_epoch() << dendl;
2952     return;
2953   }
2954
2955   cl.unlock();
2956   ldout(cct, 1) << __func__ << " epoch " << m->get_epoch() << dendl;
2957   std::unique_ptr<MDSMap> _mdsmap(new MDSMap);
2958   _mdsmap->decode(m->get_encoded());
2959   cancel_commands(*_mdsmap.get());
2960   cl.lock();
2961
2962   _mdsmap.swap(mdsmap);
2963
2964   // reset session
2965   for (auto p = mds_sessions.begin(); p != mds_sessions.end(); ) {
2966     mds_rank_t mds = p->first;
2967     MetaSessionRef session = p->second;
2968     ++p;
2969
2970     int oldstate = _mdsmap->get_state(mds);
2971     int newstate = mdsmap->get_state(mds);
2972     if (!mdsmap->is_up(mds)) {
2973       session->con->mark_down();
2974     } else if (mdsmap->get_addrs(mds) != session->addrs) {
2975       auto old_inc = _mdsmap->get_incarnation(mds);
2976       auto new_inc = mdsmap->get_incarnation(mds);
2977       if (old_inc != new_inc) {
2978         ldout(cct, 1) << "mds incarnation changed from "
2979                       << old_inc << " to " << new_inc << dendl;
2980         oldstate = MDSMap::STATE_NULL;
2981       }
2982       session->con->mark_down();
2983       session->addrs = mdsmap->get_addrs(mds);
2984       // When new MDS starts to take over, notify kernel to trim unused entries
2985       // in its dcache/icache. Hopefully, the kernel will release some unused
2986       // inodes before the new MDS enters reconnect state.
2987       trim_cache_for_reconnect(session.get());
2988     } else if (oldstate == newstate)
2989       continue;  // no change
2990
2991     session->mds_state = newstate;
2992     if (newstate == MDSMap::STATE_RECONNECT) {
2993       session->con = messenger->connect_to_mds(session->addrs);
2994       send_reconnect(session.get());
2995     } else if (newstate > MDSMap::STATE_RECONNECT) {
2996       if (oldstate < MDSMap::STATE_RECONNECT) {
2997         ldout(cct, 1) << "we may miss the MDSMap::RECONNECT, close mds session ... " << dendl;
2998         _closed_mds_session(session.get());
2999         continue;
3000       }
3001       if (newstate >= MDSMap::STATE_ACTIVE) {
3002         if (oldstate < MDSMap::STATE_ACTIVE) {
3003           // kick new requests
3004           kick_requests(session.get());
3005           kick_flushing_caps(session.get());
3006           signal_context_list(session->waiting_for_open);
3007           wake_up_session_caps(session.get(), true);
3008         }
3009         connect_mds_targets(mds);
3010       }
3011     } else if (newstate == MDSMap::STATE_NULL &&
3012                mds >= mdsmap->get_max_mds()) {
3013       _closed_mds_session(session.get());
3014     }
3015   }
3016
3017   // kick any waiting threads
3018   signal_cond_list(waiting_for_mdsmap);
3019
3020   monclient->sub_got("mdsmap", mdsmap->get_epoch());
3021 }
3022
3023 void Client::send_reconnect(MetaSession *session)
3024 {
3025   mds_rank_t mds = session->mds_num;
3026   ldout(cct, 10) << __func__ << " to mds." << mds << dendl;
3027
3028   // trim unused caps to reduce MDS's cache rejoin time
3029   trim_cache_for_reconnect(session);
3030
3031   session->readonly = false;
3032
3033   session->release.reset();
3034
3035   // reset my cap seq number
3036   session->seq = 0;
3037   //connect to the mds' offload targets
3038   connect_mds_targets(mds);
3039   //make sure unsafe requests get saved
3040   resend_unsafe_requests(session);
3041
3042   early_kick_flushing_caps(session);
3043
3044   auto m = make_message<MClientReconnect>();
3045   bool allow_multi = session->mds_features.test(CEPHFS_FEATURE_MULTI_RECONNECT);
3046
3047   // i have an open session.
3048   ceph::unordered_set<inodeno_t> did_snaprealm;
3049   for (ceph::unordered_map<vinodeno_t, Inode*>::iterator p = inode_map.begin();
3050        p != inode_map.end();
3051        ++p) {
3052     Inode *in = p->second;
3053     auto it = in->caps.find(mds);
3054     if (it != in->caps.end()) {
3055       if (allow_multi &&
3056           m->get_approx_size() >=
3057           static_cast<size_t>((std::numeric_limits<int>::max() >> 1))) {
3058         m->mark_more();
3059         session->con->send_message2(std::move(m));
3060
3061         m = make_message<MClientReconnect>();
3062       }
3063
3064       Cap &cap = it->second;
3065       ldout(cct, 10) << " caps on " << p->first
3066                << " " << ccap_string(cap.issued)
3067                << " wants " << ccap_string(in->caps_wanted())
3068                << dendl;
3069       filepath path;
3070       in->make_short_path(path);
3071       ldout(cct, 10) << "    path " << path << dendl;
3072
3073       bufferlist flockbl;
3074       _encode_filelocks(in, flockbl);
3075
3076       cap.seq = 0;  // reset seq.
3077       cap.issue_seq = 0;  // reset seq.
3078       cap.mseq = 0;  // reset seq.
3079       // cap gen should catch up with session cap_gen
3080       if (cap.gen < session->cap_gen) {
3081         cap.gen = session->cap_gen;
3082         cap.issued = cap.implemented = CEPH_CAP_PIN;
3083       } else {
3084         cap.issued = cap.implemented;
3085       }
3086       snapid_t snap_follows = 0;
3087       if (!in->cap_snaps.empty())
3088         snap_follows = in->cap_snaps.begin()->first;
3089
3090       m->add_cap(p->first.ino,
3091                  cap.cap_id,
3092                  path.get_ino(), path.get_path(),   // ino
3093                  in->caps_wanted(), // wanted
3094                  cap.issued,     // issued
3095                  in->snaprealm->ino,
3096                  snap_follows,
3097                  flockbl);
3098
3099       if (did_snaprealm.count(in->snaprealm->ino) == 0) {
3100         ldout(cct, 10) << " snaprealm " << *in->snaprealm << dendl;
3101         m->add_snaprealm(in->snaprealm->ino, in->snaprealm->seq, in->snaprealm->parent);
3102         did_snaprealm.insert(in->snaprealm->ino);
3103       }
3104     }
3105   }
3106
3107   if (!allow_multi)
3108     m->set_encoding_version(0); // use connection features to choose encoding
3109   session->con->send_message2(std::move(m));
3110
3111   mount_cond.notify_all();
3112
3113   if (session->reclaim_state == MetaSession::RECLAIMING)
3114     signal_cond_list(waiting_for_reclaim);
3115 }
3116
3117
3118 void Client::kick_requests(MetaSession *session)
3119 {
3120   ldout(cct, 10) << __func__ << " for mds." << session->mds_num << dendl;
3121   for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
3122        p != mds_requests.end();
3123        ++p) {
3124     MetaRequest *req = p->second;
3125     if (req->got_unsafe)
3126       continue;
3127     if (req->aborted()) {
3128       if (req->caller_cond) {
3129         req->kick = true;
3130         req->caller_cond->notify_all();
3131       }
3132       continue;
3133     }
3134     if (req->retry_attempt > 0)
3135       continue; // new requests only
3136     if (req->mds == session->mds_num) {
3137       send_request(p->second, session);
3138     }
3139   }
3140 }
3141
3142 void Client::resend_unsafe_requests(MetaSession *session)
3143 {
3144   for (xlist<MetaRequest*>::iterator iter = session->unsafe_requests.begin();
3145        !iter.end();
3146        ++iter)
3147     send_request(*iter, session);
3148
3149   // also re-send old requests when MDS enters reconnect stage. So that MDS can
3150   // process completed requests in clientreplay stage.
3151   for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
3152        p != mds_requests.end();
3153        ++p) {
3154     MetaRequest *req = p->second;
3155     if (req->got_unsafe)
3156       continue;
3157     if (req->aborted())
3158       continue;
3159     if (req->retry_attempt == 0)
3160       continue; // old requests only
3161     if (req->mds == session->mds_num)
3162       send_request(req, session, true);
3163   }
3164 }
3165
3166 void Client::wait_unsafe_requests()
3167 {
3168   list<MetaRequest*> last_unsafe_reqs;
3169   for (const auto &p : mds_sessions) {
3170     const auto s = p.second;
3171     if (!s->unsafe_requests.empty()) {
3172       MetaRequest *req = s->unsafe_requests.back();
3173       req->get();
3174       last_unsafe_reqs.push_back(req);
3175     }
3176   }
3177
3178   for (list<MetaRequest*>::iterator p = last_unsafe_reqs.begin();
3179        p != last_unsafe_reqs.end();
3180        ++p) {
3181     MetaRequest *req = *p;
3182     if (req->unsafe_item.is_on_list())
3183       wait_on_list(req->waitfor_safe);
3184     put_request(req);
3185   }
3186 }
3187
3188 void Client::kick_requests_closed(MetaSession *session)
3189 {
3190   ldout(cct, 10) << __func__ << " for mds." << session->mds_num << dendl;
3191   for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
3192        p != mds_requests.end(); ) {
3193     MetaRequest *req = p->second;
3194     ++p;
3195     if (req->mds == session->mds_num) {
3196       if (req->caller_cond) {
3197         req->kick = true;
3198         req->caller_cond->notify_all();
3199       }
3200       req->item.remove_myself();
3201       if (req->got_unsafe) {
3202         lderr(cct) << __func__ << " removing unsafe request " << req->get_tid() << dendl;
3203         req->unsafe_item.remove_myself();
3204         if (is_dir_operation(req)) {
3205           Inode *dir = req->inode();
3206           ceph_assert(dir);
3207           dir->set_async_err(-CEPHFS_EIO);
3208           lderr(cct) << "kick_requests_closed drop req of inode(dir) : "
3209                      <<  dir->ino  << " " << req->get_tid() << dendl;
3210           req->unsafe_dir_item.remove_myself();
3211         }
3212         if (req->target) {
3213           InodeRef &in = req->target;
3214           in->set_async_err(-CEPHFS_EIO);
3215           lderr(cct) << "kick_requests_closed drop req of inode : "
3216                      <<  in->ino  << " " << req->get_tid() << dendl;
3217           req->unsafe_target_item.remove_myself();
3218         }
3219         signal_cond_list(req->waitfor_safe);
3220         unregister_request(req);
3221       }
3222     }
3223   }
3224   ceph_assert(session->requests.empty());
3225   ceph_assert(session->unsafe_requests.empty());
3226 }
3227
3228
3229
3230
3231 /************
3232  * leases
3233  */
3234
3235 void Client::got_mds_push(MetaSession *s)
3236 {
3237   s->seq++;
3238   ldout(cct, 10) << " mds." << s->mds_num << " seq now " << s->seq << dendl;
3239   if (s->state == MetaSession::STATE_CLOSING) {
3240     s->con->send_message2(make_message<MClientSession>(CEPH_SESSION_REQUEST_CLOSE, s->seq));
3241   }
3242 }
3243
3244 void Client::handle_lease(const MConstRef<MClientLease>& m)
3245 {
3246   ldout(cct, 10) << __func__ << " " << *m << dendl;
3247
3248   ceph_assert(m->get_action() == CEPH_MDS_LEASE_REVOKE);
3249   mds_rank_t mds = mds_rank_t(m->get_source().num());
3250
3251   std::scoped_lock cl(client_lock);
3252   auto session = _get_mds_session(mds, m->get_connection().get());
3253   if (!session) {
3254     return;
3255   }
3256
3257   got_mds_push(session.get());
3258
3259   ceph_seq_t seq = m->get_seq();
3260
3261   Inode *in;
3262   vinodeno_t vino(m->get_ino(), CEPH_NOSNAP);
3263   if (inode_map.count(vino) == 0) {
3264     ldout(cct, 10) << " don't have vino " << vino << dendl;
3265     goto revoke;
3266   }
3267   in = inode_map[vino];
3268
3269   if (m->get_mask() & CEPH_LEASE_VALID) {
3270     if (!in->dir || in->dir->dentries.count(m->dname) == 0) {
3271       ldout(cct, 10) << " don't have dir|dentry " << m->get_ino() << "/" << m->dname <<dendl;
3272       goto revoke;
3273     }
3274     Dentry *dn = in->dir->dentries[m->dname];
3275     ldout(cct, 10) << " revoked DN lease on " << dn << dendl;
3276     dn->lease_mds = -1;
3277   }
3278
3279  revoke:
3280   {
3281     auto reply = make_message<MClientLease>(CEPH_MDS_LEASE_RELEASE, seq,
3282                                             m->get_mask(), m->get_ino(),
3283                                             m->get_first(), m->get_last(), m->dname);
3284     m->get_connection()->send_message2(std::move(reply));
3285   }
3286 }
3287
3288 void Client::_put_inode(Inode *in, int n)
3289 {
3290   ldout(cct, 10) << __func__ << " on " << *in << " n = " << n << dendl;
3291
3292   int left = in->get_nref();
3293   ceph_assert(left >= n + 1);
3294   in->iput(n);
3295   left -= n;
3296   if (left == 1) { // the last one will be held by the inode_map
3297     // release any caps
3298     remove_all_caps(in);
3299
3300     ldout(cct, 10) << __func__ << " deleting " << *in << dendl;
3301     bool unclean = objectcacher->release_set(&in->oset);
3302     ceph_assert(!unclean);
3303     inode_map.erase(in->vino());
3304     if (use_faked_inos())
3305       _release_faked_ino(in);
3306
3307     if (root == nullptr) {
3308       root_ancestor = 0;
3309       while (!root_parents.empty())
3310         root_parents.erase(root_parents.begin());
3311     }
3312
3313     in->iput();
3314   }
3315 }
3316
3317 void Client::delay_put_inodes(bool wakeup)
3318 {
3319   ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
3320
3321   std::map<Inode*,int> release;
3322   {
3323     std::scoped_lock dl(delay_i_lock);
3324     release.swap(delay_i_release);
3325   }
3326
3327   if (release.empty())
3328     return;
3329
3330   for (auto &[in, cnt] : release)
3331     _put_inode(in, cnt);
3332
3333   if (wakeup)
3334     mount_cond.notify_all();
3335 }
3336
3337 void Client::put_inode(Inode *in, int n)
3338 {
3339   ldout(cct, 20) << __func__ << " on " << *in << " n = " << n << dendl;
3340
3341   std::scoped_lock dl(delay_i_lock);
3342   delay_i_release[in] += n;
3343 }
3344
3345 void Client::close_dir(Dir *dir)
3346 {
3347   Inode *in = dir->parent_inode;
3348   ldout(cct, 15) << __func__ << " dir " << dir << " on " << in << dendl;
3349   ceph_assert(dir->is_empty());
3350   ceph_assert(in->dir == dir);
3351   ceph_assert(in->dentries.size() < 2);     // dirs can't be hard-linked
3352   if (!in->dentries.empty())
3353     in->get_first_parent()->put();   // unpin dentry
3354
3355   delete in->dir;
3356   in->dir = 0;
3357   put_inode(in);               // unpin inode
3358 }
3359
3360   /**
3361    * Don't call this with in==NULL, use get_or_create for that
3362    * leave dn set to default NULL unless you're trying to add
3363    * a new inode to a pre-created Dentry
3364    */
3365 Dentry* Client::link(Dir *dir, const string& name, Inode *in, Dentry *dn)
3366 {
3367   if (!dn) {
3368     // create a new Dentry
3369     dn = new Dentry(dir, name);
3370
3371     lru.lru_insert_mid(dn);    // mid or top?
3372
3373     ldout(cct, 15) << "link dir " << dir->parent_inode << " '" << name << "' to inode " << in
3374                    << " dn " << dn << " (new dn)" << dendl;
3375   } else {
3376     ceph_assert(!dn->inode);
3377     ldout(cct, 15) << "link dir " << dir->parent_inode << " '" << name << "' to inode " << in
3378                    << " dn " << dn << " (old dn)" << dendl;
3379   }
3380
3381   if (in) {    // link to inode
3382     InodeRef tmp_ref;
3383     // only one parent for directories!
3384     if (in->is_dir() && !in->dentries.empty()) {
3385       tmp_ref = in; // prevent unlink below from freeing the inode.
3386       Dentry *olddn = in->get_first_parent();
3387       ceph_assert(olddn->dir != dir || olddn->name != name);
3388       Inode *old_diri = olddn->dir->parent_inode;
3389       clear_dir_complete_and_ordered(old_diri, true);
3390       unlink(olddn, true, true);  // keep dir, dentry
3391     }
3392
3393     dn->link(in);
3394     inc_dentry_nr();
3395     ldout(cct, 20) << "link  inode " << in << " parents now " << in->dentries << dendl;
3396   }
3397
3398   return dn;
3399 }
3400
3401 void Client::unlink(Dentry *dn, bool keepdir, bool keepdentry)
3402 {
3403   InodeRef in(dn->inode);
3404   ldout(cct, 15) << "unlink dir " << dn->dir->parent_inode << " '" << dn->name << "' dn " << dn
3405                  << " inode " << dn->inode << dendl;
3406
3407   // unlink from inode
3408   if (dn->inode) {
3409     dn->unlink();
3410     dec_dentry_nr();
3411     ldout(cct, 20) << "unlink  inode " << in << " parents now " << in->dentries << dendl;
3412   }
3413
3414   if (keepdentry) {
3415     dn->lease_mds = -1;
3416   } else {
3417     ldout(cct, 15) << "unlink  removing '" << dn->name << "' dn " << dn << dendl;
3418
3419     // unlink from dir
3420     Dir *dir = dn->dir;
3421     dn->detach();
3422
3423     // delete den
3424     lru.lru_remove(dn);
3425     dn->put();
3426
3427     if (dir->is_empty() && !keepdir)
3428       close_dir(dir);
3429   }
3430 }
3431
3432 /**
3433  * For asynchronous flushes, check for errors from the IO and
3434  * update the inode if necessary
3435  */
3436 class C_Client_FlushComplete : public Context {
3437 private:
3438   Client *client;
3439   InodeRef inode;
3440 public:
3441   C_Client_FlushComplete(Client *c, Inode *in) : client(c), inode(in) { }
3442   void finish(int r) override {
3443     ceph_assert(ceph_mutex_is_locked_by_me(client->client_lock));
3444     if (r != 0) {
3445       client_t const whoami = client->whoami;  // For the benefit of ldout prefix
3446       ldout(client->cct, 1) << "I/O error from flush on inode " << inode
3447         << " 0x" << std::hex << inode->ino << std::dec
3448         << ": " << r << "(" << cpp_strerror(r) << ")" << dendl;
3449       inode->set_async_err(r);
3450     }
3451   }
3452 };
3453
3454
3455 /****
3456  * caps
3457  */
3458
3459 void Client::get_cap_ref(Inode *in, int cap)
3460 {
3461   if ((cap & CEPH_CAP_FILE_BUFFER) &&
3462       in->cap_refs[CEPH_CAP_FILE_BUFFER] == 0) {
3463     ldout(cct, 5) << __func__ << " got first FILE_BUFFER ref on " << *in << dendl;
3464     in->iget();
3465   }
3466   if ((cap & CEPH_CAP_FILE_CACHE) &&
3467       in->cap_refs[CEPH_CAP_FILE_CACHE] == 0) {
3468     ldout(cct, 5) << __func__ << " got first FILE_CACHE ref on " << *in << dendl;
3469     in->iget();
3470   }
3471   in->get_cap_ref(cap);
3472 }
3473
3474 void Client::put_cap_ref(Inode *in, int cap)
3475 {
3476   int last = in->put_cap_ref(cap);
3477   if (last) {
3478     int put_nref = 0;
3479     int drop = last & ~in->caps_issued();
3480     if (in->snapid == CEPH_NOSNAP) {
3481       if ((last & (CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER)) &&
3482           !in->cap_snaps.empty() &&
3483           in->cap_snaps.rbegin()->second.writing) {
3484         ldout(cct, 10) << __func__ << " finishing pending cap_snap on " << *in << dendl;
3485         in->cap_snaps.rbegin()->second.writing = 0;
3486         finish_cap_snap(in, in->cap_snaps.rbegin()->second, get_caps_used(in));
3487         signal_cond_list(in->waitfor_caps);  // wake up blocked sync writers
3488       }
3489       if (last & CEPH_CAP_FILE_BUFFER) {
3490         for (auto &p : in->cap_snaps)
3491           p.second.dirty_data = 0;
3492         signal_cond_list(in->waitfor_commit);
3493         ldout(cct, 5) << __func__ << " dropped last FILE_BUFFER ref on " << *in << dendl;
3494         ++put_nref;
3495       }
3496     }
3497     if (last & CEPH_CAP_FILE_CACHE) {
3498       ldout(cct, 5) << __func__ << " dropped last FILE_CACHE ref on " << *in << dendl;
3499       ++put_nref;
3500     }
3501     if (drop)
3502       check_caps(in, 0);
3503     if (put_nref)
3504       put_inode(in, put_nref);
3505   }
3506 }
3507
3508 // get caps for a given file handle -- the inode should have @need caps
3509 // issued by the mds and @want caps not revoked (or not under revocation).
3510 // this routine blocks till the cap requirement is satisfied. also account
3511 // (track) for capability hit when required (when cap requirement succeedes).
3512 int Client::get_caps(Fh *fh, int need, int want, int *phave, loff_t endoff)
3513 {
3514   Inode *in = fh->inode.get();
3515
3516   int r = check_pool_perm(in, need);
3517   if (r < 0)
3518     return r;
3519
3520   while (1) {
3521     int file_wanted = in->caps_file_wanted();
3522     if ((file_wanted & need) != need) {
3523       ldout(cct, 10) << "get_caps " << *in << " need " << ccap_string(need)
3524                      << " file_wanted " << ccap_string(file_wanted) << ", EBADF "
3525                      << dendl;
3526       return -CEPHFS_EBADF;
3527     }
3528
3529     if ((fh->mode & CEPH_FILE_MODE_WR) && fh->gen != fd_gen)
3530       return -CEPHFS_EBADF;
3531
3532     if ((in->flags & I_ERROR_FILELOCK) && fh->has_any_filelocks())
3533       return -CEPHFS_EIO;
3534
3535     int implemented;
3536     int have = in->caps_issued(&implemented);
3537
3538     bool waitfor_caps = false;
3539     bool waitfor_commit = false;
3540
3541     if (have & need & CEPH_CAP_FILE_WR) {
3542       if (endoff > 0) {
3543          if ((endoff >= (loff_t)in->max_size ||
3544               endoff > (loff_t)(in->size << 1)) &&
3545              endoff > (loff_t)in->wanted_max_size) {
3546            ldout(cct, 10) << "wanted_max_size " << in->wanted_max_size << " -> " << endoff << dendl;
3547            in->wanted_max_size = endoff;
3548          }
3549          if (in->wanted_max_size > in->max_size &&
3550              in->wanted_max_size > in->requested_max_size)
3551            check_caps(in, 0);
3552       }
3553
3554       if (endoff >= 0 && endoff > (loff_t)in->max_size) {
3555         ldout(cct, 10) << "waiting on max_size, endoff " << endoff << " max_size " << in->max_size << " on " << *in << dendl;
3556         waitfor_caps = true;
3557       }
3558       if (!in->cap_snaps.empty()) {
3559         if (in->cap_snaps.rbegin()->second.writing) {
3560           ldout(cct, 10) << "waiting on cap_snap write to complete" << dendl;
3561           waitfor_caps = true;
3562         }
3563         for (auto &p : in->cap_snaps) {
3564           if (p.second.dirty_data) {
3565             waitfor_commit = true;
3566             break;
3567           }
3568         }
3569         if (waitfor_commit) {
3570           _flush(in, new C_Client_FlushComplete(this, in));
3571           ldout(cct, 10) << "waiting for WRBUFFER to get dropped" << dendl;
3572         }
3573       }
3574     }
3575
3576     if (!waitfor_caps && !waitfor_commit) {
3577       if ((have & need) == need) {
3578         int revoking = implemented & ~have;
3579         ldout(cct, 10) << "get_caps " << *in << " have " << ccap_string(have)
3580                  << " need " << ccap_string(need) << " want " << ccap_string(want)
3581                  << " revoking " << ccap_string(revoking)
3582                  << dendl;
3583         if ((revoking & want) == 0) {
3584           *phave = need | (have & want);
3585           in->get_cap_ref(need);
3586           cap_hit();
3587           return 0;
3588         }
3589       }
3590       ldout(cct, 10) << "waiting for caps " << *in << " need " << ccap_string(need) << " want " << ccap_string(want) << dendl;
3591       waitfor_caps = true;
3592     }
3593
3594     if ((need & CEPH_CAP_FILE_WR) && in->auth_cap &&
3595         in->auth_cap->session->readonly)
3596       return -CEPHFS_EROFS;
3597
3598     if (in->flags & I_CAP_DROPPED) {
3599       int mds_wanted = in->caps_mds_wanted();
3600       if ((mds_wanted & need) != need) {
3601         int ret = _renew_caps(in);
3602         if (ret < 0)
3603           return ret;
3604         continue;
3605       }
3606       if (!(file_wanted & ~mds_wanted))
3607         in->flags &= ~I_CAP_DROPPED;
3608     }
3609
3610     if (waitfor_caps)
3611       wait_on_list(in->waitfor_caps);
3612     else if (waitfor_commit)
3613       wait_on_list(in->waitfor_commit);
3614   }
3615 }
3616
3617 int Client::get_caps_used(Inode *in)
3618 {
3619   unsigned used = in->caps_used();
3620   if (!(used & CEPH_CAP_FILE_CACHE) &&
3621       !objectcacher->set_is_empty(&in->oset))
3622     used |= CEPH_CAP_FILE_CACHE;
3623   return used;
3624 }
3625
3626 void Client::cap_delay_requeue(Inode *in)
3627 {
3628   ldout(cct, 10) << __func__ << " on " << *in << dendl;
3629
3630   in->hold_caps_until = ceph::coarse_mono_clock::now() + caps_release_delay;
3631   delayed_list.push_back(&in->delay_cap_item);
3632 }
3633
3634 void Client::send_cap(Inode *in, MetaSession *session, Cap *cap,
3635                       int flags, int used, int want, int retain,
3636                       int flush, ceph_tid_t flush_tid)
3637 {
3638   int held = cap->issued | cap->implemented;
3639   int revoking = cap->implemented & ~cap->issued;
3640   retain &= ~revoking;
3641   int dropping = cap->issued & ~retain;
3642   int op = CEPH_CAP_OP_UPDATE;
3643
3644   ldout(cct, 10) << __func__ << " " << *in
3645            << " mds." << session->mds_num << " seq " << cap->seq
3646            << " used " << ccap_string(used)
3647            << " want " << ccap_string(want)
3648            << " flush " << ccap_string(flush)
3649            << " retain " << ccap_string(retain)
3650            << " held "<< ccap_string(held)
3651            << " revoking " << ccap_string(revoking)
3652            << " dropping " << ccap_string(dropping)
3653            << dendl;
3654
3655   if (cct->_conf->client_inject_release_failure && revoking) {
3656     const int would_have_issued = cap->issued & retain;
3657     const int would_have_implemented = cap->implemented & (cap->issued | used);
3658     // Simulated bug:
3659     //  - tell the server we think issued is whatever they issued plus whatever we implemented
3660     //  - leave what we have implemented in place
3661     ldout(cct, 20) << __func__ << " injecting failure to release caps" << dendl;
3662     cap->issued = cap->issued | cap->implemented;
3663
3664     // Make an exception for revoking xattr caps: we are injecting
3665     // failure to release other caps, but allow xattr because client
3666     // will block on xattr ops if it can't release these to MDS (#9800)
3667     const int xattr_mask = CEPH_CAP_XATTR_SHARED | CEPH_CAP_XATTR_EXCL;
3668     cap->issued ^= xattr_mask & revoking;
3669     cap->implemented ^= xattr_mask & revoking;
3670
3671     ldout(cct, 20) << __func__ << " issued " << ccap_string(cap->issued) << " vs " << ccap_string(would_have_issued) << dendl;
3672     ldout(cct, 20) << __func__ << " implemented " << ccap_string(cap->implemented) << " vs " << ccap_string(would_have_implemented) << dendl;
3673   } else {
3674     // Normal behaviour
3675     cap->issued &= retain;
3676     cap->implemented &= cap->issued | used;
3677   }
3678
3679   snapid_t follows = 0;
3680
3681   if (flush)
3682     follows = in->snaprealm->get_snap_context().seq;
3683
3684   auto m = make_message<MClientCaps>(op,
3685                                    in->ino,
3686                                    0,
3687                                    cap->cap_id, cap->seq,
3688                                    cap->implemented,
3689                                    want,
3690                                    flush,
3691                                    cap->mseq,
3692                                    cap_epoch_barrier);
3693   m->caller_uid = in->cap_dirtier_uid;
3694   m->caller_gid = in->cap_dirtier_gid;
3695
3696   m->head.issue_seq = cap->issue_seq;
3697   m->set_tid(flush_tid);
3698
3699   m->head.uid = in->uid;
3700   m->head.gid = in->gid;
3701   m->head.mode = in->mode;
3702
3703   m->head.nlink = in->nlink;
3704
3705   if (flush & CEPH_CAP_XATTR_EXCL) {
3706     encode(in->xattrs, m->xattrbl);
3707     m->head.xattr_version = in->xattr_version;
3708   }
3709
3710   m->size = in->size;
3711   m->max_size = in->max_size;
3712   m->truncate_seq = in->truncate_seq;
3713   m->truncate_size = in->truncate_size;
3714   m->mtime = in->mtime;
3715   m->atime = in->atime;
3716   m->ctime = in->ctime;
3717   m->btime = in->btime;
3718   m->time_warp_seq = in->time_warp_seq;
3719   m->change_attr = in->change_attr;
3720
3721   if (!(flags & MClientCaps::FLAG_PENDING_CAPSNAP) &&
3722       !in->cap_snaps.empty() &&
3723       in->cap_snaps.rbegin()->second.flush_tid == 0)
3724     flags |= MClientCaps::FLAG_PENDING_CAPSNAP;
3725   m->flags = flags;
3726
3727   if (flush & CEPH_CAP_FILE_WR) {
3728     m->inline_version = in->inline_version;
3729     m->inline_data = in->inline_data;
3730   }
3731
3732   in->reported_size = in->size;
3733   m->set_snap_follows(follows);
3734   cap->wanted = want;
3735   if (cap == in->auth_cap) {
3736     if (want & CEPH_CAP_ANY_FILE_WR) {
3737       m->set_max_size(in->wanted_max_size);
3738       in->requested_max_size = in->wanted_max_size;
3739       ldout(cct, 15) << "auth cap, requesting max_size " << in->requested_max_size << dendl;
3740     } else {
3741       in->requested_max_size = 0;
3742       ldout(cct, 15) << "auth cap, reset requested_max_size due to not wanting any file write cap" << dendl;
3743     }
3744   }
3745
3746   if (!session->flushing_caps_tids.empty())
3747     m->set_oldest_flush_tid(*session->flushing_caps_tids.begin());
3748
3749   session->con->send_message2(std::move(m));
3750 }
3751
3752 static bool is_max_size_approaching(Inode *in)
3753 {
3754   /* mds will adjust max size according to the reported size */
3755   if (in->flushing_caps & CEPH_CAP_FILE_WR)
3756     return false;
3757   if (in->size >= in->max_size)
3758     return true;
3759   /* half of previous max_size increment has been used */
3760   if (in->max_size > in->reported_size &&
3761       (in->size << 1) >= in->max_size + in->reported_size)
3762     return true;
3763   return false;
3764 }
3765
3766 static int adjust_caps_used_for_lazyio(int used, int issued, int implemented)
3767 {
3768   if (!(used & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_BUFFER)))
3769     return used;
3770   if (!(implemented & CEPH_CAP_FILE_LAZYIO))
3771     return used;
3772
3773   if (issued & CEPH_CAP_FILE_LAZYIO) {
3774     if (!(issued & CEPH_CAP_FILE_CACHE)) {
3775       used &= ~CEPH_CAP_FILE_CACHE;
3776       used |= CEPH_CAP_FILE_LAZYIO;
3777     }
3778     if (!(issued & CEPH_CAP_FILE_BUFFER)) {
3779       used &= ~CEPH_CAP_FILE_BUFFER;
3780       used |= CEPH_CAP_FILE_LAZYIO;
3781     }
3782   } else {
3783     if (!(implemented & CEPH_CAP_FILE_CACHE)) {
3784       used &= ~CEPH_CAP_FILE_CACHE;
3785       used |= CEPH_CAP_FILE_LAZYIO;
3786     }
3787     if (!(implemented & CEPH_CAP_FILE_BUFFER)) {
3788       used &= ~CEPH_CAP_FILE_BUFFER;
3789       used |= CEPH_CAP_FILE_LAZYIO;
3790     }
3791   }
3792   return used;
3793 }
3794
3795 /**
3796  * check_caps
3797  *
3798  * Examine currently used and wanted versus held caps. Release, flush or ack
3799  * revoked caps to the MDS as appropriate.
3800  *
3801  * @param in the inode to check
3802  * @param flags flags to apply to cap check
3803  */
3804 void Client::check_caps(Inode *in, unsigned flags)
3805 {
3806   unsigned wanted = in->caps_wanted();
3807   unsigned used = get_caps_used(in);
3808   unsigned cap_used;
3809
3810   int implemented;
3811   int issued = in->caps_issued(&implemented);
3812   int revoking = implemented & ~issued;
3813
3814   int orig_used = used;
3815   used = adjust_caps_used_for_lazyio(used, issued, implemented);
3816
3817   int retain = wanted | used | CEPH_CAP_PIN;
3818   if (!is_unmounting() && in->nlink > 0) {
3819     if (wanted) {
3820       retain |= CEPH_CAP_ANY;
3821     } else if (in->is_dir() &&
3822                (issued & CEPH_CAP_FILE_SHARED) &&
3823                (in->flags & I_COMPLETE)) {
3824       // we do this here because we don't want to drop to Fs (and then
3825       // drop the Fs if we do a create!) if that alone makes us send lookups
3826       // to the MDS. Doing it in in->caps_wanted() has knock-on effects elsewhere
3827       wanted = CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_EXCL;
3828       retain |= wanted;
3829     } else {
3830       retain |= CEPH_CAP_ANY_SHARED;
3831       // keep RD only if we didn't have the file open RW,
3832       // because then the mds would revoke it anyway to
3833       // journal max_size=0.
3834       if (in->max_size == 0)
3835         retain |= CEPH_CAP_ANY_RD;
3836     }
3837   }
3838
3839   ldout(cct, 10) << __func__ << " on " << *in
3840            << " wanted " << ccap_string(wanted)
3841            << " used " << ccap_string(used)
3842            << " issued " << ccap_string(issued)
3843            << " revoking " << ccap_string(revoking)
3844            << " flags=" << flags
3845            << dendl;
3846
3847   if (in->snapid != CEPH_NOSNAP)
3848     return; //snap caps last forever, can't write
3849
3850   if (in->caps.empty())
3851     return;   // guard if at end of func
3852
3853   if (!(orig_used & CEPH_CAP_FILE_BUFFER) &&
3854       (revoking & used & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO))) {
3855     if (_release(in))
3856       used &= ~(CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO);
3857   }
3858
3859   for (auto &[mds, cap] : in->caps) {
3860     auto session = mds_sessions.at(mds);
3861
3862     cap_used = used;
3863     if (in->auth_cap && &cap != in->auth_cap)
3864       cap_used &= ~in->auth_cap->issued;
3865
3866     revoking = cap.implemented & ~cap.issued;
3867
3868     ldout(cct, 10) << " cap mds." << mds
3869              << " issued " << ccap_string(cap.issued)
3870              << " implemented " << ccap_string(cap.implemented)
3871              << " revoking " << ccap_string(revoking) << dendl;
3872
3873     if (in->wanted_max_size > in->max_size &&
3874         in->wanted_max_size > in->requested_max_size &&
3875         &cap == in->auth_cap)
3876       goto ack;
3877
3878     /* approaching file_max? */
3879     if ((cap.issued & CEPH_CAP_FILE_WR) &&
3880         &cap == in->auth_cap &&
3881         is_max_size_approaching(in)) {
3882       ldout(cct, 10) << "size " << in->size << " approaching max_size " << in->max_size
3883                      << ", reported " << in->reported_size << dendl;
3884       goto ack;
3885     }
3886
3887     /* completed revocation? */
3888     if (revoking && (revoking & cap_used) == 0) {
3889       ldout(cct, 10) << "completed revocation of " << ccap_string(cap.implemented & ~cap.issued) << dendl;
3890       goto ack;
3891     }
3892
3893     /* want more caps from mds? */
3894     if (wanted & ~(cap.wanted | cap.issued))
3895       goto ack;
3896
3897     if (!revoking && is_unmounting() && (cap_used == 0))
3898       goto ack;
3899
3900     if ((cap.issued & ~retain) == 0 && // and we don't have anything we wouldn't like
3901         !in->dirty_caps)               // and we have no dirty caps
3902       continue;
3903
3904     if (!(flags & CHECK_CAPS_NODELAY)) {
3905       ldout(cct, 10) << "delaying cap release" << dendl;
3906       cap_delay_requeue(in);
3907       continue;
3908     }
3909
3910   ack:
3911     if (&cap == in->auth_cap) {
3912       if (in->flags & I_KICK_FLUSH) {
3913         ldout(cct, 20) << " reflushing caps (check_caps) on " << *in
3914                        << " to mds." << mds << dendl;
3915         kick_flushing_caps(in, session.get());
3916       }
3917       if (!in->cap_snaps.empty() &&
3918           in->cap_snaps.rbegin()->second.flush_tid == 0)
3919         flush_snaps(in);
3920     }
3921
3922     int flushing;
3923     int msg_flags = 0;
3924     ceph_tid_t flush_tid;
3925     if (in->auth_cap == &cap && in->dirty_caps) {
3926       flushing = mark_caps_flushing(in, &flush_tid);
3927       if (flags & CHECK_CAPS_SYNCHRONOUS)
3928         msg_flags |= MClientCaps::FLAG_SYNC;
3929     } else {
3930       flushing = 0;
3931       flush_tid = 0;
3932     }
3933
3934     in->delay_cap_item.remove_myself();
3935     send_cap(in, session.get(), &cap, msg_flags, cap_used, wanted, retain,
3936              flushing, flush_tid);
3937   }
3938 }
3939
3940
3941 void Client::queue_cap_snap(Inode *in, SnapContext& old_snapc)
3942 {
3943   int used = get_caps_used(in);
3944   int dirty = in->caps_dirty();
3945   ldout(cct, 10) << __func__ << " " << *in << " snapc " << old_snapc << " used " << ccap_string(used) << dendl;
3946
3947   if (in->cap_snaps.size() &&
3948       in->cap_snaps.rbegin()->second.writing) {
3949     ldout(cct, 10) << __func__ << " already have pending cap_snap on " << *in << dendl;
3950     return;
3951   } else if (in->caps_dirty() ||
3952             (used & CEPH_CAP_FILE_WR) ||
3953              (dirty & CEPH_CAP_ANY_WR)) {
3954     const auto &capsnapem = in->cap_snaps.emplace(std::piecewise_construct, std::make_tuple(old_snapc.seq), std::make_tuple(in));
3955     ceph_assert(capsnapem.second); /* element inserted */
3956     CapSnap &capsnap = capsnapem.first->second;
3957     capsnap.context = old_snapc;
3958     capsnap.issued = in->caps_issued();
3959     capsnap.dirty = in->caps_dirty();
3960
3961     capsnap.dirty_data = (used & CEPH_CAP_FILE_BUFFER);
3962
3963     capsnap.uid = in->uid;
3964     capsnap.gid = in->gid;
3965     capsnap.mode = in->mode;
3966     capsnap.btime = in->btime;
3967     capsnap.xattrs = in->xattrs;
3968     capsnap.xattr_version = in->xattr_version;
3969     capsnap.cap_dirtier_uid = in->cap_dirtier_uid;
3970     capsnap.cap_dirtier_gid = in->cap_dirtier_gid;
3971
3972     if (used & CEPH_CAP_FILE_WR) {
3973       ldout(cct, 10) << __func__ << " WR used on " << *in << dendl;
3974       capsnap.writing = 1;
3975     } else {
3976       finish_cap_snap(in, capsnap, used);
3977     }
3978   } else {
3979     ldout(cct, 10) << __func__ << " not dirty|writing on " << *in << dendl;
3980   }
3981 }
3982
3983 void Client::finish_cap_snap(Inode *in, CapSnap &capsnap, int used)
3984 {
3985   ldout(cct, 10) << __func__ << " " << *in << " capsnap " << (void *)&capsnap << " used " << ccap_string(used) << dendl;
3986   capsnap.size = in->size;
3987   capsnap.mtime = in->mtime;
3988   capsnap.atime = in->atime;
3989   capsnap.ctime = in->ctime;
3990   capsnap.time_warp_seq = in->time_warp_seq;
3991   capsnap.change_attr = in->change_attr;
3992   capsnap.dirty |= in->caps_dirty();
3993
3994   /* Only reset it if it wasn't set before */
3995   if (capsnap.cap_dirtier_uid == -1) {
3996     capsnap.cap_dirtier_uid = in->cap_dirtier_uid;
3997     capsnap.cap_dirtier_gid = in->cap_dirtier_gid;
3998   }
3999
4000   if (capsnap.dirty & CEPH_CAP_FILE_WR) {
4001     capsnap.inline_data = in->inline_data;
4002     capsnap.inline_version = in->inline_version;
4003   }
4004
4005   if (used & CEPH_CAP_FILE_BUFFER) {
4006     capsnap.writing = 1;
4007     ldout(cct, 10) << __func__ << " " << *in << " cap_snap " << &capsnap << " used " << used
4008              << " WRBUFFER, delaying" << dendl;
4009   } else {
4010     capsnap.dirty_data = 0;
4011     flush_snaps(in);
4012   }
4013 }
4014
4015 void Client::send_flush_snap(Inode *in, MetaSession *session,
4016                              snapid_t follows, CapSnap& capsnap)
4017 {
4018   auto m = make_message<MClientCaps>(CEPH_CAP_OP_FLUSHSNAP,
4019                                      in->ino, in->snaprealm->ino, 0,
4020                                      in->auth_cap->mseq, cap_epoch_barrier);
4021   m->caller_uid = capsnap.cap_dirtier_uid;
4022   m->caller_gid = capsnap.cap_dirtier_gid;
4023
4024   m->set_client_tid(capsnap.flush_tid);
4025   m->head.snap_follows = follows;
4026
4027   m->head.caps = capsnap.issued;
4028   m->head.dirty = capsnap.dirty;
4029
4030   m->head.uid = capsnap.uid;
4031   m->head.gid = capsnap.gid;
4032   m->head.mode = capsnap.mode;
4033   m->btime = capsnap.btime;
4034
4035   m->size = capsnap.size;
4036
4037   m->head.xattr_version = capsnap.xattr_version;
4038   encode(capsnap.xattrs, m->xattrbl);
4039
4040   m->ctime = capsnap.ctime;
4041   m->btime = capsnap.btime;
4042   m->mtime = capsnap.mtime;
4043   m->atime = capsnap.atime;
4044   m->time_warp_seq = capsnap.time_warp_seq;
4045   m->change_attr = capsnap.change_attr;
4046
4047   if (capsnap.dirty & CEPH_CAP_FILE_WR) {
4048     m->inline_version = in->inline_version;
4049     m->inline_data = in->inline_data;
4050   }
4051
4052   ceph_assert(!session->flushing_caps_tids.empty());
4053   m->set_oldest_flush_tid(*session->flushing_caps_tids.begin());
4054
4055   session->con->send_message2(std::move(m));
4056 }
4057
4058 void Client::flush_snaps(Inode *in)
4059 {
4060   ldout(cct, 10) << "flush_snaps on " << *in << dendl;
4061   ceph_assert(in->cap_snaps.size());
4062
4063   // pick auth mds
4064   ceph_assert(in->auth_cap);
4065   MetaSession *session = in->auth_cap->session;
4066
4067   for (auto &p : in->cap_snaps) {
4068     CapSnap &capsnap = p.second;
4069     // only do new flush
4070     if (capsnap.flush_tid > 0)
4071       continue;
4072
4073     ldout(cct, 10) << "flush_snaps mds." << session->mds_num
4074              << " follows " << p.first
4075              << " size " << capsnap.size
4076              << " mtime " << capsnap.mtime
4077              << " dirty_data=" << capsnap.dirty_data
4078              << " writing=" << capsnap.writing
4079              << " on " << *in << dendl;
4080     if (capsnap.dirty_data || capsnap.writing)
4081       break;
4082
4083     capsnap.flush_tid = ++last_flush_tid;
4084     session->flushing_caps_tids.insert(capsnap.flush_tid);
4085     in->flushing_cap_tids[capsnap.flush_tid] = 0;
4086     if (!in->flushing_cap_item.is_on_list())
4087       session->flushing_caps.push_back(&in->flushing_cap_item);
4088
4089     send_flush_snap(in, session, p.first, capsnap);
4090   }
4091 }
4092
4093 void Client::wait_on_list(list<ceph::condition_variable*>& ls)
4094 {
4095   ceph::condition_variable cond;
4096   ls.push_back(&cond);
4097   std::unique_lock l{client_lock, std::adopt_lock};
4098   cond.wait(l);
4099   l.release();
4100   ls.remove(&cond);
4101 }
4102
4103 void Client::signal_cond_list(list<ceph::condition_variable*>& ls)
4104 {
4105   for (auto cond : ls) {
4106     cond->notify_all();
4107   }
4108 }
4109
4110 void Client::wait_on_context_list(list<Context*>& ls)
4111 {
4112   ceph::condition_variable cond;
4113   bool done = false;
4114   int r;
4115   ls.push_back(new C_Cond(cond, &done, &r));
4116   std::unique_lock l{client_lock, std::adopt_lock};
4117   cond.wait(l, [&done] { return done;});
4118   l.release();
4119 }
4120
4121 void Client::signal_context_list(list<Context*>& ls)
4122 {
4123   while (!ls.empty()) {
4124     ls.front()->complete(0);
4125     ls.pop_front();
4126   }
4127 }
4128
4129 void Client::wake_up_session_caps(MetaSession *s, bool reconnect)
4130 {
4131   for (const auto &cap : s->caps) {
4132     auto &in = cap->inode;
4133     if (reconnect) {
4134       in.requested_max_size = 0;
4135       in.wanted_max_size = 0;
4136     } else {
4137       if (cap->gen < s->cap_gen) {
4138         // mds did not re-issue stale cap.
4139         cap->issued = cap->implemented = CEPH_CAP_PIN;
4140         // make sure mds knows what we want.
4141         if (in.caps_file_wanted() & ~cap->wanted)
4142           in.flags |= I_CAP_DROPPED;
4143       }
4144     }
4145     signal_cond_list(in.waitfor_caps);
4146   }
4147 }
4148
4149
4150 // flush dirty data (from objectcache)
4151
4152 class C_Client_CacheInvalidate : public Context  {
4153 private:
4154   Client *client;
4155   vinodeno_t ino;
4156   int64_t offset, length;
4157 public:
4158   C_Client_CacheInvalidate(Client *c, Inode *in, int64_t off, int64_t len) :
4159     client(c), offset(off), length(len) {
4160     if (client->use_faked_inos())
4161       ino = vinodeno_t(in->faked_ino, CEPH_NOSNAP);
4162     else
4163       ino = in->vino();
4164   }
4165   void finish(int r) override {
4166     // _async_invalidate takes the lock when it needs to, call this back from outside of lock.
4167     ceph_assert(ceph_mutex_is_not_locked_by_me(client->client_lock));
4168     client->_async_invalidate(ino, offset, length);
4169   }
4170 };
4171
4172 void Client::_async_invalidate(vinodeno_t ino, int64_t off, int64_t len)
4173 {
4174   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
4175   if (!mref_reader.is_state_satisfied())
4176     return;
4177
4178   ldout(cct, 10) << __func__ << " " << ino << " " << off << "~" << len << dendl;
4179   ino_invalidate_cb(callback_handle, ino, off, len);
4180 }
4181
4182 void Client::_schedule_invalidate_callback(Inode *in, int64_t off, int64_t len) {
4183
4184   if (ino_invalidate_cb)
4185     // we queue the invalidate, which calls the callback and decrements the ref
4186     async_ino_invalidator.queue(new C_Client_CacheInvalidate(this, in, off, len));
4187 }
4188
4189 void Client::_invalidate_inode_cache(Inode *in)
4190 {
4191   ldout(cct, 10) << __func__ << " " << *in << dendl;
4192
4193   // invalidate our userspace inode cache
4194   if (cct->_conf->client_oc) {
4195     objectcacher->release_set(&in->oset);
4196     if (!objectcacher->set_is_empty(&in->oset))
4197       lderr(cct) << "failed to invalidate cache for " << *in << dendl;
4198   }
4199
4200   _schedule_invalidate_callback(in, 0, 0);
4201 }
4202
4203 void Client::_invalidate_inode_cache(Inode *in, int64_t off, int64_t len)
4204 {
4205   ldout(cct, 10) << __func__ << " " << *in << " " << off << "~" << len << dendl;
4206
4207   // invalidate our userspace inode cache
4208   if (cct->_conf->client_oc) {
4209     vector<ObjectExtent> ls;
4210     Striper::file_to_extents(cct, in->ino, &in->layout, off, len, in->truncate_size, ls);
4211     objectcacher->discard_writeback(&in->oset, ls, nullptr);
4212   }
4213
4214   _schedule_invalidate_callback(in, off, len);
4215 }
4216
4217 bool Client::_release(Inode *in)
4218 {
4219   ldout(cct, 20) << "_release " << *in << dendl;
4220   if (in->cap_refs[CEPH_CAP_FILE_CACHE] == 0) {
4221     _invalidate_inode_cache(in);
4222     return true;
4223   }
4224   return false;
4225 }
4226
4227 bool Client::_flush(Inode *in, Context *onfinish)
4228 {
4229   ldout(cct, 10) << "_flush " << *in << dendl;
4230
4231   if (!in->oset.dirty_or_tx) {
4232     ldout(cct, 10) << " nothing to flush" << dendl;
4233     onfinish->complete(0);
4234     return true;
4235   }
4236
4237   if (objecter->osdmap_pool_full(in->layout.pool_id)) {
4238     ldout(cct, 8) << __func__ << ": FULL, purging for ENOSPC" << dendl;
4239     objectcacher->purge_set(&in->oset);
4240     if (onfinish) {
4241       onfinish->complete(-CEPHFS_ENOSPC);
4242     }
4243     return true;
4244   }
4245
4246   return objectcacher->flush_set(&in->oset, onfinish);
4247 }
4248
4249 void Client::_flush_range(Inode *in, int64_t offset, uint64_t size)
4250 {
4251   ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
4252   if (!in->oset.dirty_or_tx) {
4253     ldout(cct, 10) << " nothing to flush" << dendl;
4254     return;
4255   }
4256
4257   C_SaferCond onflush("Client::_flush_range flock");
4258   bool ret = objectcacher->file_flush(&in->oset, &in->layout, in->snaprealm->get_snap_context(),
4259                                       offset, size, &onflush);
4260   if (!ret) {
4261     // wait for flush
4262     client_lock.unlock();
4263     onflush.wait();
4264     client_lock.lock();
4265   }
4266 }
4267
4268 void Client::flush_set_callback(ObjectCacher::ObjectSet *oset)
4269 {
4270   //  std::scoped_lock l(client_lock);
4271   ceph_assert(ceph_mutex_is_locked_by_me(client_lock));   // will be called via dispatch() -> objecter -> ...
4272   Inode *in = static_cast<Inode *>(oset->parent);
4273   ceph_assert(in);
4274   _flushed(in);
4275 }
4276
4277 void Client::_flushed(Inode *in)
4278 {
4279   ldout(cct, 10) << "_flushed " << *in << dendl;
4280
4281   put_cap_ref(in, CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_BUFFER);
4282 }
4283
4284
4285
4286 // checks common to add_update_cap, handle_cap_grant
4287 void Client::check_cap_issue(Inode *in, unsigned issued)
4288 {
4289   unsigned had = in->caps_issued();
4290
4291   if ((issued & CEPH_CAP_FILE_CACHE) &&
4292       !(had & CEPH_CAP_FILE_CACHE))
4293     in->cache_gen++;
4294
4295   if ((issued & CEPH_CAP_FILE_SHARED) !=
4296       (had & CEPH_CAP_FILE_SHARED)) {
4297     if (issued & CEPH_CAP_FILE_SHARED)
4298       in->shared_gen++;
4299     if (in->is_dir())
4300       clear_dir_complete_and_ordered(in, true);
4301   }
4302 }
4303
4304 void Client::add_update_cap(Inode *in, MetaSession *mds_session, uint64_t cap_id,
4305                             unsigned issued, unsigned wanted, unsigned seq, unsigned mseq,
4306                             inodeno_t realm, int flags, const UserPerm& cap_perms)
4307 {
4308   if (!in->is_any_caps()) {
4309     ceph_assert(in->snaprealm == 0);
4310     in->snaprealm = get_snap_realm(realm);
4311     in->snaprealm->inodes_with_caps.push_back(&in->snaprealm_item);
4312     ldout(cct, 15) << __func__ << " first one, opened snaprealm " << in->snaprealm << dendl;
4313   } else {
4314     ceph_assert(in->snaprealm);
4315     if ((flags & CEPH_CAP_FLAG_AUTH) &&
4316         realm != inodeno_t(-1) && in->snaprealm->ino != realm) {
4317       in->snaprealm_item.remove_myself();
4318       auto oldrealm = in->snaprealm;
4319       in->snaprealm = get_snap_realm(realm);
4320       in->snaprealm->inodes_with_caps.push_back(&in->snaprealm_item);
4321       put_snap_realm(oldrealm);
4322     }
4323   }
4324
4325   mds_rank_t mds = mds_session->mds_num;
4326   const auto &capem = in->caps.emplace(std::piecewise_construct, std::forward_as_tuple(mds), std::forward_as_tuple(*in, mds_session));
4327   Cap &cap = capem.first->second;
4328   if (!capem.second) {
4329     if (cap.gen < mds_session->cap_gen)
4330       cap.issued = cap.implemented = CEPH_CAP_PIN;
4331
4332     /*
4333      * auth mds of the inode changed. we received the cap export
4334      * message, but still haven't received the cap import message.
4335      * handle_cap_export() updated the new auth MDS' cap.
4336      *
4337      * "ceph_seq_cmp(seq, cap->seq) <= 0" means we are processing
4338      * a message that was send before the cap import message. So
4339      * don't remove caps.
4340      */
4341     if (ceph_seq_cmp(seq, cap.seq) <= 0) {
4342       if (&cap != in->auth_cap)
4343          ldout(cct, 0) << "WARNING: " <<  "inode " << *in << " caps on mds." << mds << " != auth_cap." << dendl;
4344
4345       ceph_assert(cap.cap_id == cap_id);
4346       seq = cap.seq;
4347       mseq = cap.mseq;
4348       issued |= cap.issued;
4349       flags |= CEPH_CAP_FLAG_AUTH;
4350     }
4351   } else {
4352     inc_pinned_icaps();
4353   }
4354
4355   check_cap_issue(in, issued);
4356
4357   if (flags & CEPH_CAP_FLAG_AUTH) {
4358     if (in->auth_cap != &cap &&
4359         (!in->auth_cap || ceph_seq_cmp(in->auth_cap->mseq, mseq) < 0)) {
4360       if (in->auth_cap && in->flushing_cap_item.is_on_list()) {
4361         ldout(cct, 10) << __func__ << " changing auth cap: "
4362                        << "add myself to new auth MDS' flushing caps list" << dendl;
4363         adjust_session_flushing_caps(in, in->auth_cap->session, mds_session);
4364       }
4365       in->auth_cap = &cap;
4366     }
4367   }
4368
4369   unsigned old_caps = cap.issued;
4370   cap.cap_id = cap_id;
4371   cap.issued = issued;
4372   cap.implemented |= issued;
4373   if (ceph_seq_cmp(mseq, cap.mseq) > 0)
4374     cap.wanted = wanted;
4375   else
4376     cap.wanted |= wanted;
4377   cap.seq = seq;
4378   cap.issue_seq = seq;
4379   cap.mseq = mseq;
4380   cap.gen = mds_session->cap_gen;
4381   cap.latest_perms = cap_perms;
4382   ldout(cct, 10) << __func__ << " issued " << ccap_string(old_caps) << " -> " << ccap_string(cap.issued)
4383            << " from mds." << mds
4384            << " on " << *in
4385            << dendl;
4386
4387   if ((issued & ~old_caps) && in->auth_cap == &cap) {
4388     // non-auth MDS is revoking the newly grant caps ?
4389     for (auto &p : in->caps) {
4390       if (&p.second == &cap)
4391         continue;
4392       if (p.second.implemented & ~p.second.issued & issued) {
4393         check_caps(in, CHECK_CAPS_NODELAY);
4394         break;
4395       }
4396     }
4397   }
4398
4399   if (issued & ~old_caps)
4400     signal_cond_list(in->waitfor_caps);
4401 }
4402
4403 void Client::remove_cap(Cap *cap, bool queue_release)
4404 {
4405   auto &in = cap->inode;
4406   MetaSession *session = cap->session;
4407   mds_rank_t mds = cap->session->mds_num;
4408
4409   ldout(cct, 10) << __func__ << " mds." << mds << " on " << in << dendl;
4410
4411   if (queue_release) {
4412     session->enqueue_cap_release(
4413       in.ino,
4414       cap->cap_id,
4415       cap->issue_seq,
4416       cap->mseq,
4417       cap_epoch_barrier);
4418   } else {
4419     dec_pinned_icaps();
4420   }
4421
4422
4423   if (in.auth_cap == cap) {
4424     if (in.flushing_cap_item.is_on_list()) {
4425       ldout(cct, 10) << " removing myself from flushing_cap list" << dendl;
4426       in.flushing_cap_item.remove_myself();
4427     }
4428     in.auth_cap = NULL;
4429   }
4430   size_t n = in.caps.erase(mds);
4431   ceph_assert(n == 1);
4432   cap = nullptr;
4433
4434   if (!in.is_any_caps()) {
4435     ldout(cct, 15) << __func__ << " last one, closing snaprealm " << in.snaprealm << dendl;
4436     in.snaprealm_item.remove_myself();
4437     put_snap_realm(in.snaprealm);
4438     in.snaprealm = 0;
4439   }
4440 }
4441
4442 void Client::remove_all_caps(Inode *in)
4443 {
4444   while (!in->caps.empty())
4445     remove_cap(&in->caps.begin()->second, true);
4446 }
4447
4448 void Client::remove_session_caps(MetaSession *s, int err)
4449 {
4450   ldout(cct, 10) << __func__ << " mds." << s->mds_num << dendl;
4451
4452   while (s->caps.size()) {
4453     Cap *cap = *s->caps.begin();
4454     InodeRef in(&cap->inode);
4455     bool dirty_caps = false;
4456     if (in->auth_cap == cap) {
4457       dirty_caps = in->dirty_caps | in->flushing_caps;
4458       in->wanted_max_size = 0;
4459       in->requested_max_size = 0;
4460       if (in->has_any_filelocks())
4461         in->flags |= I_ERROR_FILELOCK;
4462     }
4463     auto caps = cap->implemented;
4464     if (cap->wanted | cap->issued)
4465       in->flags |= I_CAP_DROPPED;
4466     remove_cap(cap, false);
4467     in->cap_snaps.clear();
4468     if (dirty_caps) {
4469       lderr(cct) << __func__ << " still has dirty|flushing caps on " << *in << dendl;
4470       if (in->flushing_caps) {
4471         num_flushing_caps--;
4472         in->flushing_cap_tids.clear();
4473       }
4474       in->flushing_caps = 0;
4475       in->mark_caps_clean();
4476       put_inode(in.get());
4477     }
4478     caps &= CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_BUFFER;
4479     if (caps && !in->caps_issued_mask(caps, true)) {
4480       if (err == -CEPHFS_EBLOCKLISTED) {
4481         if (in->oset.dirty_or_tx) {
4482           lderr(cct) << __func__ << " still has dirty data on " << *in << dendl;
4483           in->set_async_err(err);
4484         }
4485         objectcacher->purge_set(&in->oset);
4486       } else {
4487         objectcacher->release_set(&in->oset);
4488       }
4489       _schedule_invalidate_callback(in.get(), 0, 0);
4490     }
4491
4492     signal_cond_list(in->waitfor_caps);
4493   }
4494   s->flushing_caps_tids.clear();
4495   sync_cond.notify_all();
4496 }
4497
4498 std::pair<int, bool> Client::_do_remount(bool retry_on_error)
4499 {
4500   uint64_t max_retries = cct->_conf.get_val<uint64_t>("mds_max_retries_on_remount_failure");
4501   bool abort_on_failure = false;
4502
4503   errno = 0;
4504   int r = remount_cb(callback_handle);
4505   if (r == 0) {
4506     retries_on_invalidate = 0;
4507   } else {
4508     int e = errno;
4509     client_t whoami = get_nodeid();
4510     if (r == -1) {
4511       lderr(cct) <<
4512           "failed to remount (to trim kernel dentries): "
4513           "errno = " << e << " (" << strerror(e) << ")" << dendl;
4514     } else {
4515       lderr(cct) <<
4516           "failed to remount (to trim kernel dentries): "
4517           "return code = " << r << dendl;
4518     }
4519     bool should_abort =
4520       (cct->_conf.get_val<bool>("client_die_on_failed_remount") ||
4521        cct->_conf.get_val<bool>("client_die_on_failed_dentry_invalidate")) &&
4522       !(retry_on_error && (++retries_on_invalidate < max_retries));
4523     if (should_abort && !is_unmounting()) {
4524       lderr(cct) << "failed to remount for kernel dentry trimming; quitting!" << dendl;
4525       abort_on_failure = true;
4526     }
4527   }
4528   return std::make_pair(r, abort_on_failure);
4529 }
4530
4531 class C_Client_Remount : public Context  {
4532 private:
4533   Client *client;
4534 public:
4535   explicit C_Client_Remount(Client *c) : client(c) {}
4536   void finish(int r) override {
4537     ceph_assert(r == 0);
4538     client->_do_remount(true);
4539   }
4540 };
4541
4542 void Client::_invalidate_kernel_dcache()
4543 {
4544   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
4545   if (!mref_reader.is_state_satisfied())
4546     return;
4547
4548   if (can_invalidate_dentries) {
4549     if (dentry_invalidate_cb && root->dir) {
4550       for (ceph::unordered_map<string, Dentry*>::iterator p = root->dir->dentries.begin();
4551          p != root->dir->dentries.end();
4552          ++p) {
4553        if (p->second->inode)
4554         _schedule_invalidate_dentry_callback(p->second, false);
4555       }
4556     }
4557   } else if (remount_cb) {
4558     // Hacky:
4559     // when remounting a file system, linux kernel trims all unused dentries in the fs
4560     remount_finisher.queue(new C_Client_Remount(this));
4561   }
4562 }
4563
4564 void Client::_trim_negative_child_dentries(InodeRef& in)
4565 {
4566   if (!in->is_dir())
4567     return;
4568
4569   Dir* dir = in->dir;
4570   if (dir && dir->dentries.size() == dir->num_null_dentries) {
4571     for (auto p = dir->dentries.begin(); p != dir->dentries.end(); ) {
4572       Dentry *dn = p->second;
4573       ++p;
4574       ceph_assert(!dn->inode);
4575       if (dn->lru_is_expireable())
4576         unlink(dn, true, false);  // keep dir, drop dentry
4577     }
4578     if (dir->dentries.empty()) {
4579       close_dir(dir);
4580     }
4581   }
4582
4583   if (in->flags & I_SNAPDIR_OPEN) {
4584     InodeRef snapdir = open_snapdir(in.get());
4585     _trim_negative_child_dentries(snapdir);
4586   }
4587 }
4588
4589 class C_Client_CacheRelease : public Context  {
4590 private:
4591   Client *client;
4592   vinodeno_t ino;
4593 public:
4594   C_Client_CacheRelease(Client *c, Inode *in) :
4595     client(c) {
4596     if (client->use_faked_inos())
4597       ino = vinodeno_t(in->faked_ino, CEPH_NOSNAP);
4598     else
4599       ino = in->vino();
4600   }
4601   void finish(int r) override {
4602     ceph_assert(ceph_mutex_is_not_locked_by_me(client->client_lock));
4603     client->_async_inode_release(ino);
4604   }
4605 };
4606
4607 void Client::_async_inode_release(vinodeno_t ino)
4608 {
4609   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
4610   if (!mref_reader.is_state_satisfied())
4611     return;
4612
4613   ldout(cct, 10) << __func__ << " " << ino << dendl;
4614   ino_release_cb(callback_handle, ino);
4615 }
4616
4617 void Client::_schedule_ino_release_callback(Inode *in) {
4618
4619   if (ino_release_cb)
4620     // we queue the invalidate, which calls the callback and decrements the ref
4621     async_ino_releasor.queue(new C_Client_CacheRelease(this, in));
4622 }
4623
4624 void Client::trim_caps(MetaSession *s, uint64_t max)
4625 {
4626   mds_rank_t mds = s->mds_num;
4627   size_t caps_size = s->caps.size();
4628   ldout(cct, 10) << __func__ << " mds." << mds << " max " << max
4629     << " caps " << caps_size << dendl;
4630
4631   uint64_t trimmed = 0;
4632   auto p = s->caps.begin();
4633   std::set<Dentry *> to_trim; /* this avoids caps other than the one we're
4634                                * looking at from getting deleted during traversal. */
4635   while ((caps_size - trimmed) > max && !p.end()) {
4636     Cap *cap = *p;
4637     InodeRef in(&cap->inode);
4638
4639     // Increment p early because it will be invalidated if cap
4640     // is deleted inside remove_cap
4641     ++p;
4642
4643     if (in->caps.size() > 1 && cap != in->auth_cap) {
4644       int mine = cap->issued | cap->implemented;
4645       int oissued = in->auth_cap ? in->auth_cap->issued : 0;
4646       // disposable non-auth cap
4647       if (!(get_caps_used(in.get()) & ~oissued & mine)) {
4648         ldout(cct, 20) << " removing unused, unneeded non-auth cap on " << *in << dendl;
4649         cap = (remove_cap(cap, true), nullptr);
4650         trimmed++;
4651       }
4652     } else {
4653       ldout(cct, 20) << " trying to trim dentries for " << *in << dendl;
4654       _trim_negative_child_dentries(in);
4655       bool all = true;
4656       auto q = in->dentries.begin();
4657       while (q != in->dentries.end()) {
4658         Dentry *dn = *q;
4659         ++q;
4660         if (dn->lru_is_expireable()) {
4661           if (can_invalidate_dentries &&
4662               dn->dir->parent_inode->ino == CEPH_INO_ROOT) {
4663             // Only issue one of these per DN for inodes in root: handle
4664             // others more efficiently by calling for root-child DNs at
4665             // the end of this function.
4666             _schedule_invalidate_dentry_callback(dn, true);
4667           }
4668           ldout(cct, 20) << " queueing dentry for trimming: " << dn->name << dendl;
4669           to_trim.insert(dn);
4670         } else {
4671           ldout(cct, 20) << "  not expirable: " << dn->name << dendl;
4672           all = false;
4673         }
4674       }
4675       if (in->ll_ref == 1 && in->ino != CEPH_INO_ROOT) {
4676          _schedule_ino_release_callback(in.get());
4677       }
4678       if (all && in->ino != CEPH_INO_ROOT) {
4679         ldout(cct, 20) << __func__ << " counting as trimmed: " << *in << dendl;
4680         trimmed++;
4681       }
4682     }
4683   }
4684   ldout(cct, 20) << " trimming queued dentries: " << dendl;
4685   for (const auto &dn : to_trim) {
4686     trim_dentry(dn);
4687   }
4688   to_trim.clear();
4689
4690   caps_size = s->caps.size();
4691   if (caps_size > (size_t)max)
4692     _invalidate_kernel_dcache();
4693 }
4694
4695 void Client::force_session_readonly(MetaSession *s)
4696 {
4697   s->readonly = true;
4698   for (xlist<Cap*>::iterator p = s->caps.begin(); !p.end(); ++p) {
4699     auto &in = (*p)->inode;
4700     if (in.caps_wanted() & CEPH_CAP_FILE_WR)
4701       signal_cond_list(in.waitfor_caps);
4702   }
4703 }
4704
4705 int Client::mark_caps_flushing(Inode *in, ceph_tid_t* ptid)
4706 {
4707   MetaSession *session = in->auth_cap->session;
4708
4709   int flushing = in->dirty_caps;
4710   ceph_assert(flushing);
4711
4712   ceph_tid_t flush_tid = ++last_flush_tid;
4713   in->flushing_cap_tids[flush_tid] = flushing;
4714
4715   if (!in->flushing_caps) {
4716     ldout(cct, 10) << __func__ << " " << ccap_string(flushing) << " " << *in << dendl;
4717     num_flushing_caps++;
4718   } else {
4719     ldout(cct, 10) << __func__ << " (more) " << ccap_string(flushing) << " " << *in << dendl;
4720   }
4721
4722   in->flushing_caps |= flushing;
4723   in->mark_caps_clean();
4724
4725   if (!in->flushing_cap_item.is_on_list())
4726     session->flushing_caps.push_back(&in->flushing_cap_item);
4727   session->flushing_caps_tids.insert(flush_tid);
4728
4729   *ptid = flush_tid;
4730   return flushing;
4731 }
4732
4733 void Client::adjust_session_flushing_caps(Inode *in, MetaSession *old_s,  MetaSession *new_s)
4734 {
4735   for (auto &p : in->cap_snaps) {
4736     CapSnap &capsnap = p.second;
4737     if (capsnap.flush_tid > 0) {
4738       old_s->flushing_caps_tids.erase(capsnap.flush_tid);
4739       new_s->flushing_caps_tids.insert(capsnap.flush_tid);
4740     }
4741   }
4742   for (map<ceph_tid_t, int>::iterator it = in->flushing_cap_tids.begin();
4743        it != in->flushing_cap_tids.end();
4744        ++it) {
4745     old_s->flushing_caps_tids.erase(it->first);
4746     new_s->flushing_caps_tids.insert(it->first);
4747   }
4748   new_s->flushing_caps.push_back(&in->flushing_cap_item);
4749 }
4750
4751 /*
4752  * Flush all the dirty caps back to the MDS. Because the callers
4753  * generally wait on the result of this function (syncfs and umount
4754  * cases), we set CHECK_CAPS_SYNCHRONOUS on the last check_caps call.
4755  */
4756 void Client::flush_caps_sync()
4757 {
4758   ldout(cct, 10) << __func__ << dendl;
4759   for (auto &q : mds_sessions) {
4760     auto s = q.second;
4761     xlist<Inode*>::iterator p = s->dirty_list.begin();
4762     while (!p.end()) {
4763       unsigned flags = CHECK_CAPS_NODELAY;
4764       Inode *in = *p;
4765
4766       ++p;
4767       if (p.end())
4768         flags |= CHECK_CAPS_SYNCHRONOUS;
4769       check_caps(in, flags);
4770     }
4771   }
4772 }
4773
4774 void Client::wait_sync_caps(Inode *in, ceph_tid_t want)
4775 {
4776   while (in->flushing_caps) {
4777     map<ceph_tid_t, int>::iterator it = in->flushing_cap_tids.begin();
4778     ceph_assert(it != in->flushing_cap_tids.end());
4779     if (it->first > want)
4780       break;
4781     ldout(cct, 10) << __func__ << " on " << *in << " flushing "
4782                    << ccap_string(it->second) << " want " << want
4783                    << " last " << it->first << dendl;
4784     wait_on_list(in->waitfor_caps);
4785   }
4786 }
4787
4788 void Client::wait_sync_caps(ceph_tid_t want)
4789 {
4790  retry:
4791   ldout(cct, 10) << __func__ << " want " << want  << " (last is " << last_flush_tid << ", "
4792            << num_flushing_caps << " total flushing)" << dendl;
4793   for (auto &p : mds_sessions) {
4794     auto s = p.second;
4795     if (s->flushing_caps_tids.empty())
4796         continue;
4797     ceph_tid_t oldest_tid = *s->flushing_caps_tids.begin();
4798     if (oldest_tid <= want) {
4799       ldout(cct, 10) << " waiting on mds." << p.first << " tid " << oldest_tid
4800                      << " (want " << want << ")" << dendl;
4801       std::unique_lock l{client_lock, std::adopt_lock};
4802       sync_cond.wait(l);
4803       l.release();
4804       goto retry;
4805     }
4806   }
4807 }
4808
4809 void Client::kick_flushing_caps(Inode *in, MetaSession *session)
4810 {
4811   in->flags &= ~I_KICK_FLUSH;
4812
4813   Cap *cap = in->auth_cap;
4814   ceph_assert(cap->session == session);
4815
4816   ceph_tid_t last_snap_flush = 0;
4817   for (auto p = in->flushing_cap_tids.rbegin();
4818        p != in->flushing_cap_tids.rend();
4819        ++p) {
4820     if (!p->second) {
4821       last_snap_flush = p->first;
4822       break;
4823     }
4824   }
4825
4826   int wanted = in->caps_wanted();
4827   int used = get_caps_used(in) | in->caps_dirty();
4828   auto it = in->cap_snaps.begin();
4829   for (auto& p : in->flushing_cap_tids) {
4830     if (p.second) {
4831       int msg_flags = p.first < last_snap_flush ? MClientCaps::FLAG_PENDING_CAPSNAP : 0;
4832       send_cap(in, session, cap, msg_flags, used, wanted, (cap->issued | cap->implemented),
4833                p.second, p.first);
4834     } else {
4835       ceph_assert(it != in->cap_snaps.end());
4836       ceph_assert(it->second.flush_tid == p.first);
4837       send_flush_snap(in, session, it->first, it->second);
4838       ++it;
4839     }
4840   }
4841 }
4842
4843 void Client::kick_flushing_caps(MetaSession *session)
4844 {
4845   mds_rank_t mds = session->mds_num;
4846   ldout(cct, 10) << __func__ << " mds." << mds << dendl;
4847
4848   for (xlist<Inode*>::iterator p = session->flushing_caps.begin(); !p.end(); ++p) {
4849     Inode *in = *p;
4850     if (in->flags & I_KICK_FLUSH) {
4851       ldout(cct, 20) << " reflushing caps on " << *in << " to mds." << mds << dendl;
4852       kick_flushing_caps(in, session);
4853     }
4854   }
4855 }
4856
4857 void Client::early_kick_flushing_caps(MetaSession *session)
4858 {
4859   for (xlist<Inode*>::iterator p = session->flushing_caps.begin(); !p.end(); ++p) {
4860     Inode *in = *p;
4861     Cap *cap = in->auth_cap;
4862     ceph_assert(cap);
4863
4864     // if flushing caps were revoked, we re-send the cap flush in client reconnect
4865     // stage. This guarantees that MDS processes the cap flush message before issuing
4866     // the flushing caps to other client.
4867     if ((in->flushing_caps & in->auth_cap->issued) == in->flushing_caps) {
4868       in->flags |= I_KICK_FLUSH;
4869       continue;
4870     }
4871
4872     ldout(cct, 20) << " reflushing caps (early_kick) on " << *in
4873                    << " to mds." << session->mds_num << dendl;
4874     // send_reconnect() also will reset these sequence numbers. make sure
4875     // sequence numbers in cap flush message match later reconnect message.
4876     cap->seq = 0;
4877     cap->issue_seq = 0;
4878     cap->mseq = 0;
4879     cap->issued = cap->implemented;
4880
4881     kick_flushing_caps(in, session);
4882   }
4883 }
4884
4885 void Client::invalidate_snaprealm_and_children(SnapRealm *realm)
4886 {
4887   list<SnapRealm*> q;
4888   q.push_back(realm);
4889
4890   while (!q.empty()) {
4891     realm = q.front();
4892     q.pop_front();
4893
4894     ldout(cct, 10) << __func__ << " " << *realm << dendl;
4895     realm->invalidate_cache();
4896
4897     for (set<SnapRealm*>::iterator p = realm->pchildren.begin();
4898          p != realm->pchildren.end();
4899          ++p)
4900       q.push_back(*p);
4901   }
4902 }
4903
4904 SnapRealm *Client::get_snap_realm(inodeno_t r)
4905 {
4906   SnapRealm *realm = snap_realms[r];
4907
4908   ldout(cct, 20) << __func__ << " " << r << " " << realm << ", nref was "
4909                  << (realm ? realm->nref : 0) << dendl;
4910   if (!realm) {
4911     snap_realms[r] = realm = new SnapRealm(r);
4912
4913     // Do not release the global snaprealm until unmounting.
4914     if (r == CEPH_INO_GLOBAL_SNAPREALM)
4915       realm->nref++;
4916   }
4917
4918   realm->nref++;
4919   ldout(cct, 20) << __func__ << " " << r << " " << realm << ", nref now is "
4920                  << realm->nref << dendl;
4921   return realm;
4922 }
4923
4924 SnapRealm *Client::get_snap_realm_maybe(inodeno_t r)
4925 {
4926   if (snap_realms.count(r) == 0) {
4927     ldout(cct, 20) << __func__ << " " << r << " fail" << dendl;
4928     return NULL;
4929   }
4930   SnapRealm *realm = snap_realms[r];
4931   ldout(cct, 20) << __func__ << " " << r << " " << realm << " " << realm->nref << " -> " << (realm->nref + 1) << dendl;
4932   realm->nref++;
4933   return realm;
4934 }
4935
4936 void Client::put_snap_realm(SnapRealm *realm)
4937 {
4938   ldout(cct, 20) << __func__ << " " << realm->ino << " " << realm
4939                  << " " << realm->nref << " -> " << (realm->nref - 1) << dendl;
4940   if (--realm->nref == 0) {
4941     snap_realms.erase(realm->ino);
4942     if (realm->pparent) {
4943       realm->pparent->pchildren.erase(realm);
4944       put_snap_realm(realm->pparent);
4945     }
4946     delete realm;
4947   }
4948 }
4949
4950 bool Client::adjust_realm_parent(SnapRealm *realm, inodeno_t parent)
4951 {
4952   if (realm->parent != parent) {
4953     ldout(cct, 10) << __func__ << " " << *realm
4954              << " " << realm->parent << " -> " << parent << dendl;
4955     realm->parent = parent;
4956     if (realm->pparent) {
4957       realm->pparent->pchildren.erase(realm);
4958       put_snap_realm(realm->pparent);
4959     }
4960     realm->pparent = get_snap_realm(parent);
4961     realm->pparent->pchildren.insert(realm);
4962     return true;
4963   }
4964   return false;
4965 }
4966
4967 static bool has_new_snaps(const SnapContext& old_snapc,
4968                           const SnapContext& new_snapc)
4969 {
4970   return !new_snapc.snaps.empty() && new_snapc.snaps[0] > old_snapc.seq;
4971 }
4972
4973
4974 void Client::update_snap_trace(const bufferlist& bl, SnapRealm **realm_ret, bool flush)
4975 {
4976   SnapRealm *first_realm = NULL;
4977   ldout(cct, 10) << __func__ << " len " << bl.length() << dendl;
4978
4979   map<SnapRealm*, SnapContext> dirty_realms;
4980
4981   auto p = bl.cbegin();
4982   while (!p.end()) {
4983     SnapRealmInfo info;
4984     decode(info, p);
4985     SnapRealm *realm = get_snap_realm(info.ino());
4986
4987     bool invalidate = false;
4988
4989     if (info.seq() > realm->seq) {
4990       ldout(cct, 10) << __func__ << " " << *realm << " seq " << info.seq() << " > " << realm->seq
4991                << dendl;
4992
4993       if (flush) {
4994         // writeback any dirty caps _before_ updating snap list (i.e. with old snap info)
4995         //  flush me + children
4996         list<SnapRealm*> q;
4997         q.push_back(realm);
4998         while (!q.empty()) {
4999           SnapRealm *realm = q.front();
5000           q.pop_front();
5001
5002           for (set<SnapRealm*>::iterator p = realm->pchildren.begin();
5003                p != realm->pchildren.end();
5004                ++p)
5005             q.push_back(*p);
5006
5007           if (dirty_realms.count(realm) == 0) {
5008             realm->nref++;
5009             dirty_realms[realm] = realm->get_snap_context();
5010           }
5011         }
5012       }
5013
5014       // update
5015       realm->seq = info.seq();
5016       realm->created = info.created();
5017       realm->parent_since = info.parent_since();
5018       realm->prior_parent_snaps = info.prior_parent_snaps;
5019       realm->my_snaps = info.my_snaps;
5020       invalidate = true;
5021     }
5022
5023     // _always_ verify parent
5024     if (adjust_realm_parent(realm, info.parent()))
5025       invalidate = true;
5026
5027     if (invalidate) {
5028       invalidate_snaprealm_and_children(realm);
5029       ldout(cct, 15) << __func__ << " " << *realm << " self|parent updated" << dendl;
5030       ldout(cct, 15) << "  snapc " << realm->get_snap_context() << dendl;
5031     } else {
5032       ldout(cct, 10) << __func__ << " " << *realm << " seq " << info.seq()
5033                << " <= " << realm->seq << " and same parent, SKIPPING" << dendl;
5034     }
5035
5036     if (!first_realm)
5037       first_realm = realm;
5038     else
5039       put_snap_realm(realm);
5040   }
5041
5042   for (auto &[realm, snapc] : dirty_realms) {
5043     // if there are new snaps ?
5044     if (has_new_snaps(snapc, realm->get_snap_context())) {
5045       ldout(cct, 10) << " flushing caps on " << *realm << dendl;
5046       for (auto&& in : realm->inodes_with_caps) {
5047         queue_cap_snap(in, snapc);
5048       }
5049     } else {
5050       ldout(cct, 10) << " no new snap on " << *realm << dendl;
5051     }
5052     put_snap_realm(realm);
5053   }
5054
5055   if (realm_ret)
5056     *realm_ret = first_realm;
5057   else
5058     put_snap_realm(first_realm);
5059 }
5060
5061 void Client::handle_snap(const MConstRef<MClientSnap>& m)
5062 {
5063   ldout(cct, 10) << __func__ << " " << *m << dendl;
5064   mds_rank_t mds = mds_rank_t(m->get_source().num());
5065
5066   std::scoped_lock cl(client_lock);
5067   auto session = _get_mds_session(mds, m->get_connection().get());
5068   if (!session) {
5069     return;
5070   }
5071
5072   got_mds_push(session.get());
5073
5074   map<Inode*, SnapContext> to_move;
5075   SnapRealm *realm = 0;
5076
5077   if (m->head.op == CEPH_SNAP_OP_SPLIT) {
5078     ceph_assert(m->head.split);
5079     SnapRealmInfo info;
5080     auto p = m->bl.cbegin();
5081     decode(info, p);
5082     ceph_assert(info.ino() == m->head.split);
5083
5084     // flush, then move, ino's.
5085     realm = get_snap_realm(info.ino());
5086     ldout(cct, 10) << " splitting off " << *realm << dendl;
5087     for (auto& ino : m->split_inos) {
5088       vinodeno_t vino(ino, CEPH_NOSNAP);
5089       if (inode_map.count(vino)) {
5090         Inode *in = inode_map[vino];
5091         if (!in->snaprealm || in->snaprealm == realm)
5092           continue;
5093         if (in->snaprealm->created > info.created()) {
5094           ldout(cct, 10) << " NOT moving " << *in << " from _newer_ realm "
5095                    << *in->snaprealm << dendl;
5096           continue;
5097         }
5098         ldout(cct, 10) << " moving " << *in << " from " << *in->snaprealm << dendl;
5099
5100
5101         in->snaprealm_item.remove_myself();
5102         to_move[in] = in->snaprealm->get_snap_context();
5103         put_snap_realm(in->snaprealm);
5104       }
5105     }
5106
5107     // move child snaprealms, too
5108     for (auto& child_realm : m->split_realms) {
5109       ldout(cct, 10) << "adjusting snaprealm " << child_realm << " parent" << dendl;
5110       SnapRealm *child = get_snap_realm_maybe(child_realm);
5111       if (!child)
5112         continue;
5113       adjust_realm_parent(child, realm->ino);
5114       put_snap_realm(child);
5115     }
5116   }
5117
5118   update_snap_trace(m->bl, NULL, m->head.op != CEPH_SNAP_OP_DESTROY);
5119
5120   if (realm) {
5121     for (auto p = to_move.begin(); p != to_move.end(); ++p) {
5122       Inode *in = p->first;
5123       in->snaprealm = realm;
5124       realm->inodes_with_caps.push_back(&in->snaprealm_item);
5125       realm->nref++;
5126       // queue for snap writeback
5127       if (has_new_snaps(p->second, realm->get_snap_context()))
5128         queue_cap_snap(in, p->second);
5129     }
5130     put_snap_realm(realm);
5131   }
5132 }
5133
5134 void Client::handle_quota(const MConstRef<MClientQuota>& m)
5135 {
5136   mds_rank_t mds = mds_rank_t(m->get_source().num());
5137
5138   std::scoped_lock cl(client_lock);
5139   auto session = _get_mds_session(mds, m->get_connection().get());
5140   if (!session) {
5141     return;
5142   }
5143
5144   got_mds_push(session.get());
5145
5146   ldout(cct, 10) << __func__ << " " << *m << " from mds." << mds << dendl;
5147
5148   vinodeno_t vino(m->ino, CEPH_NOSNAP);
5149   if (inode_map.count(vino)) {
5150     Inode *in = NULL;
5151     in = inode_map[vino];
5152
5153     if (in) {
5154       in->quota = m->quota;
5155       in->rstat = m->rstat;
5156     }
5157   }
5158 }
5159
5160 void Client::handle_caps(const MConstRef<MClientCaps>& m)
5161 {
5162   mds_rank_t mds = mds_rank_t(m->get_source().num());
5163
5164   std::scoped_lock cl(client_lock);
5165   auto session = _get_mds_session(mds, m->get_connection().get());
5166   if (!session) {
5167     return;
5168   }
5169
5170   if (m->osd_epoch_barrier && !objecter->have_map(m->osd_epoch_barrier)) {
5171     // Pause RADOS operations until we see the required epoch
5172     objecter->set_epoch_barrier(m->osd_epoch_barrier);
5173   }
5174
5175   if (m->osd_epoch_barrier > cap_epoch_barrier) {
5176     // Record the barrier so that we will transmit it to MDS when releasing
5177     set_cap_epoch_barrier(m->osd_epoch_barrier);
5178   }
5179
5180   got_mds_push(session.get());
5181
5182   Inode *in;
5183   vinodeno_t vino(m->get_ino(), CEPH_NOSNAP);
5184   if (auto it = inode_map.find(vino); it != inode_map.end()) {
5185     in = it->second;
5186   } else {
5187     if (m->get_op() == CEPH_CAP_OP_IMPORT) {
5188       ldout(cct, 5) << __func__ << " don't have vino " << vino << " on IMPORT, immediately releasing" << dendl;
5189       session->enqueue_cap_release(
5190         m->get_ino(),
5191         m->get_cap_id(),
5192         m->get_seq(),
5193         m->get_mseq(),
5194         cap_epoch_barrier);
5195     } else {
5196       ldout(cct, 5) << __func__ << " don't have vino " << vino << ", dropping" << dendl;
5197     }
5198
5199     // in case the mds is waiting on e.g. a revocation
5200     flush_cap_releases();
5201     return;
5202   }
5203
5204   switch (m->get_op()) {
5205     case CEPH_CAP_OP_EXPORT: return handle_cap_export(session.get(), in, m);
5206     case CEPH_CAP_OP_FLUSHSNAP_ACK: return handle_cap_flushsnap_ack(session.get(), in, m);
5207     case CEPH_CAP_OP_IMPORT: /* no return */ handle_cap_import(session.get(), in, m);
5208   }
5209
5210   if (auto it = in->caps.find(mds); it != in->caps.end()) {
5211     Cap &cap = in->caps.at(mds);
5212
5213     switch (m->get_op()) {
5214       case CEPH_CAP_OP_TRUNC: return handle_cap_trunc(session.get(), in, m);
5215       case CEPH_CAP_OP_IMPORT:
5216       case CEPH_CAP_OP_REVOKE:
5217       case CEPH_CAP_OP_GRANT: return handle_cap_grant(session.get(), in, &cap, m);
5218       case CEPH_CAP_OP_FLUSH_ACK: return handle_cap_flush_ack(session.get(), in, &cap, m);
5219     }
5220   } else {
5221     ldout(cct, 5) << __func__ << " don't have " << *in << " cap on mds." << mds << dendl;
5222     return;
5223   }
5224 }
5225
5226 void Client::handle_cap_import(MetaSession *session, Inode *in, const MConstRef<MClientCaps>& m)
5227 {
5228   mds_rank_t mds = session->mds_num;
5229
5230   ldout(cct, 5) << __func__ << " ino " << m->get_ino() << " mseq " << m->get_mseq()
5231                 << " IMPORT from mds." << mds << dendl;
5232
5233   const mds_rank_t peer_mds = mds_rank_t(m->peer.mds);
5234   Cap *cap = NULL;
5235   UserPerm cap_perms;
5236   if (auto it = in->caps.find(peer_mds); m->peer.cap_id && it != in->caps.end()) {
5237     cap = &it->second;
5238     cap_perms = cap->latest_perms;
5239   }
5240
5241   // add/update it
5242   SnapRealm *realm = NULL;
5243   update_snap_trace(m->snapbl, &realm);
5244
5245   int issued = m->get_caps();
5246   int wanted = m->get_wanted();
5247   add_update_cap(in, session, m->get_cap_id(),
5248                  issued, wanted, m->get_seq(), m->get_mseq(),
5249                  m->get_realm(), CEPH_CAP_FLAG_AUTH, cap_perms);
5250
5251   if (cap && cap->cap_id == m->peer.cap_id) {
5252       remove_cap(cap, (m->peer.flags & CEPH_CAP_FLAG_RELEASE));
5253   }
5254
5255   if (realm)
5256     put_snap_realm(realm);
5257
5258   if (in->auth_cap && in->auth_cap->session == session) {
5259     if (!(wanted & CEPH_CAP_ANY_FILE_WR) ||
5260         in->requested_max_size > m->get_max_size()) {
5261       in->requested_max_size = 0;
5262       ldout(cct, 15) << "reset requested_max_size after cap import" << dendl;
5263     }
5264     // reflush any/all caps (if we are now the auth_cap)
5265     kick_flushing_caps(in, session);
5266   }
5267 }
5268
5269 void Client::handle_cap_export(MetaSession *session, Inode *in, const MConstRef<MClientCaps>& m)
5270 {
5271   mds_rank_t mds = session->mds_num;
5272
5273   ldout(cct, 5) << __func__ << " ino " << m->get_ino() << " mseq " << m->get_mseq()
5274                 << " EXPORT from mds." << mds << dendl;
5275
5276   auto it = in->caps.find(mds);
5277   if (it != in->caps.end()) {
5278     Cap &cap = it->second;
5279     if (cap.cap_id == m->get_cap_id()) {
5280       if (m->peer.cap_id) {
5281         const auto peer_mds = mds_rank_t(m->peer.mds);
5282         auto tsession = _get_or_open_mds_session(peer_mds);
5283         auto it = in->caps.find(peer_mds);
5284         if (it != in->caps.end()) {
5285           Cap &tcap = it->second;
5286           if (tcap.cap_id == m->peer.cap_id &&
5287               ceph_seq_cmp(tcap.seq, m->peer.seq) < 0) {
5288             tcap.cap_id = m->peer.cap_id;
5289             tcap.seq = m->peer.seq - 1;
5290             tcap.issue_seq = tcap.seq;
5291             tcap.issued |= cap.issued;
5292             tcap.implemented |= cap.issued;
5293             if (&cap == in->auth_cap)
5294               in->auth_cap = &tcap;
5295             if (in->auth_cap == &tcap && in->flushing_cap_item.is_on_list())
5296               adjust_session_flushing_caps(in, session, tsession.get());
5297           }
5298         } else {
5299           add_update_cap(in, tsession.get(), m->peer.cap_id, cap.issued, 0,
5300                          m->peer.seq - 1, m->peer.mseq, (uint64_t)-1,
5301                          &cap == in->auth_cap ? CEPH_CAP_FLAG_AUTH : 0,
5302                          cap.latest_perms);
5303         }
5304       } else {
5305         if (cap.wanted | cap.issued)
5306           in->flags |= I_CAP_DROPPED;
5307       }
5308
5309       remove_cap(&cap, false);
5310     }
5311   }
5312 }
5313
5314 void Client::handle_cap_trunc(MetaSession *session, Inode *in, const MConstRef<MClientCaps>& m)
5315 {
5316   mds_rank_t mds = session->mds_num;
5317   ceph_assert(in->caps.count(mds));
5318
5319   ldout(cct, 10) << __func__ << " on ino " << *in
5320            << " size " << in->size << " -> " << m->get_size()
5321            << dendl;
5322
5323   int issued;
5324   in->caps_issued(&issued);
5325   issued |= in->caps_dirty();
5326   update_inode_file_size(in, issued, m->get_size(),
5327                          m->get_truncate_seq(), m->get_truncate_size());
5328 }
5329
5330 void Client::handle_cap_flush_ack(MetaSession *session, Inode *in, Cap *cap, const MConstRef<MClientCaps>& m)
5331 {
5332   ceph_tid_t flush_ack_tid = m->get_client_tid();
5333   int dirty = m->get_dirty();
5334   int cleaned = 0;
5335   int flushed = 0;
5336
5337   auto it = in->flushing_cap_tids.begin();
5338   if (it->first < flush_ack_tid) {
5339        ldout(cct, 0) << __func__ << " mds." << session->mds_num
5340                    << " got unexpected flush ack tid " << flush_ack_tid
5341                    << " expected is " << it->first << dendl;
5342   }
5343   for (; it != in->flushing_cap_tids.end(); ) {
5344     if (!it->second) {
5345       // cap snap
5346       ++it;
5347       continue;
5348     }
5349     if (it->first == flush_ack_tid)
5350       cleaned = it->second;
5351     if (it->first <= flush_ack_tid) {
5352       session->flushing_caps_tids.erase(it->first);
5353       in->flushing_cap_tids.erase(it++);
5354       ++flushed;
5355       continue;
5356     }
5357     cleaned &= ~it->second;
5358     if (!cleaned)
5359       break;
5360     ++it;
5361   }
5362
5363   ldout(cct, 5) << __func__ << " mds." << session->mds_num
5364           << " cleaned " << ccap_string(cleaned) << " on " << *in
5365           << " with " << ccap_string(dirty) << dendl;
5366
5367   if (flushed) {
5368     signal_cond_list(in->waitfor_caps);
5369     if (session->flushing_caps_tids.empty() ||
5370         *session->flushing_caps_tids.begin() > flush_ack_tid)
5371       sync_cond.notify_all();
5372   }
5373
5374   if (!dirty) {
5375     in->cap_dirtier_uid = -1;
5376     in->cap_dirtier_gid = -1;
5377   }
5378
5379   if (!cleaned) {
5380     ldout(cct, 10) << " tid " << m->get_client_tid() << " != any cap bit tids" << dendl;
5381   } else {
5382     if (in->flushing_caps) {
5383       ldout(cct, 5) << "  flushing_caps " << ccap_string(in->flushing_caps)
5384               << " -> " << ccap_string(in->flushing_caps & ~cleaned) << dendl;
5385       in->flushing_caps &= ~cleaned;
5386       if (in->flushing_caps == 0) {
5387         ldout(cct, 10) << " " << *in << " !flushing" << dendl;
5388         num_flushing_caps--;
5389        if (in->flushing_cap_tids.empty())
5390           in->flushing_cap_item.remove_myself();
5391       }
5392       if (!in->caps_dirty())
5393         put_inode(in);
5394     }
5395   }
5396 }
5397
5398
5399 void Client::handle_cap_flushsnap_ack(MetaSession *session, Inode *in, const MConstRef<MClientCaps>& m)
5400 {
5401   ceph_tid_t flush_ack_tid = m->get_client_tid();
5402   mds_rank_t mds = session->mds_num;
5403   ceph_assert(in->caps.count(mds));
5404   snapid_t follows = m->get_snap_follows();
5405
5406   if (auto it = in->cap_snaps.find(follows); it != in->cap_snaps.end()) {
5407     auto& capsnap = it->second;
5408     if (flush_ack_tid != capsnap.flush_tid) {
5409       ldout(cct, 10) << " tid " << flush_ack_tid << " != " << capsnap.flush_tid << dendl;
5410     } else {
5411       InodeRef tmp_ref(in);
5412       ldout(cct, 5) << __func__ << " mds." << mds << " flushed snap follows " << follows
5413               << " on " << *in << dendl;
5414       session->flushing_caps_tids.erase(capsnap.flush_tid);
5415       in->flushing_cap_tids.erase(capsnap.flush_tid);
5416       if (in->flushing_caps == 0 && in->flushing_cap_tids.empty())
5417         in->flushing_cap_item.remove_myself();
5418       in->cap_snaps.erase(it);
5419
5420       signal_cond_list(in->waitfor_caps);
5421       if (session->flushing_caps_tids.empty() ||
5422           *session->flushing_caps_tids.begin() > flush_ack_tid)
5423         sync_cond.notify_all();
5424     }
5425   } else {
5426     ldout(cct, 5) << __func__ << " DUP(?) mds." << mds << " flushed snap follows " << follows
5427             << " on " << *in << dendl;
5428     // we may not have it if we send multiple FLUSHSNAP requests and (got multiple FLUSHEDSNAPs back)
5429   }
5430 }
5431
5432 class C_Client_DentryInvalidate : public Context  {
5433 private:
5434   Client *client;
5435   vinodeno_t dirino;
5436   vinodeno_t ino;
5437   string name;
5438 public:
5439   C_Client_DentryInvalidate(Client *c, Dentry *dn, bool del) :
5440     client(c), name(dn->name) {
5441       if (client->use_faked_inos()) {
5442         dirino.ino = dn->dir->parent_inode->faked_ino;
5443         if (del)
5444           ino.ino = dn->inode->faked_ino;
5445       } else {
5446         dirino = dn->dir->parent_inode->vino();
5447         if (del)
5448           ino = dn->inode->vino();
5449       }
5450       if (!del)
5451         ino.ino = inodeno_t();
5452   }
5453   void finish(int r) override {
5454     // _async_dentry_invalidate is responsible for its own locking
5455     ceph_assert(ceph_mutex_is_not_locked_by_me(client->client_lock));
5456     client->_async_dentry_invalidate(dirino, ino, name);
5457   }
5458 };
5459
5460 void Client::_async_dentry_invalidate(vinodeno_t dirino, vinodeno_t ino, string& name)
5461 {
5462   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
5463   if (!mref_reader.is_state_satisfied())
5464     return;
5465
5466   ldout(cct, 10) << __func__ << " '" << name << "' ino " << ino
5467                  << " in dir " << dirino << dendl;
5468   dentry_invalidate_cb(callback_handle, dirino, ino, name.c_str(), name.length());
5469 }
5470
5471 void Client::_schedule_invalidate_dentry_callback(Dentry *dn, bool del)
5472 {
5473   if (dentry_invalidate_cb && dn->inode->ll_ref > 0)
5474     async_dentry_invalidator.queue(new C_Client_DentryInvalidate(this, dn, del));
5475 }
5476
5477 void Client::_try_to_trim_inode(Inode *in, bool sched_inval)
5478 {
5479   int ref = in->get_nref();
5480   ldout(cct, 5) << __func__ << " in " << *in <<dendl;
5481
5482   if (in->dir && !in->dir->dentries.empty()) {
5483     for (auto p = in->dir->dentries.begin();
5484          p != in->dir->dentries.end(); ) {
5485       Dentry *dn = p->second;
5486       ++p;
5487       /* rmsnap removes whole subtree, need trim inodes recursively.
5488        * we don't need to invalidate dentries recursively. because
5489        * invalidating a directory dentry effectively invalidate
5490        * whole subtree */
5491       if (in->snapid != CEPH_NOSNAP && dn->inode && dn->inode->is_dir())
5492         _try_to_trim_inode(dn->inode.get(), false);
5493
5494       if (dn->lru_is_expireable())
5495         unlink(dn, true, false);  // keep dir, drop dentry
5496     }
5497     if (in->dir->dentries.empty()) {
5498       close_dir(in->dir);
5499       --ref;
5500     }
5501   }
5502
5503   if (ref > 1 && (in->flags & I_SNAPDIR_OPEN)) {
5504     InodeRef snapdir = open_snapdir(in);
5505     _try_to_trim_inode(snapdir.get(), false);
5506     --ref;
5507   }
5508
5509   if (ref > 1) {
5510     auto q = in->dentries.begin();
5511     while (q != in->dentries.end()) {
5512       Dentry *dn = *q;
5513       ++q;
5514       if( in->ll_ref > 0 && sched_inval) {
5515         // FIXME: we play lots of unlink/link tricks when handling MDS replies,
5516         //        so in->dentries doesn't always reflect the state of kernel's dcache.
5517         _schedule_invalidate_dentry_callback(dn, true);
5518       }
5519       unlink(dn, true, true);
5520     }
5521   }
5522 }
5523
5524 void Client::handle_cap_grant(MetaSession *session, Inode *in, Cap *cap, const MConstRef<MClientCaps>& m)
5525 {
5526   mds_rank_t mds = session->mds_num;
5527   int used = get_caps_used(in);
5528   int wanted = in->caps_wanted();
5529   int flags = 0;
5530
5531   const unsigned new_caps = m->get_caps();
5532   const bool was_stale = session->cap_gen > cap->gen;
5533   ldout(cct, 5) << __func__ << " on in " << m->get_ino()
5534                 << " mds." << mds << " seq " << m->get_seq()
5535                 << " caps now " << ccap_string(new_caps)
5536                 << " was " << ccap_string(cap->issued)
5537                 << (was_stale ? " (stale)" : "") << dendl;
5538
5539   if (was_stale)
5540       cap->issued = cap->implemented = CEPH_CAP_PIN;
5541   cap->seq = m->get_seq();
5542   cap->gen = session->cap_gen;
5543
5544   check_cap_issue(in, new_caps);
5545
5546   // update inode
5547   int issued;
5548   in->caps_issued(&issued);
5549   issued |= in->caps_dirty();
5550
5551   if ((new_caps & CEPH_CAP_AUTH_SHARED) &&
5552       !(issued & CEPH_CAP_AUTH_EXCL)) {
5553     in->mode = m->head.mode;
5554     in->uid = m->head.uid;
5555     in->gid = m->head.gid;
5556     in->btime = m->btime;
5557   }
5558   bool deleted_inode = false;
5559   if ((new_caps & CEPH_CAP_LINK_SHARED) &&
5560       !(issued & CEPH_CAP_LINK_EXCL)) {
5561     in->nlink = m->head.nlink;
5562     if (in->nlink == 0)
5563       deleted_inode = true;
5564   }
5565   if (!(issued & CEPH_CAP_XATTR_EXCL) &&
5566       m->xattrbl.length() &&
5567       m->head.xattr_version > in->xattr_version) {
5568     auto p = m->xattrbl.cbegin();
5569     decode(in->xattrs, p);
5570     in->xattr_version = m->head.xattr_version;
5571   }
5572
5573   if ((new_caps & CEPH_CAP_FILE_SHARED) && m->dirstat_is_valid()) {
5574     in->dirstat.nfiles = m->get_nfiles();
5575     in->dirstat.nsubdirs = m->get_nsubdirs();
5576   }
5577
5578   if (new_caps & CEPH_CAP_ANY_RD) {
5579     update_inode_file_time(in, issued, m->get_time_warp_seq(),
5580                            m->get_ctime(), m->get_mtime(), m->get_atime());
5581   }
5582
5583   if (new_caps & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR)) {
5584     in->layout = m->get_layout();
5585     update_inode_file_size(in, issued, m->get_size(),
5586                            m->get_truncate_seq(), m->get_truncate_size());
5587   }
5588
5589   if (m->inline_version > in->inline_version) {
5590     in->inline_data = m->inline_data;
5591     in->inline_version = m->inline_version;
5592   }
5593
5594   /* always take a newer change attr */
5595   if (m->get_change_attr() > in->change_attr)
5596     in->change_attr = m->get_change_attr();
5597
5598   // max_size
5599   if (cap == in->auth_cap &&
5600       (new_caps & CEPH_CAP_ANY_FILE_WR) &&
5601       (m->get_max_size() != in->max_size)) {
5602     ldout(cct, 10) << "max_size " << in->max_size << " -> " << m->get_max_size() << dendl;
5603     in->max_size = m->get_max_size();
5604     if (in->max_size > in->wanted_max_size) {
5605       in->wanted_max_size = 0;
5606       in->requested_max_size = 0;
5607     }
5608   }
5609
5610   bool check = false;
5611   if ((was_stale || m->get_op() == CEPH_CAP_OP_IMPORT) &&
5612       (wanted & ~(cap->wanted | new_caps))) {
5613     // If mds is importing cap, prior cap messages that update 'wanted'
5614     // may get dropped by mds (migrate seq mismatch).
5615     //
5616     // We don't send cap message to update 'wanted' if what we want are
5617     // already issued. If mds revokes caps, cap message that releases caps
5618     // also tells mds what we want. But if caps got revoked by mds forcedly
5619     // (session stale). We may haven't told mds what we want.
5620     check = true;
5621   }
5622
5623
5624   // update caps
5625   auto revoked = cap->issued & ~new_caps;
5626   if (revoked) {
5627     ldout(cct, 10) << "  revocation of " << ccap_string(revoked) << dendl;
5628     cap->issued = new_caps;
5629     cap->implemented |= new_caps;
5630
5631     // recall delegations if we're losing caps necessary for them
5632     if (revoked & ceph_deleg_caps_for_type(CEPH_DELEGATION_RD))
5633       in->recall_deleg(false);
5634     else if (revoked & ceph_deleg_caps_for_type(CEPH_DELEGATION_WR))
5635       in->recall_deleg(true);
5636
5637     used = adjust_caps_used_for_lazyio(used, cap->issued, cap->implemented);
5638     if ((used & revoked & (CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO)) &&
5639         !_flush(in, new C_Client_FlushComplete(this, in))) {
5640       // waitin' for flush
5641     } else if (used & revoked & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO)) {
5642       if (_release(in)) {
5643         check = true;
5644         flags = CHECK_CAPS_NODELAY;
5645       }
5646     } else {
5647       cap->wanted = 0; // don't let check_caps skip sending a response to MDS
5648       check = true;
5649       flags = CHECK_CAPS_NODELAY;
5650     }
5651   } else if (cap->issued == new_caps) {
5652     ldout(cct, 10) << "  caps unchanged at " << ccap_string(cap->issued) << dendl;
5653   } else {
5654     ldout(cct, 10) << "  grant, new caps are " << ccap_string(new_caps & ~cap->issued) << dendl;
5655     cap->issued = new_caps;
5656     cap->implemented |= new_caps;
5657
5658     if (cap == in->auth_cap) {
5659       // non-auth MDS is revoking the newly grant caps ?
5660       for (const auto &p : in->caps) {
5661         if (&p.second == cap)
5662           continue;
5663         if (p.second.implemented & ~p.second.issued & new_caps) {
5664           check = true;
5665           break;
5666         }
5667       }
5668     }
5669   }
5670
5671   if (check)
5672     check_caps(in, flags);
5673
5674   // wake up waiters
5675   if (new_caps)
5676     signal_cond_list(in->waitfor_caps);
5677
5678   // may drop inode's last ref
5679   if (deleted_inode)
5680     _try_to_trim_inode(in, true);
5681 }
5682
5683 int Client::inode_permission(Inode *in, const UserPerm& perms, unsigned want)
5684 {
5685   if (perms.uid() == 0) {
5686     // For directories, DACs are overridable.
5687     // For files, Read/write DACs are always overridable but executable DACs are
5688     // overridable when there is at least one exec bit set
5689     if(!S_ISDIR(in->mode) && (want & MAY_EXEC) && !(in->mode & S_IXUGO))
5690       return -CEPHFS_EACCES;
5691     return 0;
5692   }
5693
5694   if (perms.uid() != in->uid && (in->mode & S_IRWXG)) {
5695     int ret = _posix_acl_permission(in, perms, want);
5696     if (ret != -CEPHFS_EAGAIN)
5697       return ret;
5698   }
5699
5700   // check permissions before doing anything else
5701   if (!in->check_mode(perms, want))
5702     return -CEPHFS_EACCES;
5703   return 0;
5704 }
5705
5706 int Client::xattr_permission(Inode *in, const char *name, unsigned want,
5707                              const UserPerm& perms)
5708 {
5709   int r = _getattr_for_perm(in, perms);
5710   if (r < 0)
5711     goto out;
5712
5713   r = 0;
5714   if (strncmp(name, "system.", 7) == 0) {
5715     if ((want & MAY_WRITE) && (perms.uid() != 0 && perms.uid() != in->uid))
5716       r = -CEPHFS_EPERM;
5717   } else {
5718     r = inode_permission(in, perms, want);
5719   }
5720 out:
5721   ldout(cct, 5) << __func__ << " " << in << " = " << r <<  dendl;
5722   return r;
5723 }
5724
5725 std::ostream& operator<<(std::ostream &out, const UserPerm& perm) {
5726   out << "UserPerm(uid: " << perm.uid() << ", gid: " << perm.gid() << ")";
5727   return out;
5728 }
5729
5730 int Client::may_setattr(Inode *in, struct ceph_statx *stx, int mask,
5731                         const UserPerm& perms)
5732 {
5733   ldout(cct, 20) << __func__ << " " << *in << "; " << perms << dendl;
5734   int r = _getattr_for_perm(in, perms);
5735   if (r < 0)
5736     goto out;
5737
5738   if (mask & CEPH_SETATTR_SIZE) {
5739     r = inode_permission(in, perms, MAY_WRITE);
5740     if (r < 0)
5741       goto out;
5742   }
5743
5744   r = -CEPHFS_EPERM;
5745   if (mask & CEPH_SETATTR_UID) {
5746     if (perms.uid() != 0 && (perms.uid() != in->uid || stx->stx_uid != in->uid))
5747       goto out;
5748   }
5749   if (mask & CEPH_SETATTR_GID) {
5750     if (perms.uid() != 0 && (perms.uid() != in->uid ||
5751                (!perms.gid_in_groups(stx->stx_gid) && stx->stx_gid != in->gid)))
5752       goto out;
5753   }
5754
5755   if (mask & CEPH_SETATTR_MODE) {
5756     if (perms.uid() != 0 && perms.uid() != in->uid)
5757       goto out;
5758
5759     gid_t i_gid = (mask & CEPH_SETATTR_GID) ? stx->stx_gid : in->gid;
5760     if (perms.uid() != 0 && !perms.gid_in_groups(i_gid))
5761       stx->stx_mode &= ~S_ISGID;
5762   }
5763
5764   if (mask & (CEPH_SETATTR_CTIME | CEPH_SETATTR_BTIME |
5765               CEPH_SETATTR_MTIME | CEPH_SETATTR_ATIME)) {
5766     if (perms.uid() != 0 && perms.uid() != in->uid) {
5767       int check_mask = CEPH_SETATTR_CTIME | CEPH_SETATTR_BTIME;
5768       if (!(mask & CEPH_SETATTR_MTIME_NOW))
5769         check_mask |= CEPH_SETATTR_MTIME;
5770       if (!(mask & CEPH_SETATTR_ATIME_NOW))
5771         check_mask |= CEPH_SETATTR_ATIME;
5772       if (check_mask & mask) {
5773         goto out;
5774       } else {
5775         r = inode_permission(in, perms, MAY_WRITE);
5776         if (r < 0)
5777           goto out;
5778       }
5779     }
5780   }
5781   r = 0;
5782 out:
5783   ldout(cct, 3) << __func__ << " " << in << " = " << r <<  dendl;
5784   return r;
5785 }
5786
5787 int Client::may_open(Inode *in, int flags, const UserPerm& perms)
5788 {
5789   ldout(cct, 20) << __func__ << " " << *in << "; " << perms << dendl;
5790   unsigned want = 0;
5791
5792   if ((flags & O_ACCMODE) == O_WRONLY)
5793     want = MAY_WRITE;
5794   else if ((flags & O_ACCMODE) == O_RDWR)
5795     want = MAY_READ | MAY_WRITE;
5796   else if ((flags & O_ACCMODE) == O_RDONLY)
5797     want = MAY_READ;
5798   if (flags & O_TRUNC)
5799     want |= MAY_WRITE;
5800
5801   int r = 0;
5802   switch (in->mode & S_IFMT) {
5803     case S_IFLNK:
5804       r = -CEPHFS_ELOOP;
5805       goto out;
5806     case S_IFDIR:
5807       if (want & MAY_WRITE) {
5808         r = -CEPHFS_EISDIR;
5809         goto out;
5810       }
5811       break;
5812   }
5813
5814   r = _getattr_for_perm(in, perms);
5815   if (r < 0)
5816     goto out;
5817
5818   r = inode_permission(in, perms, want);
5819 out:
5820   ldout(cct, 3) << __func__ << " " << in << " = " << r <<  dendl;
5821   return r;
5822 }
5823
5824 int Client::may_lookup(Inode *dir, const UserPerm& perms)
5825 {
5826   ldout(cct, 20) << __func__ << " " << *dir << "; " << perms << dendl;
5827   int r = _getattr_for_perm(dir, perms);
5828   if (r < 0)
5829     goto out;
5830
5831   r = inode_permission(dir, perms, MAY_EXEC);
5832 out:
5833   ldout(cct, 3) << __func__ << " " << dir << " = " << r <<  dendl;
5834   return r;
5835 }
5836
5837 int Client::may_create(Inode *dir, const UserPerm& perms)
5838 {
5839   ldout(cct, 20) << __func__ << " " << *dir << "; " << perms << dendl;
5840   int r = _getattr_for_perm(dir, perms);
5841   if (r < 0)
5842     goto out;
5843
5844   r = inode_permission(dir, perms, MAY_EXEC | MAY_WRITE);
5845 out:
5846   ldout(cct, 3) << __func__ << " " << dir << " = " << r <<  dendl;
5847   return r;
5848 }
5849
5850 int Client::may_delete(Inode *dir, const char *name, const UserPerm& perms)
5851 {
5852   ldout(cct, 20) << __func__ << " " << *dir << "; " << "; name " << name << "; " << perms << dendl;
5853   int r = _getattr_for_perm(dir, perms);
5854   if (r < 0)
5855     goto out;
5856
5857   r = inode_permission(dir, perms, MAY_EXEC | MAY_WRITE);
5858   if (r < 0)
5859     goto out;
5860
5861   /* 'name == NULL' means rmsnap w/o permission checks */
5862   if (perms.uid() != 0 && name && (dir->mode & S_ISVTX)) {
5863     InodeRef otherin;
5864     r = _lookup(dir, name, CEPH_CAP_AUTH_SHARED, &otherin, perms);
5865     if (r < 0)
5866       goto out;
5867     if (dir->uid != perms.uid() && otherin->uid != perms.uid())
5868       r = -CEPHFS_EPERM;
5869   }
5870 out:
5871   ldout(cct, 3) << __func__ << " " << dir << " = " << r <<  dendl;
5872   return r;
5873 }
5874
5875 int Client::may_delete(const char *relpath, const UserPerm& perms) {
5876   ldout(cct, 20) << __func__ << " " << relpath << "; " << perms << dendl;
5877
5878   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
5879   if (!mref_reader.is_state_satisfied())
5880     return -ENOTCONN;
5881
5882   filepath path(relpath);
5883   string name = path.last_dentry();
5884   path.pop_dentry();
5885   InodeRef dir;
5886
5887   std::scoped_lock lock(client_lock);
5888   int r = path_walk(path, &dir, perms);
5889   if (r < 0)
5890     return r;
5891   if (cct->_conf->client_permissions) {
5892     int r = may_delete(dir.get(), name.c_str(), perms);
5893     if (r < 0)
5894       return r;
5895   }
5896
5897   return 0;
5898 }
5899
5900 int Client::may_hardlink(Inode *in, const UserPerm& perms)
5901 {
5902   ldout(cct, 20) << __func__ << " " << *in << "; " << perms << dendl;
5903   int r = _getattr_for_perm(in, perms);
5904   if (r < 0)
5905     goto out;
5906
5907   if (perms.uid() == 0 || perms.uid() == in->uid) {
5908     r = 0;
5909     goto out;
5910   }
5911
5912   r = -CEPHFS_EPERM;
5913   if (!S_ISREG(in->mode))
5914     goto out;
5915
5916   if (in->mode & S_ISUID)
5917     goto out;
5918
5919   if ((in->mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP))
5920     goto out;
5921
5922   r = inode_permission(in, perms, MAY_READ | MAY_WRITE);
5923 out:
5924   ldout(cct, 3) << __func__ << " " << in << " = " << r <<  dendl;
5925   return r;
5926 }
5927
5928 int Client::_getattr_for_perm(Inode *in, const UserPerm& perms)
5929 {
5930   int mask = CEPH_STAT_CAP_MODE;
5931   bool force = false;
5932   if (acl_type != NO_ACL) {
5933     mask |= CEPH_STAT_CAP_XATTR;
5934     force = in->xattr_version == 0;
5935   }
5936   return _getattr(in, mask, perms, force);
5937 }
5938
5939 vinodeno_t Client::_get_vino(Inode *in)
5940 {
5941   /* The caller must hold the client lock */
5942   return vinodeno_t(in->ino, in->snapid);
5943 }
5944
5945 /**
5946  * Resolve an MDS spec to a list of MDS daemon GIDs.
5947  *
5948  * The spec is a string representing a GID, rank, filesystem:rank, or name/id.
5949  * It may be '*' in which case it matches all GIDs.
5950  *
5951  * If no error is returned, the `targets` vector will be populated with at least
5952  * one MDS.
5953  */
5954 int Client::resolve_mds(
5955     const std::string &mds_spec,
5956     std::vector<mds_gid_t> *targets)
5957 {
5958   ceph_assert(fsmap);
5959   ceph_assert(targets != nullptr);
5960
5961   mds_role_t role;
5962   CachedStackStringStream css;
5963   int role_r = fsmap->parse_role(mds_spec, &role, *css);
5964   if (role_r == 0) {
5965     // We got a role, resolve it to a GID
5966     auto& info = fsmap->get_filesystem(role.fscid)->mds_map.get_info(role.rank);
5967     ldout(cct, 10) << __func__ << ": resolved " << mds_spec << " to role '"
5968       << role << "' aka " << info.human_name() << dendl;
5969     targets->push_back(info.global_id);
5970     return 0;
5971   }
5972
5973   std::string strtol_err;
5974   long long rank_or_gid = strict_strtoll(mds_spec.c_str(), 10, &strtol_err);
5975   if (strtol_err.empty()) {
5976     // It is a possible GID
5977     const mds_gid_t mds_gid = mds_gid_t(rank_or_gid);
5978     if (fsmap->gid_exists(mds_gid)) {
5979       auto& info = fsmap->get_info_gid(mds_gid);
5980       ldout(cct, 10) << __func__ << ": validated gid " << mds_gid << " aka "
5981                      << info.human_name() << dendl;
5982       targets->push_back(mds_gid);
5983       return 0;
5984     } else {
5985       lderr(cct) << __func__ << ": gid " << mds_gid << " not in MDS map"
5986                  << dendl;
5987       lderr(cct) << "FSMap: " << *fsmap << dendl;
5988       return -CEPHFS_ENOENT;
5989     }
5990   } else if (mds_spec == "*") {
5991     // It is a wildcard: use all MDSs
5992     const auto& mds_info = fsmap->get_mds_info();
5993
5994     ldout(cct, 10) << __func__ << ": resolving `*' to all MDS daemons" << dendl;
5995     if (mds_info.empty()) {
5996       lderr(cct) << __func__ << ": no MDS daemons found" << dendl;
5997       lderr(cct) << "FSMap: " << *fsmap << dendl;
5998       return -CEPHFS_ENOENT;
5999     }
6000
6001     for (const auto& [gid, info] : mds_info) {
6002       ldout(cct, 10) << __func__ << ": appending " << info.human_name() << " to targets" << dendl;
6003       targets->push_back(gid);
6004     }
6005     return 0;
6006   } else {
6007     // It did not parse as an integer, it is not a wildcard, it must be a name
6008     const mds_gid_t mds_gid = fsmap->find_mds_gid_by_name(mds_spec);
6009     if (mds_gid == 0) {
6010       lderr(cct) << __func__ << ": no MDS daemons found by name `" << mds_spec << "'" << dendl;
6011       lderr(cct) << "FSMap: " << *fsmap << dendl;
6012       return -CEPHFS_ENOENT;
6013     } else {
6014       auto& info = fsmap->get_info_gid(mds_gid);
6015       ldout(cct, 10) << __func__ << ": resolved name '" << mds_spec
6016                      << "' to " << info.human_name() << dendl;
6017       targets->push_back(mds_gid);
6018     }
6019     return 0;
6020   }
6021 }
6022
6023
6024 /**
6025  * Authenticate with mon and establish global ID
6026  */
6027 int Client::authenticate()
6028 {
6029   ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
6030
6031   if (monclient->is_authenticated()) {
6032     return 0;
6033   }
6034
6035   client_lock.unlock();
6036   int r = monclient->authenticate(std::chrono::duration<double>(mount_timeout).count());
6037   client_lock.lock();
6038   if (r < 0) {
6039     return r;
6040   }
6041
6042   whoami = monclient->get_global_id();
6043   messenger->set_myname(entity_name_t::CLIENT(whoami.v));
6044
6045   return 0;
6046 }
6047
6048 int Client::fetch_fsmap(bool user)
6049 {
6050   ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
6051
6052   // Retrieve FSMap to enable looking up daemon addresses.  We need FSMap
6053   // rather than MDSMap because no one MDSMap contains all the daemons, and
6054   // a `tell` can address any daemon.
6055   version_t fsmap_latest;
6056   bs::error_code ec;
6057   do {
6058     client_lock.unlock();
6059     std::tie(fsmap_latest, std::ignore) =
6060       monclient->get_version("fsmap", ca::use_blocked[ec]);
6061     client_lock.lock();
6062   } while (ec == bs::errc::resource_unavailable_try_again);
6063
6064   if (ec) {
6065     lderr(cct) << "Failed to learn FSMap version: " << ec << dendl;
6066     return ceph::from_error_code(ec);
6067   }
6068
6069   ldout(cct, 10) << __func__ << " learned FSMap version " << fsmap_latest << dendl;
6070
6071   if (user) {
6072     if (!fsmap_user || fsmap_user->get_epoch() < fsmap_latest) {
6073       monclient->sub_want("fsmap.user", fsmap_latest, CEPH_SUBSCRIBE_ONETIME);
6074       monclient->renew_subs();
6075       wait_on_list(waiting_for_fsmap);
6076     }
6077     ceph_assert(fsmap_user);
6078     ceph_assert(fsmap_user->get_epoch() >= fsmap_latest);
6079   } else {
6080     if (!fsmap || fsmap->get_epoch() < fsmap_latest) {
6081       monclient->sub_want("fsmap", fsmap_latest, CEPH_SUBSCRIBE_ONETIME);
6082       monclient->renew_subs();
6083       wait_on_list(waiting_for_fsmap);
6084     }
6085     ceph_assert(fsmap);
6086     ceph_assert(fsmap->get_epoch() >= fsmap_latest);
6087   }
6088   ldout(cct, 10) << __func__ << " finished waiting for FSMap version "
6089                  << fsmap_latest << dendl;
6090   return 0;
6091 }
6092
6093 /**
6094  *
6095  * @mds_spec one of ID, rank, GID, "*"
6096  *
6097  */
6098 int Client::mds_command(
6099     const std::string &mds_spec,
6100     const vector<string>& cmd,
6101     const bufferlist& inbl,
6102     bufferlist *outbl,
6103     string *outs,
6104     Context *onfinish)
6105 {
6106   RWRef_t iref_reader(initialize_state, CLIENT_INITIALIZED);
6107   if (!iref_reader.is_state_satisfied())
6108     return -CEPHFS_ENOTCONN;
6109
6110   std::unique_lock cl(client_lock);
6111
6112   int r;
6113   r = authenticate();
6114   if (r < 0) {
6115     return r;
6116   }
6117
6118   r = fetch_fsmap(false);
6119   if (r < 0) {
6120     return r;
6121   }
6122
6123   // Look up MDS target(s) of the command
6124   std::vector<mds_gid_t> targets;
6125   r = resolve_mds(mds_spec, &targets);
6126   if (r < 0) {
6127     return r;
6128   }
6129
6130   // If daemons are laggy, we won't send them commands.  If all
6131   // are laggy then we fail.
6132   std::vector<mds_gid_t> non_laggy;
6133   for (const auto& gid : targets) {
6134     const auto info = fsmap->get_info_gid(gid);
6135     if (!info.laggy()) {
6136       non_laggy.push_back(gid);
6137     }
6138   }
6139   if (non_laggy.size() == 0) {
6140     *outs = "All targeted MDS daemons are laggy";
6141     return -CEPHFS_ENOENT;
6142   }
6143
6144   if (metadata.empty()) {
6145     // We are called on an unmounted client, so metadata
6146     // won't be initialized yet.
6147     populate_metadata("");
6148   }
6149
6150   // Send commands to targets
6151   C_GatherBuilder gather(cct, onfinish);
6152   for (const auto& target_gid : non_laggy) {
6153     const auto info = fsmap->get_info_gid(target_gid);
6154
6155     // Open a connection to the target MDS
6156     ConnectionRef conn = messenger->connect_to_mds(info.get_addrs());
6157
6158     cl.unlock();
6159     {
6160       std::scoped_lock cmd_lock(command_lock);
6161       // Generate MDSCommandOp state
6162       auto &op = command_table.start_command();
6163
6164       op.on_finish = gather.new_sub();
6165       op.cmd = cmd;
6166       op.outbl = outbl;
6167       op.outs = outs;
6168       op.inbl = inbl;
6169       op.mds_gid = target_gid;
6170       op.con = conn;
6171
6172       ldout(cct, 4) << __func__ << ": new command op to " << target_gid
6173         << " tid=" << op.tid << cmd << dendl;
6174
6175       // Construct and send MCommand
6176       MessageRef m = op.get_message(monclient->get_fsid());
6177       conn->send_message2(std::move(m));
6178     }
6179     cl.lock();
6180   }
6181   gather.activate();
6182
6183   return 0;
6184 }
6185
6186 void Client::handle_command_reply(const MConstRef<MCommandReply>& m)
6187 {
6188   ceph_tid_t const tid = m->get_tid();
6189
6190   ldout(cct, 10) << __func__ << ": tid=" << m->get_tid() << dendl;
6191
6192   std::scoped_lock cmd_lock(command_lock);
6193   if (!command_table.exists(tid)) {
6194     ldout(cct, 1) << __func__ << ": unknown tid " << tid << ", dropping" << dendl;
6195     return;
6196   }
6197
6198   auto &op = command_table.get_command(tid);
6199   if (op.outbl) {
6200     *op.outbl = m->get_data();
6201   }
6202   if (op.outs) {
6203     *op.outs = m->rs;
6204   }
6205
6206   if (op.on_finish) {
6207     op.on_finish->complete(m->r);
6208   }
6209
6210   command_table.erase(tid);
6211 }
6212
6213 // -------------------
6214 // MOUNT
6215
6216 int Client::subscribe_mdsmap(const std::string &fs_name)
6217 {
6218   int r = authenticate();
6219   if (r < 0) {
6220     lderr(cct) << "authentication failed: " << cpp_strerror(r) << dendl;
6221     return r;
6222   }
6223
6224   std::string resolved_fs_name;
6225   if (fs_name.empty()) {
6226     resolved_fs_name = cct->_conf.get_val<std::string>("client_fs");
6227     if (resolved_fs_name.empty())
6228             // Try the backwards compatibility fs name option
6229             resolved_fs_name = cct->_conf.get_val<std::string>("client_mds_namespace");
6230   } else {
6231     resolved_fs_name = fs_name;
6232   }
6233
6234   std::string want = "mdsmap";
6235   if (!resolved_fs_name.empty()) {
6236     r = fetch_fsmap(true);
6237     if (r < 0)
6238       return r;
6239     fscid = fsmap_user->get_fs_cid(resolved_fs_name);
6240     if (fscid == FS_CLUSTER_ID_NONE) {
6241       return -CEPHFS_ENOENT;
6242     }
6243
6244     std::ostringstream oss;
6245     oss << want << "." << fscid;
6246     want = oss.str();
6247   }
6248   ldout(cct, 10) << "Subscribing to map '" << want << "'" << dendl;
6249
6250   monclient->sub_want(want, 0, 0);
6251   monclient->renew_subs();
6252
6253   return 0;
6254 }
6255
6256 int Client::mount(const std::string &mount_root, const UserPerm& perms,
6257                   bool require_mds, const std::string &fs_name)
6258 {
6259   ceph_assert(is_initialized());
6260
6261   /*
6262    * To make sure that the _unmount() must wait until the mount()
6263    * is done.
6264    */
6265   RWRef_t mref_writer(mount_state, CLIENT_MOUNTING, false);
6266   if (!mref_writer.is_first_writer()) // already mounting or mounted
6267     return 0;
6268
6269   std::unique_lock cl(client_lock);
6270
6271   int r = subscribe_mdsmap(fs_name);
6272   if (r < 0) {
6273     lderr(cct) << "mdsmap subscription failed: " << cpp_strerror(r) << dendl;
6274     return r;
6275   }
6276
6277   start_tick_thread(); // start tick thread
6278
6279   if (require_mds) {
6280     while (1) {
6281       auto availability = mdsmap->is_cluster_available();
6282       if (availability == MDSMap::STUCK_UNAVAILABLE) {
6283         // Error out
6284         ldout(cct, 10) << "mds cluster unavailable: epoch=" << mdsmap->get_epoch() << dendl;
6285         return CEPH_FUSE_NO_MDS_UP;
6286       } else if (availability == MDSMap::AVAILABLE) {
6287         // Continue to mount
6288         break;
6289       } else if (availability == MDSMap::TRANSIENT_UNAVAILABLE) {
6290         // Else, wait.  MDSMonitor will update the map to bring
6291         // us to a conclusion eventually.
6292         wait_on_list(waiting_for_mdsmap);
6293       } else {
6294         // Unexpected value!
6295         ceph_abort();
6296       }
6297     }
6298   }
6299
6300   populate_metadata(mount_root.empty() ? "/" : mount_root);
6301
6302   filepath fp(CEPH_INO_ROOT);
6303   if (!mount_root.empty()) {
6304     fp = filepath(mount_root.c_str());
6305   }
6306   while (true) {
6307     MetaRequest *req = new MetaRequest(CEPH_MDS_OP_GETATTR);
6308     req->set_filepath(fp);
6309     req->head.args.getattr.mask = CEPH_STAT_CAP_INODE_ALL;
6310     int res = make_request(req, perms);
6311     if (res < 0) {
6312       if (res == -CEPHFS_EACCES && root) {
6313         ldout(cct, 1) << __func__ << " EACCES on parent of mount point; quotas may not work" << dendl;
6314         break;
6315       }
6316       return res;
6317     }
6318
6319     if (fp.depth())
6320       fp.pop_dentry();
6321     else
6322       break;
6323   }
6324
6325   ceph_assert(root);
6326   _ll_get(root.get());
6327
6328   // trace?
6329   if (!cct->_conf->client_trace.empty()) {
6330     traceout.open(cct->_conf->client_trace.c_str());
6331     if (traceout.is_open()) {
6332       ldout(cct, 1) << "opened trace file '" << cct->_conf->client_trace << "'" << dendl;
6333     } else {
6334       ldout(cct, 1) << "FAILED to open trace file '" << cct->_conf->client_trace << "'" << dendl;
6335     }
6336   }
6337
6338   /*
6339   ldout(cct, 3) << "op: // client trace data structs" << dendl;
6340   ldout(cct, 3) << "op: struct stat st;" << dendl;
6341   ldout(cct, 3) << "op: struct utimbuf utim;" << dendl;
6342   ldout(cct, 3) << "op: int readlinkbuf_len = 1000;" << dendl;
6343   ldout(cct, 3) << "op: char readlinkbuf[readlinkbuf_len];" << dendl;
6344   ldout(cct, 3) << "op: map<string, inode_t*> dir_contents;" << dendl;
6345   ldout(cct, 3) << "op: map<int, int> open_files;" << dendl;
6346   ldout(cct, 3) << "op: int fd;" << dendl;
6347   */
6348
6349   mref_writer.update_state(CLIENT_MOUNTED);
6350   return 0;
6351 }
6352
6353 // UNMOUNT
6354
6355 void Client::_close_sessions()
6356 {
6357   for (auto it = mds_sessions.begin(); it != mds_sessions.end(); ) {
6358     if (it->second->state == MetaSession::STATE_REJECTED)
6359       mds_sessions.erase(it++);
6360     else
6361       ++it;
6362   }
6363
6364   while (!mds_sessions.empty()) {
6365     // send session closes!
6366     for (auto &p : mds_sessions) {
6367       if (p.second->state != MetaSession::STATE_CLOSING) {
6368         _close_mds_session(p.second.get());
6369         mds_ranks_closing.insert(p.first);
6370       }
6371     }
6372
6373     // wait for sessions to close
6374     double timo = cct->_conf.get_val<std::chrono::seconds>("client_shutdown_timeout").count();
6375     ldout(cct, 2) << "waiting for " << mds_ranks_closing.size() << " mds session(s) to close (timeout: "
6376                   << timo << "s)" << dendl;
6377     std::unique_lock l{client_lock, std::adopt_lock};
6378     if (!timo) {
6379       mount_cond.wait(l);
6380     } else if (!mount_cond.wait_for(l, ceph::make_timespan(timo), [this] { return mds_ranks_closing.empty(); })) {
6381       ldout(cct, 1) << mds_ranks_closing.size() << " mds(s) did not respond to session close -- timing out." << dendl;
6382       while (!mds_ranks_closing.empty()) {
6383         auto session = mds_sessions.at(*mds_ranks_closing.begin());
6384         // this prunes entry from mds_sessions and mds_ranks_closing
6385         _closed_mds_session(session.get(), -CEPHFS_ETIMEDOUT);
6386       }
6387     }
6388
6389     mds_ranks_closing.clear();
6390     l.release();
6391   }
6392 }
6393
6394 void Client::flush_mdlog_sync(Inode *in)
6395 {
6396   if (in->unsafe_ops.empty()) {
6397     return;
6398   }
6399
6400   std::set<mds_rank_t> anchor;
6401   for (auto &&p : in->unsafe_ops) {
6402     anchor.emplace(p->mds);
6403   }
6404   if (in->auth_cap) {
6405     anchor.emplace(in->auth_cap->session->mds_num);
6406   }
6407
6408   for (auto &rank : anchor) {
6409     auto session = &mds_sessions.at(rank);
6410     flush_mdlog(session->get());
6411   }
6412 }
6413
6414 void Client::flush_mdlog_sync()
6415 {
6416   if (mds_requests.empty())
6417     return;
6418   for (auto &p : mds_sessions) {
6419     flush_mdlog(p.second.get());
6420   }
6421 }
6422
6423 void Client::flush_mdlog(MetaSession *session)
6424 {
6425   // Only send this to Luminous or newer MDS daemons, older daemons
6426   // will crash if they see an unknown CEPH_SESSION_* value in this msg.
6427   const uint64_t features = session->con->get_features();
6428   if (HAVE_FEATURE(features, SERVER_LUMINOUS)) {
6429     auto m = make_message<MClientSession>(CEPH_SESSION_REQUEST_FLUSH_MDLOG);
6430     session->con->send_message2(std::move(m));
6431   }
6432 }
6433
6434
6435 void Client::_abort_mds_sessions(int err)
6436 {
6437   for (auto p = mds_requests.begin(); p != mds_requests.end(); ) {
6438     auto req = p->second;
6439     ++p;
6440     // unsafe requests will be removed during close session below.
6441     if (req->got_unsafe)
6442       continue;
6443
6444     req->abort(err);
6445     if (req->caller_cond) {
6446       req->kick = true;
6447       req->caller_cond->notify_all();
6448     }
6449   }
6450
6451   // Process aborts on any requests that were on this waitlist.
6452   // Any requests that were on a waiting_for_open session waitlist
6453   // will get kicked during close session below.
6454   signal_cond_list(waiting_for_mdsmap);
6455
6456   // Force-close all sessions
6457   while(!mds_sessions.empty()) {
6458     auto session = mds_sessions.begin()->second;
6459     _closed_mds_session(session.get(), err);
6460   }
6461 }
6462
6463 void Client::_unmount(bool abort)
6464 {
6465   /*
6466    * We are unmounting the client.
6467    *
6468    * Just declare the state to STATE_UNMOUNTING to block and fail
6469    * any new comming "reader" and then try to wait all the in-flight
6470    * "readers" to finish.
6471    */
6472   RWRef_t mref_writer(mount_state, CLIENT_UNMOUNTING, false);
6473   if (!mref_writer.is_first_writer())
6474     return;
6475   mref_writer.wait_readers_done();
6476
6477   std::unique_lock lock{client_lock};
6478
6479   if (abort || blocklisted) {
6480     ldout(cct, 2) << "unmounting (" << (abort ? "abort)" : "blocklisted)") << dendl;
6481   } else {
6482     ldout(cct, 2) << "unmounting" << dendl;
6483   }
6484
6485   deleg_timeout = 0;
6486
6487   if (abort) {
6488     mount_aborted = true;
6489     // Abort all mds sessions
6490     _abort_mds_sessions(-CEPHFS_ENOTCONN);
6491
6492     objecter->op_cancel_writes(-CEPHFS_ENOTCONN);
6493   } else {
6494     // flush the mdlog for pending requests, if any
6495     flush_mdlog_sync();
6496   }
6497
6498   mount_cond.wait(lock, [this] {
6499     if (!mds_requests.empty()) {
6500       ldout(cct, 10) << "waiting on " << mds_requests.size() << " requests"
6501                      << dendl;
6502     }
6503     return mds_requests.empty();
6504   });
6505
6506   cwd.reset();
6507   root.reset();
6508
6509   // clean up any unclosed files
6510   while (!fd_map.empty()) {
6511     Fh *fh = fd_map.begin()->second;
6512     fd_map.erase(fd_map.begin());
6513     ldout(cct, 0) << " destroyed lost open file " << fh << " on " << *fh->inode << dendl;
6514     _release_fh(fh);
6515   }
6516
6517   while (!ll_unclosed_fh_set.empty()) {
6518     set<Fh*>::iterator it = ll_unclosed_fh_set.begin();
6519     Fh *fh = *it;
6520     ll_unclosed_fh_set.erase(fh);
6521     ldout(cct, 0) << " destroyed lost open file " << fh << " on " << *(fh->inode) << dendl;
6522     _release_fh(fh);
6523   }
6524
6525   while (!opened_dirs.empty()) {
6526     dir_result_t *dirp = *opened_dirs.begin();
6527     ldout(cct, 0) << " destroyed lost open dir " << dirp << " on " << *dirp->inode << dendl;
6528     _closedir(dirp);
6529   }
6530
6531   _ll_drop_pins();
6532
6533   if (cct->_conf->client_oc) {
6534     // flush/release all buffered data
6535     std::list<InodeRef> anchor;
6536     for (auto& p : inode_map) {
6537       Inode *in = p.second;
6538       if (!in) {
6539         ldout(cct, 0) << "null inode_map entry ino " << p.first << dendl;
6540         ceph_assert(in);
6541       }
6542
6543       // prevent inode from getting freed
6544       anchor.emplace_back(in);
6545
6546       if (abort || blocklisted) {
6547         objectcacher->purge_set(&in->oset);
6548       } else if (!in->caps.empty()) {
6549         _release(in);
6550         _flush(in, new C_Client_FlushComplete(this, in));
6551       }
6552     }
6553   }
6554
6555   if (abort || blocklisted) {
6556     for (auto &q : mds_sessions) {
6557       auto s = q.second;
6558       for (auto p = s->dirty_list.begin(); !p.end(); ) {
6559         Inode *in = *p;
6560         ++p;
6561         if (in->dirty_caps) {
6562           ldout(cct, 0) << " drop dirty caps on " << *in << dendl;
6563           in->mark_caps_clean();
6564           put_inode(in);
6565         }
6566       }
6567     }
6568   } else {
6569     flush_caps_sync();
6570     wait_sync_caps(last_flush_tid);
6571   }
6572
6573   // empty lru cache
6574   trim_cache();
6575
6576   delay_put_inodes();
6577
6578   while (lru.lru_get_size() > 0 ||
6579          !inode_map.empty()) {
6580     ldout(cct, 2) << "cache still has " << lru.lru_get_size()
6581             << "+" << inode_map.size() << " items"
6582             << ", waiting (for caps to release?)"
6583             << dendl;
6584
6585     if (auto r = mount_cond.wait_for(lock, ceph::make_timespan(5));
6586         r == std::cv_status::timeout) {
6587       dump_cache(NULL);
6588     }
6589   }
6590   ceph_assert(lru.lru_get_size() == 0);
6591   ceph_assert(inode_map.empty());
6592
6593   // stop tracing
6594   if (!cct->_conf->client_trace.empty()) {
6595     ldout(cct, 1) << "closing trace file '" << cct->_conf->client_trace << "'" << dendl;
6596     traceout.close();
6597   }
6598
6599   // stop the tick thread
6600   tick_thread_stopped = true;
6601   upkeep_cond.notify_one();
6602
6603   _close_sessions();
6604
6605   // release the global snapshot realm
6606   SnapRealm *global_realm = snap_realms[CEPH_INO_GLOBAL_SNAPREALM];
6607   if (global_realm) {
6608     ceph_assert(global_realm->nref == 1);
6609     put_snap_realm(global_realm);
6610   }
6611
6612   mref_writer.update_state(CLIENT_UNMOUNTED);
6613
6614   ldout(cct, 2) << "unmounted." << dendl;
6615 }
6616
6617 void Client::unmount()
6618 {
6619   _unmount(false);
6620 }
6621
6622 void Client::abort_conn()
6623 {
6624   _unmount(true);
6625 }
6626
6627 void Client::flush_cap_releases()
6628 {
6629   uint64_t nr_caps = 0;
6630
6631   // send any cap releases
6632   for (auto &p : mds_sessions) {
6633     auto session = p.second;
6634     if (session->release && mdsmap->is_clientreplay_or_active_or_stopping(
6635           p.first)) {
6636       nr_caps += session->release->caps.size();
6637       if (cct->_conf->client_inject_release_failure) {
6638         ldout(cct, 20) << __func__ << " injecting failure to send cap release message" << dendl;
6639       } else {
6640         session->con->send_message2(std::move(session->release));
6641       }
6642       session->release.reset();
6643     }
6644   }
6645
6646   if (nr_caps > 0) {
6647     dec_pinned_icaps(nr_caps);
6648   }
6649 }
6650
6651 void Client::renew_and_flush_cap_releases()
6652 {
6653   ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
6654
6655   if (!mount_aborted && mdsmap->get_epoch()) {
6656     // renew caps?
6657     auto el = ceph::coarse_mono_clock::now() - last_cap_renew;
6658     if (unlikely(utime_t(el) > mdsmap->get_session_timeout() / 3.0))
6659       renew_caps();
6660
6661     flush_cap_releases();
6662   }
6663 }
6664
6665 void Client::tick()
6666 {
6667   ldout(cct, 20) << "tick" << dendl;
6668
6669   auto now = ceph::coarse_mono_clock::now();
6670
6671   /*
6672    * If the mount() is not finished
6673    */
6674   if (is_mounting() && !mds_requests.empty()) {
6675     MetaRequest *req = mds_requests.begin()->second;
6676
6677     if (req->created + mount_timeout < now) {
6678       req->abort(-CEPHFS_ETIMEDOUT);
6679       if (req->caller_cond) {
6680         req->kick = true;
6681         req->caller_cond->notify_all();
6682       }
6683       signal_cond_list(waiting_for_mdsmap);
6684       for (auto &p : mds_sessions) {
6685         signal_context_list(p.second->waiting_for_open);
6686       }
6687     }
6688   }
6689
6690   renew_and_flush_cap_releases();
6691
6692   // delayed caps
6693   xlist<Inode*>::iterator p = delayed_list.begin();
6694   while (!p.end()) {
6695     Inode *in = *p;
6696     ++p;
6697     if (!mount_aborted && in->hold_caps_until > now)
6698       break;
6699     delayed_list.pop_front();
6700     if (!mount_aborted)
6701       check_caps(in, CHECK_CAPS_NODELAY);
6702   }
6703
6704   if (!mount_aborted)
6705     collect_and_send_metrics();
6706
6707   delay_put_inodes(is_unmounting());
6708   trim_cache(true);
6709
6710   if (blocklisted && (is_mounted() || is_unmounting()) &&
6711       last_auto_reconnect + std::chrono::seconds(30 * 60) < now &&
6712       cct->_conf.get_val<bool>("client_reconnect_stale")) {
6713     messenger->client_reset();
6714     fd_gen++; // invalidate open files
6715     blocklisted = false;
6716     _kick_stale_sessions();
6717     last_auto_reconnect = now;
6718   }
6719 }
6720
6721 void Client::start_tick_thread()
6722 {
6723   upkeeper = std::thread([this]() {
6724     using time = ceph::coarse_mono_time;
6725     using sec = std::chrono::seconds;
6726
6727     auto last_tick = time::min();
6728
6729     std::unique_lock cl(client_lock);
6730     while (!tick_thread_stopped) {
6731       auto now = clock::now();
6732       auto since = now - last_tick;
6733
6734       auto t_interval = clock::duration(cct->_conf.get_val<sec>("client_tick_interval"));
6735       auto d_interval = clock::duration(cct->_conf.get_val<sec>("client_debug_inject_tick_delay"));
6736
6737       auto interval = std::max(t_interval, d_interval);
6738       if (likely(since >= interval*.90)) {
6739         tick();
6740         last_tick = clock::now();
6741       } else {
6742         interval -= since;
6743       }
6744
6745       ldout(cct, 20) << "upkeep thread waiting interval " << interval << dendl;
6746       if (!tick_thread_stopped)
6747         upkeep_cond.wait_for(cl, interval);
6748     }
6749   });
6750 }
6751
6752 void Client::collect_and_send_metrics() {
6753   ldout(cct, 20) << __func__ << dendl;
6754
6755   ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
6756
6757   // right now, we only track and send global metrics. its sufficient
6758   // to send these metrics to MDS rank0.
6759   collect_and_send_global_metrics();
6760 }
6761
6762 void Client::collect_and_send_global_metrics() {
6763   ldout(cct, 20) << __func__ << dendl;
6764   ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
6765
6766   if (!have_open_session((mds_rank_t)0)) {
6767     ldout(cct, 5) << __func__ << ": no session with rank=0 -- not sending metric"
6768                   << dendl;
6769     return;
6770   }
6771   auto session = _get_or_open_mds_session((mds_rank_t)0);
6772   if (!session->mds_features.test(CEPHFS_FEATURE_METRIC_COLLECT)) {
6773     ldout(cct, 5) << __func__ << ": rank=0 does not support metrics" << dendl;
6774     return;
6775   }
6776
6777   ClientMetricMessage metric;
6778   std::vector<ClientMetricMessage> message;
6779
6780   // read latency
6781   if (_collect_and_send_global_metrics ||
6782       session->mds_metric_flags.test(CLIENT_METRIC_TYPE_READ_LATENCY)) {
6783     metric = ClientMetricMessage(ReadLatencyPayload(logger->tget(l_c_read),
6784                                                     logger->tget(l_c_rd_avg),
6785                                                     logger->get(l_c_rd_sqsum),
6786                                                     nr_read_request));
6787     message.push_back(metric);
6788   }
6789
6790   // write latency
6791   if (_collect_and_send_global_metrics ||
6792       session->mds_metric_flags.test(CLIENT_METRIC_TYPE_WRITE_LATENCY)) {
6793     metric = ClientMetricMessage(WriteLatencyPayload(logger->tget(l_c_wrlat),
6794                                                     logger->tget(l_c_wr_avg),
6795                                                     logger->get(l_c_wr_sqsum),
6796                                                     nr_write_request));
6797     message.push_back(metric);
6798   }
6799
6800   // metadata latency
6801   if (_collect_and_send_global_metrics ||
6802       session->mds_metric_flags.test(CLIENT_METRIC_TYPE_METADATA_LATENCY)) {
6803     metric = ClientMetricMessage(MetadataLatencyPayload(logger->tget(l_c_lat),
6804                                                         logger->tget(l_c_md_avg),
6805                                                         logger->get(l_c_md_sqsum),
6806                                                         nr_metadata_request));
6807     message.push_back(metric);
6808   }
6809
6810   // cap hit ratio -- nr_caps is unused right now
6811   if (_collect_and_send_global_metrics ||
6812       session->mds_metric_flags.test(CLIENT_METRIC_TYPE_CAP_INFO)) {
6813     auto [cap_hits, cap_misses] = get_cap_hit_rates();
6814     metric = ClientMetricMessage(CapInfoPayload(cap_hits, cap_misses, 0));
6815     message.push_back(metric);
6816   }
6817
6818   // dentry lease hit ratio
6819   if (_collect_and_send_global_metrics ||
6820       session->mds_metric_flags.test(CLIENT_METRIC_TYPE_DENTRY_LEASE)) {
6821     auto [dlease_hits, dlease_misses, nr] = get_dlease_hit_rates();
6822     metric = ClientMetricMessage(DentryLeasePayload(dlease_hits, dlease_misses, nr));
6823     message.push_back(metric);
6824   }
6825
6826   // opened files
6827   if (_collect_and_send_global_metrics ||
6828       session->mds_metric_flags.test(CLIENT_METRIC_TYPE_OPENED_FILES)) {
6829     auto [opened_files, total_inodes] = get_opened_files_rates();
6830     metric = ClientMetricMessage(OpenedFilesPayload(opened_files, total_inodes));
6831     message.push_back(metric);
6832   }
6833
6834   // pinned i_caps
6835   if (_collect_and_send_global_metrics ||
6836       session->mds_metric_flags.test(CLIENT_METRIC_TYPE_PINNED_ICAPS)) {
6837     auto [pinned_icaps, total_inodes] = get_pinned_icaps_rates();
6838     metric = ClientMetricMessage(PinnedIcapsPayload(pinned_icaps, total_inodes));
6839     message.push_back(metric);
6840   }
6841
6842   // opened inodes
6843   if (_collect_and_send_global_metrics ||
6844       session->mds_metric_flags.test(CLIENT_METRIC_TYPE_OPENED_INODES)) {
6845     auto [opened_inodes, total_inodes] = get_opened_inodes_rates();
6846     metric = ClientMetricMessage(OpenedInodesPayload(opened_inodes, total_inodes));
6847     message.push_back(metric);
6848   }
6849
6850   // read io sizes
6851   if (_collect_and_send_global_metrics ||
6852       session->mds_metric_flags.test(CLIENT_METRIC_TYPE_READ_IO_SIZES)) {
6853     metric = ClientMetricMessage(ReadIoSizesPayload(total_read_ops,
6854                                                     total_read_size));
6855     message.push_back(metric);
6856   }
6857
6858   // write io sizes
6859   if (_collect_and_send_global_metrics ||
6860       session->mds_metric_flags.test(CLIENT_METRIC_TYPE_WRITE_IO_SIZES)) {
6861     metric = ClientMetricMessage(WriteIoSizesPayload(total_write_ops,
6862                                                      total_write_size));
6863     message.push_back(metric);
6864   }
6865
6866   session->con->send_message2(make_message<MClientMetrics>(std::move(message)));
6867 }
6868
6869 void Client::renew_caps()
6870 {
6871   ldout(cct, 10) << "renew_caps()" << dendl;
6872   last_cap_renew = ceph::coarse_mono_clock::now();
6873
6874   for (auto &p : mds_sessions) {
6875     ldout(cct, 15) << "renew_caps requesting from mds." << p.first << dendl;
6876     if (mdsmap->get_state(p.first) >= MDSMap::STATE_REJOIN)
6877       renew_caps(p.second.get());
6878   }
6879 }
6880
6881 void Client::renew_caps(MetaSession *session)
6882 {
6883   ldout(cct, 10) << "renew_caps mds." << session->mds_num << dendl;
6884   session->last_cap_renew_request = ceph_clock_now();
6885   uint64_t seq = ++session->cap_renew_seq;
6886   session->con->send_message2(make_message<MClientSession>(CEPH_SESSION_REQUEST_RENEWCAPS, seq));
6887 }
6888
6889
6890 // ===============================================================
6891 // high level (POSIXy) interface
6892
6893 int Client::_do_lookup(Inode *dir, const string& name, int mask,
6894                        InodeRef *target, const UserPerm& perms)
6895 {
6896   int op = dir->snapid == CEPH_SNAPDIR ? CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP;
6897   MetaRequest *req = new MetaRequest(op);
6898   filepath path;
6899   dir->make_nosnap_relative_path(path);
6900   path.push_dentry(name);
6901   req->set_filepath(path);
6902   req->set_inode(dir);
6903   if (cct->_conf->client_debug_getattr_caps && op == CEPH_MDS_OP_LOOKUP)
6904       mask |= DEBUG_GETATTR_CAPS;
6905   req->head.args.getattr.mask = mask;
6906
6907   ldout(cct, 10) << __func__ << " on " << path << dendl;
6908
6909   int r = make_request(req, perms, target);
6910   ldout(cct, 10) << __func__ << " res is " << r << dendl;
6911   return r;
6912 }
6913
6914 bool Client::_dentry_valid(const Dentry *dn)
6915 {
6916   ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
6917
6918   // is dn lease valid?
6919   utime_t now = ceph_clock_now();
6920   if (dn->lease_mds >= 0 && dn->lease_ttl > now &&
6921       mds_sessions.count(dn->lease_mds)) {
6922     auto s = mds_sessions.at(dn->lease_mds);
6923     if (s->cap_ttl > now && s->cap_gen == dn->lease_gen) {
6924       dlease_hit();
6925       return true;
6926     }
6927
6928     ldout(cct, 20) << " bad lease, cap_ttl " << s->cap_ttl << ", cap_gen " << s->cap_gen
6929                    << " vs lease_gen " << dn->lease_gen << dendl;
6930   }
6931
6932   dlease_miss();
6933   return false;
6934 }
6935
6936 int Client::_lookup(Inode *dir, const string& dname, int mask, InodeRef *target,
6937                     const UserPerm& perms, std::string* alternate_name)
6938 {
6939   int r = 0;
6940   Dentry *dn = NULL;
6941   bool did_lookup_request = false;
6942   // can only request shared caps
6943   mask &= CEPH_CAP_ANY_SHARED | CEPH_STAT_RSTAT;
6944
6945   if (dname == "..") {
6946     if (dir->dentries.empty()) {
6947       MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPPARENT);
6948       filepath path(dir->ino);
6949       req->set_filepath(path);
6950
6951       InodeRef tmptarget;
6952       int r = make_request(req, perms, &tmptarget, NULL, rand() % mdsmap->get_num_in_mds());
6953
6954       if (r == 0) {
6955         *target = std::move(tmptarget);
6956         ldout(cct, 8) << __func__ << " found target " << (*target)->ino << dendl;
6957       } else {
6958         *target = dir;
6959       }
6960     }
6961     else
6962       *target = dir->get_first_parent()->dir->parent_inode; //dirs can't be hard-linked
6963     goto done;
6964   }
6965
6966   if (dname == ".") {
6967     *target = dir;
6968     goto done;
6969   }
6970
6971   if (!dir->is_dir()) {
6972     r = -CEPHFS_ENOTDIR;
6973     goto done;
6974   }
6975
6976   if (dname.length() > NAME_MAX) {
6977     r = -CEPHFS_ENAMETOOLONG;
6978     goto done;
6979   }
6980
6981   if (dname == cct->_conf->client_snapdir &&
6982       dir->snapid == CEPH_NOSNAP) {
6983     *target = open_snapdir(dir);
6984     goto done;
6985   }
6986
6987 relookup:
6988   if (dir->dir &&
6989       dir->dir->dentries.count(dname)) {
6990     dn = dir->dir->dentries[dname];
6991
6992     ldout(cct, 20) << __func__ << " have " << *dn << " from mds." << dn->lease_mds
6993         << " ttl " << dn->lease_ttl << " seq " << dn->lease_seq << dendl;
6994
6995     if (!dn->inode || dn->inode->caps_issued_mask(mask, true)) {
6996       if (_dentry_valid(dn)) {
6997         // touch this mds's dir cap too, even though we don't _explicitly_ use it here, to
6998         // make trim_caps() behave.
6999         dir->try_touch_cap(dn->lease_mds);
7000           goto hit_dn;
7001       }
7002       // dir shared caps?
7003       if (dir->caps_issued_mask(CEPH_CAP_FILE_SHARED, true)) {
7004         if (dn->cap_shared_gen == dir->shared_gen &&
7005             (!dn->inode || dn->inode->caps_issued_mask(mask, true)))
7006               goto hit_dn;
7007         if (!dn->inode && (dir->flags & I_COMPLETE)) {
7008           ldout(cct, 10) << __func__ << " concluded ENOENT locally for "
7009                          << *dir << " dn '" << dname << "'" << dendl;
7010           return -CEPHFS_ENOENT;
7011         }
7012       }
7013     } else {
7014       ldout(cct, 20) << " no cap on " << dn->inode->vino() << dendl;
7015     }
7016   } else {
7017     // can we conclude ENOENT locally?
7018     if (dir->caps_issued_mask(CEPH_CAP_FILE_SHARED, true) &&
7019         (dir->flags & I_COMPLETE)) {
7020       ldout(cct, 10) << __func__ << " concluded ENOENT locally for " << *dir << " dn '" << dname << "'" << dendl;
7021       return -CEPHFS_ENOENT;
7022     }
7023   }
7024
7025   if (did_lookup_request) {
7026     r = 0;
7027     goto done;
7028   }
7029   r = _do_lookup(dir, dname, mask, target, perms);
7030   did_lookup_request = true;
7031   if (r == 0) {
7032     /* complete lookup to get dentry for alternate_name */
7033     goto relookup;
7034   } else {
7035     goto done;
7036   }
7037
7038  hit_dn:
7039   if (dn->inode) {
7040     *target = dn->inode;
7041     if (alternate_name)
7042       *alternate_name = dn->alternate_name;
7043   } else {
7044     r = -CEPHFS_ENOENT;
7045   }
7046   touch_dn(dn);
7047   goto done;
7048
7049  done:
7050   if (r < 0)
7051     ldout(cct, 10) << __func__ << " " << *dir << " " << dname << " = " << r << dendl;
7052   else
7053     ldout(cct, 10) << __func__ << " " << *dir << " " << dname << " = " << **target << dendl;
7054   return r;
7055 }
7056
7057 int Client::get_or_create(Inode *dir, const char* name,
7058                           Dentry **pdn, bool expect_null)
7059 {
7060   // lookup
7061   ldout(cct, 20) << __func__ << " " << *dir << " name " << name << dendl;
7062   dir->open_dir();
7063   if (dir->dir->dentries.count(name)) {
7064     Dentry *dn = dir->dir->dentries[name];
7065     if (_dentry_valid(dn)) {
7066       if (expect_null)
7067         return -CEPHFS_EEXIST;
7068     }
7069     *pdn = dn;
7070   } else {
7071     // otherwise link up a new one
7072     *pdn = link(dir->dir, name, NULL, NULL);
7073   }
7074
7075   // success
7076   return 0;
7077 }
7078
7079 int Client::walk(std::string_view path, walk_dentry_result* wdr, const UserPerm& perms, bool followsym)
7080 {
7081   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
7082   if (!mref_reader.is_state_satisfied())
7083     return -CEPHFS_ENOTCONN;
7084
7085   ldout(cct, 10) << __func__ << ": " << path << dendl;
7086
7087   std::scoped_lock lock(client_lock);
7088
7089   return path_walk(path, wdr, perms, followsym);
7090 }
7091
7092 int Client::path_walk(const filepath& origpath, InodeRef *end,
7093                       const UserPerm& perms, bool followsym, int mask, InodeRef dirinode)
7094 {
7095   walk_dentry_result wdr;
7096   int rc = path_walk(origpath, &wdr, perms, followsym, mask, dirinode);
7097   *end = std::move(wdr.in);
7098   return rc;
7099 }
7100
7101 int Client::path_walk(const filepath& origpath, walk_dentry_result* result, const UserPerm& perms,
7102                       bool followsym, int mask, InodeRef dirinode)
7103 {
7104   filepath path = origpath;
7105   InodeRef cur;
7106   std::string alternate_name;
7107   if (origpath.absolute())
7108     cur = root;
7109   else if (!dirinode)
7110     cur = cwd;
7111   else {
7112     cur = dirinode;
7113   }
7114   ceph_assert(cur);
7115
7116   ldout(cct, 20) << __func__ << " cur=" << *cur << dendl;
7117   ldout(cct, 10) << __func__ << " " << path << dendl;
7118
7119   int symlinks = 0;
7120
7121   unsigned i=0;
7122   while (i < path.depth() && cur) {
7123     int caps = 0;
7124     const string &dname = path[i];
7125     ldout(cct, 10) << " " << i << " " << *cur << " " << dname << dendl;
7126     ldout(cct, 20) << "  (path is " << path << ")" << dendl;
7127     InodeRef next;
7128     if (cct->_conf->client_permissions) {
7129       int r = may_lookup(cur.get(), perms);
7130       if (r < 0)
7131         return r;
7132       caps = CEPH_CAP_AUTH_SHARED;
7133     }
7134
7135     /* Get extra requested caps on the last component */
7136     if (i == (path.depth() - 1))
7137       caps |= mask;
7138     int r = _lookup(cur.get(), dname, caps, &next, perms, &alternate_name);
7139     if (r < 0)
7140       return r;
7141     // only follow trailing symlink if followsym.  always follow
7142     // 'directory' symlinks.
7143     if (next && next->is_symlink()) {
7144       symlinks++;
7145       ldout(cct, 20) << " symlink count " << symlinks << ", value is '" << next->symlink << "'" << dendl;
7146       if (symlinks > MAXSYMLINKS) {
7147         return -CEPHFS_ELOOP;
7148       }
7149
7150       if (i < path.depth() - 1) {
7151         // dir symlink
7152         // replace consumed components of path with symlink dir target
7153         filepath resolved(next->symlink.c_str());
7154         resolved.append(path.postfixpath(i + 1));
7155         path = resolved;
7156         i = 0;
7157         if (next->symlink[0] == '/') {
7158           cur = root;
7159         }
7160         continue;
7161       } else if (followsym) {
7162         if (next->symlink[0] == '/') {
7163           path = next->symlink.c_str();
7164           i = 0;
7165           // reset position
7166           cur = root;
7167         } else {
7168           filepath more(next->symlink.c_str());
7169           // we need to remove the symlink component from off of the path
7170           // before adding the target that the symlink points to.  remain
7171           // at the same position in the path.
7172           path.pop_dentry();
7173           path.append(more);
7174         }
7175         continue;
7176       }
7177     }
7178     cur.swap(next);
7179     i++;
7180   }
7181   if (!cur)
7182     return -CEPHFS_ENOENT;
7183   if (result) {
7184     result->in = std::move(cur);
7185     result->alternate_name = std::move(alternate_name);
7186   }
7187   return 0;
7188 }
7189
7190
7191 // namespace ops
7192
7193 int Client::link(const char *relexisting, const char *relpath, const UserPerm& perm, std::string alternate_name)
7194 {
7195   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
7196   if (!mref_reader.is_state_satisfied())
7197     return -CEPHFS_ENOTCONN;
7198
7199   tout(cct) << "link" << std::endl;
7200   tout(cct) << relexisting << std::endl;
7201   tout(cct) << relpath << std::endl;
7202
7203   filepath existing(relexisting);
7204
7205   InodeRef in, dir;
7206
7207   std::scoped_lock lock(client_lock);
7208   int r = path_walk(existing, &in, perm, true);
7209   if (r < 0)
7210     return r;
7211   if (std::string(relpath) == "/") {
7212     r = -CEPHFS_EEXIST;
7213     return r;
7214   }
7215   filepath path(relpath);
7216   string name = path.last_dentry();
7217   path.pop_dentry();
7218
7219   r = path_walk(path, &dir, perm, true);
7220   if (r < 0)
7221     return r;
7222   if (cct->_conf->client_permissions) {
7223     if (S_ISDIR(in->mode)) {
7224       r = -CEPHFS_EPERM;
7225       return r;
7226     }
7227     r = may_hardlink(in.get(), perm);
7228     if (r < 0)
7229       return r;
7230     r = may_create(dir.get(), perm);
7231     if (r < 0)
7232       return r;
7233   }
7234   r = _link(in.get(), dir.get(), name.c_str(), perm, std::move(alternate_name));
7235   return r;
7236 }
7237
7238 int Client::unlink(const char *relpath, const UserPerm& perm)
7239 {
7240   return unlinkat(CEPHFS_AT_FDCWD, relpath, 0, perm);
7241 }
7242
7243 int Client::unlinkat(int dirfd, const char *relpath, int flags, const UserPerm& perm)
7244 {
7245   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
7246   if (!mref_reader.is_state_satisfied()) {
7247     return -CEPHFS_ENOTCONN;
7248   }
7249
7250   tout(cct) << __func__ << std::endl;
7251   tout(cct) << dirfd << std::endl;
7252   tout(cct) << relpath << std::endl;
7253   tout(cct) << flags << std::endl;
7254
7255   if (std::string(relpath) == "/") {
7256     return flags & AT_REMOVEDIR ? -CEPHFS_EBUSY : -CEPHFS_EISDIR;
7257   }
7258
7259   filepath path(relpath);
7260   string name = path.last_dentry();
7261   path.pop_dentry();
7262   InodeRef dir;
7263
7264   std::scoped_lock lock(client_lock);
7265
7266   InodeRef dirinode;
7267   int r = get_fd_inode(dirfd, &dirinode);
7268   if (r < 0) {
7269     return r;
7270   }
7271
7272   r = path_walk(path, &dir, perm, true, 0, dirinode);
7273   if (r < 0) {
7274     return r;
7275   }
7276   if (cct->_conf->client_permissions) {
7277     r = may_delete(dir.get(), name.c_str(), perm);
7278     if (r < 0) {
7279       return r;
7280     }
7281   }
7282   if (flags & AT_REMOVEDIR) {
7283     r = _rmdir(dir.get(), name.c_str(), perm);
7284   } else {
7285     r = _unlink(dir.get(), name.c_str(), perm);
7286   }
7287   return r;
7288 }
7289
7290 int Client::rename(const char *relfrom, const char *relto, const UserPerm& perm, std::string alternate_name)
7291 {
7292   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
7293   if (!mref_reader.is_state_satisfied())
7294     return -CEPHFS_ENOTCONN;
7295
7296   tout(cct) << __func__ << std::endl;
7297   tout(cct) << relfrom << std::endl;
7298   tout(cct) << relto << std::endl;
7299
7300   if (std::string(relfrom) == "/" || std::string(relto) == "/")
7301     return -CEPHFS_EBUSY;
7302
7303   filepath from(relfrom);
7304   filepath to(relto);
7305   string fromname = from.last_dentry();
7306   from.pop_dentry();
7307   string toname = to.last_dentry();
7308   to.pop_dentry();
7309
7310   InodeRef fromdir, todir;
7311
7312   std::scoped_lock lock(client_lock);
7313   int r = path_walk(from, &fromdir, perm);
7314   if (r < 0)
7315     goto out;
7316   r = path_walk(to, &todir, perm);
7317   if (r < 0)
7318     goto out;
7319
7320   if (cct->_conf->client_permissions) {
7321     int r = may_delete(fromdir.get(), fromname.c_str(), perm);
7322     if (r < 0)
7323       return r;
7324     r = may_delete(todir.get(), toname.c_str(), perm);
7325     if (r < 0 && r != -CEPHFS_ENOENT)
7326       return r;
7327   }
7328   r = _rename(fromdir.get(), fromname.c_str(), todir.get(), toname.c_str(), perm, std::move(alternate_name));
7329 out:
7330   return r;
7331 }
7332
7333 // dirs
7334
7335 int Client::mkdir(const char *relpath, mode_t mode, const UserPerm& perm, std::string alternate_name)
7336 {
7337   return mkdirat(CEPHFS_AT_FDCWD, relpath, mode, perm, alternate_name);
7338 }
7339
7340 int Client::mkdirat(int dirfd, const char *relpath, mode_t mode, const UserPerm& perm,
7341                     std::string alternate_name)
7342 {
7343   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
7344   if (!mref_reader.is_state_satisfied())
7345     return -CEPHFS_ENOTCONN;
7346
7347   tout(cct) << __func__ << std::endl;
7348   tout(cct) << dirfd << std::endl;
7349   tout(cct) << relpath << std::endl;
7350   tout(cct) << mode << std::endl;
7351   ldout(cct, 10) << __func__ << ": " << relpath << dendl;
7352
7353   if (std::string(relpath) == "/") {
7354     return -CEPHFS_EEXIST;
7355   }
7356
7357   filepath path(relpath);
7358   string name = path.last_dentry();
7359   path.pop_dentry();
7360   InodeRef dir;
7361
7362   std::scoped_lock lock(client_lock);
7363
7364   InodeRef dirinode;
7365   int r = get_fd_inode(dirfd, &dirinode);
7366   if (r < 0) {
7367     return r;
7368   }
7369
7370   r = path_walk(path, &dir, perm, true, 0, dirinode);
7371   if (r < 0) {
7372     return r;
7373   }
7374   if (cct->_conf->client_permissions) {
7375     r = may_create(dir.get(), perm);
7376     if (r < 0) {
7377       return r;
7378     }
7379   }
7380   return _mkdir(dir.get(), name.c_str(), mode, perm, 0, {}, std::move(alternate_name));
7381 }
7382
7383 int Client::mkdirs(const char *relpath, mode_t mode, const UserPerm& perms)
7384 {
7385   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
7386   if (!mref_reader.is_state_satisfied())
7387     return -CEPHFS_ENOTCONN;
7388
7389   ldout(cct, 10) << "Client::mkdirs " << relpath << dendl;
7390   tout(cct) << __func__ << std::endl;
7391   tout(cct) << relpath << std::endl;
7392   tout(cct) << mode << std::endl;
7393
7394   //get through existing parts of path
7395   filepath path(relpath);
7396   unsigned int i;
7397   int r = 0, caps = 0;
7398   InodeRef cur, next;
7399
7400   std::scoped_lock lock(client_lock);
7401   cur = cwd;
7402   for (i=0; i<path.depth(); ++i) {
7403     if (cct->_conf->client_permissions) {
7404       r = may_lookup(cur.get(), perms);
7405       if (r < 0)
7406         break;
7407       caps = CEPH_CAP_AUTH_SHARED;
7408     }
7409     r = _lookup(cur.get(), path[i].c_str(), caps, &next, perms);
7410     if (r < 0)
7411       break;
7412     cur.swap(next);
7413   }
7414   if (r!=-CEPHFS_ENOENT) return r;
7415   ldout(cct, 20) << __func__ << " got through " << i << " directories on path " << relpath << dendl;
7416   //make new directory at each level
7417   for (; i<path.depth(); ++i) {
7418     if (cct->_conf->client_permissions) {
7419       r = may_create(cur.get(), perms);
7420       if (r < 0)
7421         return r;
7422     }
7423     //make new dir
7424     r = _mkdir(cur.get(), path[i].c_str(), mode, perms, &next);
7425
7426     //check proper creation/existence
7427     if(-CEPHFS_EEXIST == r && i < path.depth() - 1) {
7428       r = _lookup(cur.get(), path[i].c_str(), CEPH_CAP_AUTH_SHARED, &next, perms);
7429     }
7430     if (r < 0)
7431       return r;
7432     //move to new dir and continue
7433     cur.swap(next);
7434     ldout(cct, 20) << __func__ << ": successfully created directory "
7435                    << filepath(cur->ino).get_path() << dendl;
7436   }
7437   return 0;
7438 }
7439
7440 int Client::rmdir(const char *relpath, const UserPerm& perms)
7441 {
7442   return unlinkat(CEPHFS_AT_FDCWD, relpath, AT_REMOVEDIR, perms);
7443 }
7444
7445 int Client::mknod(const char *relpath, mode_t mode, const UserPerm& perms, dev_t rdev)
7446 {
7447   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
7448   if (!mref_reader.is_state_satisfied())
7449     return -CEPHFS_ENOTCONN;
7450
7451   tout(cct) << __func__ << std::endl;
7452   tout(cct) << relpath << std::endl;
7453   tout(cct) << mode << std::endl;
7454   tout(cct) << rdev << std::endl;
7455
7456   if (std::string(relpath) == "/")
7457     return -CEPHFS_EEXIST;
7458
7459   filepath path(relpath);
7460   string name = path.last_dentry();
7461   path.pop_dentry();
7462   InodeRef dir;
7463
7464   std::scoped_lock lock(client_lock);
7465   int r = path_walk(path, &dir, perms);
7466   if (r < 0)
7467     return r;
7468   if (cct->_conf->client_permissions) {
7469     int r = may_create(dir.get(), perms);
7470     if (r < 0)
7471       return r;
7472   }
7473   return _mknod(dir.get(), name.c_str(), mode, rdev, perms);
7474 }
7475
7476 // symlinks
7477
7478 int Client::symlink(const char *target, const char *relpath, const UserPerm& perms, std::string alternate_name)
7479 {
7480   return symlinkat(target, CEPHFS_AT_FDCWD, relpath, perms, alternate_name);
7481 }
7482
7483 int Client::symlinkat(const char *target, int dirfd, const char *relpath, const UserPerm& perms,
7484                       std::string alternate_name)
7485 {
7486   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
7487   if (!mref_reader.is_state_satisfied()) {
7488     return -CEPHFS_ENOTCONN;
7489   }
7490
7491   tout(cct) << __func__ << std::endl;
7492   tout(cct) << target << std::endl;
7493   tout(cct) << dirfd << std::endl;
7494   tout(cct) << relpath << std::endl;
7495
7496   if (std::string(relpath) == "/") {
7497     return -CEPHFS_EEXIST;
7498   }
7499
7500   filepath path(relpath);
7501   string name = path.last_dentry();
7502   path.pop_dentry();
7503   InodeRef dir;
7504
7505   std::scoped_lock lock(client_lock);
7506
7507   InodeRef dirinode;
7508   int r = get_fd_inode(dirfd, &dirinode);
7509   if (r < 0) {
7510     return r;
7511   }
7512   r = path_walk(path, &dir, perms, true, 0, dirinode);
7513   if (r < 0) {
7514     return r;
7515   }
7516   if (cct->_conf->client_permissions) {
7517     int r = may_create(dir.get(), perms);
7518     if (r < 0) {
7519       return r;
7520     }
7521   }
7522   return _symlink(dir.get(), name.c_str(), target, perms, std::move(alternate_name));
7523 }
7524
7525 int Client::readlink(const char *relpath, char *buf, loff_t size, const UserPerm& perms)
7526 {
7527   return readlinkat(CEPHFS_AT_FDCWD, relpath, buf, size, perms);
7528 }
7529
7530 int Client::readlinkat(int dirfd, const char *relpath, char *buf, loff_t size, const UserPerm& perms) {
7531   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
7532   if (!mref_reader.is_state_satisfied()) {
7533     return -CEPHFS_ENOTCONN;
7534   }
7535
7536   tout(cct) << __func__ << std::endl;
7537   tout(cct) << dirfd << std::endl;
7538   tout(cct) << relpath << std::endl;
7539
7540   InodeRef dirinode;
7541   std::scoped_lock lock(client_lock);
7542   int r = get_fd_inode(dirfd, &dirinode);
7543   if (r < 0) {
7544     return r;
7545   }
7546
7547   InodeRef in;
7548   filepath path(relpath);
7549   r = path_walk(path, &in, perms, false, 0, dirinode);
7550   if (r < 0) {
7551     return r;
7552   }
7553
7554   return _readlink(in.get(), buf, size);
7555 }
7556
7557 int Client::_readlink(Inode *in, char *buf, size_t size)
7558 {
7559   if (!in->is_symlink())
7560     return -CEPHFS_EINVAL;
7561
7562   // copy into buf (at most size bytes)
7563   int r = in->symlink.length();
7564   if (r > (int)size)
7565     r = size;
7566   memcpy(buf, in->symlink.c_str(), r);
7567   return r;
7568 }
7569
7570
7571 // inode stuff
7572
7573 int Client::_getattr(Inode *in, int mask, const UserPerm& perms, bool force)
7574 {
7575   bool yes = in->caps_issued_mask(mask, true);
7576
7577   ldout(cct, 10) << __func__ << " mask " << ccap_string(mask) << " issued=" << yes << dendl;
7578   if (yes && !force)
7579     return 0;
7580
7581   MetaRequest *req = new MetaRequest(CEPH_MDS_OP_GETATTR);
7582   filepath path;
7583   in->make_nosnap_relative_path(path);
7584   req->set_filepath(path);
7585   req->set_inode(in);
7586   req->head.args.getattr.mask = mask;
7587
7588   int res = make_request(req, perms);
7589   ldout(cct, 10) << __func__ << " result=" << res << dendl;
7590   return res;
7591 }
7592
7593 int Client::_getvxattr(
7594   Inode *in,
7595   const UserPerm& perms,
7596   const char *xattr_name,
7597   ssize_t size,
7598   void *value,
7599   mds_rank_t rank)
7600 {
7601   if (!xattr_name || strlen(xattr_name) <= 0 || strlen(xattr_name) > 255) {
7602     return -CEPHFS_ENODATA;
7603   }
7604
7605   MetaRequest *req = new MetaRequest(CEPH_MDS_OP_GETVXATTR);
7606   filepath path;
7607   in->make_nosnap_relative_path(path);
7608   req->set_filepath(path);
7609   req->set_inode(in);
7610   req->set_string2(xattr_name);
7611
7612   bufferlist bl;
7613   int res = make_request(req, perms, nullptr, nullptr, rank, &bl);
7614   ldout(cct, 10) << __func__ << " result=" << res << dendl;
7615
7616   if (res < 0) {
7617     return res;
7618   }
7619
7620   std::string buf;
7621   auto p = bl.cbegin();
7622
7623   DECODE_START(1, p);
7624   decode(buf, p);
7625   DECODE_FINISH(p);
7626
7627   ssize_t len = buf.length();
7628
7629   res = len; // refer to man getxattr(2) for output buffer size == 0
7630
7631   if (size > 0) {
7632     if (len > size) {
7633       res = -CEPHFS_ERANGE; // insufficient output buffer space
7634     } else {
7635       memcpy(value, buf.c_str(), len);
7636     }
7637   }
7638   return res;
7639 }
7640
7641 int Client::_do_setattr(Inode *in, struct ceph_statx *stx, int mask,
7642                         const UserPerm& perms, InodeRef *inp)
7643 {
7644   int issued = in->caps_issued();
7645   union ceph_mds_request_args args;
7646   bool kill_sguid = false;
7647   int inode_drop = 0;
7648
7649   ldout(cct, 10) << __func__ << " mask " << mask << " issued " <<
7650     ccap_string(issued) << dendl;
7651
7652   if (in->snapid != CEPH_NOSNAP) {
7653     return -CEPHFS_EROFS;
7654   }
7655   if ((mask & CEPH_SETATTR_SIZE) &&
7656       (uint64_t)stx->stx_size > in->size &&
7657       is_quota_bytes_exceeded(in, (uint64_t)stx->stx_size - in->size,
7658                               perms)) {
7659     return -CEPHFS_EDQUOT;
7660   }
7661
7662   memset(&args, 0, sizeof(args));
7663
7664   // make the change locally?
7665   if ((in->cap_dirtier_uid >= 0 && perms.uid() != in->cap_dirtier_uid) ||
7666       (in->cap_dirtier_gid >= 0 && perms.gid() != in->cap_dirtier_gid)) {
7667     ldout(cct, 10) << __func__ << " caller " << perms.uid() << ":" << perms.gid()
7668                    << " != cap dirtier " << in->cap_dirtier_uid << ":"
7669                    << in->cap_dirtier_gid << ", forcing sync setattr"
7670                    << dendl;
7671     /*
7672      * This works because we implicitly flush the caps as part of the
7673      * request, so the cap update check will happen with the writeback
7674      * cap context, and then the setattr check will happen with the
7675      * caller's context.
7676      *
7677      * In reality this pattern is likely pretty rare (different users
7678      * setattr'ing the same file).  If that turns out not to be the
7679      * case later, we can build a more complex pipelined cap writeback
7680      * infrastructure...
7681      */
7682     mask |= CEPH_SETATTR_CTIME;
7683   }
7684
7685   if (!mask) {
7686     // caller just needs us to bump the ctime
7687     in->ctime = ceph_clock_now();
7688     in->cap_dirtier_uid = perms.uid();
7689     in->cap_dirtier_gid = perms.gid();
7690     if (issued & CEPH_CAP_AUTH_EXCL)
7691       in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7692     else if (issued & CEPH_CAP_FILE_EXCL)
7693       in->mark_caps_dirty(CEPH_CAP_FILE_EXCL);
7694     else if (issued & CEPH_CAP_XATTR_EXCL)
7695       in->mark_caps_dirty(CEPH_CAP_XATTR_EXCL);
7696     else
7697       mask |= CEPH_SETATTR_CTIME;
7698   }
7699
7700   if (in->caps_issued_mask(CEPH_CAP_AUTH_EXCL)) {
7701     kill_sguid = mask & (CEPH_SETATTR_SIZE|CEPH_SETATTR_KILL_SGUID);
7702
7703     mask &= ~CEPH_SETATTR_KILL_SGUID;
7704   } else if (mask & CEPH_SETATTR_SIZE) {
7705     /* If we don't have Ax, then we must ask the server to clear them on truncate */
7706     mask |= CEPH_SETATTR_KILL_SGUID;
7707     inode_drop |= CEPH_CAP_AUTH_SHARED;
7708   }
7709
7710   if (mask & CEPH_SETATTR_UID) {
7711     ldout(cct,10) << "changing uid to " << stx->stx_uid << dendl;
7712
7713     if (in->caps_issued_mask(CEPH_CAP_AUTH_EXCL)) {
7714       in->ctime = ceph_clock_now();
7715       in->cap_dirtier_uid = perms.uid();
7716       in->cap_dirtier_gid = perms.gid();
7717       in->uid = stx->stx_uid;
7718       in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7719       mask &= ~CEPH_SETATTR_UID;
7720       kill_sguid = true;
7721     } else if (!in->caps_issued_mask(CEPH_CAP_AUTH_SHARED) ||
7722                in->uid != stx->stx_uid) {
7723       args.setattr.uid = stx->stx_uid;
7724       inode_drop |= CEPH_CAP_AUTH_SHARED;
7725     } else {
7726       mask &= ~CEPH_SETATTR_UID;
7727     }
7728   }
7729
7730   if (mask & CEPH_SETATTR_GID) {
7731     ldout(cct,10) << "changing gid to " << stx->stx_gid << dendl;
7732
7733     if (in->caps_issued_mask(CEPH_CAP_AUTH_EXCL)) {
7734       in->ctime = ceph_clock_now();
7735       in->cap_dirtier_uid = perms.uid();
7736       in->cap_dirtier_gid = perms.gid();
7737       in->gid = stx->stx_gid;
7738       in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7739       mask &= ~CEPH_SETATTR_GID;
7740       kill_sguid = true;
7741     } else if (!in->caps_issued_mask(CEPH_CAP_AUTH_SHARED) ||
7742                in->gid != stx->stx_gid) {
7743       args.setattr.gid = stx->stx_gid;
7744       inode_drop |= CEPH_CAP_AUTH_SHARED;
7745     } else {
7746       mask &= ~CEPH_SETATTR_GID;
7747     }
7748   }
7749
7750   if (mask & CEPH_SETATTR_MODE) {
7751     ldout(cct,10) << "changing mode to " << stx->stx_mode << dendl;
7752
7753     if (in->caps_issued_mask(CEPH_CAP_AUTH_EXCL)) {
7754       in->ctime = ceph_clock_now();
7755       in->cap_dirtier_uid = perms.uid();
7756       in->cap_dirtier_gid = perms.gid();
7757       in->mode = (in->mode & ~07777) | (stx->stx_mode & 07777);
7758       in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7759       mask &= ~CEPH_SETATTR_MODE;
7760     } else if (!in->caps_issued_mask(CEPH_CAP_AUTH_SHARED) ||
7761                in->mode != stx->stx_mode) {
7762       args.setattr.mode = stx->stx_mode;
7763       inode_drop |= CEPH_CAP_AUTH_SHARED;
7764     } else {
7765       mask &= ~CEPH_SETATTR_MODE;
7766     }
7767   } else if (in->caps_issued_mask(CEPH_CAP_AUTH_EXCL) &&
7768              kill_sguid && S_ISREG(in->mode) &&
7769              (in->mode & (S_IXUSR|S_IXGRP|S_IXOTH))) {
7770     /* Must squash the any setuid/setgid bits with an ownership change */
7771     in->mode &= ~(S_ISUID|S_ISGID);
7772     in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7773   }
7774
7775   if (mask & CEPH_SETATTR_BTIME) {
7776     ldout(cct,10) << "changing btime to " << in->btime << dendl;
7777
7778     if (in->caps_issued_mask(CEPH_CAP_AUTH_EXCL)) {
7779       in->ctime = ceph_clock_now();
7780       in->cap_dirtier_uid = perms.uid();
7781       in->cap_dirtier_gid = perms.gid();
7782       in->btime = utime_t(stx->stx_btime);
7783       in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7784       mask &= ~CEPH_SETATTR_BTIME;
7785     } else if (!in->caps_issued_mask(CEPH_CAP_AUTH_SHARED) ||
7786                in->btime != utime_t(stx->stx_btime)) {
7787       args.setattr.btime = utime_t(stx->stx_btime);
7788       inode_drop |= CEPH_CAP_AUTH_SHARED;
7789     } else {
7790       mask &= ~CEPH_SETATTR_BTIME;
7791     }
7792   }
7793
7794   if (mask & CEPH_SETATTR_SIZE) {
7795     if ((uint64_t)stx->stx_size >= mdsmap->get_max_filesize()) {
7796       //too big!
7797       ldout(cct,10) << "unable to set size to " << stx->stx_size << ". Too large!" << dendl;
7798       return -CEPHFS_EFBIG;
7799     }
7800
7801     ldout(cct,10) << "changing size to " << stx->stx_size << dendl;
7802     if (in->caps_issued_mask(CEPH_CAP_FILE_EXCL) &&
7803         !(mask & CEPH_SETATTR_KILL_SGUID) &&
7804         stx->stx_size >= in->size) {
7805       if (stx->stx_size > in->size) {
7806         in->size = in->reported_size = stx->stx_size;
7807         in->cap_dirtier_uid = perms.uid();
7808         in->cap_dirtier_gid = perms.gid();
7809         in->mark_caps_dirty(CEPH_CAP_FILE_EXCL);
7810         mask &= ~(CEPH_SETATTR_SIZE);
7811         mask |= CEPH_SETATTR_MTIME;
7812       } else {
7813         // ignore it when size doesn't change
7814         mask &= ~(CEPH_SETATTR_SIZE);
7815       }
7816     } else {
7817       args.setattr.size = stx->stx_size;
7818       inode_drop |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_RD |
7819                     CEPH_CAP_FILE_WR;
7820     }
7821   }
7822
7823   if (mask & CEPH_SETATTR_MTIME) {
7824     if (in->caps_issued_mask(CEPH_CAP_FILE_EXCL)) {
7825       in->mtime = utime_t(stx->stx_mtime);
7826       in->ctime = ceph_clock_now();
7827       in->cap_dirtier_uid = perms.uid();
7828       in->cap_dirtier_gid = perms.gid();
7829       in->time_warp_seq++;
7830       in->mark_caps_dirty(CEPH_CAP_FILE_EXCL);
7831       mask &= ~CEPH_SETATTR_MTIME;
7832     } else if (in->caps_issued_mask(CEPH_CAP_FILE_WR) &&
7833                utime_t(stx->stx_mtime) > in->mtime) {
7834       in->mtime = utime_t(stx->stx_mtime);
7835       in->ctime = ceph_clock_now();
7836       in->cap_dirtier_uid = perms.uid();
7837       in->cap_dirtier_gid = perms.gid();
7838       in->mark_caps_dirty(CEPH_CAP_FILE_WR);
7839       mask &= ~CEPH_SETATTR_MTIME;
7840     } else if (!in->caps_issued_mask(CEPH_CAP_FILE_SHARED) ||
7841                in->mtime != utime_t(stx->stx_mtime)) {
7842       args.setattr.mtime = utime_t(stx->stx_mtime);
7843       inode_drop |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_RD |
7844                     CEPH_CAP_FILE_WR;
7845     } else {
7846       mask &= ~CEPH_SETATTR_MTIME;
7847     }
7848   }
7849
7850   if (mask & CEPH_SETATTR_ATIME) {
7851     if (in->caps_issued_mask(CEPH_CAP_FILE_EXCL)) {
7852       in->atime = utime_t(stx->stx_atime);
7853       in->ctime = ceph_clock_now();
7854       in->cap_dirtier_uid = perms.uid();
7855       in->cap_dirtier_gid = perms.gid();
7856       in->time_warp_seq++;
7857       in->mark_caps_dirty(CEPH_CAP_FILE_EXCL);
7858       mask &= ~CEPH_SETATTR_ATIME;
7859     } else if (in->caps_issued_mask(CEPH_CAP_FILE_WR) &&
7860                utime_t(stx->stx_atime) > in->atime) {
7861       in->atime = utime_t(stx->stx_atime);
7862       in->ctime = ceph_clock_now();
7863       in->cap_dirtier_uid = perms.uid();
7864       in->cap_dirtier_gid = perms.gid();
7865       in->mark_caps_dirty(CEPH_CAP_FILE_WR);
7866       mask &= ~CEPH_SETATTR_ATIME;
7867     } else if (!in->caps_issued_mask(CEPH_CAP_FILE_SHARED) ||
7868                in->atime != utime_t(stx->stx_atime)) {
7869       args.setattr.atime = utime_t(stx->stx_atime);
7870       inode_drop |= CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_RD |
7871                     CEPH_CAP_FILE_WR;
7872     } else {
7873       mask &= ~CEPH_SETATTR_ATIME;
7874     }
7875   }
7876
7877   if (!mask) {
7878     in->change_attr++;
7879     return 0;
7880   }
7881
7882   MetaRequest *req = new MetaRequest(CEPH_MDS_OP_SETATTR);
7883
7884   filepath path;
7885
7886   in->make_nosnap_relative_path(path);
7887   req->set_filepath(path);
7888   req->set_inode(in);
7889
7890   req->head.args = args;
7891   req->inode_drop = inode_drop;
7892   req->head.args.setattr.mask = mask;
7893   req->regetattr_mask = mask;
7894
7895   int res = make_request(req, perms, inp);
7896   ldout(cct, 10) << "_setattr result=" << res << dendl;
7897   return res;
7898 }
7899
7900 /* Note that we only care about attrs that setattr cares about */
7901 void Client::stat_to_statx(struct stat *st, struct ceph_statx *stx)
7902 {
7903   stx->stx_size = st->st_size;
7904   stx->stx_mode = st->st_mode;
7905   stx->stx_uid = st->st_uid;
7906   stx->stx_gid = st->st_gid;
7907 #ifdef __APPLE__
7908   stx->stx_mtime = st->st_mtimespec;
7909   stx->stx_atime = st->st_atimespec;
7910 #elif __WIN32
7911   stx->stx_mtime.tv_sec = st->st_mtime;
7912   stx->stx_atime.tv_sec = st->st_atime;
7913 #else
7914   stx->stx_mtime = st->st_mtim;
7915   stx->stx_atime = st->st_atim;
7916 #endif
7917 }
7918
7919 int Client::__setattrx(Inode *in, struct ceph_statx *stx, int mask,
7920                        const UserPerm& perms, InodeRef *inp)
7921 {
7922   int ret = _do_setattr(in, stx, mask, perms, inp);
7923   if (ret < 0)
7924    return ret;
7925   if (mask & CEPH_SETATTR_MODE)
7926     ret = _posix_acl_chmod(in, stx->stx_mode, perms);
7927   return ret;
7928 }
7929
7930 int Client::_setattrx(InodeRef &in, struct ceph_statx *stx, int mask,
7931                       const UserPerm& perms)
7932 {
7933   mask &= (CEPH_SETATTR_MODE | CEPH_SETATTR_UID |
7934            CEPH_SETATTR_GID | CEPH_SETATTR_MTIME |
7935            CEPH_SETATTR_ATIME | CEPH_SETATTR_SIZE |
7936            CEPH_SETATTR_CTIME | CEPH_SETATTR_BTIME);
7937   if (cct->_conf->client_permissions) {
7938     int r = may_setattr(in.get(), stx, mask, perms);
7939     if (r < 0)
7940       return r;
7941   }
7942   return __setattrx(in.get(), stx, mask, perms);
7943 }
7944
7945 int Client::_setattr(InodeRef &in, struct stat *attr, int mask,
7946                      const UserPerm& perms)
7947 {
7948   struct ceph_statx stx;
7949
7950   stat_to_statx(attr, &stx);
7951   mask &= ~CEPH_SETATTR_BTIME;
7952
7953   if ((mask & CEPH_SETATTR_UID) && attr->st_uid == static_cast<uid_t>(-1)) {
7954     mask &= ~CEPH_SETATTR_UID;
7955   }
7956   if ((mask & CEPH_SETATTR_GID) && attr->st_gid == static_cast<uid_t>(-1)) {
7957     mask &= ~CEPH_SETATTR_GID;
7958   }
7959
7960   return _setattrx(in, &stx, mask, perms);
7961 }
7962
7963 int Client::setattr(const char *relpath, struct stat *attr, int mask,
7964                     const UserPerm& perms)
7965 {
7966   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
7967   if (!mref_reader.is_state_satisfied())
7968     return -CEPHFS_ENOTCONN;
7969
7970   tout(cct) << __func__ << std::endl;
7971   tout(cct) << relpath << std::endl;
7972   tout(cct) << mask  << std::endl;
7973
7974   filepath path(relpath);
7975   InodeRef in;
7976
7977   std::scoped_lock lock(client_lock);
7978   int r = path_walk(path, &in, perms);
7979   if (r < 0)
7980     return r;
7981   return _setattr(in, attr, mask, perms);
7982 }
7983
7984 int Client::setattrx(const char *relpath, struct ceph_statx *stx, int mask,
7985                      const UserPerm& perms, int flags)
7986 {
7987   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
7988   if (!mref_reader.is_state_satisfied())
7989     return -CEPHFS_ENOTCONN;
7990
7991   tout(cct) << __func__ << std::endl;
7992   tout(cct) << relpath << std::endl;
7993   tout(cct) << mask  << std::endl;
7994
7995   filepath path(relpath);
7996   InodeRef in;
7997
7998   std::scoped_lock lock(client_lock);
7999   int r = path_walk(path, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW));
8000   if (r < 0)
8001     return r;
8002   return _setattrx(in, stx, mask, perms);
8003 }
8004
8005 int Client::fsetattr(int fd, struct stat *attr, int mask, const UserPerm& perms)
8006 {
8007   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8008   if (!mref_reader.is_state_satisfied())
8009     return -CEPHFS_ENOTCONN;
8010
8011   tout(cct) << __func__ << std::endl;
8012   tout(cct) << fd << std::endl;
8013   tout(cct) << mask  << std::endl;
8014
8015   std::scoped_lock lock(client_lock);
8016   Fh *f = get_filehandle(fd);
8017   if (!f)
8018     return -CEPHFS_EBADF;
8019 #if defined(__linux__) && defined(O_PATH)
8020   if (f->flags & O_PATH)
8021     return -CEPHFS_EBADF;
8022 #endif
8023   return _setattr(f->inode, attr, mask, perms);
8024 }
8025
8026 int Client::fsetattrx(int fd, struct ceph_statx *stx, int mask, const UserPerm& perms)
8027 {
8028   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8029   if (!mref_reader.is_state_satisfied())
8030     return -CEPHFS_ENOTCONN;
8031
8032   tout(cct) << __func__ << std::endl;
8033   tout(cct) << fd << std::endl;
8034   tout(cct) << mask  << std::endl;
8035
8036   std::scoped_lock lock(client_lock);
8037   Fh *f = get_filehandle(fd);
8038   if (!f)
8039     return -CEPHFS_EBADF;
8040 #if defined(__linux__) && defined(O_PATH)
8041   if (f->flags & O_PATH)
8042     return -CEPHFS_EBADF;
8043 #endif
8044   return _setattrx(f->inode, stx, mask, perms);
8045 }
8046
8047 int Client::stat(const char *relpath, struct stat *stbuf, const UserPerm& perms,
8048                  frag_info_t *dirstat, int mask)
8049 {
8050   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8051   if (!mref_reader.is_state_satisfied())
8052     return -CEPHFS_ENOTCONN;
8053
8054   ldout(cct, 3) << __func__ << " enter (relpath " << relpath << " mask " << mask << ")" << dendl;
8055   tout(cct) << "stat" << std::endl;
8056   tout(cct) << relpath << std::endl;
8057
8058   filepath path(relpath);
8059   InodeRef in;
8060
8061   std::scoped_lock lock(client_lock);
8062   int r = path_walk(path, &in, perms, true, mask);
8063   if (r < 0)
8064     return r;
8065   r = _getattr(in, mask, perms);
8066   if (r < 0) {
8067     ldout(cct, 3) << __func__ << " exit on error!" << dendl;
8068     return r;
8069   }
8070   fill_stat(in, stbuf, dirstat);
8071   ldout(cct, 3) << __func__ << " exit (relpath " << relpath << " mask " << mask << ")" << dendl;
8072   return r;
8073 }
8074
8075 unsigned Client::statx_to_mask(unsigned int flags, unsigned int want)
8076 {
8077   unsigned mask = 0;
8078
8079   /* The AT_STATX_FORCE_SYNC is always in higher priority than AT_STATX_DONT_SYNC. */
8080   if ((flags & AT_STATX_SYNC_TYPE) == AT_STATX_DONT_SYNC)
8081     goto out;
8082
8083   /* Always set PIN to distinguish from AT_STATX_DONT_SYNC case */
8084   mask |= CEPH_CAP_PIN;
8085   if (want & (CEPH_STATX_MODE|CEPH_STATX_UID|CEPH_STATX_GID|CEPH_STATX_BTIME|CEPH_STATX_CTIME|CEPH_STATX_VERSION))
8086     mask |= CEPH_CAP_AUTH_SHARED;
8087   if (want & (CEPH_STATX_NLINK|CEPH_STATX_CTIME|CEPH_STATX_VERSION))
8088     mask |= CEPH_CAP_LINK_SHARED;
8089   if (want & (CEPH_STATX_NLINK|CEPH_STATX_ATIME|CEPH_STATX_MTIME|CEPH_STATX_CTIME|CEPH_STATX_SIZE|CEPH_STATX_BLOCKS|CEPH_STATX_VERSION))
8090     mask |= CEPH_CAP_FILE_SHARED;
8091   if (want & (CEPH_STATX_VERSION|CEPH_STATX_CTIME))
8092     mask |= CEPH_CAP_XATTR_SHARED;
8093 out:
8094   return mask;
8095 }
8096
8097 int Client::statx(const char *relpath, struct ceph_statx *stx,
8098                   const UserPerm& perms,
8099                   unsigned int want, unsigned int flags)
8100 {
8101   return statxat(CEPHFS_AT_FDCWD, relpath, stx, perms, want, flags);
8102 }
8103
8104 int Client::lstat(const char *relpath, struct stat *stbuf,
8105                   const UserPerm& perms, frag_info_t *dirstat, int mask)
8106 {
8107   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8108   if (!mref_reader.is_state_satisfied())
8109     return -CEPHFS_ENOTCONN;
8110
8111   ldout(cct, 3) << __func__ << " enter (relpath " << relpath << " mask " << mask << ")" << dendl;
8112   tout(cct) << __func__ << std::endl;
8113   tout(cct) << relpath << std::endl;
8114
8115   filepath path(relpath);
8116   InodeRef in;
8117
8118   std::scoped_lock lock(client_lock);
8119   // don't follow symlinks
8120   int r = path_walk(path, &in, perms, false, mask);
8121   if (r < 0)
8122     return r;
8123   r = _getattr(in, mask, perms);
8124   if (r < 0) {
8125     ldout(cct, 3) << __func__ << " exit on error!" << dendl;
8126     return r;
8127   }
8128   fill_stat(in, stbuf, dirstat);
8129   ldout(cct, 3) << __func__ << " exit (relpath " << relpath << " mask " << mask << ")" << dendl;
8130   return r;
8131 }
8132
8133 int Client::fill_stat(Inode *in, struct stat *st, frag_info_t *dirstat, nest_info_t *rstat)
8134 {
8135   ldout(cct, 10) << __func__ << " on " << in->ino << " snap/dev" << in->snapid
8136            << " mode 0" << oct << in->mode << dec
8137            << " mtime " << in->mtime << " ctime " << in->ctime << dendl;
8138   memset(st, 0, sizeof(struct stat));
8139   if (use_faked_inos())
8140     st->st_ino = in->faked_ino;
8141   else
8142     st->st_ino = in->ino;
8143   st->st_dev = in->snapid;
8144   st->st_mode = in->mode;
8145   st->st_rdev = in->rdev;
8146   if (in->is_dir()) {
8147     switch (in->nlink) {
8148       case 0:
8149         st->st_nlink = 0; /* dir is unlinked */
8150         break;
8151       case 1:
8152         st->st_nlink = 1 /* parent dentry */
8153                        + 1 /* <dir>/. */
8154                        + in->dirstat.nsubdirs; /* include <dir>/. self-reference */
8155         break;
8156       default:
8157         ceph_abort();
8158     }
8159   } else {
8160     st->st_nlink = in->nlink;
8161   }
8162   st->st_uid = in->uid;
8163   st->st_gid = in->gid;
8164   if (in->ctime > in->mtime) {
8165     stat_set_ctime_sec(st, in->ctime.sec());
8166     stat_set_ctime_nsec(st, in->ctime.nsec());
8167   } else {
8168     stat_set_ctime_sec(st, in->mtime.sec());
8169     stat_set_ctime_nsec(st, in->mtime.nsec());
8170   }
8171   stat_set_atime_sec(st, in->atime.sec());
8172   stat_set_atime_nsec(st, in->atime.nsec());
8173   stat_set_mtime_sec(st, in->mtime.sec());
8174   stat_set_mtime_nsec(st, in->mtime.nsec());
8175   if (in->is_dir()) {
8176     if (cct->_conf->client_dirsize_rbytes)
8177       st->st_size = in->rstat.rbytes;
8178     else
8179       st->st_size = in->dirstat.size();
8180 // The Windows "stat" structure provides just a subset of the fields that are
8181 // available on Linux.
8182 #ifndef _WIN32
8183     st->st_blocks = 1;
8184 #endif
8185   } else {
8186     st->st_size = in->size;
8187 #ifndef _WIN32
8188     st->st_blocks = (in->size + 511) >> 9;
8189 #endif
8190   }
8191 #ifndef _WIN32
8192   st->st_blksize = std::max<uint32_t>(in->layout.stripe_unit, 4096);
8193 #endif
8194
8195   if (dirstat)
8196     *dirstat = in->dirstat;
8197   if (rstat)
8198     *rstat = in->rstat;
8199
8200   return in->caps_issued();
8201 }
8202
8203 void Client::fill_statx(Inode *in, unsigned int mask, struct ceph_statx *stx)
8204 {
8205   ldout(cct, 10) << __func__ << " on " << in->ino << " snap/dev" << in->snapid
8206            << " mode 0" << oct << in->mode << dec
8207            << " mtime " << in->mtime << " ctime " << in->ctime << dendl;
8208   memset(stx, 0, sizeof(struct ceph_statx));
8209
8210   /*
8211    * If mask is 0, then the caller set AT_STATX_DONT_SYNC. Reset the mask
8212    * so that all bits are set.
8213    */
8214   if (!mask)
8215     mask = ~0;
8216
8217   /* These are always considered to be available */
8218   stx->stx_dev = in->snapid;
8219   stx->stx_blksize = std::max<uint32_t>(in->layout.stripe_unit, 4096);
8220
8221   /* Type bits are always set, even when CEPH_STATX_MODE is not */
8222   stx->stx_mode = S_IFMT & in->mode;
8223   stx->stx_ino = use_faked_inos() ? in->faked_ino : (ino_t)in->ino;
8224   stx->stx_rdev = in->rdev;
8225   stx->stx_mask |= (CEPH_STATX_INO|CEPH_STATX_RDEV);
8226
8227   if (mask & CEPH_CAP_AUTH_SHARED) {
8228     stx->stx_uid = in->uid;
8229     stx->stx_gid = in->gid;
8230     stx->stx_mode = in->mode;
8231     in->btime.to_timespec(&stx->stx_btime);
8232     stx->stx_mask |= (CEPH_STATX_MODE|CEPH_STATX_UID|CEPH_STATX_GID|CEPH_STATX_BTIME);
8233   }
8234
8235   if (mask & CEPH_CAP_LINK_SHARED) {
8236     if (in->is_dir()) {
8237       switch (in->nlink) {
8238         case 0:
8239           stx->stx_nlink = 0; /* dir is unlinked */
8240           break;
8241         case 1:
8242           stx->stx_nlink = 1 /* parent dentry */
8243                            + 1 /* <dir>/. */
8244                            + in->dirstat.nsubdirs; /* include <dir>/. self-reference */
8245           break;
8246         default:
8247           ceph_abort();
8248       }
8249     } else {
8250       stx->stx_nlink = in->nlink;
8251     }
8252     stx->stx_mask |= CEPH_STATX_NLINK;
8253   }
8254
8255   if (mask & CEPH_CAP_FILE_SHARED) {
8256
8257     in->atime.to_timespec(&stx->stx_atime);
8258     in->mtime.to_timespec(&stx->stx_mtime);
8259
8260     if (in->is_dir()) {
8261       if (cct->_conf->client_dirsize_rbytes)
8262         stx->stx_size = in->rstat.rbytes;
8263       else
8264         stx->stx_size = in->dirstat.size();
8265       stx->stx_blocks = 1;
8266     } else {
8267       stx->stx_size = in->size;
8268       stx->stx_blocks = (in->size + 511) >> 9;
8269     }
8270     stx->stx_mask |= (CEPH_STATX_ATIME|CEPH_STATX_MTIME|
8271                       CEPH_STATX_SIZE|CEPH_STATX_BLOCKS);
8272   }
8273
8274   /* Change time and change_attr both require all shared caps to view */
8275   if ((mask & CEPH_STAT_CAP_INODE_ALL) == CEPH_STAT_CAP_INODE_ALL) {
8276     stx->stx_version = in->change_attr;
8277     if (in->ctime > in->mtime)
8278       in->ctime.to_timespec(&stx->stx_ctime);
8279     else
8280       in->mtime.to_timespec(&stx->stx_ctime);
8281     stx->stx_mask |= (CEPH_STATX_CTIME|CEPH_STATX_VERSION);
8282   }
8283
8284 }
8285
8286 void Client::touch_dn(Dentry *dn)
8287 {
8288   lru.lru_touch(dn);
8289 }
8290
8291 int Client::chmod(const char *relpath, mode_t mode, const UserPerm& perms)
8292 {
8293   return chmodat(CEPHFS_AT_FDCWD, relpath, mode, 0, perms);
8294 }
8295
8296 int Client::fchmod(int fd, mode_t mode, const UserPerm& perms)
8297 {
8298   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8299   if (!mref_reader.is_state_satisfied())
8300     return -CEPHFS_ENOTCONN;
8301
8302   tout(cct) << __func__ << std::endl;
8303   tout(cct) << fd << std::endl;
8304   tout(cct) << mode << std::endl;
8305
8306   std::scoped_lock lock(client_lock);
8307   Fh *f = get_filehandle(fd);
8308   if (!f)
8309     return -CEPHFS_EBADF;
8310 #if defined(__linux__) && defined(O_PATH)
8311   if (f->flags & O_PATH)
8312     return -CEPHFS_EBADF;
8313 #endif
8314   struct stat attr;
8315   attr.st_mode = mode;
8316   return _setattr(f->inode, &attr, CEPH_SETATTR_MODE, perms);
8317 }
8318
8319 int Client::chmodat(int dirfd, const char *relpath, mode_t mode, int flags,
8320                     const UserPerm& perms) {
8321   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8322   if (!mref_reader.is_state_satisfied()) {
8323     return -CEPHFS_ENOTCONN;
8324   }
8325
8326   tout(cct) << __func__ << std::endl;
8327   tout(cct) << dirfd << std::endl;
8328   tout(cct) << relpath << std::endl;
8329   tout(cct) << mode << std::endl;
8330   tout(cct) << flags << std::endl;
8331
8332   filepath path(relpath);
8333   InodeRef in;
8334   InodeRef dirinode;
8335
8336   std::scoped_lock lock(client_lock);
8337   int r = get_fd_inode(dirfd, &dirinode);
8338   if (r < 0) {
8339     return r;
8340   }
8341
8342   r = path_walk(path, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW), 0, dirinode);
8343   if (r < 0) {
8344     return r;
8345   }
8346   struct stat attr;
8347   attr.st_mode = mode;
8348   return _setattr(in, &attr, CEPH_SETATTR_MODE, perms);
8349 }
8350
8351 int Client::lchmod(const char *relpath, mode_t mode, const UserPerm& perms)
8352 {
8353   return chmodat(CEPHFS_AT_FDCWD, relpath, mode, AT_SYMLINK_NOFOLLOW, perms);
8354 }
8355
8356 int Client::chown(const char *relpath, uid_t new_uid, gid_t new_gid,
8357                   const UserPerm& perms)
8358 {
8359   return chownat(CEPHFS_AT_FDCWD, relpath, new_uid, new_gid, 0, perms);
8360 }
8361
8362 int Client::fchown(int fd, uid_t new_uid, gid_t new_gid, const UserPerm& perms)
8363 {
8364   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8365   if (!mref_reader.is_state_satisfied())
8366     return -CEPHFS_ENOTCONN;
8367
8368   tout(cct) << __func__ << std::endl;
8369   tout(cct) << fd << std::endl;
8370   tout(cct) << new_uid << std::endl;
8371   tout(cct) << new_gid << std::endl;
8372
8373   std::scoped_lock lock(client_lock);
8374   Fh *f = get_filehandle(fd);
8375   if (!f)
8376     return -CEPHFS_EBADF;
8377 #if defined(__linux__) && defined(O_PATH)
8378   if (f->flags & O_PATH)
8379     return -CEPHFS_EBADF;
8380 #endif
8381   struct stat attr;
8382   attr.st_uid = new_uid;
8383   attr.st_gid = new_gid;
8384   int mask = 0;
8385   if (new_uid != static_cast<uid_t>(-1)) mask |= CEPH_SETATTR_UID;
8386   if (new_gid != static_cast<gid_t>(-1)) mask |= CEPH_SETATTR_GID;
8387   return _setattr(f->inode, &attr, mask, perms);
8388 }
8389
8390 int Client::lchown(const char *relpath, uid_t new_uid, gid_t new_gid,
8391                    const UserPerm& perms)
8392 {
8393   return chownat(CEPHFS_AT_FDCWD, relpath, new_uid, new_gid, AT_SYMLINK_NOFOLLOW, perms);
8394 }
8395
8396 int Client::chownat(int dirfd, const char *relpath, uid_t new_uid, gid_t new_gid,
8397                     int flags, const UserPerm& perms) {
8398   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8399   if (!mref_reader.is_state_satisfied()) {
8400     return -CEPHFS_ENOTCONN;
8401   }
8402
8403   tout(cct) << __func__ << std::endl;
8404   tout(cct) << dirfd << std::endl;
8405   tout(cct) << relpath << std::endl;
8406   tout(cct) << new_uid << std::endl;
8407   tout(cct) << new_gid << std::endl;
8408   tout(cct) << flags << std::endl;
8409
8410   filepath path(relpath);
8411   InodeRef in;
8412   InodeRef dirinode;
8413
8414   std::scoped_lock lock(client_lock);
8415   int r = get_fd_inode(dirfd, &dirinode);
8416   if (r < 0) {
8417     return r;
8418   }
8419
8420   r = path_walk(path, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW), 0, dirinode);
8421   if (r < 0) {
8422     return r;
8423   }
8424   struct stat attr;
8425   attr.st_uid = new_uid;
8426   attr.st_gid = new_gid;
8427   return _setattr(in, &attr, CEPH_SETATTR_UID|CEPH_SETATTR_GID, perms);
8428 }
8429
8430 static void attr_set_atime_and_mtime(struct stat *attr,
8431                                      const utime_t &atime,
8432                                      const utime_t &mtime)
8433 {
8434   stat_set_atime_sec(attr, atime.tv.tv_sec);
8435   stat_set_atime_nsec(attr, atime.tv.tv_nsec);
8436   stat_set_mtime_sec(attr, mtime.tv.tv_sec);
8437   stat_set_mtime_nsec(attr, mtime.tv.tv_nsec);
8438 }
8439
8440 // for [l]utime() invoke the timeval variant as the timespec
8441 // variant are not yet implemented. for futime[s](), invoke
8442 // the timespec variant.
8443 int Client::utime(const char *relpath, struct utimbuf *buf,
8444                   const UserPerm& perms)
8445 {
8446   struct timeval tv[2];
8447   tv[0].tv_sec  = buf->actime;
8448   tv[0].tv_usec = 0;
8449   tv[1].tv_sec  = buf->modtime;
8450   tv[1].tv_usec = 0;
8451
8452   return utimes(relpath, tv, perms);
8453 }
8454
8455 int Client::lutime(const char *relpath, struct utimbuf *buf,
8456                    const UserPerm& perms)
8457 {
8458   struct timeval tv[2];
8459   tv[0].tv_sec  = buf->actime;
8460   tv[0].tv_usec = 0;
8461   tv[1].tv_sec  = buf->modtime;
8462   tv[1].tv_usec = 0;
8463
8464   return lutimes(relpath, tv, perms);
8465 }
8466
8467 int Client::futime(int fd, struct utimbuf *buf, const UserPerm& perms)
8468 {
8469   struct timespec ts[2];
8470   ts[0].tv_sec  = buf->actime;
8471   ts[0].tv_nsec = 0;
8472   ts[1].tv_sec  = buf->modtime;
8473   ts[1].tv_nsec = 0;
8474
8475   return futimens(fd, ts, perms);
8476 }
8477
8478 int Client::utimes(const char *relpath, struct timeval times[2],
8479                    const UserPerm& perms)
8480 {
8481   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8482   if (!mref_reader.is_state_satisfied())
8483     return -CEPHFS_ENOTCONN;
8484
8485   tout(cct) << __func__ << std::endl;
8486   tout(cct) << relpath << std::endl;
8487   tout(cct) << "atime: " << times[0].tv_sec << "." << times[0].tv_usec
8488             << std::endl;
8489   tout(cct) << "mtime: " << times[1].tv_sec << "." << times[1].tv_usec
8490             << std::endl;
8491
8492   filepath path(relpath);
8493   InodeRef in;
8494
8495   std::scoped_lock lock(client_lock);
8496   int r = path_walk(path, &in, perms);
8497   if (r < 0)
8498     return r;
8499   struct stat attr;
8500   utime_t atime(times[0]);
8501   utime_t mtime(times[1]);
8502
8503   attr_set_atime_and_mtime(&attr, atime, mtime);
8504   return _setattr(in, &attr, CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME, perms);
8505 }
8506
8507 int Client::lutimes(const char *relpath, struct timeval times[2],
8508                     const UserPerm& perms)
8509 {
8510   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8511   if (!mref_reader.is_state_satisfied())
8512     return -CEPHFS_ENOTCONN;
8513
8514   tout(cct) << __func__ << std::endl;
8515   tout(cct) << relpath << std::endl;
8516   tout(cct) << "atime: " << times[0].tv_sec << "." << times[0].tv_usec
8517             << std::endl;
8518   tout(cct) << "mtime: " << times[1].tv_sec << "." << times[1].tv_usec
8519             << std::endl;
8520
8521   filepath path(relpath);
8522   InodeRef in;
8523
8524   std::scoped_lock lock(client_lock);
8525   int r = path_walk(path, &in, perms, false);
8526   if (r < 0)
8527     return r;
8528   struct stat attr;
8529   utime_t atime(times[0]);
8530   utime_t mtime(times[1]);
8531
8532   attr_set_atime_and_mtime(&attr, atime, mtime);
8533   return _setattr(in, &attr, CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME, perms);
8534 }
8535
8536 int Client::futimes(int fd, struct timeval times[2], const UserPerm& perms)
8537 {
8538   struct timespec ts[2];
8539   ts[0].tv_sec  = times[0].tv_sec;
8540   ts[0].tv_nsec = times[0].tv_usec * 1000;
8541   ts[1].tv_sec  = times[1].tv_sec;
8542   ts[1].tv_nsec = times[1].tv_usec * 1000;
8543
8544   return futimens(fd, ts, perms);
8545 }
8546
8547 int Client::futimens(int fd, struct timespec times[2], const UserPerm& perms)
8548 {
8549   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8550   if (!mref_reader.is_state_satisfied())
8551     return -CEPHFS_ENOTCONN;
8552
8553   tout(cct) << __func__ << std::endl;
8554   tout(cct) << fd << std::endl;
8555   tout(cct) << "atime: " << times[0].tv_sec << "." << times[0].tv_nsec
8556             << std::endl;
8557   tout(cct) << "mtime: " << times[1].tv_sec << "." << times[1].tv_nsec
8558             << std::endl;
8559
8560   std::scoped_lock lock(client_lock);
8561   Fh *f = get_filehandle(fd);
8562   if (!f)
8563     return -CEPHFS_EBADF;
8564 #if defined(__linux__) && defined(O_PATH)
8565   if (f->flags & O_PATH)
8566     return -CEPHFS_EBADF;
8567 #endif
8568   struct stat attr;
8569   utime_t atime(times[0]);
8570   utime_t mtime(times[1]);
8571
8572   attr_set_atime_and_mtime(&attr, atime, mtime);
8573   return _setattr(f->inode, &attr, CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME, perms);
8574 }
8575
8576 int Client::utimensat(int dirfd, const char *relpath, struct timespec times[2], int flags,
8577                       const UserPerm& perms) {
8578   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8579   if (!mref_reader.is_state_satisfied()) {
8580     return -CEPHFS_ENOTCONN;
8581   }
8582
8583   tout(cct) << __func__ << std::endl;
8584   tout(cct) << dirfd << std::endl;
8585   tout(cct) << relpath << std::endl;
8586   tout(cct) << "atime: " << times[0].tv_sec << "." << times[0].tv_nsec
8587             << std::endl;
8588   tout(cct) << "mtime: " << times[1].tv_sec << "." << times[1].tv_nsec
8589             << std::endl;
8590   tout(cct) << flags << std::endl;
8591
8592   filepath path(relpath);
8593   InodeRef in;
8594   InodeRef dirinode;
8595
8596   std::scoped_lock lock(client_lock);
8597   int r = get_fd_inode(dirfd, &dirinode);
8598   if (r < 0) {
8599     return r;
8600   }
8601
8602 #if defined(__linux__) && defined(O_PATH)
8603   if (flags & O_PATH) {
8604     return -CEPHFS_EBADF;
8605   }
8606 #endif
8607
8608   r = path_walk(path, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW), 0, dirinode);
8609   if (r < 0) {
8610     return r;
8611   }
8612   struct stat attr;
8613   utime_t atime(times[0]);
8614   utime_t mtime(times[1]);
8615
8616   attr_set_atime_and_mtime(&attr, atime, mtime);
8617   return _setattr(in, &attr, CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME, perms);
8618 }
8619
8620 int Client::flock(int fd, int operation, uint64_t owner)
8621 {
8622   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8623   if (!mref_reader.is_state_satisfied())
8624     return -CEPHFS_ENOTCONN;
8625
8626   tout(cct) << __func__ << std::endl;
8627   tout(cct) << fd << std::endl;
8628   tout(cct) << operation << std::endl;
8629   tout(cct) << owner << std::endl;
8630
8631   std::scoped_lock lock(client_lock);
8632   Fh *f = get_filehandle(fd);
8633   if (!f)
8634     return -CEPHFS_EBADF;
8635
8636   return _flock(f, operation, owner);
8637 }
8638
8639 int Client::opendir(const char *relpath, dir_result_t **dirpp, const UserPerm& perms)
8640 {
8641   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8642   if (!mref_reader.is_state_satisfied())
8643     return -CEPHFS_ENOTCONN;
8644
8645   tout(cct) << __func__ << std::endl;
8646   tout(cct) << relpath << std::endl;
8647
8648   filepath path(relpath);
8649   InodeRef in;
8650
8651   std::scoped_lock lock(client_lock);
8652   int r = path_walk(path, &in, perms, true);
8653   if (r < 0)
8654     return r;
8655   if (cct->_conf->client_permissions) {
8656     int r = may_open(in.get(), O_RDONLY, perms);
8657     if (r < 0)
8658       return r;
8659   }
8660   r = _opendir(in.get(), dirpp, perms);
8661   /* if ENOTDIR, dirpp will be an uninitialized point and it's very dangerous to access its value */
8662   if (r != -CEPHFS_ENOTDIR)
8663       tout(cct) << (uintptr_t)*dirpp << std::endl;
8664   return r;
8665 }
8666
8667 int Client::fdopendir(int dirfd, dir_result_t **dirpp, const UserPerm &perms) {
8668   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8669   if (!mref_reader.is_state_satisfied()) {
8670     return -CEPHFS_ENOTCONN;
8671   }
8672
8673   tout(cct) << __func__ << std::endl;
8674   tout(cct) << dirfd << std::endl;
8675
8676   InodeRef dirinode;
8677   std::scoped_lock locker(client_lock);
8678   int r = get_fd_inode(dirfd, &dirinode);
8679   if (r < 0) {
8680     return r;
8681   }
8682
8683   if (cct->_conf->client_permissions) {
8684     r = may_open(dirinode.get(), O_RDONLY, perms);
8685     if (r < 0) {
8686       return r;
8687     }
8688   }
8689   r = _opendir(dirinode.get(), dirpp, perms);
8690   /* if ENOTDIR, dirpp will be an uninitialized point and it's very dangerous to access its value */
8691   if (r != -CEPHFS_ENOTDIR) {
8692       tout(cct) << (uintptr_t)*dirpp << std::endl;
8693   }
8694   return r;
8695 }
8696
8697 int Client::_opendir(Inode *in, dir_result_t **dirpp, const UserPerm& perms)
8698 {
8699   if (!in->is_dir())
8700     return -CEPHFS_ENOTDIR;
8701   *dirpp = new dir_result_t(in, perms);
8702   opened_dirs.insert(*dirpp);
8703   ldout(cct, 8) << __func__ << "(" << in->ino << ") = " << 0 << " (" << *dirpp << ")" << dendl;
8704   return 0;
8705 }
8706
8707
8708 int Client::closedir(dir_result_t *dir)
8709 {
8710   tout(cct) << __func__ << std::endl;
8711   tout(cct) << (uintptr_t)dir << std::endl;
8712
8713   ldout(cct, 3) << __func__ << "(" << dir << ") = 0" << dendl;
8714   std::scoped_lock lock(client_lock);
8715   _closedir(dir);
8716   return 0;
8717 }
8718
8719 void Client::_closedir(dir_result_t *dirp)
8720 {
8721   ldout(cct, 10) << __func__ << "(" << dirp << ")" << dendl;
8722
8723   if (dirp->inode) {
8724     ldout(cct, 10) << __func__ << " detaching inode " << dirp->inode << dendl;
8725     dirp->inode.reset();
8726   }
8727   _readdir_drop_dirp_buffer(dirp);
8728   opened_dirs.erase(dirp);
8729   delete dirp;
8730 }
8731
8732 void Client::rewinddir(dir_result_t *dirp)
8733 {
8734   ldout(cct, 3) << __func__ << "(" << dirp << ")" << dendl;
8735
8736   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8737   if (!mref_reader.is_state_satisfied())
8738     return;
8739
8740   std::scoped_lock lock(client_lock);
8741   dir_result_t *d = static_cast<dir_result_t*>(dirp);
8742   _readdir_drop_dirp_buffer(d);
8743   d->reset();
8744 }
8745
8746 loff_t Client::telldir(dir_result_t *dirp)
8747 {
8748   dir_result_t *d = static_cast<dir_result_t*>(dirp);
8749   ldout(cct, 3) << __func__ << "(" << dirp << ") = " << d->offset << dendl;
8750   return d->offset;
8751 }
8752
8753 void Client::seekdir(dir_result_t *dirp, loff_t offset)
8754 {
8755   ldout(cct, 3) << __func__ << "(" << dirp << ", " << offset << ")" << dendl;
8756
8757   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8758   if (!mref_reader.is_state_satisfied())
8759     return;
8760
8761   std::scoped_lock lock(client_lock);
8762
8763   if (offset == dirp->offset)
8764     return;
8765
8766   if (offset > dirp->offset)
8767     dirp->release_count = 0;   // bump if we do a forward seek
8768   else
8769     dirp->ordered_count = 0;   // disable filling readdir cache
8770
8771   if (dirp->hash_order()) {
8772     if (dirp->offset > offset) {
8773       _readdir_drop_dirp_buffer(dirp);
8774       dirp->reset();
8775     }
8776   } else {
8777     if (offset == 0 ||
8778         dirp->buffer_frag != frag_t(dir_result_t::fpos_high(offset)) ||
8779         dirp->offset_low() > dir_result_t::fpos_low(offset))  {
8780       _readdir_drop_dirp_buffer(dirp);
8781       dirp->reset();
8782     }
8783   }
8784
8785   dirp->offset = offset;
8786 }
8787
8788
8789 //struct dirent {
8790 //  ino_t          d_ino;       /* inode number */
8791 //  off_t          d_off;       /* offset to the next dirent */
8792 //  unsigned short d_reclen;    /* length of this record */
8793 //  unsigned char  d_type;      /* type of file */
8794 //  char           d_name[256]; /* filename */
8795 //};
8796 void Client::fill_dirent(struct dirent *de, const char *name, int type, uint64_t ino, loff_t next_off)
8797 {
8798   strncpy(de->d_name, name, 255);
8799   de->d_name[255] = '\0';
8800 #if !defined(__CYGWIN__) && !(defined(_WIN32))
8801   de->d_ino = ino;
8802 #if !defined(__APPLE__) && !defined(__FreeBSD__)
8803   de->d_off = next_off;
8804 #endif
8805   de->d_reclen = 1;
8806   de->d_type = IFTODT(type);
8807   ldout(cct, 10) << __func__ << " '" << de->d_name << "' -> " << inodeno_t(de->d_ino)
8808            << " type " << (int)de->d_type << " w/ next_off " << hex << next_off << dec << dendl;
8809 #endif
8810 }
8811
8812 void Client::_readdir_next_frag(dir_result_t *dirp)
8813 {
8814   frag_t fg = dirp->buffer_frag;
8815
8816   if (fg.is_rightmost()) {
8817     ldout(cct, 10) << __func__ << " advance from " << fg << " to END" << dendl;
8818     dirp->set_end();
8819     return;
8820   }
8821
8822   // advance
8823   fg = fg.next();
8824   ldout(cct, 10) << __func__ << " advance from " << dirp->buffer_frag << " to " << fg << dendl;
8825
8826   if (dirp->hash_order()) {
8827     // keep last_name
8828     int64_t new_offset = dir_result_t::make_fpos(fg.value(), 2, true);
8829     if (dirp->offset < new_offset) // don't decrease offset
8830       dirp->offset = new_offset;
8831   } else {
8832     dirp->last_name.clear();
8833     dirp->offset = dir_result_t::make_fpos(fg, 2, false);
8834     _readdir_rechoose_frag(dirp);
8835   }
8836 }
8837
8838 void Client::_readdir_rechoose_frag(dir_result_t *dirp)
8839 {
8840   ceph_assert(dirp->inode);
8841
8842   if (dirp->hash_order())
8843     return;
8844
8845   frag_t cur = frag_t(dirp->offset_high());
8846   frag_t fg = dirp->inode->dirfragtree[cur.value()];
8847   if (fg != cur) {
8848     ldout(cct, 10) << __func__ << " frag " << cur << " maps to " << fg << dendl;
8849     dirp->offset = dir_result_t::make_fpos(fg, 2, false);
8850     dirp->last_name.clear();
8851     dirp->next_offset = 2;
8852   }
8853 }
8854
8855 void Client::_readdir_drop_dirp_buffer(dir_result_t *dirp)
8856 {
8857   ldout(cct, 10) << __func__ << " " << dirp << dendl;
8858   dirp->buffer.clear();
8859 }
8860
8861 int Client::_readdir_get_frag(dir_result_t *dirp)
8862 {
8863   ceph_assert(dirp);
8864   ceph_assert(dirp->inode);
8865
8866   // get the current frag.
8867   frag_t fg;
8868   if (dirp->hash_order())
8869     fg = dirp->inode->dirfragtree[dirp->offset_high()];
8870   else
8871     fg = frag_t(dirp->offset_high());
8872
8873   ldout(cct, 10) << __func__ << " " << dirp << " on " << dirp->inode->ino << " fg " << fg
8874                  << " offset " << hex << dirp->offset << dec << dendl;
8875
8876   int op = CEPH_MDS_OP_READDIR;
8877   if (dirp->inode && dirp->inode->snapid == CEPH_SNAPDIR)
8878     op = CEPH_MDS_OP_LSSNAP;
8879
8880   InodeRef& diri = dirp->inode;
8881
8882   MetaRequest *req = new MetaRequest(op);
8883   filepath path;
8884   diri->make_nosnap_relative_path(path);
8885   req->set_filepath(path);
8886   req->set_inode(diri.get());
8887   req->head.args.readdir.frag = fg;
8888   req->head.args.readdir.flags = CEPH_READDIR_REPLY_BITFLAGS;
8889   if (dirp->last_name.length()) {
8890     req->path2.set_path(dirp->last_name);
8891   } else if (dirp->hash_order()) {
8892     req->head.args.readdir.offset_hash = dirp->offset_high();
8893   }
8894   req->dirp = dirp;
8895
8896   bufferlist dirbl;
8897   int res = make_request(req, dirp->perms, NULL, NULL, -1, &dirbl);
8898
8899   if (res == -CEPHFS_EAGAIN) {
8900     ldout(cct, 10) << __func__ << " got EAGAIN, retrying" << dendl;
8901     _readdir_rechoose_frag(dirp);
8902     return _readdir_get_frag(dirp);
8903   }
8904
8905   if (res == 0) {
8906     ldout(cct, 10) << __func__ << " " << dirp << " got frag " << dirp->buffer_frag
8907                    << " size " << dirp->buffer.size() << dendl;
8908   } else {
8909     ldout(cct, 10) << __func__ << " got error " << res << ", setting end flag" << dendl;
8910     dirp->set_end();
8911   }
8912
8913   return res;
8914 }
8915
8916 struct dentry_off_lt {
8917   bool operator()(const Dentry* dn, int64_t off) const {
8918     return dir_result_t::fpos_cmp(dn->offset, off) < 0;
8919   }
8920 };
8921
8922 int Client::_readdir_cache_cb(dir_result_t *dirp, add_dirent_cb_t cb, void *p,
8923                               int caps, bool getref)
8924 {
8925   ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
8926   ldout(cct, 10) << __func__ << " " << dirp << " on " << dirp->inode->ino
8927            << " last_name " << dirp->last_name << " offset " << hex << dirp->offset << dec
8928            << dendl;
8929   Dir *dir = dirp->inode->dir;
8930
8931   if (!dir) {
8932     ldout(cct, 10) << " dir is empty" << dendl;
8933     dirp->set_end();
8934     return 0;
8935   }
8936
8937   vector<Dentry*>::iterator pd = std::lower_bound(dir->readdir_cache.begin(),
8938                                                   dir->readdir_cache.end(),
8939                                                   dirp->offset, dentry_off_lt());
8940
8941   string dn_name;
8942   while (true) {
8943     int mask = caps;
8944     if (!dirp->inode->is_complete_and_ordered())
8945       return -CEPHFS_EAGAIN;
8946     if (pd == dir->readdir_cache.end())
8947       break;
8948     Dentry *dn = *pd;
8949     if (dn->inode == NULL) {
8950       ldout(cct, 15) << " skipping null '" << dn->name << "'" << dendl;
8951       ++pd;
8952       continue;
8953     }
8954     if (dn->cap_shared_gen != dir->parent_inode->shared_gen) {
8955       ldout(cct, 15) << " skipping mismatch shared gen '" << dn->name << "'" << dendl;
8956       ++pd;
8957       continue;
8958     }
8959
8960     int idx = pd - dir->readdir_cache.begin();
8961     if (dn->inode->is_dir()) {
8962       mask |= CEPH_STAT_RSTAT;
8963     }
8964     int r = _getattr(dn->inode, mask, dirp->perms);
8965     if (r < 0)
8966       return r;
8967
8968     // the content of readdir_cache may change after _getattr(), so pd may be invalid iterator
8969     pd = dir->readdir_cache.begin() + idx;
8970     if (pd >= dir->readdir_cache.end() || *pd != dn)
8971       return -CEPHFS_EAGAIN;
8972
8973     struct ceph_statx stx;
8974     struct dirent de;
8975     fill_statx(dn->inode, caps, &stx);
8976
8977     uint64_t next_off = dn->offset + 1;
8978     fill_dirent(&de, dn->name.c_str(), stx.stx_mode, stx.stx_ino, next_off);
8979     ++pd;
8980     if (pd == dir->readdir_cache.end())
8981       next_off = dir_result_t::END;
8982
8983     Inode *in = NULL;
8984     if (getref) {
8985       in = dn->inode.get();
8986       _ll_get(in);
8987     }
8988
8989     dn_name = dn->name; // fill in name while we have lock
8990
8991     client_lock.unlock();
8992     r = cb(p, &de, &stx, next_off, in);  // _next_ offset
8993     client_lock.lock();
8994     ldout(cct, 15) << " de " << de.d_name << " off " << hex << dn->offset << dec
8995                    << " = " << r << dendl;
8996     if (r < 0) {
8997       return r;
8998     }
8999
9000     dirp->offset = next_off;
9001     if (dirp->at_end())
9002       dirp->next_offset = 2;
9003     else
9004       dirp->next_offset = dirp->offset_low();
9005     dirp->last_name = dn_name; // we successfully returned this one; update!
9006     dirp->release_count = 0; // last_name no longer match cache index
9007     if (r > 0)
9008       return r;
9009   }
9010
9011   ldout(cct, 10) << __func__ << " " << dirp << " on " << dirp->inode->ino << " at end" << dendl;
9012   dirp->set_end();
9013   return 0;
9014 }
9015
9016 int Client::readdir_r_cb(dir_result_t *d, add_dirent_cb_t cb, void *p,
9017                          unsigned want, unsigned flags, bool getref)
9018 {
9019   int caps = statx_to_mask(flags, want);
9020
9021   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
9022   if (!mref_reader.is_state_satisfied())
9023     return -CEPHFS_ENOTCONN;
9024
9025   std::unique_lock cl(client_lock);
9026
9027   dir_result_t *dirp = static_cast<dir_result_t*>(d);
9028
9029   ldout(cct, 10) << __func__ << " " << *dirp->inode << " offset " << hex << dirp->offset
9030                  << dec << " at_end=" << dirp->at_end()
9031                  << " hash_order=" << dirp->hash_order() << dendl;
9032
9033   struct dirent de;
9034   struct ceph_statx stx;
9035   memset(&de, 0, sizeof(de));
9036   memset(&stx, 0, sizeof(stx));
9037
9038   InodeRef& diri = dirp->inode;
9039
9040   if (dirp->at_end())
9041     return 0;
9042
9043   if (dirp->offset == 0) {
9044     ldout(cct, 15) << " including ." << dendl;
9045     ceph_assert(diri->dentries.size() < 2); // can't have multiple hard-links to a dir
9046     uint64_t next_off = 1;
9047
9048     int r;
9049     r = _getattr(diri, caps | CEPH_STAT_RSTAT, dirp->perms);
9050     if (r < 0)
9051       return r;
9052
9053     fill_statx(diri, caps, &stx);
9054     fill_dirent(&de, ".", S_IFDIR, stx.stx_ino, next_off);
9055
9056     Inode *inode = NULL;
9057     if (getref) {
9058       inode = diri.get();
9059       _ll_get(inode);
9060     }
9061
9062     cl.unlock();
9063     r = cb(p, &de, &stx, next_off, inode);
9064     cl.lock();
9065     if (r < 0)
9066       return r;
9067
9068     dirp->offset = next_off;
9069     if (r > 0)
9070       return r;
9071   }
9072   if (dirp->offset == 1) {
9073     ldout(cct, 15) << " including .." << dendl;
9074     uint64_t next_off = 2;
9075     InodeRef in;
9076     if (diri->dentries.empty())
9077       in = diri;
9078     else
9079       in = diri->get_first_parent()->dir->parent_inode;
9080
9081     int r;
9082     r = _getattr(in, caps | CEPH_STAT_RSTAT, dirp->perms);
9083     if (r < 0)
9084       return r;
9085
9086     fill_statx(in, caps, &stx);
9087     fill_dirent(&de, "..", S_IFDIR, stx.stx_ino, next_off);
9088
9089     Inode *inode = NULL;
9090     if (getref) {
9091       inode = in.get();
9092       _ll_get(inode);
9093     }
9094
9095     cl.unlock();
9096     r = cb(p, &de, &stx, next_off, inode);
9097     cl.lock();
9098     if (r < 0)
9099       return r;
9100
9101     dirp->offset = next_off;
9102     if (r > 0)
9103       return r;
9104   }
9105
9106   // can we read from our cache?
9107   ldout(cct, 10) << "offset " << hex << dirp->offset << dec
9108            << " snapid " << dirp->inode->snapid << " (complete && ordered) "
9109            << dirp->inode->is_complete_and_ordered()
9110            << " issued " << ccap_string(dirp->inode->caps_issued())
9111            << dendl;
9112   if (dirp->inode->snapid != CEPH_SNAPDIR &&
9113       dirp->inode->is_complete_and_ordered() &&
9114       dirp->inode->caps_issued_mask(CEPH_CAP_FILE_SHARED, true)) {
9115     int err = _readdir_cache_cb(dirp, cb, p, caps, getref);
9116     if (err != -CEPHFS_EAGAIN)
9117       return err;
9118   }
9119
9120   while (1) {
9121     if (dirp->at_end())
9122       return 0;
9123
9124     bool check_caps = true;
9125     if (!dirp->is_cached()) {
9126       int r = _readdir_get_frag(dirp);
9127       if (r)
9128         return r;
9129       // _readdir_get_frag () may updates dirp->offset if the replied dirfrag is
9130       // different than the requested one. (our dirfragtree was outdated)
9131       check_caps = false;
9132     }
9133     frag_t fg = dirp->buffer_frag;
9134
9135     ldout(cct, 10) << "frag " << fg << " buffer size " << dirp->buffer.size()
9136                    << " offset " << hex << dirp->offset << dendl;
9137
9138     for (auto it = std::lower_bound(dirp->buffer.begin(), dirp->buffer.end(),
9139                                     dirp->offset, dir_result_t::dentry_off_lt());
9140          it != dirp->buffer.end();
9141          ++it) {
9142       dir_result_t::dentry &entry = *it;
9143
9144       uint64_t next_off = entry.offset + 1;
9145
9146       int r;
9147       if (check_caps) {
9148         int mask = caps;
9149         if(entry.inode->is_dir()){
9150           mask |= CEPH_STAT_RSTAT;
9151         }
9152         r = _getattr(entry.inode, mask, dirp->perms);
9153         if (r < 0)
9154           return r;
9155       }
9156
9157       fill_statx(entry.inode, caps, &stx);
9158       fill_dirent(&de, entry.name.c_str(), stx.stx_mode, stx.stx_ino, next_off);
9159
9160       Inode *inode = NULL;
9161       if (getref) {
9162         inode = entry.inode.get();
9163         _ll_get(inode);
9164       }
9165
9166       cl.unlock();
9167       r = cb(p, &de, &stx, next_off, inode);  // _next_ offset
9168       cl.lock();
9169
9170       ldout(cct, 15) << " de " << de.d_name << " off " << hex << next_off - 1 << dec
9171                      << " = " << r << dendl;
9172       if (r < 0)
9173         return r;
9174
9175       dirp->offset = next_off;
9176       if (r > 0)
9177         return r;
9178     }
9179
9180     if (dirp->next_offset > 2) {
9181       ldout(cct, 10) << " fetching next chunk of this frag" << dendl;
9182       _readdir_drop_dirp_buffer(dirp);
9183       continue;  // more!
9184     }
9185
9186     if (!fg.is_rightmost()) {
9187       // next frag!
9188       _readdir_next_frag(dirp);
9189       continue;
9190     }
9191
9192     if (diri->shared_gen == dirp->start_shared_gen &&
9193         diri->dir_release_count == dirp->release_count) {
9194       if (diri->dir_ordered_count == dirp->ordered_count) {
9195         ldout(cct, 10) << " marking (I_COMPLETE|I_DIR_ORDERED) on " << *diri << dendl;
9196         if (diri->dir) {
9197           ceph_assert(diri->dir->readdir_cache.size() >= dirp->cache_index);
9198           diri->dir->readdir_cache.resize(dirp->cache_index);
9199         }
9200         diri->flags |= I_COMPLETE | I_DIR_ORDERED;
9201       } else {
9202         ldout(cct, 10) << " marking I_COMPLETE on " << *diri << dendl;
9203         diri->flags |= I_COMPLETE;
9204       }
9205     }
9206
9207     dirp->set_end();
9208     return 0;
9209   }
9210   ceph_abort();
9211   return 0;
9212 }
9213
9214
9215 int Client::readdir_r(dir_result_t *d, struct dirent *de)
9216 {
9217   return readdirplus_r(d, de, 0, 0, 0, NULL);
9218 }
9219
9220 /*
9221  * readdirplus_r
9222  *
9223  * returns
9224  *  1 if we got a dirent
9225  *  0 for end of directory
9226  * <0 on error
9227  */
9228
9229 struct single_readdir {
9230   struct dirent *de;
9231   struct ceph_statx *stx;
9232   Inode *inode;
9233   bool full;
9234 };
9235
9236 static int _readdir_single_dirent_cb(void *p, struct dirent *de,
9237                                      struct ceph_statx *stx, off_t off,
9238                                      Inode *in)
9239 {
9240   single_readdir *c = static_cast<single_readdir *>(p);
9241
9242   if (c->full)
9243     return -1;  // already filled this dirent
9244
9245   *c->de = *de;
9246   if (c->stx)
9247     *c->stx = *stx;
9248   c->inode = in;
9249   c->full = true;
9250   return 1;
9251 }
9252
9253 struct dirent *Client::readdir(dir_result_t *d)
9254 {
9255   int ret;
9256   auto& de = d->de;
9257   single_readdir sr;
9258   sr.de = &de;
9259   sr.stx = NULL;
9260   sr.inode = NULL;
9261   sr.full = false;
9262
9263   // our callback fills the dirent and sets sr.full=true on first
9264   // call, and returns -1 the second time around.
9265   ret = readdir_r_cb(d, _readdir_single_dirent_cb, (void *)&sr);
9266   if (ret < -1) {
9267     errno = -ret;  // this sucks.
9268     return (dirent *) NULL;
9269   }
9270   if (sr.full) {
9271     return &de;
9272   }
9273   return (dirent *) NULL;
9274 }
9275
9276 int Client::readdirplus_r(dir_result_t *d, struct dirent *de,
9277                           struct ceph_statx *stx, unsigned want,
9278                           unsigned flags, Inode **out)
9279 {
9280   single_readdir sr;
9281   sr.de = de;
9282   sr.stx = stx;
9283   sr.inode = NULL;
9284   sr.full = false;
9285
9286   // our callback fills the dirent and sets sr.full=true on first
9287   // call, and returns -1 the second time around.
9288   int r = readdir_r_cb(d, _readdir_single_dirent_cb, (void *)&sr, want, flags, out);
9289   if (r < -1)
9290     return r;
9291   if (out)
9292     *out = sr.inode;
9293   if (sr.full)
9294     return 1;
9295   return 0;
9296 }
9297
9298
9299 /* getdents */
9300 struct getdents_result {
9301   char *buf;
9302   int buflen;
9303   int pos;
9304   bool fullent;
9305 };
9306
9307 static int _readdir_getdent_cb(void *p, struct dirent *de,
9308                                struct ceph_statx *stx, off_t off, Inode *in)
9309 {
9310   struct getdents_result *c = static_cast<getdents_result *>(p);
9311
9312   int dlen;
9313   if (c->fullent)
9314     dlen = sizeof(*de);
9315   else
9316     dlen = strlen(de->d_name) + 1;
9317
9318   if (c->pos + dlen > c->buflen)
9319     return -1;  // doesn't fit
9320
9321   if (c->fullent) {
9322     memcpy(c->buf + c->pos, de, sizeof(*de));
9323   } else {
9324     memcpy(c->buf + c->pos, de->d_name, dlen);
9325   }
9326   c->pos += dlen;
9327   return 0;
9328 }
9329
9330 int Client::_getdents(dir_result_t *dir, char *buf, int buflen, bool fullent)
9331 {
9332   getdents_result gr;
9333   gr.buf = buf;
9334   gr.buflen = buflen;
9335   gr.fullent = fullent;
9336   gr.pos = 0;
9337
9338   int r = readdir_r_cb(dir, _readdir_getdent_cb, (void *)&gr);
9339
9340   if (r < 0) { // some error
9341     if (r == -1) { // buffer ran out of space
9342       if (gr.pos) { // but we got some entries already!
9343         return gr.pos;
9344       } // or we need a larger buffer
9345       return -CEPHFS_ERANGE;
9346     } else { // actual error, return it
9347       return r;
9348     }
9349   }
9350   return gr.pos;
9351 }
9352
9353
9354 /* getdir */
9355 struct getdir_result {
9356   list<string> *contents;
9357   int num;
9358 };
9359
9360 static int _getdir_cb(void *p, struct dirent *de, struct ceph_statx *stx, off_t off, Inode *in)
9361 {
9362   getdir_result *r = static_cast<getdir_result *>(p);
9363
9364   r->contents->push_back(de->d_name);
9365   r->num++;
9366   return 0;
9367 }
9368
9369 int Client::getdir(const char *relpath, list<string>& contents,
9370                    const UserPerm& perms)
9371 {
9372   ldout(cct, 3) << "getdir(" << relpath << ")" << dendl;
9373   tout(cct) << "getdir" << std::endl;
9374   tout(cct) << relpath << std::endl;
9375
9376   dir_result_t *d;
9377   int r = opendir(relpath, &d, perms);
9378   if (r < 0)
9379     return r;
9380
9381   getdir_result gr;
9382   gr.contents = &contents;
9383   gr.num = 0;
9384   r = readdir_r_cb(d, _getdir_cb, (void *)&gr);
9385
9386   closedir(d);
9387
9388   if (r < 0)
9389     return r;
9390   return gr.num;
9391 }
9392
9393
9394 /****** file i/o **********/
9395
9396 // common parts for open and openat. call with client_lock locked.
9397 int Client::create_and_open(int dirfd, const char *relpath, int flags,
9398                             const UserPerm& perms, mode_t mode, int stripe_unit,
9399                             int stripe_count, int object_size, const char *data_pool,
9400                             std::string alternate_name) {
9401   ceph_assert(ceph_mutex_is_locked(client_lock));
9402   int cflags = ceph_flags_sys2wire(flags);
9403   tout(cct) << cflags << std::endl;
9404
9405   Fh *fh = NULL;
9406
9407 #if defined(__linux__) && defined(O_PATH)
9408   /* When the O_PATH is being specified, others flags than O_DIRECTORY
9409    * and O_NOFOLLOW are ignored. Please refer do_entry_open() function
9410    * in kernel (fs/open.c). */
9411   if (flags & O_PATH)
9412     flags &= O_DIRECTORY | O_NOFOLLOW | O_PATH;
9413 #endif
9414
9415   filepath path(relpath);
9416   InodeRef in;
9417   bool created = false;
9418   /* O_CREATE with O_EXCL enforces O_NOFOLLOW. */
9419   bool followsym = !((flags & O_NOFOLLOW) || ((flags & O_CREAT) && (flags & O_EXCL)));
9420   int mask = ceph_caps_for_mode(ceph_flags_to_mode(cflags));
9421
9422   InodeRef dirinode = nullptr;
9423   int r = get_fd_inode(dirfd, &dirinode);
9424   if (r < 0) {
9425     return r;
9426   }
9427
9428   r = path_walk(path, &in, perms, followsym, mask, dirinode);
9429   if (r == 0 && (flags & O_CREAT) && (flags & O_EXCL))
9430     return -CEPHFS_EEXIST;
9431
9432 #if defined(__linux__) && defined(O_PATH)
9433   if (r == 0 && in->is_symlink() && (flags & O_NOFOLLOW) && !(flags & O_PATH))
9434 #else
9435     if (r == 0 && in->is_symlink() && (flags & O_NOFOLLOW))
9436 #endif
9437     return -CEPHFS_ELOOP;
9438
9439   if (r == -CEPHFS_ENOENT && (flags & O_CREAT)) {
9440     filepath dirpath = path;
9441     string dname = dirpath.last_dentry();
9442     dirpath.pop_dentry();
9443     InodeRef dir;
9444     r = path_walk(dirpath, &dir, perms, true,
9445                   cct->_conf->client_permissions ? CEPH_CAP_AUTH_SHARED : 0, dirinode);
9446     if (r < 0) {
9447       goto out;
9448     }
9449     if (cct->_conf->client_permissions) {
9450       r = may_create(dir.get(), perms);
9451       if (r < 0)
9452         goto out;
9453     }
9454     r = _create(dir.get(), dname.c_str(), flags, mode, &in, &fh, stripe_unit,
9455                 stripe_count, object_size, data_pool, &created, perms,
9456                 std::move(alternate_name));
9457   }
9458   if (r < 0)
9459     goto out;
9460
9461   if (!created) {
9462     // posix says we can only check permissions of existing files
9463     if (cct->_conf->client_permissions) {
9464       r = may_open(in.get(), flags, perms);
9465       if (r < 0)
9466         goto out;
9467     }
9468   }
9469
9470   if (!fh)
9471     r = _open(in.get(), flags, mode, &fh, perms);
9472   if (r >= 0) {
9473     // allocate a integer file descriptor
9474     ceph_assert(fh);
9475     r = get_fd();
9476     ceph_assert(fd_map.count(r) == 0);
9477     fd_map[r] = fh;
9478   }
9479
9480  out:
9481   return r;
9482 }
9483
9484 int Client::open(const char *relpath, int flags, const UserPerm& perms,
9485                  mode_t mode, int stripe_unit, int stripe_count,
9486                  int object_size, const char *data_pool, std::string alternate_name)
9487 {
9488   return openat(CEPHFS_AT_FDCWD, relpath, flags, perms, mode, stripe_unit,
9489                 stripe_count, object_size, data_pool, alternate_name);
9490 }
9491
9492 int Client::openat(int dirfd, const char *relpath, int flags, const UserPerm& perms,
9493                    mode_t mode, int stripe_unit, int stripe_count, int object_size,
9494                    const char *data_pool, std::string alternate_name) {
9495   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
9496   if (!mref_reader.is_state_satisfied()) {
9497     return -CEPHFS_ENOTCONN;
9498   }
9499
9500   ldout(cct, 3) << "openat enter(" << relpath << ")" << dendl;
9501   tout(cct) << dirfd << std::endl;
9502   tout(cct) << relpath << std::endl;
9503   tout(cct) << flags << std::endl;
9504   tout(cct) << mode << std::endl;
9505
9506   std::scoped_lock locker(client_lock);
9507   int r =  create_and_open(dirfd, relpath, flags, perms, mode, stripe_unit, stripe_count,
9508                            object_size, data_pool, alternate_name);
9509
9510   tout(cct) << r << std::endl;
9511   ldout(cct, 3) << "openat exit(" << relpath << ")" << dendl;
9512   return r;
9513 }
9514
9515 int Client::lookup_hash(inodeno_t ino, inodeno_t dirino, const char *name,
9516                         const UserPerm& perms)
9517 {
9518   ldout(cct, 3) << __func__ << " enter(" << ino << ", #" << dirino << "/" << name << ")" << dendl;
9519
9520   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
9521   if (!mref_reader.is_state_satisfied())
9522     return -CEPHFS_ENOTCONN;
9523
9524   std::scoped_lock lock(client_lock);
9525   MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPHASH);
9526   filepath path(ino);
9527   req->set_filepath(path);
9528
9529   uint32_t h = ceph_str_hash(CEPH_STR_HASH_RJENKINS, name, strlen(name));
9530   char f[30];
9531   sprintf(f, "%u", h);
9532   filepath path2(dirino);
9533   path2.push_dentry(string(f));
9534   req->set_filepath2(path2);
9535
9536   int r = make_request(req, perms, NULL, NULL,
9537                        rand() % mdsmap->get_num_in_mds());
9538   ldout(cct, 3) << __func__ << " exit(" << ino << ", #" << dirino << "/" << name << ") = " << r << dendl;
9539   return r;
9540 }
9541
9542
9543 /**
9544  * Load inode into local cache.
9545  *
9546  * If inode pointer is non-NULL, and take a reference on
9547  * the resulting Inode object in one operation, so that caller
9548  * can safely assume inode will still be there after return.
9549  */
9550 int Client::_lookup_vino(vinodeno_t vino, const UserPerm& perms, Inode **inode)
9551 {
9552   ldout(cct, 8) << __func__ << " enter(" << vino << ")" << dendl;
9553
9554   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
9555   if (!mref_reader.is_state_satisfied())
9556     return -CEPHFS_ENOTCONN;
9557
9558   if (is_reserved_vino(vino))
9559     return -CEPHFS_ESTALE;
9560
9561   MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPINO);
9562   filepath path(vino.ino);
9563   req->set_filepath(path);
9564
9565   /*
9566    * The MDS expects either a "real" snapid here or 0. The special value
9567    * carveouts for the snapid are all at the end of the range so we can
9568    * just look for any snapid below this value.
9569    */
9570   if (vino.snapid < CEPH_NOSNAP)
9571     req->head.args.lookupino.snapid = vino.snapid;
9572
9573   int r = make_request(req, perms, NULL, NULL, rand() % mdsmap->get_num_in_mds());
9574   if (r == 0 && inode != NULL) {
9575     unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino);
9576     ceph_assert(p != inode_map.end());
9577     *inode = p->second;
9578     _ll_get(*inode);
9579   }
9580   ldout(cct, 8) << __func__ << " exit(" << vino << ") = " << r << dendl;
9581   return r;
9582 }
9583
9584 int Client::lookup_ino(inodeno_t ino, const UserPerm& perms, Inode **inode)
9585 {
9586   vinodeno_t vino(ino, CEPH_NOSNAP);
9587   std::scoped_lock lock(client_lock);
9588   return _lookup_vino(vino, perms, inode);
9589 }
9590
9591 /**
9592  * Find the parent inode of `ino` and insert it into
9593  * our cache.  Conditionally also set `parent` to a referenced
9594  * Inode* if caller provides non-NULL value.
9595  */
9596 int Client::_lookup_parent(Inode *ino, const UserPerm& perms, Inode **parent)
9597 {
9598   ldout(cct, 8) << __func__ << " enter(" << ino->ino << ")" << dendl;
9599
9600   MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPPARENT);
9601   filepath path(ino->ino);
9602   req->set_filepath(path);
9603
9604   InodeRef target;
9605   int r = make_request(req, perms, &target, NULL, rand() % mdsmap->get_num_in_mds());
9606   // Give caller a reference to the parent ino if they provided a pointer.
9607   if (parent != NULL) {
9608     if (r == 0) {
9609       *parent = target.get();
9610       _ll_get(*parent);
9611       ldout(cct, 8) << __func__ << " found parent " << (*parent)->ino << dendl;
9612     } else {
9613       *parent = NULL;
9614     }
9615   }
9616   ldout(cct, 8) << __func__ << " exit(" << ino->ino << ") = " << r << dendl;
9617   return r;
9618 }
9619
9620 /**
9621  * Populate the parent dentry for `ino`, provided it is
9622  * a child of `parent`.
9623  */
9624 int Client::_lookup_name(Inode *ino, Inode *parent, const UserPerm& perms)
9625 {
9626   ceph_assert(parent->is_dir());
9627   ldout(cct, 3) << __func__ << " enter(" << ino->ino << ")" << dendl;
9628
9629   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
9630   if (!mref_reader.is_state_satisfied())
9631     return -CEPHFS_ENOTCONN;
9632
9633   MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPNAME);
9634   req->set_filepath2(filepath(parent->ino));
9635   req->set_filepath(filepath(ino->ino));
9636   req->set_inode(ino);
9637
9638   int r = make_request(req, perms, NULL, NULL, rand() % mdsmap->get_num_in_mds());
9639   ldout(cct, 3) << __func__ << " exit(" << ino->ino << ") = " << r << dendl;
9640   return r;
9641 }
9642
9643 int Client::lookup_name(Inode *ino, Inode *parent, const UserPerm& perms)
9644 {
9645   std::scoped_lock lock(client_lock);
9646   return _lookup_name(ino, parent, perms);
9647 }
9648
9649 Fh *Client::_create_fh(Inode *in, int flags, int cmode, const UserPerm& perms)
9650 {
9651   ceph_assert(in);
9652   Fh *f = new Fh(in, flags, cmode, fd_gen, perms);
9653
9654   ldout(cct, 10) << __func__ << " " << in->ino << " mode " << cmode << dendl;
9655
9656   if (in->snapid != CEPH_NOSNAP) {
9657     in->snap_cap_refs++;
9658     ldout(cct, 5) << "open success, fh is " << f << " combined IMMUTABLE SNAP caps "
9659             << ccap_string(in->caps_issued()) << dendl;
9660   }
9661
9662   const auto& conf = cct->_conf;
9663   f->readahead.set_trigger_requests(1);
9664   f->readahead.set_min_readahead_size(conf->client_readahead_min);
9665   uint64_t max_readahead = Readahead::NO_LIMIT;
9666   if (conf->client_readahead_max_bytes) {
9667     max_readahead = std::min(max_readahead, (uint64_t)conf->client_readahead_max_bytes);
9668   }
9669   if (conf->client_readahead_max_periods) {
9670     max_readahead = std::min(max_readahead, in->layout.get_period()*(uint64_t)conf->client_readahead_max_periods);
9671   }
9672   f->readahead.set_max_readahead_size(max_readahead);
9673   vector<uint64_t> alignments;
9674   alignments.push_back(in->layout.get_period());
9675   alignments.push_back(in->layout.stripe_unit);
9676   f->readahead.set_alignments(alignments);
9677
9678   return f;
9679 }
9680
9681 int Client::_release_fh(Fh *f)
9682 {
9683   //ldout(cct, 3) << "op: client->close(open_files[ " << fh << " ]);" << dendl;
9684   //ldout(cct, 3) << "op: open_files.erase( " << fh << " );" << dendl;
9685   Inode *in = f->inode.get();
9686   ldout(cct, 8) << __func__ << " " << f << " mode " << f->mode << " on " << *in << dendl;
9687
9688   in->unset_deleg(f);
9689
9690   if (in->snapid == CEPH_NOSNAP) {
9691     if (in->put_open_ref(f->mode)) {
9692       _flush(in, new C_Client_FlushComplete(this, in));
9693       check_caps(in, 0);
9694     }
9695   } else {
9696     ceph_assert(in->snap_cap_refs > 0);
9697     in->snap_cap_refs--;
9698   }
9699
9700   _release_filelocks(f);
9701
9702   // Finally, read any async err (i.e. from flushes)
9703   int err = f->take_async_err();
9704   if (err != 0) {
9705     ldout(cct, 1) << __func__ << " " << f << " on inode " << *in << " caught async_err = "
9706                   << cpp_strerror(err) << dendl;
9707   } else {
9708     ldout(cct, 10) << __func__ << " " << f << " on inode " << *in << " no async_err state" << dendl;
9709   }
9710
9711   _put_fh(f);
9712
9713   return err;
9714 }
9715
9716 void Client::_put_fh(Fh *f)
9717 {
9718   int left = f->put();
9719   if (!left) {
9720     delete f;
9721   }
9722 }
9723
9724 int Client::_open(Inode *in, int flags, mode_t mode, Fh **fhp,
9725                   const UserPerm& perms)
9726 {
9727   if (in->snapid != CEPH_NOSNAP &&
9728       (flags & (O_WRONLY | O_RDWR | O_CREAT | O_TRUNC | O_APPEND))) {
9729     return -CEPHFS_EROFS;
9730   }
9731
9732   // use normalized flags to generate cmode
9733   int cflags = ceph_flags_sys2wire(flags);
9734   if (cct->_conf.get_val<bool>("client_force_lazyio"))
9735     cflags |= CEPH_O_LAZY;
9736
9737   int cmode = ceph_flags_to_mode(cflags);
9738   int want = ceph_caps_for_mode(cmode);
9739   int result = 0;
9740
9741   in->get_open_ref(cmode);  // make note of pending open, since it effects _wanted_ caps.
9742
9743   if ((flags & O_TRUNC) == 0 && in->caps_issued_mask(want)) {
9744     // update wanted?
9745     check_caps(in, CHECK_CAPS_NODELAY);
9746   } else {
9747
9748     MetaRequest *req = new MetaRequest(CEPH_MDS_OP_OPEN);
9749     filepath path;
9750     in->make_nosnap_relative_path(path);
9751     req->set_filepath(path);
9752     req->head.args.open.flags = cflags & ~CEPH_O_CREAT;
9753     req->head.args.open.mode = mode;
9754     req->head.args.open.pool = -1;
9755     if (cct->_conf->client_debug_getattr_caps)
9756       req->head.args.open.mask = DEBUG_GETATTR_CAPS;
9757     else
9758       req->head.args.open.mask = 0;
9759     req->head.args.open.old_size = in->size;   // for O_TRUNC
9760     req->set_inode(in);
9761     result = make_request(req, perms);
9762
9763     /*
9764      * NFS expects that delegations will be broken on a conflicting open,
9765      * not just when there is actual conflicting access to the file. SMB leases
9766      * and oplocks also have similar semantics.
9767      *
9768      * Ensure that clients that have delegations enabled will wait on minimal
9769      * caps during open, just to ensure that other clients holding delegations
9770      * return theirs first.
9771      */
9772     if (deleg_timeout && result == 0) {
9773       int need = 0, have;
9774
9775       if (cmode & CEPH_FILE_MODE_WR)
9776         need |= CEPH_CAP_FILE_WR;
9777       if (cmode & CEPH_FILE_MODE_RD)
9778         need |= CEPH_CAP_FILE_RD;
9779
9780       Fh fh(in, flags, cmode, fd_gen, perms);
9781       result = get_caps(&fh, need, want, &have, -1);
9782       if (result < 0) {
9783         ldout(cct, 8) << "Unable to get caps after open of inode " << *in <<
9784                           " . Denying open: " <<
9785                           cpp_strerror(result) << dendl;
9786       } else {
9787         put_cap_ref(in, need);
9788       }
9789     }
9790   }
9791
9792   // success?
9793   if (result >= 0) {
9794     if (fhp)
9795       *fhp = _create_fh(in, flags, cmode, perms);
9796   } else {
9797     in->put_open_ref(cmode);
9798   }
9799
9800   trim_cache();
9801
9802   return result;
9803 }
9804
9805 int Client::_renew_caps(Inode *in)
9806 {
9807   int wanted = in->caps_file_wanted();
9808   if (in->is_any_caps() &&
9809       ((wanted & CEPH_CAP_ANY_WR) == 0 || in->auth_cap)) {
9810     check_caps(in, CHECK_CAPS_NODELAY);
9811     return 0;
9812   }
9813
9814   int flags = 0;
9815   if ((wanted & CEPH_CAP_FILE_RD) && (wanted & CEPH_CAP_FILE_WR))
9816     flags = O_RDWR;
9817   else if (wanted & CEPH_CAP_FILE_RD)
9818     flags = O_RDONLY;
9819   else if (wanted & CEPH_CAP_FILE_WR)
9820     flags = O_WRONLY;
9821
9822   MetaRequest *req = new MetaRequest(CEPH_MDS_OP_OPEN);
9823   filepath path;
9824   in->make_nosnap_relative_path(path);
9825   req->set_filepath(path);
9826   req->head.args.open.flags = flags;
9827   req->head.args.open.pool = -1;
9828   if (cct->_conf->client_debug_getattr_caps)
9829     req->head.args.open.mask = DEBUG_GETATTR_CAPS;
9830   else
9831     req->head.args.open.mask = 0;
9832   req->set_inode(in);
9833
9834   // duplicate in case Cap goes away; not sure if that race is a concern?
9835   const UserPerm *pperm = in->get_best_perms();
9836   UserPerm perms;
9837   if (pperm != NULL)
9838     perms = *pperm;
9839   int ret = make_request(req, perms);
9840   return ret;
9841 }
9842
9843 int Client::_close(int fd)
9844 {
9845   ldout(cct, 3) << "close enter(" << fd << ")" << dendl;
9846   tout(cct) << "close" << std::endl;
9847   tout(cct) << fd << std::endl;
9848
9849   Fh *fh = get_filehandle(fd);
9850   if (!fh)
9851     return -CEPHFS_EBADF;
9852   int err = _release_fh(fh);
9853   fd_map.erase(fd);
9854   put_fd(fd);
9855   ldout(cct, 3) << "close exit(" << fd << ")" << dendl;
9856   return err;
9857 }
9858
9859 int Client::close(int fd) {
9860   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
9861   if (!mref_reader.is_state_satisfied())
9862     return -CEPHFS_ENOTCONN;
9863
9864   std::scoped_lock lock(client_lock);
9865   return _close(fd);
9866 }
9867
9868 // ------------
9869 // read, write
9870
9871 loff_t Client::lseek(int fd, loff_t offset, int whence)
9872 {
9873   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
9874   if (!mref_reader.is_state_satisfied())
9875     return -CEPHFS_ENOTCONN;
9876
9877   tout(cct) << "lseek" << std::endl;
9878   tout(cct) << fd << std::endl;
9879   tout(cct) << offset << std::endl;
9880   tout(cct) << whence << std::endl;
9881
9882   std::scoped_lock lock(client_lock);
9883   Fh *f = get_filehandle(fd);
9884   if (!f)
9885     return -CEPHFS_EBADF;
9886 #if defined(__linux__) && defined(O_PATH)
9887   if (f->flags & O_PATH)
9888     return -CEPHFS_EBADF;
9889 #endif
9890   return _lseek(f, offset, whence);
9891 }
9892
9893 loff_t Client::_lseek(Fh *f, loff_t offset, int whence)
9894 {
9895   Inode *in = f->inode.get();
9896   bool whence_check = false;
9897   loff_t pos = -1;
9898
9899   switch (whence) {
9900   case SEEK_END:
9901     whence_check = true;
9902   break;
9903
9904 #ifdef SEEK_DATA
9905   case SEEK_DATA:
9906     whence_check = true;
9907   break;
9908 #endif
9909
9910 #ifdef SEEK_HOLE
9911   case SEEK_HOLE:
9912     whence_check = true;
9913   break;
9914 #endif
9915   }
9916
9917   if (whence_check) {
9918     int r = _getattr(in, CEPH_STAT_CAP_SIZE, f->actor_perms);
9919     if (r < 0)
9920       return r;
9921   }
9922
9923   switch (whence) {
9924   case SEEK_SET:
9925     pos = offset;
9926     break;
9927
9928   case SEEK_CUR:
9929     pos = f->pos + offset;
9930     break;
9931
9932   case SEEK_END:
9933     pos = in->size + offset;
9934     break;
9935
9936 #ifdef SEEK_DATA
9937   case SEEK_DATA:
9938     if (offset < 0 || static_cast<uint64_t>(offset) >= in->size)
9939       return -CEPHFS_ENXIO;
9940     pos = offset;
9941     break;
9942 #endif
9943
9944 #ifdef SEEK_HOLE
9945   case SEEK_HOLE:
9946     if (offset < 0 || static_cast<uint64_t>(offset) >= in->size)
9947       return -CEPHFS_ENXIO;
9948     pos = in->size;
9949     break;
9950 #endif
9951
9952   default:
9953     ldout(cct, 1) << __func__ << ": invalid whence value " << whence << dendl;
9954     return -CEPHFS_EINVAL;
9955   }
9956
9957   if (pos < 0) {
9958     return -CEPHFS_EINVAL;
9959   } else {
9960     f->pos = pos;
9961   }
9962
9963   ldout(cct, 8) << "_lseek(" << f << ", " << offset << ", " << whence << ") = " << f->pos << dendl;
9964   return f->pos;
9965 }
9966
9967
9968 void Client::lock_fh_pos(Fh *f)
9969 {
9970   ldout(cct, 10) << __func__ << " " << f << dendl;
9971
9972   if (f->pos_locked || !f->pos_waiters.empty()) {
9973     ceph::condition_variable cond;
9974     f->pos_waiters.push_back(&cond);
9975     ldout(cct, 10) << __func__ << " BLOCKING on " << f << dendl;
9976     std::unique_lock l{client_lock, std::adopt_lock};
9977     cond.wait(l, [f, me=&cond] {
9978       return !f->pos_locked && f->pos_waiters.front() == me;
9979     });
9980     l.release();
9981     ldout(cct, 10) << __func__ << " UNBLOCKING on " << f << dendl;
9982     ceph_assert(f->pos_waiters.front() == &cond);
9983     f->pos_waiters.pop_front();
9984   }
9985
9986   f->pos_locked = true;
9987 }
9988
9989 void Client::unlock_fh_pos(Fh *f)
9990 {
9991   ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
9992
9993   ldout(cct, 10) << __func__ << " " << f << dendl;
9994   f->pos_locked = false;
9995   if (!f->pos_waiters.empty()) {
9996     // only wake up the oldest waiter
9997     auto cond = f->pos_waiters.front();
9998     cond->notify_one();
9999   }
10000 }
10001
10002 int Client::uninline_data(Inode *in, Context *onfinish)
10003 {
10004   if (!in->inline_data.length()) {
10005     onfinish->complete(0);
10006     return 0;
10007   }
10008
10009   char oid_buf[32];
10010   snprintf(oid_buf, sizeof(oid_buf), "%llx.00000000", (long long unsigned)in->ino);
10011   object_t oid = oid_buf;
10012
10013   ObjectOperation create_ops;
10014   create_ops.create(false);
10015
10016   objecter->mutate(oid,
10017                    OSDMap::file_to_object_locator(in->layout),
10018                    create_ops,
10019                    in->snaprealm->get_snap_context(),
10020                    ceph::real_clock::now(),
10021                    0,
10022                    NULL);
10023
10024   bufferlist inline_version_bl;
10025   encode(in->inline_version, inline_version_bl);
10026
10027   ObjectOperation uninline_ops;
10028   uninline_ops.cmpxattr("inline_version",
10029                         CEPH_OSD_CMPXATTR_OP_GT,
10030                         CEPH_OSD_CMPXATTR_MODE_U64,
10031                         inline_version_bl);
10032   bufferlist inline_data = in->inline_data;
10033   uninline_ops.write(0, inline_data, in->truncate_size, in->truncate_seq);
10034   uninline_ops.setxattr("inline_version", stringify(in->inline_version));
10035
10036   objecter->mutate(oid,
10037                    OSDMap::file_to_object_locator(in->layout),
10038                    uninline_ops,
10039                    in->snaprealm->get_snap_context(),
10040                    ceph::real_clock::now(),
10041                    0,
10042                    onfinish);
10043
10044   return 0;
10045 }
10046
10047 //
10048
10049 // blocking osd interface
10050
10051 int Client::read(int fd, char *buf, loff_t size, loff_t offset)
10052 {
10053   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
10054   if (!mref_reader.is_state_satisfied())
10055     return -CEPHFS_ENOTCONN;
10056
10057   tout(cct) << "read" << std::endl;
10058   tout(cct) << fd << std::endl;
10059   tout(cct) << size << std::endl;
10060   tout(cct) << offset << std::endl;
10061
10062   std::unique_lock lock(client_lock);
10063   Fh *f = get_filehandle(fd);
10064   if (!f)
10065     return -CEPHFS_EBADF;
10066 #if defined(__linux__) && defined(O_PATH)
10067   if (f->flags & O_PATH)
10068     return -CEPHFS_EBADF;
10069 #endif
10070   bufferlist bl;
10071   /* We can't return bytes written larger than INT_MAX, clamp size to that */
10072   size = std::min(size, (loff_t)INT_MAX);
10073   int r = _read(f, offset, size, &bl);
10074   ldout(cct, 3) << "read(" << fd << ", " << (void*)buf << ", " << size << ", " << offset << ") = " << r << dendl;
10075   if (r >= 0) {
10076     lock.unlock();
10077     bl.begin().copy(bl.length(), buf);
10078     r = bl.length();
10079   }
10080   return r;
10081 }
10082
10083 int Client::preadv(int fd, const struct iovec *iov, int iovcnt, loff_t offset)
10084 {
10085   if (iovcnt < 0)
10086     return -CEPHFS_EINVAL;
10087   return _preadv_pwritev(fd, iov, iovcnt, offset, false);
10088 }
10089
10090 int64_t Client::_read(Fh *f, int64_t offset, uint64_t size, bufferlist *bl)
10091 {
10092   ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
10093
10094   int want, have = 0;
10095   bool movepos = false;
10096   std::unique_ptr<C_SaferCond> onuninline;
10097   int64_t rc = 0;
10098   const auto& conf = cct->_conf;
10099   Inode *in = f->inode.get();
10100   utime_t lat;
10101   utime_t start = ceph_clock_now();
10102
10103   if ((f->mode & CEPH_FILE_MODE_RD) == 0)
10104     return -CEPHFS_EBADF;
10105   //bool lazy = f->mode == CEPH_FILE_MODE_LAZY;
10106
10107   if (offset < 0) {
10108     lock_fh_pos(f);
10109     offset = f->pos;
10110     movepos = true;
10111   }
10112   loff_t start_pos = offset;
10113
10114   if (in->inline_version == 0) {
10115     auto r = _getattr(in, CEPH_STAT_CAP_INLINE_DATA, f->actor_perms, true);
10116     if (r < 0) {
10117       rc = r;
10118       goto done;
10119     }
10120     ceph_assert(in->inline_version > 0);
10121   }
10122
10123 retry:
10124   if (f->mode & CEPH_FILE_MODE_LAZY)
10125     want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO;
10126   else
10127     want = CEPH_CAP_FILE_CACHE;
10128   {
10129     auto r = get_caps(f, CEPH_CAP_FILE_RD, want, &have, -1);
10130     if (r < 0) {
10131       rc = r;
10132       goto done;
10133     }
10134   }
10135   if (f->flags & O_DIRECT)
10136     have &= ~(CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO);
10137
10138   if (in->inline_version < CEPH_INLINE_NONE) {
10139     if (!(have & CEPH_CAP_FILE_CACHE)) {
10140       onuninline.reset(new C_SaferCond("Client::_read_uninline_data flock"));
10141       uninline_data(in, onuninline.get());
10142     } else {
10143       uint32_t len = in->inline_data.length();
10144       uint64_t endoff = offset + size;
10145       if (endoff > in->size)
10146         endoff = in->size;
10147
10148       if (offset < len) {
10149         if (endoff <= len) {
10150           bl->substr_of(in->inline_data, offset, endoff - offset);
10151         } else {
10152           bl->substr_of(in->inline_data, offset, len - offset);
10153           bl->append_zero(endoff - len);
10154         }
10155         rc = endoff - offset;
10156       } else if ((uint64_t)offset < endoff) {
10157         bl->append_zero(endoff - offset);
10158         rc = endoff - offset;
10159       } else {
10160         rc = 0;
10161       }
10162       goto success;
10163     }
10164   }
10165
10166   if (!conf->client_debug_force_sync_read &&
10167       conf->client_oc &&
10168       (have & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO))) {
10169
10170     if (f->flags & O_RSYNC) {
10171       _flush_range(in, offset, size);
10172     }
10173     rc = _read_async(f, offset, size, bl);
10174     if (rc < 0)
10175       goto done;
10176   } else {
10177     if (f->flags & O_DIRECT)
10178       _flush_range(in, offset, size);
10179
10180     bool checkeof = false;
10181     rc = _read_sync(f, offset, size, bl, &checkeof);
10182     if (rc < 0)
10183       goto done;
10184     if (checkeof) {
10185       offset += rc;
10186       size -= rc;
10187
10188       put_cap_ref(in, CEPH_CAP_FILE_RD);
10189       have = 0;
10190       // reverify size
10191       {
10192         auto r = _getattr(in, CEPH_STAT_CAP_SIZE, f->actor_perms);
10193         if (r < 0) {
10194           rc = r;
10195           goto done;
10196         }
10197       }
10198
10199       // eof?  short read.
10200       if ((uint64_t)offset < in->size)
10201         goto retry;
10202     }
10203   }
10204
10205 success:
10206   ceph_assert(rc >= 0);
10207   update_read_io_size(bl->length());
10208   if (movepos) {
10209     // adjust fd pos
10210     f->pos = start_pos + rc;
10211   }
10212
10213   lat = ceph_clock_now();
10214   lat -= start;
10215
10216   ++nr_read_request;
10217   update_io_stat_read(lat);
10218
10219 done:
10220   // done!
10221
10222   if (onuninline) {
10223     client_lock.unlock();
10224     int ret = onuninline->wait();
10225     client_lock.lock();
10226     if (ret >= 0 || ret == -CEPHFS_ECANCELED) {
10227       in->inline_data.clear();
10228       in->inline_version = CEPH_INLINE_NONE;
10229       in->mark_caps_dirty(CEPH_CAP_FILE_WR);
10230       check_caps(in, 0);
10231     } else
10232       rc = ret;
10233   }
10234   if (have) {
10235     put_cap_ref(in, CEPH_CAP_FILE_RD);
10236   }
10237   if (movepos) {
10238     unlock_fh_pos(f);
10239   }
10240   return rc;
10241 }
10242
10243 Client::C_Readahead::C_Readahead(Client *c, Fh *f) :
10244     client(c), f(f) {
10245   f->get();
10246   f->readahead.inc_pending();
10247 }
10248
10249 Client::C_Readahead::~C_Readahead() {
10250   f->readahead.dec_pending();
10251   client->_put_fh(f);
10252 }
10253
10254 void Client::C_Readahead::finish(int r) {
10255   lgeneric_subdout(client->cct, client, 20) << "client." << client->get_nodeid() << " " << "C_Readahead on " << f->inode << dendl;
10256   client->put_cap_ref(f->inode.get(), CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE);
10257   if (r > 0) {
10258     client->update_read_io_size(r);
10259   }
10260 }
10261
10262 int Client::_read_async(Fh *f, uint64_t off, uint64_t len, bufferlist *bl)
10263 {
10264   ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
10265
10266   const auto& conf = cct->_conf;
10267   Inode *in = f->inode.get();
10268
10269   ldout(cct, 10) << __func__ << " " << *in << " " << off << "~" << len << dendl;
10270
10271   // trim read based on file size?
10272   if (off >= in->size)
10273     return 0;
10274   if (len == 0)
10275     return 0;
10276   if (off + len > in->size) {
10277     len = in->size - off;
10278   }
10279
10280   ldout(cct, 10) << " min_bytes=" << f->readahead.get_min_readahead_size()
10281                  << " max_bytes=" << f->readahead.get_max_readahead_size()
10282                  << " max_periods=" << conf->client_readahead_max_periods << dendl;
10283
10284   // read (and possibly block)
10285   int r = 0;
10286   C_SaferCond onfinish("Client::_read_async flock");
10287   r = objectcacher->file_read(&in->oset, &in->layout, in->snapid,
10288                               off, len, bl, 0, &onfinish);
10289   if (r == 0) {
10290     get_cap_ref(in, CEPH_CAP_FILE_CACHE);
10291     client_lock.unlock();
10292     r = onfinish.wait();
10293     client_lock.lock();
10294     put_cap_ref(in, CEPH_CAP_FILE_CACHE);
10295     update_read_io_size(bl->length());
10296   }
10297
10298   if(f->readahead.get_min_readahead_size() > 0) {
10299     pair<uint64_t, uint64_t> readahead_extent = f->readahead.update(off, len, in->size);
10300     if (readahead_extent.second > 0) {
10301       ldout(cct, 20) << "readahead " << readahead_extent.first << "~" << readahead_extent.second
10302                      << " (caller wants " << off << "~" << len << ")" << dendl;
10303       Context *onfinish2 = new C_Readahead(this, f);
10304       int r2 = objectcacher->file_read(&in->oset, &in->layout, in->snapid,
10305                                        readahead_extent.first, readahead_extent.second,
10306                                        NULL, 0, onfinish2);
10307       if (r2 == 0) {
10308         ldout(cct, 20) << "readahead initiated, c " << onfinish2 << dendl;
10309         get_cap_ref(in, CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE);
10310       } else {
10311         ldout(cct, 20) << "readahead was no-op, already cached" << dendl;
10312         delete onfinish2;
10313       }
10314     }
10315   }
10316
10317   return r;
10318 }
10319
10320 int Client::_read_sync(Fh *f, uint64_t off, uint64_t len, bufferlist *bl,
10321                        bool *checkeof)
10322 {
10323   ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
10324
10325   Inode *in = f->inode.get();
10326   uint64_t pos = off;
10327   int left = len;
10328   int read = 0;
10329
10330   ldout(cct, 10) << __func__ << " " << *in << " " << off << "~" << len << dendl;
10331
10332   // 0 success, 1 continue and < 0 error happen.
10333   auto wait_and_copy = [&](C_SaferCond &onfinish, bufferlist &tbl, int wanted) {
10334     int r = onfinish.wait();
10335
10336     // if we get ENOENT from OSD, assume 0 bytes returned
10337     if (r == -CEPHFS_ENOENT)
10338       r = 0;
10339     if (r < 0)
10340       return r;
10341
10342     if (tbl.length()) {
10343       r = tbl.length();
10344
10345       read += r;
10346       pos += r;
10347       left -= r;
10348       bl->claim_append(tbl);
10349     }
10350     // short read?
10351     if (r >= 0 && r < wanted) {
10352       if (pos < in->size) {
10353         // zero up to known EOF
10354         int64_t some = in->size - pos;
10355         if (some > left)
10356           some = left;
10357         auto z = buffer::ptr_node::create(some);
10358         z->zero();
10359         bl->push_back(std::move(z));
10360         read += some;
10361         pos += some;
10362         left -= some;
10363         if (left == 0)
10364           return 0;
10365       }
10366
10367       *checkeof = true;
10368       return 0;
10369     }
10370     return 1;
10371   };
10372
10373   while (left > 0) {
10374     C_SaferCond onfinish("Client::_read_sync flock");
10375     bufferlist tbl;
10376
10377     int wanted = left;
10378     filer->read_trunc(in->ino, &in->layout, in->snapid,
10379                       pos, left, &tbl, 0,
10380                       in->truncate_size, in->truncate_seq,
10381                       &onfinish);
10382     client_lock.unlock();
10383     int r = wait_and_copy(onfinish, tbl, wanted);
10384     client_lock.lock();
10385     if (!r)
10386       return read;
10387     if (r < 0)
10388       return r;
10389   }
10390   return read;
10391 }
10392
10393 int Client::write(int fd, const char *buf, loff_t size, loff_t offset)
10394 {
10395   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
10396   if (!mref_reader.is_state_satisfied())
10397     return -CEPHFS_ENOTCONN;
10398
10399   tout(cct) << "write" << std::endl;
10400   tout(cct) << fd << std::endl;
10401   tout(cct) << size << std::endl;
10402   tout(cct) << offset << std::endl;
10403
10404   std::scoped_lock lock(client_lock);
10405   Fh *fh = get_filehandle(fd);
10406   if (!fh)
10407     return -CEPHFS_EBADF;
10408 #if defined(__linux__) && defined(O_PATH)
10409   if (fh->flags & O_PATH)
10410     return -CEPHFS_EBADF;
10411 #endif
10412   /* We can't return bytes written larger than INT_MAX, clamp size to that */
10413   size = std::min(size, (loff_t)INT_MAX);
10414   int r = _write(fh, offset, size, buf, NULL, false);
10415   ldout(cct, 3) << "write(" << fd << ", \"...\", " << size << ", " << offset << ") = " << r << dendl;
10416   return r;
10417 }
10418
10419 int Client::pwritev(int fd, const struct iovec *iov, int iovcnt, int64_t offset)
10420 {
10421   if (iovcnt < 0)
10422     return -CEPHFS_EINVAL;
10423   return _preadv_pwritev(fd, iov, iovcnt, offset, true);
10424 }
10425
10426 int64_t Client::_preadv_pwritev_locked(Fh *fh, const struct iovec *iov,
10427                                        unsigned iovcnt, int64_t offset,
10428                                        bool write, bool clamp_to_int)
10429 {
10430     ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
10431
10432 #if defined(__linux__) && defined(O_PATH)
10433     if (fh->flags & O_PATH)
10434         return -CEPHFS_EBADF;
10435 #endif
10436     loff_t totallen = 0;
10437     for (unsigned i = 0; i < iovcnt; i++) {
10438         totallen += iov[i].iov_len;
10439     }
10440
10441     /*
10442      * Some of the API functions take 64-bit size values, but only return
10443      * 32-bit signed integers. Clamp the I/O sizes in those functions so that
10444      * we don't do I/Os larger than the values we can return.
10445      */
10446     if (clamp_to_int) {
10447       totallen = std::min(totallen, (loff_t)INT_MAX);
10448     }
10449     if (write) {
10450         int64_t w = _write(fh, offset, totallen, NULL, iov, iovcnt);
10451         ldout(cct, 3) << "pwritev(" << fh << ", \"...\", " << totallen << ", " << offset << ") = " << w << dendl;
10452         return w;
10453     } else {
10454         bufferlist bl;
10455         int64_t r = _read(fh, offset, totallen, &bl);
10456         ldout(cct, 3) << "preadv(" << fh << ", " <<  offset << ") = " << r << dendl;
10457         if (r <= 0)
10458           return r;
10459
10460         client_lock.unlock();
10461         auto iter = bl.cbegin();
10462         for (unsigned j = 0, resid = r; j < iovcnt && resid > 0; j++) {
10463                /*
10464                 * This piece of code aims to handle the case that bufferlist
10465                 * does not have enough data to fill in the iov
10466                 */
10467                const auto round_size = std::min<unsigned>(resid, iov[j].iov_len);
10468                iter.copy(round_size, reinterpret_cast<char*>(iov[j].iov_base));
10469                resid -= round_size;
10470                /* iter is self-updating */
10471         }
10472         client_lock.lock();
10473         return r;
10474     }
10475 }
10476
10477 int Client::_preadv_pwritev(int fd, const struct iovec *iov, unsigned iovcnt, int64_t offset, bool write)
10478 {
10479     RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
10480     if (!mref_reader.is_state_satisfied())
10481       return -CEPHFS_ENOTCONN;
10482
10483     tout(cct) << fd << std::endl;
10484     tout(cct) << offset << std::endl;
10485
10486     std::scoped_lock cl(client_lock);
10487     Fh *fh = get_filehandle(fd);
10488     if (!fh)
10489       return -CEPHFS_EBADF;
10490     return _preadv_pwritev_locked(fh, iov, iovcnt, offset, write, true);
10491 }
10492
10493 int64_t Client::_write(Fh *f, int64_t offset, uint64_t size, const char *buf,
10494                         const struct iovec *iov, int iovcnt)
10495 {
10496   ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
10497
10498   uint64_t fpos = 0;
10499   Inode *in = f->inode.get();
10500
10501   if ( (uint64_t)(offset+size) > mdsmap->get_max_filesize() && //exceeds config
10502        (uint64_t)(offset+size) > in->size ) { //exceeds filesize
10503       return -CEPHFS_EFBIG;
10504         }
10505   //ldout(cct, 7) << "write fh " << fh << " size " << size << " offset " << offset << dendl;
10506
10507   if (objecter->osdmap_pool_full(in->layout.pool_id)) {
10508     return -CEPHFS_ENOSPC;
10509   }
10510
10511   ceph_assert(in->snapid == CEPH_NOSNAP);
10512
10513   // was Fh opened as writeable?
10514   if ((f->mode & CEPH_FILE_MODE_WR) == 0)
10515     return -CEPHFS_EBADF;
10516
10517   // use/adjust fd pos?
10518   if (offset < 0) {
10519     lock_fh_pos(f);
10520     /*
10521      * FIXME: this is racy in that we may block _after_ this point waiting for caps, and size may
10522      * change out from under us.
10523      */
10524     if (f->flags & O_APPEND) {
10525       auto r = _lseek(f, 0, SEEK_END);
10526       if (r < 0) {
10527         unlock_fh_pos(f);
10528         return r;
10529       }
10530     }
10531     offset = f->pos;
10532     fpos = offset+size;
10533     unlock_fh_pos(f);
10534   }
10535
10536   // check quota
10537   uint64_t endoff = offset + size;
10538   if (endoff > in->size && is_quota_bytes_exceeded(in, endoff - in->size,
10539                                                    f->actor_perms)) {
10540     return -CEPHFS_EDQUOT;
10541   }
10542
10543   //bool lazy = f->mode == CEPH_FILE_MODE_LAZY;
10544
10545   ldout(cct, 10) << "cur file size is " << in->size << dendl;
10546
10547   // time it.
10548   utime_t start = ceph_clock_now();
10549
10550   if (in->inline_version == 0) {
10551     int r = _getattr(in, CEPH_STAT_CAP_INLINE_DATA, f->actor_perms, true);
10552     if (r < 0)
10553       return r;
10554     ceph_assert(in->inline_version > 0);
10555   }
10556
10557   // copy into fresh buffer (since our write may be resub, async)
10558   bufferlist bl;
10559   if (buf) {
10560     if (size > 0)
10561       bl.append(buf, size);
10562   } else if (iov){
10563     for (int i = 0; i < iovcnt; i++) {
10564       if (iov[i].iov_len > 0) {
10565         bl.append((const char *)iov[i].iov_base, iov[i].iov_len);
10566       }
10567     }
10568   }
10569
10570   utime_t lat;
10571   uint64_t totalwritten;
10572   int want, have;
10573   if (f->mode & CEPH_FILE_MODE_LAZY)
10574     want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO;
10575   else
10576     want = CEPH_CAP_FILE_BUFFER;
10577   int r = get_caps(f, CEPH_CAP_FILE_WR|CEPH_CAP_AUTH_SHARED, want, &have, endoff);
10578   if (r < 0)
10579     return r;
10580
10581   /* clear the setuid/setgid bits, if any */
10582   if (unlikely(in->mode & (S_ISUID|S_ISGID)) && size > 0) {
10583     struct ceph_statx stx = { 0 };
10584
10585     put_cap_ref(in, CEPH_CAP_AUTH_SHARED);
10586     r = __setattrx(in, &stx, CEPH_SETATTR_KILL_SGUID, f->actor_perms);
10587     if (r < 0)
10588       return r;
10589   } else {
10590     put_cap_ref(in, CEPH_CAP_AUTH_SHARED);
10591   }
10592
10593   if (f->flags & O_DIRECT)
10594     have &= ~(CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO);
10595
10596   ldout(cct, 10) << " snaprealm " << *in->snaprealm << dendl;
10597
10598   std::unique_ptr<C_SaferCond> onuninline = nullptr;
10599
10600   if (in->inline_version < CEPH_INLINE_NONE) {
10601     if (endoff > cct->_conf->client_max_inline_size ||
10602         endoff > CEPH_INLINE_MAX_SIZE ||
10603         !(have & CEPH_CAP_FILE_BUFFER)) {
10604       onuninline.reset(new C_SaferCond("Client::_write_uninline_data flock"));
10605       uninline_data(in, onuninline.get());
10606     } else {
10607       get_cap_ref(in, CEPH_CAP_FILE_BUFFER);
10608
10609       uint32_t len = in->inline_data.length();
10610
10611       if (endoff < len)
10612         in->inline_data.begin(endoff).copy(len - endoff, bl); // XXX
10613
10614       if (offset < len)
10615         in->inline_data.splice(offset, len - offset);
10616       else if (offset > len)
10617         in->inline_data.append_zero(offset - len);
10618
10619       in->inline_data.append(bl);
10620       in->inline_version++;
10621
10622       put_cap_ref(in, CEPH_CAP_FILE_BUFFER);
10623
10624       goto success;
10625     }
10626   }
10627
10628   if (cct->_conf->client_oc &&
10629       (have & (CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO))) {
10630     // do buffered write
10631     if (!in->oset.dirty_or_tx)
10632       get_cap_ref(in, CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_BUFFER);
10633
10634     get_cap_ref(in, CEPH_CAP_FILE_BUFFER);
10635
10636     // async, caching, non-blocking.
10637     r = objectcacher->file_write(&in->oset, &in->layout,
10638                                  in->snaprealm->get_snap_context(),
10639                                  offset, size, bl, ceph::real_clock::now(),
10640                                  0);
10641     put_cap_ref(in, CEPH_CAP_FILE_BUFFER);
10642
10643     if (r < 0)
10644       goto done;
10645
10646     // flush cached write if O_SYNC is set on file fh
10647     // O_DSYNC == O_SYNC on linux < 2.6.33
10648     // O_SYNC = __O_SYNC | O_DSYNC on linux >= 2.6.33
10649     if ((f->flags & O_SYNC) || (f->flags & O_DSYNC)) {
10650       _flush_range(in, offset, size);
10651     }
10652   } else {
10653     if (f->flags & O_DIRECT)
10654       _flush_range(in, offset, size);
10655
10656     // simple, non-atomic sync write
10657     C_SaferCond onfinish("Client::_write flock");
10658     get_cap_ref(in, CEPH_CAP_FILE_BUFFER);
10659
10660     filer->write_trunc(in->ino, &in->layout, in->snaprealm->get_snap_context(),
10661                        offset, size, bl, ceph::real_clock::now(), 0,
10662                        in->truncate_size, in->truncate_seq,
10663                        &onfinish);
10664     client_lock.unlock();
10665     r = onfinish.wait();
10666     client_lock.lock();
10667     put_cap_ref(in, CEPH_CAP_FILE_BUFFER);
10668     if (r < 0)
10669       goto done;
10670   }
10671
10672   // if we get here, write was successful, update client metadata
10673 success:
10674   update_write_io_size(size);
10675   // time
10676   lat = ceph_clock_now();
10677   lat -= start;
10678
10679   ++nr_write_request;
10680   update_io_stat_write(lat);
10681
10682   if (fpos) {
10683     lock_fh_pos(f);
10684     f->pos = fpos;
10685     unlock_fh_pos(f);
10686   }
10687   totalwritten = size;
10688   r = (int64_t)totalwritten;
10689
10690   // extend file?
10691   if (totalwritten + offset > in->size) {
10692     in->size = totalwritten + offset;
10693     in->mark_caps_dirty(CEPH_CAP_FILE_WR);
10694
10695     if (is_quota_bytes_approaching(in, f->actor_perms)) {
10696       check_caps(in, CHECK_CAPS_NODELAY);
10697     } else if (is_max_size_approaching(in)) {
10698       check_caps(in, 0);
10699     }
10700
10701     ldout(cct, 7) << "wrote to " << totalwritten+offset << ", extending file size" << dendl;
10702   } else {
10703     ldout(cct, 7) << "wrote to " << totalwritten+offset << ", leaving file size at " << in->size << dendl;
10704   }
10705
10706   // mtime
10707   in->mtime = in->ctime = ceph_clock_now();
10708   in->change_attr++;
10709   in->mark_caps_dirty(CEPH_CAP_FILE_WR);
10710
10711 done:
10712
10713   if (nullptr != onuninline) {
10714     client_lock.unlock();
10715     int uninline_ret = onuninline->wait();
10716     client_lock.lock();
10717
10718     if (uninline_ret >= 0 || uninline_ret == -CEPHFS_ECANCELED) {
10719       in->inline_data.clear();
10720       in->inline_version = CEPH_INLINE_NONE;
10721       in->mark_caps_dirty(CEPH_CAP_FILE_WR);
10722       check_caps(in, 0);
10723     } else
10724       r = uninline_ret;
10725   }
10726
10727   put_cap_ref(in, CEPH_CAP_FILE_WR);
10728   return r;
10729 }
10730
10731 int Client::_flush(Fh *f)
10732 {
10733   Inode *in = f->inode.get();
10734   int err = f->take_async_err();
10735   if (err != 0) {
10736     ldout(cct, 1) << __func__ << ": " << f << " on inode " << *in << " caught async_err = "
10737                   << cpp_strerror(err) << dendl;
10738   } else {
10739     ldout(cct, 10) << __func__ << ": " << f << " on inode " << *in << " no async_err state" << dendl;
10740   }
10741
10742   return err;
10743 }
10744
10745 int Client::truncate(const char *relpath, loff_t length, const UserPerm& perms)
10746 {
10747   struct ceph_statx stx;
10748   stx.stx_size = length;
10749   return setattrx(relpath, &stx, CEPH_SETATTR_SIZE, perms);
10750 }
10751
10752 int Client::ftruncate(int fd, loff_t length, const UserPerm& perms)
10753 {
10754   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
10755   if (!mref_reader.is_state_satisfied())
10756     return -CEPHFS_ENOTCONN;
10757
10758   tout(cct) << __func__ << std::endl;
10759   tout(cct) << fd << std::endl;
10760   tout(cct) << length << std::endl;
10761
10762   std::scoped_lock lock(client_lock);
10763   Fh *f = get_filehandle(fd);
10764   if (!f)
10765     return -CEPHFS_EBADF;
10766 #if defined(__linux__) && defined(O_PATH)
10767   if (f->flags & O_PATH)
10768     return -CEPHFS_EBADF;
10769 #endif
10770   if ((f->mode & CEPH_FILE_MODE_WR) == 0)
10771     return -CEPHFS_EBADF;
10772   struct stat attr;
10773   attr.st_size = length;
10774   return _setattr(f->inode, &attr, CEPH_SETATTR_SIZE, perms);
10775 }
10776
10777 int Client::fsync(int fd, bool syncdataonly)
10778 {
10779   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
10780   if (!mref_reader.is_state_satisfied())
10781     return -CEPHFS_ENOTCONN;
10782
10783   tout(cct) << "fsync" << std::endl;
10784   tout(cct) << fd << std::endl;
10785   tout(cct) << syncdataonly << std::endl;
10786
10787   std::scoped_lock lock(client_lock);
10788   Fh *f = get_filehandle(fd);
10789   if (!f)
10790     return -CEPHFS_EBADF;
10791 #if defined(__linux__) && defined(O_PATH)
10792   if (f->flags & O_PATH)
10793     return -CEPHFS_EBADF;
10794 #endif
10795   int r = _fsync(f, syncdataonly);
10796   if (r == 0) {
10797     // The IOs in this fsync were okay, but maybe something happened
10798     // in the background that we shoudl be reporting?
10799     r = f->take_async_err();
10800     ldout(cct, 5) << "fsync(" << fd << ", " << syncdataonly
10801                   << ") = 0, async_err = " << r << dendl;
10802   } else {
10803     // Assume that an error we encountered during fsync, even reported
10804     // synchronously, would also have applied the error to the Fh, and we
10805     // should clear it here to avoid returning the same error again on next
10806     // call.
10807     ldout(cct, 5) << "fsync(" << fd << ", " << syncdataonly << ") = "
10808                   << r << dendl;
10809     f->take_async_err();
10810   }
10811   return r;
10812 }
10813
10814 int Client::_fsync(Inode *in, bool syncdataonly)
10815 {
10816   ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
10817
10818   int r = 0;
10819   std::unique_ptr<C_SaferCond> object_cacher_completion = nullptr;
10820   ceph_tid_t flush_tid = 0;
10821   InodeRef tmp_ref;
10822   utime_t lat;
10823   utime_t start = ceph_clock_now();
10824
10825   ldout(cct, 8) << "_fsync on " << *in << " " << (syncdataonly ? "(dataonly)":"(data+metadata)") << dendl;
10826
10827   if (cct->_conf->client_oc) {
10828     object_cacher_completion.reset(new C_SaferCond("Client::_fsync::lock"));
10829     tmp_ref = in; // take a reference; C_SaferCond doesn't and _flush won't either
10830     _flush(in, object_cacher_completion.get());
10831     ldout(cct, 15) << "using return-valued form of _fsync" << dendl;
10832   }
10833
10834   if (!syncdataonly && in->dirty_caps) {
10835     check_caps(in, CHECK_CAPS_NODELAY|CHECK_CAPS_SYNCHRONOUS);
10836     if (in->flushing_caps)
10837       flush_tid = last_flush_tid;
10838   } else ldout(cct, 10) << "no metadata needs to commit" << dendl;
10839
10840   if (!syncdataonly && !in->unsafe_ops.empty()) {
10841     flush_mdlog_sync(in);
10842
10843     MetaRequest *req = in->unsafe_ops.back();
10844     ldout(cct, 15) << "waiting on unsafe requests, last tid " << req->get_tid() <<  dendl;
10845
10846     req->get();
10847     wait_on_list(req->waitfor_safe);
10848     put_request(req);
10849   }
10850
10851   if (nullptr != object_cacher_completion) { // wait on a real reply instead of guessing
10852     client_lock.unlock();
10853     ldout(cct, 15) << "waiting on data to flush" << dendl;
10854     r = object_cacher_completion->wait();
10855     client_lock.lock();
10856     ldout(cct, 15) << "got " << r << " from flush writeback" << dendl;
10857   } else {
10858     // FIXME: this can starve
10859     while (in->cap_refs[CEPH_CAP_FILE_BUFFER] > 0) {
10860       ldout(cct, 10) << "ino " << in->ino << " has " << in->cap_refs[CEPH_CAP_FILE_BUFFER]
10861                      << " uncommitted, waiting" << dendl;
10862       wait_on_list(in->waitfor_commit);
10863     }
10864   }
10865
10866   if (!r) {
10867     if (flush_tid > 0)
10868       wait_sync_caps(in, flush_tid);
10869
10870     ldout(cct, 10) << "ino " << in->ino << " has no uncommitted writes" << dendl;
10871   } else {
10872     ldout(cct, 8) << "ino " << in->ino << " failed to commit to disk! "
10873                   << cpp_strerror(-r) << dendl;
10874   }
10875
10876   lat = ceph_clock_now();
10877   lat -= start;
10878   logger->tinc(l_c_fsync, lat);
10879
10880   return r;
10881 }
10882
10883 int Client::_fsync(Fh *f, bool syncdataonly)
10884 {
10885   ldout(cct, 8) << "_fsync(" << f << ", " << (syncdataonly ? "dataonly)":"data+metadata)") << dendl;
10886   return _fsync(f->inode.get(), syncdataonly);
10887 }
10888
10889 int Client::fstat(int fd, struct stat *stbuf, const UserPerm& perms, int mask)
10890 {
10891   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
10892   if (!mref_reader.is_state_satisfied())
10893     return -CEPHFS_ENOTCONN;
10894
10895   tout(cct) << "fstat mask " << hex << mask << dec << std::endl;
10896   tout(cct) << fd << std::endl;
10897
10898   std::scoped_lock lock(client_lock);
10899   Fh *f = get_filehandle(fd);
10900   if (!f)
10901     return -CEPHFS_EBADF;
10902   int r = _getattr(f->inode, mask, perms);
10903   if (r < 0)
10904     return r;
10905   fill_stat(f->inode, stbuf, NULL);
10906   ldout(cct, 5) << "fstat(" << fd << ", " << stbuf << ") = " << r << dendl;
10907   return r;
10908 }
10909
10910 int Client::fstatx(int fd, struct ceph_statx *stx, const UserPerm& perms,
10911                    unsigned int want, unsigned int flags)
10912 {
10913   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
10914   if (!mref_reader.is_state_satisfied())
10915     return -CEPHFS_ENOTCONN;
10916
10917   tout(cct) << "fstatx flags " << hex << flags << " want " << want << dec << std::endl;
10918   tout(cct) << fd << std::endl;
10919
10920   std::scoped_lock lock(client_lock);
10921   Fh *f = get_filehandle(fd);
10922   if (!f)
10923     return -CEPHFS_EBADF;
10924
10925   unsigned mask = statx_to_mask(flags, want);
10926
10927   int r = 0;
10928   if (mask) {
10929     r = _getattr(f->inode, mask, perms);
10930     if (r < 0) {
10931       ldout(cct, 3) << "fstatx exit on error!" << dendl;
10932       return r;
10933     }
10934   }
10935
10936   fill_statx(f->inode, mask, stx);
10937   ldout(cct, 3) << "fstatx(" << fd << ", " << stx << ") = " << r << dendl;
10938   return r;
10939 }
10940
10941 int Client::statxat(int dirfd, const char *relpath,
10942                     struct ceph_statx *stx, const UserPerm& perms,
10943                     unsigned int want, unsigned int flags) {
10944   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
10945   if (!mref_reader.is_state_satisfied()) {
10946     return -CEPHFS_ENOTCONN;
10947   }
10948
10949   tout(cct) << __func__ << " flags " << hex << flags << " want " << want << dec << std::endl;
10950   tout(cct) << dirfd << std::endl;
10951   tout(cct) << relpath << std::endl;
10952
10953   unsigned mask = statx_to_mask(flags, want);
10954
10955   InodeRef dirinode;
10956   std::scoped_lock lock(client_lock);
10957   int r = get_fd_inode(dirfd, &dirinode);
10958   if (r < 0) {
10959     return r;
10960   }
10961
10962   InodeRef in;
10963   filepath path(relpath);
10964   r = path_walk(path, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW), mask, dirinode);
10965   if (r < 0) {
10966     return r;
10967   }
10968   r = _getattr(in, mask, perms);
10969   if (r < 0) {
10970     ldout(cct, 3) << __func__ << " exit on error!" << dendl;
10971     return r;
10972   }
10973
10974   fill_statx(in, mask, stx);
10975   ldout(cct, 3) << __func__ << " dirfd" << dirfd << ", r= " << r << dendl;
10976   return r;
10977 }
10978
10979 // not written yet, but i want to link!
10980
10981 int Client::chdir(const char *relpath, std::string &new_cwd,
10982                   const UserPerm& perms)
10983 {
10984   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
10985   if (!mref_reader.is_state_satisfied())
10986     return -CEPHFS_ENOTCONN;
10987
10988   tout(cct) << "chdir" << std::endl;
10989   tout(cct) << relpath << std::endl;
10990
10991   filepath path(relpath);
10992   InodeRef in;
10993
10994   std::scoped_lock lock(client_lock);
10995   int r = path_walk(path, &in, perms);
10996   if (r < 0)
10997     return r;
10998
10999   if (!(in.get()->is_dir()))
11000     return -CEPHFS_ENOTDIR;
11001
11002   if (cwd != in)
11003     cwd.swap(in);
11004   ldout(cct, 3) << "chdir(" << relpath << ")  cwd now " << cwd->ino << dendl;
11005
11006   _getcwd(new_cwd, perms);
11007   return 0;
11008 }
11009
11010 void Client::_getcwd(string& dir, const UserPerm& perms)
11011 {
11012   filepath path;
11013   ldout(cct, 10) << __func__ << " " << *cwd << dendl;
11014
11015   Inode *in = cwd.get();
11016   while (in != root.get()) {
11017     ceph_assert(in->dentries.size() < 2); // dirs can't be hard-linked
11018
11019     // A cwd or ancester is unlinked
11020     if (in->dentries.empty()) {
11021       return;
11022     }
11023
11024     Dentry *dn = in->get_first_parent();
11025
11026
11027     if (!dn) {
11028       // look it up
11029       ldout(cct, 10) << __func__ << " looking up parent for " << *in << dendl;
11030       MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPNAME);
11031       filepath path(in->ino);
11032       req->set_filepath(path);
11033       req->set_inode(in);
11034       int res = make_request(req, perms);
11035       if (res < 0)
11036         break;
11037
11038       // start over
11039       path = filepath();
11040       in = cwd.get();
11041       continue;
11042     }
11043     path.push_front_dentry(dn->name);
11044     in = dn->dir->parent_inode;
11045   }
11046   dir = "/";
11047   dir += path.get_path();
11048 }
11049
11050 void Client::getcwd(string& dir, const UserPerm& perms)
11051 {
11052   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11053   if (!mref_reader.is_state_satisfied())
11054     return;
11055
11056   std::scoped_lock l(client_lock);
11057
11058   _getcwd(dir, perms);
11059 }
11060
11061 int Client::statfs(const char *path, struct statvfs *stbuf,
11062                    const UserPerm& perms)
11063 {
11064   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11065   if (!mref_reader.is_state_satisfied())
11066     return -CEPHFS_ENOTCONN;
11067
11068   tout(cct) << __func__ << std::endl;
11069   unsigned long int total_files_on_fs;
11070
11071   ceph_statfs stats;
11072   C_SaferCond cond;
11073
11074   std::unique_lock lock(client_lock);
11075   const vector<int64_t> &data_pools = mdsmap->get_data_pools();
11076   if (data_pools.size() == 1) {
11077     objecter->get_fs_stats(stats, data_pools[0], &cond);
11078   } else {
11079     objecter->get_fs_stats(stats, std::optional<int64_t>(), &cond);
11080   }
11081
11082   lock.unlock();
11083   int rval = cond.wait();
11084   lock.lock();
11085
11086   ceph_assert(root);
11087   total_files_on_fs = root->rstat.rfiles + root->rstat.rsubdirs;
11088
11089   if (rval < 0) {
11090     ldout(cct, 1) << "underlying call to statfs returned error: "
11091                   << cpp_strerror(rval)
11092                   << dendl;
11093     return rval;
11094   }
11095
11096   memset(stbuf, 0, sizeof(*stbuf));
11097
11098   /*
11099    * we're going to set a block size of 4MB so we can represent larger
11100    * FSes without overflowing. Additionally convert the space
11101    * measurements from KB to bytes while making them in terms of
11102    * blocks.  We use 4MB only because it is big enough, and because it
11103    * actually *is* the (ceph) default block size.
11104    */
11105   const int CEPH_BLOCK_SHIFT = 22;
11106   stbuf->f_frsize = 1 << CEPH_BLOCK_SHIFT;
11107   stbuf->f_bsize = 1 << CEPH_BLOCK_SHIFT;
11108   stbuf->f_files = total_files_on_fs;
11109   stbuf->f_ffree = -1;
11110   stbuf->f_favail = -1;
11111   stbuf->f_fsid = -1;       // ??
11112   stbuf->f_flag = 0;        // ??
11113   stbuf->f_namemax = NAME_MAX;
11114
11115   // Usually quota_root will == root_ancestor, but if the mount root has no
11116   // quota but we can see a parent of it that does have a quota, we'll
11117   // respect that one instead.
11118   ceph_assert(root != nullptr);
11119   InodeRef quota_root = root->quota.is_enable() ? root : get_quota_root(root.get(), perms);
11120
11121   // get_quota_root should always give us something if client quotas are
11122   // enabled
11123   ceph_assert(cct->_conf.get_val<bool>("client_quota") == false || quota_root != nullptr);
11124
11125   if (quota_root && cct->_conf->client_quota_df && quota_root->quota.max_bytes) {
11126
11127     // Skip the getattr if any sessions are stale, as we don't want to
11128     // block `df` if this client has e.g. been evicted, or if the MDS cluster
11129     // is unhealthy.
11130     if (!_any_stale_sessions()) {
11131       int r = _getattr(quota_root, 0, perms, true);
11132       if (r != 0) {
11133         // Ignore return value: error getting latest inode metadata is not a good
11134         // reason to break "df".
11135         lderr(cct) << "Error in getattr on quota root 0x"
11136                    << std::hex << quota_root->ino << std::dec
11137                    << " statfs result may be outdated" << dendl;
11138       }
11139     }
11140
11141     // Special case: if there is a size quota set on the Inode acting
11142     // as the root for this client mount, then report the quota status
11143     // as the filesystem statistics.
11144     const fsblkcnt_t total = quota_root->quota.max_bytes >> CEPH_BLOCK_SHIFT;
11145     const fsblkcnt_t used = quota_root->rstat.rbytes >> CEPH_BLOCK_SHIFT;
11146     // It is possible for a quota to be exceeded: arithmetic here must
11147     // handle case where used > total.
11148     const fsblkcnt_t free = total > used ? total - used : 0;
11149
11150     stbuf->f_blocks = total;
11151     stbuf->f_bfree = free;
11152     stbuf->f_bavail = free;
11153   } else {
11154     // General case: report the cluster statistics returned from RADOS. Because
11155     // multiple pools may be used without one filesystem namespace via
11156     // layouts, this is the most correct thing we can do.
11157     stbuf->f_blocks = stats.kb >> (CEPH_BLOCK_SHIFT - 10);
11158     stbuf->f_bfree = stats.kb_avail >> (CEPH_BLOCK_SHIFT - 10);
11159     stbuf->f_bavail = stats.kb_avail >> (CEPH_BLOCK_SHIFT - 10);
11160   }
11161
11162   return rval;
11163 }
11164
11165 int Client::_do_filelock(Inode *in, Fh *fh, int lock_type, int op, int sleep,
11166                          struct flock *fl, uint64_t owner, bool removing)
11167 {
11168   ldout(cct, 10) << __func__ << " ino " << in->ino
11169                  << (lock_type == CEPH_LOCK_FCNTL ? " fcntl" : " flock")
11170                  << " type " << fl->l_type << " owner " << owner
11171                  << " " << fl->l_start << "~" << fl->l_len << dendl;
11172
11173   if (in->flags & I_ERROR_FILELOCK)
11174     return -CEPHFS_EIO;
11175
11176   int lock_cmd;
11177   if (F_RDLCK == fl->l_type)
11178     lock_cmd = CEPH_LOCK_SHARED;
11179   else if (F_WRLCK == fl->l_type)
11180     lock_cmd = CEPH_LOCK_EXCL;
11181   else if (F_UNLCK == fl->l_type)
11182     lock_cmd = CEPH_LOCK_UNLOCK;
11183   else
11184     return -CEPHFS_EIO;
11185
11186   if (op != CEPH_MDS_OP_SETFILELOCK || lock_cmd == CEPH_LOCK_UNLOCK)
11187     sleep = 0;
11188
11189   /*
11190    * Set the most significant bit, so that MDS knows the 'owner'
11191    * is sufficient to identify the owner of lock. (old code uses
11192    * both 'owner' and 'pid')
11193    */
11194   owner |= (1ULL << 63);
11195
11196   MetaRequest *req = new MetaRequest(op);
11197   filepath path;
11198   in->make_nosnap_relative_path(path);
11199   req->set_filepath(path);
11200   req->set_inode(in);
11201
11202   req->head.args.filelock_change.rule = lock_type;
11203   req->head.args.filelock_change.type = lock_cmd;
11204   req->head.args.filelock_change.owner = owner;
11205   req->head.args.filelock_change.pid = fl->l_pid;
11206   req->head.args.filelock_change.start = fl->l_start;
11207   req->head.args.filelock_change.length = fl->l_len;
11208   req->head.args.filelock_change.wait = sleep;
11209
11210   int ret;
11211   bufferlist bl;
11212
11213   if (sleep && switch_interrupt_cb) {
11214     // enable interrupt
11215     switch_interrupt_cb(callback_handle, req->get());
11216     ret = make_request(req, fh->actor_perms, NULL, NULL, -1, &bl);
11217     // disable interrupt
11218     switch_interrupt_cb(callback_handle, NULL);
11219     if (ret == 0 && req->aborted()) {
11220       // effect of this lock request has been revoked by the 'lock intr' request
11221       ret = req->get_abort_code();
11222     }
11223     put_request(req);
11224   } else {
11225     ret = make_request(req, fh->actor_perms, NULL, NULL, -1, &bl);
11226   }
11227
11228   if (ret == 0) {
11229     if (op == CEPH_MDS_OP_GETFILELOCK) {
11230       ceph_filelock filelock;
11231       auto p = bl.cbegin();
11232       decode(filelock, p);
11233
11234       if (CEPH_LOCK_SHARED == filelock.type)
11235         fl->l_type = F_RDLCK;
11236       else if (CEPH_LOCK_EXCL == filelock.type)
11237         fl->l_type = F_WRLCK;
11238       else
11239         fl->l_type = F_UNLCK;
11240
11241       fl->l_whence = SEEK_SET;
11242       fl->l_start = filelock.start;
11243       fl->l_len = filelock.length;
11244       fl->l_pid = filelock.pid;
11245     } else if (op == CEPH_MDS_OP_SETFILELOCK) {
11246       ceph_lock_state_t *lock_state;
11247       if (lock_type == CEPH_LOCK_FCNTL) {
11248         if (!in->fcntl_locks)
11249           in->fcntl_locks.reset(new ceph_lock_state_t(cct, CEPH_LOCK_FCNTL));
11250         lock_state = in->fcntl_locks.get();
11251       } else if (lock_type == CEPH_LOCK_FLOCK) {
11252         if (!in->flock_locks)
11253           in->flock_locks.reset(new ceph_lock_state_t(cct, CEPH_LOCK_FLOCK));
11254         lock_state = in->flock_locks.get();
11255       } else {
11256         ceph_abort();
11257         return -CEPHFS_EINVAL;
11258       }
11259       _update_lock_state(fl, owner, lock_state);
11260
11261       if (!removing) {
11262         if (lock_type == CEPH_LOCK_FCNTL) {
11263           if (!fh->fcntl_locks)
11264             fh->fcntl_locks.reset(new ceph_lock_state_t(cct, CEPH_LOCK_FCNTL));
11265           lock_state = fh->fcntl_locks.get();
11266         } else {
11267           if (!fh->flock_locks)
11268             fh->flock_locks.reset(new ceph_lock_state_t(cct, CEPH_LOCK_FLOCK));
11269           lock_state = fh->flock_locks.get();
11270         }
11271         _update_lock_state(fl, owner, lock_state);
11272       }
11273     } else
11274       ceph_abort();
11275   }
11276   return ret;
11277 }
11278
11279 int Client::_interrupt_filelock(MetaRequest *req)
11280 {
11281   // Set abort code, but do not kick. The abort code prevents the request
11282   // from being re-sent.
11283   req->abort(-CEPHFS_EINTR);
11284   if (req->mds < 0)
11285     return 0; // haven't sent the request
11286
11287   Inode *in = req->inode();
11288
11289   int lock_type;
11290   if (req->head.args.filelock_change.rule == CEPH_LOCK_FLOCK)
11291     lock_type = CEPH_LOCK_FLOCK_INTR;
11292   else if (req->head.args.filelock_change.rule == CEPH_LOCK_FCNTL)
11293     lock_type = CEPH_LOCK_FCNTL_INTR;
11294   else {
11295     ceph_abort();
11296     return -CEPHFS_EINVAL;
11297   }
11298
11299   MetaRequest *intr_req = new MetaRequest(CEPH_MDS_OP_SETFILELOCK);
11300   filepath path;
11301   in->make_nosnap_relative_path(path);
11302   intr_req->set_filepath(path);
11303   intr_req->set_inode(in);
11304   intr_req->head.args.filelock_change = req->head.args.filelock_change;
11305   intr_req->head.args.filelock_change.rule = lock_type;
11306   intr_req->head.args.filelock_change.type = CEPH_LOCK_UNLOCK;
11307
11308   UserPerm perms(req->get_uid(), req->get_gid());
11309   return make_request(intr_req, perms, NULL, NULL, -1);
11310 }
11311
11312 void Client::_encode_filelocks(Inode *in, bufferlist& bl)
11313 {
11314   if (!in->fcntl_locks && !in->flock_locks)
11315     return;
11316
11317   unsigned nr_fcntl_locks = in->fcntl_locks ? in->fcntl_locks->held_locks.size() : 0;
11318   encode(nr_fcntl_locks, bl);
11319   if (nr_fcntl_locks) {
11320     auto &lock_state = in->fcntl_locks;
11321     for(auto p = lock_state->held_locks.begin();
11322         p != lock_state->held_locks.end();
11323         ++p)
11324       encode(p->second, bl);
11325   }
11326
11327   unsigned nr_flock_locks = in->flock_locks ? in->flock_locks->held_locks.size() : 0;
11328   encode(nr_flock_locks, bl);
11329   if (nr_flock_locks) {
11330     auto &lock_state = in->flock_locks;
11331     for(auto p = lock_state->held_locks.begin();
11332         p != lock_state->held_locks.end();
11333         ++p)
11334       encode(p->second, bl);
11335   }
11336
11337   ldout(cct, 10) << __func__ << " ino " << in->ino << ", " << nr_fcntl_locks
11338                  << " fcntl locks, " << nr_flock_locks << " flock locks" <<  dendl;
11339 }
11340
11341 void Client::_release_filelocks(Fh *fh)
11342 {
11343   if (!fh->fcntl_locks && !fh->flock_locks)
11344     return;
11345
11346   Inode *in = fh->inode.get();
11347   ldout(cct, 10) << __func__ << " " << fh << " ino " << in->ino << dendl;
11348
11349   list<ceph_filelock> activated_locks;
11350
11351   list<pair<int, ceph_filelock> > to_release;
11352
11353   if (fh->fcntl_locks) {
11354     auto &lock_state = fh->fcntl_locks;
11355     for(auto p = lock_state->held_locks.begin(); p != lock_state->held_locks.end(); ) {
11356       auto q = p++;
11357       if (in->flags & I_ERROR_FILELOCK) {
11358         lock_state->remove_lock(q->second, activated_locks);
11359       } else {
11360         to_release.push_back(pair<int, ceph_filelock>(CEPH_LOCK_FCNTL, q->second));
11361       }
11362     }
11363     lock_state.reset();
11364   }
11365   if (fh->flock_locks) {
11366     auto &lock_state = fh->flock_locks;
11367     for(auto p = lock_state->held_locks.begin(); p != lock_state->held_locks.end(); ) {
11368       auto q = p++;
11369       if (in->flags & I_ERROR_FILELOCK) {
11370         lock_state->remove_lock(q->second, activated_locks);
11371       } else {
11372         to_release.push_back(pair<int, ceph_filelock>(CEPH_LOCK_FLOCK, q->second));
11373       }
11374     }
11375     lock_state.reset();
11376   }
11377
11378   if ((in->flags & I_ERROR_FILELOCK) && !in->has_any_filelocks())
11379     in->flags &= ~I_ERROR_FILELOCK;
11380
11381   if (to_release.empty())
11382     return;
11383
11384   struct flock fl;
11385   memset(&fl, 0, sizeof(fl));
11386   fl.l_whence = SEEK_SET;
11387   fl.l_type = F_UNLCK;
11388
11389   for (list<pair<int, ceph_filelock> >::iterator p = to_release.begin();
11390        p != to_release.end();
11391        ++p) {
11392     fl.l_start = p->second.start;
11393     fl.l_len = p->second.length;
11394     fl.l_pid = p->second.pid;
11395     _do_filelock(in, fh, p->first, CEPH_MDS_OP_SETFILELOCK, 0, &fl,
11396                  p->second.owner, true);
11397   }
11398 }
11399
11400 void Client::_update_lock_state(struct flock *fl, uint64_t owner,
11401                                 ceph_lock_state_t *lock_state)
11402 {
11403   int lock_cmd;
11404   if (F_RDLCK == fl->l_type)
11405     lock_cmd = CEPH_LOCK_SHARED;
11406   else if (F_WRLCK == fl->l_type)
11407     lock_cmd = CEPH_LOCK_EXCL;
11408   else
11409     lock_cmd = CEPH_LOCK_UNLOCK;;
11410
11411   ceph_filelock filelock;
11412   filelock.start = fl->l_start;
11413   filelock.length = fl->l_len;
11414   filelock.client = 0;
11415   // see comment in _do_filelock()
11416   filelock.owner = owner | (1ULL << 63);
11417   filelock.pid = fl->l_pid;
11418   filelock.type = lock_cmd;
11419
11420   if (filelock.type == CEPH_LOCK_UNLOCK) {
11421     list<ceph_filelock> activated_locks;
11422     lock_state->remove_lock(filelock, activated_locks);
11423   } else {
11424     bool r = lock_state->add_lock(filelock, false, false, NULL);
11425     ceph_assert(r);
11426   }
11427 }
11428
11429 int Client::_getlk(Fh *fh, struct flock *fl, uint64_t owner)
11430 {
11431   Inode *in = fh->inode.get();
11432   ldout(cct, 10) << "_getlk " << fh << " ino " << in->ino << dendl;
11433   int ret = _do_filelock(in, fh, CEPH_LOCK_FCNTL, CEPH_MDS_OP_GETFILELOCK, 0, fl, owner);
11434   return ret;
11435 }
11436
11437 int Client::_setlk(Fh *fh, struct flock *fl, uint64_t owner, int sleep)
11438 {
11439   Inode *in = fh->inode.get();
11440   ldout(cct, 10) << "_setlk " << fh << " ino " << in->ino << dendl;
11441   int ret =  _do_filelock(in, fh, CEPH_LOCK_FCNTL, CEPH_MDS_OP_SETFILELOCK, sleep, fl, owner);
11442   ldout(cct, 10) << "_setlk " << fh << " ino " << in->ino << " result=" << ret << dendl;
11443   return ret;
11444 }
11445
11446 int Client::_flock(Fh *fh, int cmd, uint64_t owner)
11447 {
11448   Inode *in = fh->inode.get();
11449   ldout(cct, 10) << "_flock " << fh << " ino " << in->ino << dendl;
11450
11451   int sleep = !(cmd & LOCK_NB);
11452   cmd &= ~LOCK_NB;
11453
11454   int type;
11455   switch (cmd) {
11456     case LOCK_SH:
11457       type = F_RDLCK;
11458       break;
11459     case LOCK_EX:
11460       type = F_WRLCK;
11461       break;
11462     case LOCK_UN:
11463       type = F_UNLCK;
11464       break;
11465     default:
11466       return -CEPHFS_EINVAL;
11467   }
11468
11469   struct flock fl;
11470   memset(&fl, 0, sizeof(fl));
11471   fl.l_type = type;
11472   fl.l_whence = SEEK_SET;
11473
11474   int ret =  _do_filelock(in, fh, CEPH_LOCK_FLOCK, CEPH_MDS_OP_SETFILELOCK, sleep, &fl, owner);
11475   ldout(cct, 10) << "_flock " << fh << " ino " << in->ino << " result=" << ret << dendl;
11476   return ret;
11477 }
11478
11479 int Client::get_snap_info(const char *path, const UserPerm &perms, SnapInfo *snap_info) {
11480   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11481   if (!mref_reader.is_state_satisfied()) {
11482     return -CEPHFS_ENOTCONN;
11483   }
11484
11485   std::scoped_lock lock(client_lock);
11486   InodeRef in;
11487   int r = Client::path_walk(path, &in, perms, true);
11488   if (r < 0) {
11489     return r;
11490   }
11491
11492   if (in->snapid == CEPH_NOSNAP) {
11493     return -CEPHFS_EINVAL;
11494   }
11495
11496   snap_info->id = in->snapid;
11497   snap_info->metadata = in->snap_metadata;
11498   return 0;
11499 }
11500
11501 int Client::ll_statfs(Inode *in, struct statvfs *stbuf, const UserPerm& perms)
11502 {
11503   /* Since the only thing this does is wrap a call to statfs, and
11504      statfs takes a lock, it doesn't seem we have a need to split it
11505      out. */
11506   return statfs(0, stbuf, perms);
11507 }
11508
11509 void Client::_ll_register_callbacks(struct ceph_client_callback_args *args)
11510 {
11511   if (!args)
11512     return;
11513
11514   ldout(cct, 10) << __func__ << " cb " << args->handle
11515                  << " invalidate_ino_cb " << args->ino_cb
11516                  << " invalidate_dentry_cb " << args->dentry_cb
11517                  << " switch_interrupt_cb " << args->switch_intr_cb
11518                  << " remount_cb " << args->remount_cb
11519                  << dendl;
11520   callback_handle = args->handle;
11521   if (args->ino_cb) {
11522     ino_invalidate_cb = args->ino_cb;
11523     async_ino_invalidator.start();
11524   }
11525   if (args->dentry_cb) {
11526     dentry_invalidate_cb = args->dentry_cb;
11527     async_dentry_invalidator.start();
11528   }
11529   if (args->switch_intr_cb) {
11530     switch_interrupt_cb = args->switch_intr_cb;
11531     interrupt_finisher.start();
11532   }
11533   if (args->remount_cb) {
11534     remount_cb = args->remount_cb;
11535     remount_finisher.start();
11536   }
11537   if (args->ino_release_cb) {
11538     ino_release_cb = args->ino_release_cb;
11539     async_ino_releasor.start();
11540   }
11541   if (args->umask_cb)
11542     umask_cb = args->umask_cb;
11543 }
11544
11545 // This is deprecated, use ll_register_callbacks2() instead.
11546 void Client::ll_register_callbacks(struct ceph_client_callback_args *args)
11547 {
11548   ceph_assert(!is_mounting() && !is_mounted() && !is_unmounting());
11549
11550   _ll_register_callbacks(args);
11551 }
11552
11553 int Client::ll_register_callbacks2(struct ceph_client_callback_args *args)
11554 {
11555   if (is_mounting() || is_mounted() || is_unmounting())
11556     return -CEPHFS_EBUSY;
11557
11558   _ll_register_callbacks(args);
11559   return 0;
11560 }
11561
11562 std::pair<int, bool> Client::test_dentry_handling(bool can_invalidate)
11563 {
11564   std::pair <int, bool> r(0, false);
11565
11566   RWRef_t iref_reader(initialize_state, CLIENT_INITIALIZED);
11567   if (!iref_reader.is_state_satisfied())
11568     return std::make_pair(-CEPHFS_ENOTCONN, false);
11569
11570   can_invalidate_dentries = can_invalidate;
11571
11572   if (can_invalidate_dentries) {
11573     ceph_assert(dentry_invalidate_cb);
11574     ldout(cct, 1) << "using dentry_invalidate_cb" << dendl;
11575   } else {
11576     ceph_assert(remount_cb);
11577     ldout(cct, 1) << "using remount_cb" << dendl;
11578     r = _do_remount(false);
11579   }
11580
11581   return r;
11582 }
11583
11584 int Client::_sync_fs()
11585 {
11586   ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
11587
11588   ldout(cct, 10) << __func__ << dendl;
11589
11590   // flush file data
11591   std::unique_ptr<C_SaferCond> cond = nullptr;
11592   if (cct->_conf->client_oc) {
11593     cond.reset(new C_SaferCond("Client::_sync_fs:lock"));
11594     objectcacher->flush_all(cond.get());
11595   }
11596
11597   // flush caps
11598   flush_caps_sync();
11599   ceph_tid_t flush_tid = last_flush_tid;
11600
11601   // wait for unsafe mds requests
11602   wait_unsafe_requests();
11603
11604   wait_sync_caps(flush_tid);
11605
11606   if (nullptr != cond) {
11607     client_lock.unlock();
11608     ldout(cct, 15) << __func__ << " waiting on data to flush" << dendl;
11609     cond->wait();
11610     ldout(cct, 15) << __func__ << " flush finished" << dendl;
11611     client_lock.lock();
11612   }
11613
11614   return 0;
11615 }
11616
11617 int Client::sync_fs()
11618 {
11619   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11620   if (!mref_reader.is_state_satisfied())
11621     return -CEPHFS_ENOTCONN;
11622
11623   std::scoped_lock l(client_lock);
11624
11625   return _sync_fs();
11626 }
11627
11628 int64_t Client::drop_caches()
11629 {
11630   std::scoped_lock l(client_lock);
11631   return objectcacher->release_all();
11632 }
11633
11634 int Client::_lazyio(Fh *fh, int enable)
11635 {
11636   Inode *in = fh->inode.get();
11637   ldout(cct, 20) << __func__ << " " << *in << " " << !!enable << dendl;
11638
11639   if (!!(fh->mode & CEPH_FILE_MODE_LAZY) == !!enable)
11640     return 0;
11641
11642   int orig_mode = fh->mode;
11643   if (enable) {
11644     fh->mode |= CEPH_FILE_MODE_LAZY;
11645     in->get_open_ref(fh->mode);
11646     in->put_open_ref(orig_mode);
11647     check_caps(in, CHECK_CAPS_NODELAY);
11648   } else {
11649     fh->mode &= ~CEPH_FILE_MODE_LAZY;
11650     in->get_open_ref(fh->mode);
11651     in->put_open_ref(orig_mode);
11652     check_caps(in, 0);
11653   }
11654
11655   return 0;
11656 }
11657
11658 int Client::lazyio(int fd, int enable)
11659 {
11660   std::scoped_lock l(client_lock);
11661   Fh *f = get_filehandle(fd);
11662   if (!f)
11663     return -CEPHFS_EBADF;
11664
11665   return _lazyio(f, enable);
11666 }
11667
11668 int Client::ll_lazyio(Fh *fh, int enable)
11669 {
11670   ldout(cct, 3) << __func__ << " " << fh << " " << fh->inode->ino << " " << !!enable << dendl;
11671   tout(cct) << __func__ << std::endl;
11672
11673   std::scoped_lock lock(client_lock);
11674   return _lazyio(fh, enable);
11675 }
11676
11677 int Client::lazyio_propagate(int fd, loff_t offset, size_t count)
11678 {
11679   std::scoped_lock l(client_lock);
11680   ldout(cct, 3) << "op: client->lazyio_propagate(" << fd
11681           << ", " << offset << ", " << count << ")" << dendl;
11682
11683   Fh *f = get_filehandle(fd);
11684   if (!f)
11685     return -CEPHFS_EBADF;
11686
11687   // for now
11688   _fsync(f, true);
11689
11690   return 0;
11691 }
11692
11693 int Client::lazyio_synchronize(int fd, loff_t offset, size_t count)
11694 {
11695   std::scoped_lock l(client_lock);
11696   ldout(cct, 3) << "op: client->lazyio_synchronize(" << fd
11697           << ", " << offset << ", " << count << ")" << dendl;
11698
11699   Fh *f = get_filehandle(fd);
11700   if (!f)
11701     return -CEPHFS_EBADF;
11702   Inode *in = f->inode.get();
11703
11704   _fsync(f, true);
11705   if (_release(in)) {
11706     int r =_getattr(in, CEPH_STAT_CAP_SIZE, f->actor_perms);
11707     if (r < 0)
11708       return r;
11709   }
11710   return 0;
11711 }
11712
11713
11714 // =============================
11715 // snaps
11716
11717 int Client::mksnap(const char *relpath, const char *name, const UserPerm& perm,
11718                    mode_t mode, const std::map<std::string, std::string> &metadata)
11719 {
11720   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11721   if (!mref_reader.is_state_satisfied())
11722     return -CEPHFS_ENOTCONN;
11723
11724   std::scoped_lock l(client_lock);
11725
11726   filepath path(relpath);
11727   InodeRef in;
11728   int r = path_walk(path, &in, perm);
11729   if (r < 0)
11730     return r;
11731   if (cct->_conf->client_permissions) {
11732     r = may_create(in.get(), perm);
11733     if (r < 0)
11734       return r;
11735   }
11736   Inode *snapdir = open_snapdir(in.get());
11737   return _mkdir(snapdir, name, mode, perm, nullptr, metadata);
11738 }
11739
11740 int Client::rmsnap(const char *relpath, const char *name, const UserPerm& perms, bool check_perms)
11741 {
11742   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11743   if (!mref_reader.is_state_satisfied())
11744     return -CEPHFS_ENOTCONN;
11745
11746   std::scoped_lock l(client_lock);
11747
11748   filepath path(relpath);
11749   InodeRef in;
11750   int r = path_walk(path, &in, perms);
11751   if (r < 0)
11752     return r;
11753   Inode *snapdir = open_snapdir(in.get());
11754   if (cct->_conf->client_permissions) {
11755     r = may_delete(snapdir, check_perms ? name : NULL, perms);
11756     if (r < 0)
11757       return r;
11758   }
11759   return _rmdir(snapdir, name, perms);
11760 }
11761
11762 // =============================
11763 // expose caps
11764
11765 int Client::get_caps_issued(int fd)
11766 {
11767   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11768   if (!mref_reader.is_state_satisfied())
11769     return -CEPHFS_ENOTCONN;
11770
11771   std::scoped_lock lock(client_lock);
11772
11773   Fh *f = get_filehandle(fd);
11774   if (!f)
11775     return -CEPHFS_EBADF;
11776
11777   return f->inode->caps_issued();
11778 }
11779
11780 int Client::get_caps_issued(const char *path, const UserPerm& perms)
11781 {
11782   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11783   if (!mref_reader.is_state_satisfied())
11784     return -CEPHFS_ENOTCONN;
11785
11786   std::scoped_lock lock(client_lock);
11787
11788   filepath p(path);
11789   InodeRef in;
11790   int r = path_walk(p, &in, perms, true);
11791   if (r < 0)
11792     return r;
11793   return in->caps_issued();
11794 }
11795
11796 // =========================================
11797 // low level
11798
11799 Inode *Client::open_snapdir(Inode *diri)
11800 {
11801   Inode *in;
11802   vinodeno_t vino(diri->ino, CEPH_SNAPDIR);
11803   if (!inode_map.count(vino)) {
11804     in = new Inode(this, vino, &diri->layout);
11805
11806     in->ino = diri->ino;
11807     in->snapid = CEPH_SNAPDIR;
11808     in->mode = diri->mode;
11809     in->uid = diri->uid;
11810     in->gid = diri->gid;
11811     in->nlink = 1;
11812     in->mtime = diri->mtime;
11813     in->ctime = diri->ctime;
11814     in->btime = diri->btime;
11815     in->atime = diri->atime;
11816     in->size = diri->size;
11817     in->change_attr = diri->change_attr;
11818
11819     in->dirfragtree.clear();
11820     in->snapdir_parent = diri;
11821     diri->flags |= I_SNAPDIR_OPEN;
11822     inode_map[vino] = in;
11823     if (use_faked_inos())
11824       _assign_faked_ino(in);
11825     ldout(cct, 10) << "open_snapdir created snapshot inode " << *in << dendl;
11826   } else {
11827     in = inode_map[vino];
11828     ldout(cct, 10) << "open_snapdir had snapshot inode " << *in << dendl;
11829   }
11830   return in;
11831 }
11832
11833 int Client::ll_lookup(Inode *parent, const char *name, struct stat *attr,
11834                       Inode **out, const UserPerm& perms)
11835 {
11836   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11837   if (!mref_reader.is_state_satisfied())
11838     return -CEPHFS_ENOTCONN;
11839
11840   vinodeno_t vparent = _get_vino(parent);
11841   ldout(cct, 3) << __func__ << " " << vparent << " " << name << dendl;
11842   tout(cct) << __func__ << std::endl;
11843   tout(cct) << name << std::endl;
11844
11845   std::scoped_lock lock(client_lock);
11846
11847   int r = 0;
11848   if (!fuse_default_permissions) {
11849     if (strcmp(name, ".") && strcmp(name, "..")) {
11850       r = may_lookup(parent, perms);
11851       if (r < 0)
11852         return r;
11853     }
11854   }
11855
11856   string dname(name);
11857   InodeRef in;
11858
11859   r = _lookup(parent, dname, CEPH_STAT_CAP_INODE_ALL, &in, perms);
11860   if (r < 0) {
11861     attr->st_ino = 0;
11862     goto out;
11863   }
11864
11865   ceph_assert(in);
11866   fill_stat(in, attr);
11867   _ll_get(in.get());
11868
11869  out:
11870   ldout(cct, 3) << __func__ << " " << vparent << " " << name
11871           << " -> " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
11872   tout(cct) << attr->st_ino << std::endl;
11873   *out = in.get();
11874   return r;
11875 }
11876
11877 int Client::ll_lookup_vino(
11878     vinodeno_t vino,
11879     const UserPerm& perms,
11880     Inode **inode)
11881 {
11882   ceph_assert(inode != NULL);
11883   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11884   if (!mref_reader.is_state_satisfied())
11885     return -CEPHFS_ENOTCONN;
11886
11887   if (is_reserved_vino(vino))
11888     return -CEPHFS_ESTALE;
11889
11890   std::scoped_lock lock(client_lock);
11891   ldout(cct, 3) << __func__ << " " << vino << dendl;
11892
11893   // Check the cache first
11894   unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino);
11895   if (p != inode_map.end()) {
11896     *inode = p->second;
11897     _ll_get(*inode);
11898     return 0;
11899   }
11900
11901   uint64_t snapid = vino.snapid;
11902
11903   // for snapdir, find the non-snapped dir inode
11904   if (snapid == CEPH_SNAPDIR)
11905     vino.snapid = CEPH_NOSNAP;
11906
11907   int r = _lookup_vino(vino, perms, inode);
11908   if (r)
11909     return r;
11910   ceph_assert(*inode != NULL);
11911
11912   if (snapid == CEPH_SNAPDIR) {
11913     Inode *tmp = *inode;
11914
11915     // open the snapdir and put the inode ref
11916     *inode = open_snapdir(tmp);
11917     _ll_forget(tmp, 1);
11918     _ll_get(*inode);
11919   }
11920   return 0;
11921 }
11922
11923 int Client::ll_lookup_inode(
11924     struct inodeno_t ino,
11925     const UserPerm& perms,
11926     Inode **inode)
11927 {
11928   vinodeno_t vino(ino, CEPH_NOSNAP);
11929   return ll_lookup_vino(vino, perms, inode);
11930 }
11931
11932 int Client::ll_lookupx(Inode *parent, const char *name, Inode **out,
11933                        struct ceph_statx *stx, unsigned want, unsigned flags,
11934                        const UserPerm& perms)
11935 {
11936   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11937   if (!mref_reader.is_state_satisfied())
11938     return -CEPHFS_ENOTCONN;
11939
11940   vinodeno_t vparent = _get_vino(parent);
11941   ldout(cct, 3) << __func__ << " " << vparent << " " << name << dendl;
11942   tout(cct) << "ll_lookupx" << std::endl;
11943   tout(cct) << name << std::endl;
11944
11945   std::scoped_lock lock(client_lock);
11946
11947   int r = 0;
11948   if (!fuse_default_permissions) {
11949     r = may_lookup(parent, perms);
11950     if (r < 0)
11951       return r;
11952   }
11953
11954   string dname(name);
11955   InodeRef in;
11956
11957   unsigned mask = statx_to_mask(flags, want);
11958   r = _lookup(parent, dname, mask, &in, perms);
11959   if (r < 0) {
11960     stx->stx_ino = 0;
11961     stx->stx_mask = 0;
11962   } else {
11963     ceph_assert(in);
11964     fill_statx(in, mask, stx);
11965     _ll_get(in.get());
11966   }
11967
11968   ldout(cct, 3) << __func__ << " " << vparent << " " << name
11969           << " -> " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
11970   tout(cct) << stx->stx_ino << std::endl;
11971   *out = in.get();
11972   return r;
11973 }
11974
11975 int Client::ll_walk(const char* name, Inode **out, struct ceph_statx *stx,
11976                     unsigned int want, unsigned int flags, const UserPerm& perms)
11977 {
11978   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11979   if (!mref_reader.is_state_satisfied())
11980     return -CEPHFS_ENOTCONN;
11981
11982   filepath fp(name, 0);
11983   InodeRef in;
11984   int rc;
11985   unsigned mask = statx_to_mask(flags, want);
11986
11987   ldout(cct, 3) << __func__ << " " << name << dendl;
11988   tout(cct) << __func__ << std::endl;
11989   tout(cct) << name << std::endl;
11990
11991   std::scoped_lock lock(client_lock);
11992   rc = path_walk(fp, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW), mask);
11993   if (rc < 0) {
11994     /* zero out mask, just in case... */
11995     stx->stx_mask = 0;
11996     stx->stx_ino = 0;
11997     *out = NULL;
11998     return rc;
11999   } else {
12000     ceph_assert(in);
12001     fill_statx(in, mask, stx);
12002     _ll_get(in.get());
12003     *out = in.get();
12004     return 0;
12005   }
12006 }
12007
12008 void Client::_ll_get(Inode *in)
12009 {
12010   if (in->ll_ref == 0) {
12011     in->iget();
12012     if (in->is_dir() && !in->dentries.empty()) {
12013       ceph_assert(in->dentries.size() == 1); // dirs can't be hard-linked
12014       in->get_first_parent()->get(); // pin dentry
12015     }
12016     if (in->snapid != CEPH_NOSNAP)
12017       ll_snap_ref[in->snapid]++;
12018   }
12019   in->ll_get();
12020   ldout(cct, 20) << __func__ << " " << in << " " << in->ino << " -> " << in->ll_ref << dendl;
12021 }
12022
12023 int Client::_ll_put(Inode *in, uint64_t num)
12024 {
12025   in->ll_put(num);
12026   ldout(cct, 20) << __func__ << " " << in << " " << in->ino << " " << num << " -> " << in->ll_ref << dendl;
12027   if (in->ll_ref == 0) {
12028     if (in->is_dir() && !in->dentries.empty()) {
12029       ceph_assert(in->dentries.size() == 1); // dirs can't be hard-linked
12030       in->get_first_parent()->put(); // unpin dentry
12031     }
12032     if (in->snapid != CEPH_NOSNAP) {
12033       auto p = ll_snap_ref.find(in->snapid);
12034       ceph_assert(p != ll_snap_ref.end());
12035       ceph_assert(p->second > 0);
12036       if (--p->second == 0)
12037         ll_snap_ref.erase(p);
12038     }
12039     put_inode(in);
12040     return 0;
12041   } else {
12042     return in->ll_ref;
12043   }
12044 }
12045
12046 void Client::_ll_drop_pins()
12047 {
12048   ldout(cct, 10) << __func__ << dendl;
12049   std::set<InodeRef> to_be_put; //this set will be deconstructed item by item when exit
12050   ceph::unordered_map<vinodeno_t, Inode*>::iterator next;
12051   for (ceph::unordered_map<vinodeno_t, Inode*>::iterator it = inode_map.begin();
12052        it != inode_map.end();
12053        it = next) {
12054     Inode *in = it->second;
12055     next = it;
12056     ++next;
12057     if (in->ll_ref){
12058       to_be_put.insert(in);
12059       _ll_put(in, in->ll_ref);
12060     }
12061   }
12062 }
12063
12064 bool Client::_ll_forget(Inode *in, uint64_t count)
12065 {
12066   inodeno_t ino = in->ino;
12067
12068   ldout(cct, 8) << __func__ << " " << ino << " " << count << dendl;
12069   tout(cct) << __func__ << std::endl;
12070   tout(cct) << ino.val << std::endl;
12071   tout(cct) << count << std::endl;
12072
12073   // Ignore forget if we're no longer mounted
12074   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12075   if (!mref_reader.is_state_satisfied())
12076     return true;
12077
12078   if (ino == 1) return true;  // ignore forget on root.
12079
12080   bool last = false;
12081   if (in->ll_ref < count) {
12082     ldout(cct, 1) << "WARNING: ll_forget on " << ino << " " << count
12083                   << ", which only has ll_ref=" << in->ll_ref << dendl;
12084     _ll_put(in, in->ll_ref);
12085     last = true;
12086   } else {
12087     if (_ll_put(in, count) == 0)
12088       last = true;
12089   }
12090
12091   return last;
12092 }
12093
12094 bool Client::ll_forget(Inode *in, uint64_t count)
12095 {
12096   std::scoped_lock lock(client_lock);
12097   return _ll_forget(in, count);
12098 }
12099
12100 bool Client::ll_put(Inode *in)
12101 {
12102   /* ll_forget already takes the lock */
12103   return ll_forget(in, 1);
12104 }
12105
12106 int Client::ll_get_snap_ref(snapid_t snap)
12107 {
12108   std::scoped_lock lock(client_lock);
12109   auto p = ll_snap_ref.find(snap);
12110   if (p != ll_snap_ref.end())
12111     return p->second;
12112   return 0;
12113 }
12114
12115 snapid_t Client::ll_get_snapid(Inode *in)
12116 {
12117   std::scoped_lock lock(client_lock);
12118   return in->snapid;
12119 }
12120
12121 Inode *Client::ll_get_inode(ino_t ino)
12122 {
12123   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12124   if (!mref_reader.is_state_satisfied())
12125     return NULL;
12126
12127   std::scoped_lock lock(client_lock);
12128
12129   vinodeno_t vino = _map_faked_ino(ino);
12130   unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino);
12131   if (p == inode_map.end())
12132     return NULL;
12133   Inode *in = p->second;
12134   _ll_get(in);
12135   return in;
12136 }
12137
12138 Inode *Client::ll_get_inode(vinodeno_t vino)
12139 {
12140   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12141   if (!mref_reader.is_state_satisfied())
12142     return NULL;
12143
12144   if (is_reserved_vino(vino))
12145     return NULL;
12146
12147   std::scoped_lock lock(client_lock);
12148
12149   unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino);
12150   if (p == inode_map.end())
12151     return NULL;
12152   Inode *in = p->second;
12153   _ll_get(in);
12154   return in;
12155 }
12156
12157 int Client::_ll_getattr(Inode *in, int caps, const UserPerm& perms)
12158 {
12159   vinodeno_t vino = _get_vino(in);
12160
12161   ldout(cct, 8) << __func__ << " " << vino << dendl;
12162   tout(cct) << __func__ << std::endl;
12163   tout(cct) << vino.ino.val << std::endl;
12164
12165   if (vino.snapid < CEPH_NOSNAP)
12166     return 0;
12167   else
12168     return _getattr(in, caps, perms);
12169 }
12170
12171 int Client::ll_getattr(Inode *in, struct stat *attr, const UserPerm& perms)
12172 {
12173   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12174   if (!mref_reader.is_state_satisfied())
12175     return -CEPHFS_ENOTCONN;
12176
12177   std::scoped_lock lock(client_lock);
12178
12179   int res = _ll_getattr(in, CEPH_STAT_CAP_INODE_ALL, perms);
12180
12181   if (res == 0)
12182     fill_stat(in, attr);
12183   ldout(cct, 3) << __func__ << " " << _get_vino(in) << " = " << res << dendl;
12184   return res;
12185 }
12186
12187 int Client::ll_getattrx(Inode *in, struct ceph_statx *stx, unsigned int want,
12188                         unsigned int flags, const UserPerm& perms)
12189 {
12190   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12191   if (!mref_reader.is_state_satisfied())
12192     return -CEPHFS_ENOTCONN;
12193
12194   std::scoped_lock lock(client_lock);
12195
12196   int res = 0;
12197   unsigned mask = statx_to_mask(flags, want);
12198
12199   if (mask && !in->caps_issued_mask(mask, true))
12200     res = _ll_getattr(in, mask, perms);
12201
12202   if (res == 0)
12203     fill_statx(in, mask, stx);
12204   ldout(cct, 3) << __func__ << " " << _get_vino(in) << " = " << res << dendl;
12205   return res;
12206 }
12207
12208 int Client::_ll_setattrx(Inode *in, struct ceph_statx *stx, int mask,
12209                          const UserPerm& perms, InodeRef *inp)
12210 {
12211   vinodeno_t vino = _get_vino(in);
12212
12213   ldout(cct, 8) << __func__ << " " << vino << " mask " << hex << mask << dec
12214                 << dendl;
12215   tout(cct) << __func__ << std::endl;
12216   tout(cct) << vino.ino.val << std::endl;
12217   tout(cct) << stx->stx_mode << std::endl;
12218   tout(cct) << stx->stx_uid << std::endl;
12219   tout(cct) << stx->stx_gid << std::endl;
12220   tout(cct) << stx->stx_size << std::endl;
12221   tout(cct) << stx->stx_mtime << std::endl;
12222   tout(cct) << stx->stx_atime << std::endl;
12223   tout(cct) << stx->stx_btime << std::endl;
12224   tout(cct) << mask << std::endl;
12225
12226   if (!fuse_default_permissions) {
12227     int res = may_setattr(in, stx, mask, perms);
12228     if (res < 0)
12229       return res;
12230   }
12231
12232   mask &= ~(CEPH_SETATTR_MTIME_NOW | CEPH_SETATTR_ATIME_NOW);
12233
12234   return __setattrx(in, stx, mask, perms, inp);
12235 }
12236
12237 int Client::ll_setattrx(Inode *in, struct ceph_statx *stx, int mask,
12238                         const UserPerm& perms)
12239 {
12240   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12241   if (!mref_reader.is_state_satisfied())
12242     return -CEPHFS_ENOTCONN;
12243
12244   std::scoped_lock lock(client_lock);
12245
12246   InodeRef target(in);
12247   int res = _ll_setattrx(in, stx, mask, perms, &target);
12248   if (res == 0) {
12249     ceph_assert(in == target.get());
12250     fill_statx(in, in->caps_issued(), stx);
12251   }
12252
12253   ldout(cct, 3) << __func__ << " " << _get_vino(in) << " = " << res << dendl;
12254   return res;
12255 }
12256
12257 int Client::ll_setattr(Inode *in, struct stat *attr, int mask,
12258                        const UserPerm& perms)
12259 {
12260   struct ceph_statx stx;
12261   stat_to_statx(attr, &stx);
12262
12263   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12264   if (!mref_reader.is_state_satisfied())
12265     return -CEPHFS_ENOTCONN;
12266
12267   std::scoped_lock lock(client_lock);
12268
12269   InodeRef target(in);
12270   int res = _ll_setattrx(in, &stx, mask, perms, &target);
12271   if (res == 0) {
12272     ceph_assert(in == target.get());
12273     fill_stat(in, attr);
12274   }
12275
12276   ldout(cct, 3) << __func__ << " " << _get_vino(in) << " = " << res << dendl;
12277   return res;
12278 }
12279
12280
12281 // ----------
12282 // xattrs
12283
12284 int Client::getxattr(const char *path, const char *name, void *value, size_t size,
12285                      const UserPerm& perms)
12286 {
12287   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12288   if (!mref_reader.is_state_satisfied())
12289     return -CEPHFS_ENOTCONN;
12290
12291   std::scoped_lock lock(client_lock);
12292
12293   InodeRef in;
12294   int r = Client::path_walk(path, &in, perms, true, CEPH_STAT_CAP_XATTR);
12295   if (r < 0)
12296     return r;
12297   return _getxattr(in, name, value, size, perms);
12298 }
12299
12300 int Client::lgetxattr(const char *path, const char *name, void *value, size_t size,
12301                       const UserPerm& perms)
12302 {
12303   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12304   if (!mref_reader.is_state_satisfied())
12305     return -CEPHFS_ENOTCONN;
12306
12307   std::scoped_lock lock(client_lock);
12308
12309   InodeRef in;
12310   int r = Client::path_walk(path, &in, perms, false, CEPH_STAT_CAP_XATTR);
12311   if (r < 0)
12312     return r;
12313   return _getxattr(in, name, value, size, perms);
12314 }
12315
12316 int Client::fgetxattr(int fd, const char *name, void *value, size_t size,
12317                       const UserPerm& perms)
12318 {
12319   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12320   if (!mref_reader.is_state_satisfied())
12321     return -CEPHFS_ENOTCONN;
12322
12323   std::scoped_lock lock(client_lock);
12324
12325   Fh *f = get_filehandle(fd);
12326   if (!f)
12327     return -CEPHFS_EBADF;
12328   return _getxattr(f->inode, name, value, size, perms);
12329 }
12330
12331 int Client::listxattr(const char *path, char *list, size_t size,
12332                       const UserPerm& perms)
12333 {
12334   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12335   if (!mref_reader.is_state_satisfied())
12336     return -CEPHFS_ENOTCONN;
12337
12338   std::scoped_lock lock(client_lock);
12339
12340   InodeRef in;
12341   int r = Client::path_walk(path, &in, perms, true, CEPH_STAT_CAP_XATTR);
12342   if (r < 0)
12343     return r;
12344   return Client::_listxattr(in.get(), list, size, perms);
12345 }
12346
12347 int Client::llistxattr(const char *path, char *list, size_t size,
12348                        const UserPerm& perms)
12349 {
12350   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12351   if (!mref_reader.is_state_satisfied())
12352     return -CEPHFS_ENOTCONN;
12353
12354   std::scoped_lock lock(client_lock);
12355
12356   InodeRef in;
12357   int r = Client::path_walk(path, &in, perms, false, CEPH_STAT_CAP_XATTR);
12358   if (r < 0)
12359     return r;
12360   return Client::_listxattr(in.get(), list, size, perms);
12361 }
12362
12363 int Client::flistxattr(int fd, char *list, size_t size, const UserPerm& perms)
12364 {
12365   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12366   if (!mref_reader.is_state_satisfied())
12367     return -CEPHFS_ENOTCONN;
12368
12369   std::scoped_lock lock(client_lock);
12370
12371   Fh *f = get_filehandle(fd);
12372   if (!f)
12373     return -CEPHFS_EBADF;
12374   return Client::_listxattr(f->inode.get(), list, size, perms);
12375 }
12376
12377 int Client::removexattr(const char *path, const char *name,
12378                         const UserPerm& perms)
12379 {
12380   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12381   if (!mref_reader.is_state_satisfied())
12382     return -CEPHFS_ENOTCONN;
12383
12384   std::scoped_lock lock(client_lock);
12385
12386   InodeRef in;
12387   int r = Client::path_walk(path, &in, perms, true);
12388   if (r < 0)
12389     return r;
12390   return _removexattr(in, name, perms);
12391 }
12392
12393 int Client::lremovexattr(const char *path, const char *name,
12394                          const UserPerm& perms)
12395 {
12396   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12397   if (!mref_reader.is_state_satisfied())
12398     return -CEPHFS_ENOTCONN;
12399
12400   std::scoped_lock lock(client_lock);
12401
12402   InodeRef in;
12403   int r = Client::path_walk(path, &in, perms, false);
12404   if (r < 0)
12405     return r;
12406   return _removexattr(in, name, perms);
12407 }
12408
12409 int Client::fremovexattr(int fd, const char *name, const UserPerm& perms)
12410 {
12411   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12412   if (!mref_reader.is_state_satisfied())
12413     return -CEPHFS_ENOTCONN;
12414
12415   std::scoped_lock lock(client_lock);
12416
12417   Fh *f = get_filehandle(fd);
12418   if (!f)
12419     return -CEPHFS_EBADF;
12420   return _removexattr(f->inode, name, perms);
12421 }
12422
12423 int Client::setxattr(const char *path, const char *name, const void *value,
12424                      size_t size, int flags, const UserPerm& perms)
12425 {
12426   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12427   if (!mref_reader.is_state_satisfied())
12428     return -CEPHFS_ENOTCONN;
12429
12430   _setxattr_maybe_wait_for_osdmap(name, value, size);
12431
12432   std::scoped_lock lock(client_lock);
12433
12434   InodeRef in;
12435   int r = Client::path_walk(path, &in, perms, true);
12436   if (r < 0)
12437     return r;
12438   return _setxattr(in, name, value, size, flags, perms);
12439 }
12440
12441 int Client::lsetxattr(const char *path, const char *name, const void *value,
12442                       size_t size, int flags, const UserPerm& perms)
12443 {
12444   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12445   if (!mref_reader.is_state_satisfied())
12446     return -CEPHFS_ENOTCONN;
12447
12448   _setxattr_maybe_wait_for_osdmap(name, value, size);
12449
12450   std::scoped_lock lock(client_lock);
12451
12452   InodeRef in;
12453   int r = Client::path_walk(path, &in, perms, false);
12454   if (r < 0)
12455     return r;
12456   return _setxattr(in, name, value, size, flags, perms);
12457 }
12458
12459 int Client::fsetxattr(int fd, const char *name, const void *value, size_t size,
12460                       int flags, const UserPerm& perms)
12461 {
12462   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12463   if (!mref_reader.is_state_satisfied())
12464     return -CEPHFS_ENOTCONN;
12465
12466   _setxattr_maybe_wait_for_osdmap(name, value, size);
12467
12468   std::scoped_lock lock(client_lock);
12469
12470   Fh *f = get_filehandle(fd);
12471   if (!f)
12472     return -CEPHFS_EBADF;
12473   return _setxattr(f->inode, name, value, size, flags, perms);
12474 }
12475
12476 int Client::_getxattr(Inode *in, const char *name, void *value, size_t size,
12477                       const UserPerm& perms)
12478 {
12479   int r;
12480   const VXattr *vxattr = nullptr;
12481
12482   vxattr = _match_vxattr(in, name);
12483   if (vxattr) {
12484     r = -CEPHFS_ENODATA;
12485
12486     // Do a force getattr to get the latest quota before returning
12487     // a value to userspace.
12488     int flags = 0;
12489     if (vxattr->flags & VXATTR_RSTAT) {
12490       flags |= CEPH_STAT_RSTAT;
12491     }
12492     if (vxattr->flags & VXATTR_DIRSTAT) {
12493       flags |= CEPH_CAP_FILE_SHARED;
12494     }
12495     r = _getattr(in, flags | CEPH_STAT_CAP_XATTR, perms, true);
12496     if (r != 0) {
12497       // Error from getattr!
12498       return r;
12499     }
12500
12501     // call pointer-to-member function
12502     char buf[256];
12503     if (!(vxattr->exists_cb && !(this->*(vxattr->exists_cb))(in))) {
12504       r = (this->*(vxattr->getxattr_cb))(in, buf, sizeof(buf));
12505     } else {
12506       r = -CEPHFS_ENODATA;
12507     }
12508
12509     if (size != 0) {
12510       if (r > (int)size) {
12511         r = -CEPHFS_ERANGE;
12512       } else if (r > 0) {
12513         memcpy(value, buf, r);
12514       }
12515     }
12516     goto out;
12517   }
12518
12519   if (!strncmp(name, "ceph.", 5)) {
12520     r = _getvxattr(in, perms, name, size, value, MDS_RANK_NONE);
12521     goto out;
12522   }
12523
12524   if (acl_type == NO_ACL && !strncmp(name, "system.", 7)) {
12525     r = -CEPHFS_EOPNOTSUPP;
12526     goto out;
12527   }
12528
12529   r = _getattr(in, CEPH_STAT_CAP_XATTR, perms, in->xattr_version == 0);
12530   if (r == 0) {
12531     string n(name);
12532     r = -CEPHFS_ENODATA;
12533     if (in->xattrs.count(n)) {
12534       r = in->xattrs[n].length();
12535       if (r > 0 && size != 0) {
12536         if (size >= (unsigned)r)
12537           memcpy(value, in->xattrs[n].c_str(), r);
12538         else
12539           r = -CEPHFS_ERANGE;
12540       }
12541     }
12542   }
12543  out:
12544   ldout(cct, 8) << "_getxattr(" << in->ino << ", \"" << name << "\", " << size << ") = " << r << dendl;
12545   return r;
12546 }
12547
12548 int Client::_getxattr(InodeRef &in, const char *name, void *value, size_t size,
12549                       const UserPerm& perms)
12550 {
12551   if (cct->_conf->client_permissions) {
12552     int r = xattr_permission(in.get(), name, MAY_READ, perms);
12553     if (r < 0)
12554       return r;
12555   }
12556   return _getxattr(in.get(), name, value, size, perms);
12557 }
12558
12559 int Client::ll_getxattr(Inode *in, const char *name, void *value,
12560                         size_t size, const UserPerm& perms)
12561 {
12562   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12563   if (!mref_reader.is_state_satisfied())
12564     return -CEPHFS_ENOTCONN;
12565
12566   vinodeno_t vino = _get_vino(in);
12567
12568   ldout(cct, 3) << __func__ << " " << vino << " " << name << " size " << size << dendl;
12569   tout(cct) << __func__ << std::endl;
12570   tout(cct) << vino.ino.val << std::endl;
12571   tout(cct) << name << std::endl;
12572
12573   std::scoped_lock lock(client_lock);
12574   if (!fuse_default_permissions) {
12575     int r = xattr_permission(in, name, MAY_READ, perms);
12576     if (r < 0)
12577       return r;
12578   }
12579
12580   return _getxattr(in, name, value, size, perms);
12581 }
12582
12583 int Client::_listxattr(Inode *in, char *name, size_t size,
12584                        const UserPerm& perms)
12585 {
12586   bool len_only = (size == 0);
12587   int r = _getattr(in, CEPH_STAT_CAP_XATTR, perms, in->xattr_version == 0);
12588   if (r != 0) {
12589     goto out;
12590   }
12591
12592   r = 0;
12593   for ([[maybe_unused]] const auto &[xattr_name, xattr_value_bl] : in->xattrs) {
12594     if (xattr_name.rfind("ceph.", 0) == 0) {
12595       continue;
12596     }
12597
12598     size_t this_len = xattr_name.length() + 1;
12599     r += this_len;
12600     if (len_only)
12601       continue;
12602
12603     if (this_len > size) {
12604       r = -CEPHFS_ERANGE;
12605       goto out;
12606     }
12607
12608     memcpy(name, xattr_name.c_str(), this_len);
12609     name += this_len;
12610     size -= this_len;
12611   }
12612 out:
12613   ldout(cct, 8) << __func__ << "(" << in->ino << ", " << size << ") = " << r << dendl;
12614   return r;
12615 }
12616
12617 int Client::ll_listxattr(Inode *in, char *names, size_t size,
12618                          const UserPerm& perms)
12619 {
12620   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12621   if (!mref_reader.is_state_satisfied())
12622     return -CEPHFS_ENOTCONN;
12623
12624   vinodeno_t vino = _get_vino(in);
12625
12626   ldout(cct, 3) << __func__ << " " << vino << " size " << size << dendl;
12627   tout(cct) << __func__ << std::endl;
12628   tout(cct) << vino.ino.val << std::endl;
12629   tout(cct) << size << std::endl;
12630
12631   std::scoped_lock lock(client_lock);
12632   return _listxattr(in, names, size, perms);
12633 }
12634
12635 int Client::_do_setxattr(Inode *in, const char *name, const void *value,
12636                          size_t size, int flags, const UserPerm& perms)
12637 {
12638
12639   int xattr_flags = 0;
12640   if (!value)
12641     xattr_flags |= CEPH_XATTR_REMOVE;
12642   if (flags & XATTR_CREATE)
12643     xattr_flags |= CEPH_XATTR_CREATE;
12644   if (flags & XATTR_REPLACE)
12645     xattr_flags |= CEPH_XATTR_REPLACE;
12646
12647   MetaRequest *req = new MetaRequest(CEPH_MDS_OP_SETXATTR);
12648   filepath path;
12649   in->make_nosnap_relative_path(path);
12650   req->set_filepath(path);
12651   req->set_string2(name);
12652   req->set_inode(in);
12653   req->head.args.setxattr.flags = xattr_flags;
12654
12655   bufferlist bl;
12656   ceph_assert(value || size == 0);
12657   bl.append((const char*)value, size);
12658   req->set_data(bl);
12659
12660   int res = make_request(req, perms);
12661
12662   trim_cache();
12663   ldout(cct, 3) << __func__ << "(" << in->ino << ", \"" << name << "\") = " <<
12664     res << dendl;
12665   return res;
12666 }
12667
12668 int Client::_setxattr(Inode *in, const char *name, const void *value,
12669                       size_t size, int flags, const UserPerm& perms)
12670 {
12671   if (in->snapid != CEPH_NOSNAP) {
12672     return -CEPHFS_EROFS;
12673   }
12674
12675   if (size == 0) {
12676     value = "";
12677   } else if (value == NULL) {
12678       return -CEPHFS_EINVAL;
12679   }
12680
12681   bool posix_acl_xattr = false;
12682   if (acl_type == POSIX_ACL)
12683     posix_acl_xattr = !strncmp(name, "system.", 7);
12684
12685   if (strncmp(name, "user.", 5) &&
12686       strncmp(name, "security.", 9) &&
12687       strncmp(name, "trusted.", 8) &&
12688       strncmp(name, "ceph.", 5) &&
12689       !posix_acl_xattr)
12690     return -CEPHFS_EOPNOTSUPP;
12691
12692   bool check_realm = false;
12693
12694   if (posix_acl_xattr) {
12695     if (!strcmp(name, ACL_EA_ACCESS)) {
12696       mode_t new_mode = in->mode;
12697       if (value) {
12698         int ret = posix_acl_equiv_mode(value, size, &new_mode);
12699         if (ret < 0)
12700           return ret;
12701         if (ret == 0) {
12702           value = NULL;
12703           size = 0;
12704         }
12705         if (new_mode != in->mode) {
12706           struct ceph_statx stx;
12707           stx.stx_mode = new_mode;
12708           ret = _do_setattr(in, &stx, CEPH_SETATTR_MODE, perms, NULL);
12709           if (ret < 0)
12710             return ret;
12711         }
12712       }
12713     } else if (!strcmp(name, ACL_EA_DEFAULT)) {
12714       if (value) {
12715         if (!S_ISDIR(in->mode))
12716           return -CEPHFS_EACCES;
12717         int ret = posix_acl_check(value, size);
12718         if (ret < 0)
12719           return -CEPHFS_EINVAL;
12720         if (ret == 0) {
12721           value = NULL;
12722           size = 0;
12723         }
12724       }
12725     } else {
12726       return -CEPHFS_EOPNOTSUPP;
12727     }
12728   } else {
12729     const VXattr *vxattr = _match_vxattr(in, name);
12730     if (vxattr) {
12731       if (vxattr->readonly)
12732         return -CEPHFS_EOPNOTSUPP;
12733       if (vxattr->name.compare(0, 10, "ceph.quota") == 0 && value)
12734         check_realm = true;
12735     }
12736   }
12737
12738   int ret = _do_setxattr(in, name, value, size, flags, perms);
12739   if (ret >= 0 && check_realm) {
12740     // check if snaprealm was created for quota inode
12741     if (in->quota.is_enable() &&
12742         !(in->snaprealm && in->snaprealm->ino == in->ino))
12743       ret = -CEPHFS_EOPNOTSUPP;
12744   }
12745
12746   return ret;
12747 }
12748
12749 int Client::_setxattr(InodeRef &in, const char *name, const void *value,
12750                       size_t size, int flags, const UserPerm& perms)
12751 {
12752   if (cct->_conf->client_permissions) {
12753     int r = xattr_permission(in.get(), name, MAY_WRITE, perms);
12754     if (r < 0)
12755       return r;
12756   }
12757   return _setxattr(in.get(), name, value, size, flags, perms);
12758 }
12759
12760 int Client::_setxattr_check_data_pool(string& name, string& value, const OSDMap *osdmap)
12761 {
12762   string tmp;
12763   if (name == "layout") {
12764     string::iterator begin = value.begin();
12765     string::iterator end = value.end();
12766     keys_and_values<string::iterator> p;    // create instance of parser
12767     std::map<string, string> m;             // map to receive results
12768     if (!qi::parse(begin, end, p, m)) {     // returns true if successful
12769       return -CEPHFS_EINVAL;
12770     }
12771     if (begin != end)
12772       return -CEPHFS_EINVAL;
12773     for (map<string,string>::iterator q = m.begin(); q != m.end(); ++q) {
12774       if (q->first == "pool") {
12775         tmp = q->second;
12776         break;
12777       }
12778     }
12779   } else if (name == "layout.pool") {
12780     tmp = value;
12781   }
12782
12783   if (tmp.length()) {
12784     int64_t pool;
12785     try {
12786       pool = boost::lexical_cast<unsigned>(tmp);
12787       if (!osdmap->have_pg_pool(pool))
12788         return -CEPHFS_ENOENT;
12789     } catch (boost::bad_lexical_cast const&) {
12790       pool = osdmap->lookup_pg_pool_name(tmp);
12791       if (pool < 0) {
12792         return -CEPHFS_ENOENT;
12793       }
12794     }
12795   }
12796
12797   return 0;
12798 }
12799
12800 void Client::_setxattr_maybe_wait_for_osdmap(const char *name, const void *value, size_t size)
12801 {
12802   // For setting pool of layout, MetaRequest need osdmap epoch.
12803   // There is a race which create a new data pool but client and mds both don't have.
12804   // Make client got the latest osdmap which make mds quickly judge whether get newer osdmap.
12805   ldout(cct, 15) << __func__ << ": name = " << name << dendl;
12806   if (strcmp(name, "ceph.file.layout.pool") == 0 || strcmp(name, "ceph.dir.layout.pool") == 0 ||
12807       strcmp(name, "ceph.file.layout") == 0 || strcmp(name, "ceph.dir.layout") == 0) {
12808     string rest(strstr(name, "layout"));
12809     string v((const char*)value, size);
12810     int r = objecter->with_osdmap([&](const OSDMap& o) {
12811       return _setxattr_check_data_pool(rest, v, &o);
12812     });
12813
12814     if (r == -CEPHFS_ENOENT) {
12815       bs::error_code ec;
12816       ldout(cct, 20) << __func__ << ": waiting for latest osdmap" << dendl;
12817       objecter->wait_for_latest_osdmap(ca::use_blocked[ec]);
12818       ldout(cct, 20) << __func__ << ": got latest osdmap: " << ec << dendl;
12819     }
12820   }
12821 }
12822
12823 int Client::ll_setxattr(Inode *in, const char *name, const void *value,
12824                         size_t size, int flags, const UserPerm& perms)
12825 {
12826   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12827   if (!mref_reader.is_state_satisfied())
12828     return -CEPHFS_ENOTCONN;
12829
12830   _setxattr_maybe_wait_for_osdmap(name, value, size);
12831
12832   vinodeno_t vino = _get_vino(in);
12833
12834   ldout(cct, 3) << __func__ << " " << vino << " " << name << " size " << size << dendl;
12835   tout(cct) << __func__ << std::endl;
12836   tout(cct) << vino.ino.val << std::endl;
12837   tout(cct) << name << std::endl;
12838
12839   std::scoped_lock lock(client_lock);
12840   if (!fuse_default_permissions) {
12841     int r = xattr_permission(in, name, MAY_WRITE, perms);
12842     if (r < 0)
12843       return r;
12844   }
12845   return _setxattr(in, name, value, size, flags, perms);
12846 }
12847
12848 int Client::_removexattr(Inode *in, const char *name, const UserPerm& perms)
12849 {
12850   if (in->snapid != CEPH_NOSNAP) {
12851     return -CEPHFS_EROFS;
12852   }
12853
12854   // same xattrs supported by kernel client
12855   if (strncmp(name, "user.", 5) &&
12856       strncmp(name, "system.", 7) &&
12857       strncmp(name, "security.", 9) &&
12858       strncmp(name, "trusted.", 8) &&
12859       strncmp(name, "ceph.", 5))
12860     return -CEPHFS_EOPNOTSUPP;
12861
12862   const VXattr *vxattr = _match_vxattr(in, name);
12863   if (vxattr && vxattr->readonly)
12864     return -CEPHFS_EOPNOTSUPP;
12865
12866   MetaRequest *req = new MetaRequest(CEPH_MDS_OP_RMXATTR);
12867   filepath path;
12868   in->make_nosnap_relative_path(path);
12869   req->set_filepath(path);
12870   req->set_filepath2(name);
12871   req->set_inode(in);
12872
12873   int res = make_request(req, perms);
12874
12875   trim_cache();
12876   ldout(cct, 8) << "_removexattr(" << in->ino << ", \"" << name << "\") = " << res << dendl;
12877   return res;
12878 }
12879
12880 int Client::_removexattr(InodeRef &in, const char *name, const UserPerm& perms)
12881 {
12882   if (cct->_conf->client_permissions) {
12883     int r = xattr_permission(in.get(), name, MAY_WRITE, perms);
12884     if (r < 0)
12885       return r;
12886   }
12887   return _removexattr(in.get(), name, perms);
12888 }
12889
12890 int Client::ll_removexattr(Inode *in, const char *name, const UserPerm& perms)
12891 {
12892   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12893   if (!mref_reader.is_state_satisfied())
12894     return -CEPHFS_ENOTCONN;
12895
12896   vinodeno_t vino = _get_vino(in);
12897
12898   ldout(cct, 3) << "ll_removexattr " << vino << " " << name << dendl;
12899   tout(cct) << "ll_removexattr" << std::endl;
12900   tout(cct) << vino.ino.val << std::endl;
12901   tout(cct) << name << std::endl;
12902
12903   std::scoped_lock lock(client_lock);
12904   if (!fuse_default_permissions) {
12905     int r = xattr_permission(in, name, MAY_WRITE, perms);
12906     if (r < 0)
12907       return r;
12908   }
12909
12910   return _removexattr(in, name, perms);
12911 }
12912
12913 bool Client::_vxattrcb_quota_exists(Inode *in)
12914 {
12915   return in->quota.is_enable() &&
12916    (in->snapid != CEPH_NOSNAP ||
12917     (in->snaprealm && in->snaprealm->ino == in->ino));
12918 }
12919 size_t Client::_vxattrcb_quota(Inode *in, char *val, size_t size)
12920 {
12921   return snprintf(val, size,
12922                   "max_bytes=%lld max_files=%lld",
12923                   (long long int)in->quota.max_bytes,
12924                   (long long int)in->quota.max_files);
12925 }
12926 size_t Client::_vxattrcb_quota_max_bytes(Inode *in, char *val, size_t size)
12927 {
12928   return snprintf(val, size, "%lld", (long long int)in->quota.max_bytes);
12929 }
12930 size_t Client::_vxattrcb_quota_max_files(Inode *in, char *val, size_t size)
12931 {
12932   return snprintf(val, size, "%lld", (long long int)in->quota.max_files);
12933 }
12934
12935 bool Client::_vxattrcb_layout_exists(Inode *in)
12936 {
12937   return in->layout != file_layout_t();
12938 }
12939 size_t Client::_vxattrcb_layout(Inode *in, char *val, size_t size)
12940 {
12941   int r = snprintf(val, size,
12942       "stripe_unit=%llu stripe_count=%llu object_size=%llu pool=",
12943       (unsigned long long)in->layout.stripe_unit,
12944       (unsigned long long)in->layout.stripe_count,
12945       (unsigned long long)in->layout.object_size);
12946   objecter->with_osdmap([&](const OSDMap& o) {
12947       if (o.have_pg_pool(in->layout.pool_id))
12948         r += snprintf(val + r, size - r, "%s",
12949                       o.get_pool_name(in->layout.pool_id).c_str());
12950       else
12951         r += snprintf(val + r, size - r, "%" PRIu64,
12952                       (uint64_t)in->layout.pool_id);
12953     });
12954   if (in->layout.pool_ns.length())
12955     r += snprintf(val + r, size - r, " pool_namespace=%s",
12956                   in->layout.pool_ns.c_str());
12957   return r;
12958 }
12959 size_t Client::_vxattrcb_layout_stripe_unit(Inode *in, char *val, size_t size)
12960 {
12961   return snprintf(val, size, "%llu", (unsigned long long)in->layout.stripe_unit);
12962 }
12963 size_t Client::_vxattrcb_layout_stripe_count(Inode *in, char *val, size_t size)
12964 {
12965   return snprintf(val, size, "%llu", (unsigned long long)in->layout.stripe_count);
12966 }
12967 size_t Client::_vxattrcb_layout_object_size(Inode *in, char *val, size_t size)
12968 {
12969   return snprintf(val, size, "%llu", (unsigned long long)in->layout.object_size);
12970 }
12971 size_t Client::_vxattrcb_layout_pool(Inode *in, char *val, size_t size)
12972 {
12973   size_t r;
12974   objecter->with_osdmap([&](const OSDMap& o) {
12975       if (o.have_pg_pool(in->layout.pool_id))
12976         r = snprintf(val, size, "%s", o.get_pool_name(
12977                        in->layout.pool_id).c_str());
12978       else
12979         r = snprintf(val, size, "%" PRIu64, (uint64_t)in->layout.pool_id);
12980     });
12981   return r;
12982 }
12983 size_t Client::_vxattrcb_layout_pool_namespace(Inode *in, char *val, size_t size)
12984 {
12985   return snprintf(val, size, "%s", in->layout.pool_ns.c_str());
12986 }
12987 size_t Client::_vxattrcb_dir_entries(Inode *in, char *val, size_t size)
12988 {
12989   return snprintf(val, size, "%llu", (unsigned long long)(in->dirstat.nfiles + in->dirstat.nsubdirs));
12990 }
12991 size_t Client::_vxattrcb_dir_files(Inode *in, char *val, size_t size)
12992 {
12993   return snprintf(val, size, "%llu", (unsigned long long)in->dirstat.nfiles);
12994 }
12995 size_t Client::_vxattrcb_dir_subdirs(Inode *in, char *val, size_t size)
12996 {
12997   return snprintf(val, size, "%llu", (unsigned long long)in->dirstat.nsubdirs);
12998 }
12999 size_t Client::_vxattrcb_dir_rentries(Inode *in, char *val, size_t size)
13000 {
13001   return snprintf(val, size, "%llu", (unsigned long long)(in->rstat.rfiles + in->rstat.rsubdirs));
13002 }
13003 size_t Client::_vxattrcb_dir_rfiles(Inode *in, char *val, size_t size)
13004 {
13005   return snprintf(val, size, "%llu", (unsigned long long)in->rstat.rfiles);
13006 }
13007 size_t Client::_vxattrcb_dir_rsubdirs(Inode *in, char *val, size_t size)
13008 {
13009   return snprintf(val, size, "%llu", (unsigned long long)in->rstat.rsubdirs);
13010 }
13011 size_t Client::_vxattrcb_dir_rsnaps(Inode *in, char *val, size_t size)
13012 {
13013   return snprintf(val, size, "%llu", (unsigned long long)in->rstat.rsnaps);
13014 }
13015 size_t Client::_vxattrcb_dir_rbytes(Inode *in, char *val, size_t size)
13016 {
13017   return snprintf(val, size, "%llu", (unsigned long long)in->rstat.rbytes);
13018 }
13019 size_t Client::_vxattrcb_dir_rctime(Inode *in, char *val, size_t size)
13020 {
13021   return snprintf(val, size, "%ld.%09ld", (long)in->rstat.rctime.sec(),
13022       (long)in->rstat.rctime.nsec());
13023 }
13024 bool Client::_vxattrcb_dir_pin_exists(Inode *in)
13025 {
13026   return in->dir_pin != -CEPHFS_ENODATA;
13027 }
13028 size_t Client::_vxattrcb_dir_pin(Inode *in, char *val, size_t size)
13029 {
13030   return snprintf(val, size, "%ld", (long)in->dir_pin);
13031 }
13032
13033 bool Client::_vxattrcb_snap_btime_exists(Inode *in)
13034 {
13035   return !in->snap_btime.is_zero();
13036 }
13037
13038 size_t Client::_vxattrcb_snap_btime(Inode *in, char *val, size_t size)
13039 {
13040   return snprintf(val, size, "%llu.%09lu",
13041       (long long unsigned)in->snap_btime.sec(),
13042       (long unsigned)in->snap_btime.nsec());
13043 }
13044
13045 size_t Client::_vxattrcb_caps(Inode *in, char *val, size_t size)
13046 {
13047   int issued;
13048
13049   in->caps_issued(&issued);
13050   return snprintf(val, size, "%s/0x%x", ccap_string(issued).c_str(), issued);
13051 }
13052
13053 bool Client::_vxattrcb_mirror_info_exists(Inode *in)
13054 {
13055   // checking one of the xattrs would suffice
13056   return in->xattrs.count("ceph.mirror.info.cluster_id") != 0;
13057 }
13058
13059 size_t Client::_vxattrcb_mirror_info(Inode *in, char *val, size_t size)
13060 {
13061   return snprintf(val, size, "cluster_id=%.*s fs_id=%.*s",
13062                   in->xattrs["ceph.mirror.info.cluster_id"].length(),
13063                   in->xattrs["ceph.mirror.info.cluster_id"].c_str(),
13064                   in->xattrs["ceph.mirror.info.fs_id"].length(),
13065                   in->xattrs["ceph.mirror.info.fs_id"].c_str());
13066 }
13067
13068 size_t Client::_vxattrcb_cluster_fsid(Inode *in, char *val, size_t size)
13069 {
13070   return snprintf(val, size, "%s", monclient->get_fsid().to_string().c_str());
13071 }
13072
13073 size_t Client::_vxattrcb_client_id(Inode *in, char *val, size_t size)
13074 {
13075   auto name = messenger->get_myname();
13076   return snprintf(val, size, "%s%" PRId64, name.type_str(), name.num());
13077 }
13078
13079 #define CEPH_XATTR_NAME(_type, _name) "ceph." #_type "." #_name
13080 #define CEPH_XATTR_NAME2(_type, _name, _name2) "ceph." #_type "." #_name "." #_name2
13081
13082 #define XATTR_NAME_CEPH(_type, _name, _flags)                 \
13083 {                                                              \
13084   name: CEPH_XATTR_NAME(_type, _name),                         \
13085   getxattr_cb: &Client::_vxattrcb_ ## _type ## _ ## _name,     \
13086   readonly: true,                                              \
13087   exists_cb: NULL,                                             \
13088   flags: _flags,                                               \
13089 }
13090 #define XATTR_LAYOUT_FIELD(_type, _name, _field)                \
13091 {                                                               \
13092   name: CEPH_XATTR_NAME2(_type, _name, _field),                 \
13093   getxattr_cb: &Client::_vxattrcb_ ## _name ## _ ## _field,     \
13094   readonly: false,                                              \
13095   exists_cb: &Client::_vxattrcb_layout_exists,                  \
13096   flags: 0,                                                     \
13097 }
13098 #define XATTR_QUOTA_FIELD(_type, _name)                         \
13099 {                                                               \
13100   name: CEPH_XATTR_NAME(_type, _name),                          \
13101   getxattr_cb: &Client::_vxattrcb_ ## _type ## _ ## _name,      \
13102   readonly: false,                                              \
13103   exists_cb: &Client::_vxattrcb_quota_exists,                   \
13104   flags: 0,                                                     \
13105 }
13106
13107 const Client::VXattr Client::_dir_vxattrs[] = {
13108   {
13109     name: "ceph.dir.layout",
13110     getxattr_cb: &Client::_vxattrcb_layout,
13111     readonly: false,
13112     exists_cb: &Client::_vxattrcb_layout_exists,
13113     flags: 0,
13114   },
13115   // FIXME
13116   // Delete the following dir layout field definitions for release "S"
13117   XATTR_LAYOUT_FIELD(dir, layout, stripe_unit),
13118   XATTR_LAYOUT_FIELD(dir, layout, stripe_count),
13119   XATTR_LAYOUT_FIELD(dir, layout, object_size),
13120   XATTR_LAYOUT_FIELD(dir, layout, pool),
13121   XATTR_LAYOUT_FIELD(dir, layout, pool_namespace),
13122   XATTR_NAME_CEPH(dir, entries, VXATTR_DIRSTAT),
13123   XATTR_NAME_CEPH(dir, files, VXATTR_DIRSTAT),
13124   XATTR_NAME_CEPH(dir, subdirs, VXATTR_DIRSTAT),
13125   XATTR_NAME_CEPH(dir, rentries, VXATTR_RSTAT),
13126   XATTR_NAME_CEPH(dir, rfiles, VXATTR_RSTAT),
13127   XATTR_NAME_CEPH(dir, rsubdirs, VXATTR_RSTAT),
13128   XATTR_NAME_CEPH(dir, rsnaps, VXATTR_RSTAT),
13129   XATTR_NAME_CEPH(dir, rbytes, VXATTR_RSTAT),
13130   XATTR_NAME_CEPH(dir, rctime, VXATTR_RSTAT),
13131   {
13132     name: "ceph.quota",
13133     getxattr_cb: &Client::_vxattrcb_quota,
13134     readonly: false,
13135     exists_cb: &Client::_vxattrcb_quota_exists,
13136     flags: 0,
13137   },
13138   XATTR_QUOTA_FIELD(quota, max_bytes),
13139   XATTR_QUOTA_FIELD(quota, max_files),
13140   // FIXME
13141   // Delete the following dir pin field definitions for release "S"
13142   {
13143     name: "ceph.dir.pin",
13144     getxattr_cb: &Client::_vxattrcb_dir_pin,
13145     readonly: false,
13146     exists_cb: &Client::_vxattrcb_dir_pin_exists,
13147     flags: 0,
13148   },
13149   {
13150     name: "ceph.snap.btime",
13151     getxattr_cb: &Client::_vxattrcb_snap_btime,
13152     readonly: true,
13153     exists_cb: &Client::_vxattrcb_snap_btime_exists,
13154     flags: 0,
13155   },
13156   {
13157     name: "ceph.mirror.info",
13158     getxattr_cb: &Client::_vxattrcb_mirror_info,
13159     readonly: false,
13160     exists_cb: &Client::_vxattrcb_mirror_info_exists,
13161     flags: 0,
13162   },
13163   {
13164     name: "ceph.caps",
13165     getxattr_cb: &Client::_vxattrcb_caps,
13166     readonly: true,
13167     exists_cb: NULL,
13168     flags: 0,
13169   },
13170   { name: "" }     /* Required table terminator */
13171 };
13172
13173 const Client::VXattr Client::_file_vxattrs[] = {
13174   {
13175     name: "ceph.file.layout",
13176     getxattr_cb: &Client::_vxattrcb_layout,
13177     readonly: false,
13178     exists_cb: &Client::_vxattrcb_layout_exists,
13179     flags: 0,
13180   },
13181   XATTR_LAYOUT_FIELD(file, layout, stripe_unit),
13182   XATTR_LAYOUT_FIELD(file, layout, stripe_count),
13183   XATTR_LAYOUT_FIELD(file, layout, object_size),
13184   XATTR_LAYOUT_FIELD(file, layout, pool),
13185   XATTR_LAYOUT_FIELD(file, layout, pool_namespace),
13186   {
13187     name: "ceph.snap.btime",
13188     getxattr_cb: &Client::_vxattrcb_snap_btime,
13189     readonly: true,
13190     exists_cb: &Client::_vxattrcb_snap_btime_exists,
13191     flags: 0,
13192   },
13193   {
13194     name: "ceph.caps",
13195     getxattr_cb: &Client::_vxattrcb_caps,
13196     readonly: true,
13197     exists_cb: NULL,
13198     flags: 0,
13199   },
13200   { name: "" }     /* Required table terminator */
13201 };
13202
13203 const Client::VXattr Client::_common_vxattrs[] = {
13204   {
13205     name: "ceph.cluster_fsid",
13206     getxattr_cb: &Client::_vxattrcb_cluster_fsid,
13207     readonly: true,
13208     exists_cb: nullptr,
13209     flags: 0,
13210   },
13211   {
13212     name: "ceph.client_id",
13213     getxattr_cb: &Client::_vxattrcb_client_id,
13214     readonly: true,
13215     exists_cb: nullptr,
13216     flags: 0,
13217   },
13218   { name: "" }     /* Required table terminator */
13219 };
13220
13221 const Client::VXattr *Client::_get_vxattrs(Inode *in)
13222 {
13223   if (in->is_dir())
13224     return _dir_vxattrs;
13225   else if (in->is_file())
13226     return _file_vxattrs;
13227   return NULL;
13228 }
13229
13230 const Client::VXattr *Client::_match_vxattr(Inode *in, const char *name)
13231 {
13232   if (strncmp(name, "ceph.", 5) == 0) {
13233     const VXattr *vxattr = _get_vxattrs(in);
13234     if (vxattr) {
13235       while (!vxattr->name.empty()) {
13236         if (vxattr->name == name)
13237           return vxattr;
13238         vxattr++;
13239       }
13240     }
13241
13242     // for common vxattrs
13243     vxattr = _common_vxattrs;
13244     while (!vxattr->name.empty()) {
13245       if (vxattr->name == name)
13246         return vxattr;
13247       vxattr++;
13248     }
13249   }
13250
13251   return NULL;
13252 }
13253
13254 int Client::ll_readlink(Inode *in, char *buf, size_t buflen, const UserPerm& perms)
13255 {
13256   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
13257   if (!mref_reader.is_state_satisfied())
13258     return -CEPHFS_ENOTCONN;
13259
13260   vinodeno_t vino = _get_vino(in);
13261
13262   ldout(cct, 3) << "ll_readlink " << vino << dendl;
13263   tout(cct) << "ll_readlink" << std::endl;
13264   tout(cct) << vino.ino.val << std::endl;
13265
13266   std::scoped_lock lock(client_lock);
13267   for (auto dn : in->dentries) {
13268     touch_dn(dn);
13269   }
13270
13271   int r = _readlink(in, buf, buflen); // FIXME: no permission checking!
13272   ldout(cct, 3) << "ll_readlink " << vino << " = " << r << dendl;
13273   return r;
13274 }
13275
13276 int Client::_mknod(Inode *dir, const char *name, mode_t mode, dev_t rdev,
13277                    const UserPerm& perms, InodeRef *inp)
13278 {
13279   ldout(cct, 8) << "_mknod(" << dir->ino << " " << name << ", 0" << oct
13280                 << mode << dec << ", " << rdev << ", uid " << perms.uid()
13281                 << ", gid " << perms.gid() << ")" << dendl;
13282
13283   if (strlen(name) > NAME_MAX)
13284     return -CEPHFS_ENAMETOOLONG;
13285
13286   if (dir->snapid != CEPH_NOSNAP) {
13287     return -CEPHFS_EROFS;
13288   }
13289   if (is_quota_files_exceeded(dir, perms)) {
13290     return -CEPHFS_EDQUOT;
13291   }
13292
13293   MetaRequest *req = new MetaRequest(CEPH_MDS_OP_MKNOD);
13294
13295   filepath path;
13296   dir->make_nosnap_relative_path(path);
13297   path.push_dentry(name);
13298   req->set_filepath(path);
13299   req->set_inode(dir);
13300   req->head.args.mknod.rdev = rdev;
13301   req->dentry_drop = CEPH_CAP_FILE_SHARED;
13302   req->dentry_unless = CEPH_CAP_FILE_EXCL;
13303
13304   bufferlist xattrs_bl;
13305   int res = _posix_acl_create(dir, &mode, xattrs_bl, perms);
13306   if (res < 0)
13307     goto fail;
13308   req->head.args.mknod.mode = mode;
13309   if (xattrs_bl.length() > 0)
13310     req->set_data(xattrs_bl);
13311
13312   Dentry *de;
13313   res = get_or_create(dir, name, &de);
13314   if (res < 0)
13315     goto fail;
13316   req->set_dentry(de);
13317
13318   res = make_request(req, perms, inp);
13319
13320   trim_cache();
13321
13322   ldout(cct, 8) << "mknod(" << path << ", 0" << oct << mode << dec << ") = " << res << dendl;
13323   return res;
13324
13325  fail:
13326   put_request(req);
13327   return res;
13328 }
13329
13330 int Client::ll_mknod(Inode *parent, const char *name, mode_t mode,
13331                      dev_t rdev, struct stat *attr, Inode **out,
13332                      const UserPerm& perms)
13333 {
13334   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
13335   if (!mref_reader.is_state_satisfied())
13336     return -CEPHFS_ENOTCONN;
13337
13338   vinodeno_t vparent = _get_vino(parent);
13339
13340   ldout(cct, 3) << "ll_mknod " << vparent << " " << name << dendl;
13341   tout(cct) << "ll_mknod" << std::endl;
13342   tout(cct) << vparent.ino.val << std::endl;
13343   tout(cct) << name << std::endl;
13344   tout(cct) << mode << std::endl;
13345   tout(cct) << rdev << std::endl;
13346
13347   std::scoped_lock lock(client_lock);
13348   if (!fuse_default_permissions) {
13349     int r = may_create(parent, perms);
13350     if (r < 0)
13351       return r;
13352   }
13353
13354   InodeRef in;
13355   int r = _mknod(parent, name, mode, rdev, perms, &in);
13356   if (r == 0) {
13357     fill_stat(in, attr);
13358     _ll_get(in.get());
13359   }
13360   tout(cct) << attr->st_ino << std::endl;
13361   ldout(cct, 3) << "ll_mknod " << vparent << " " << name
13362           << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
13363   *out = in.get();
13364   return r;
13365 }
13366
13367 int Client::ll_mknodx(Inode *parent, const char *name, mode_t mode,
13368                       dev_t rdev, Inode **out,
13369                       struct ceph_statx *stx, unsigned want, unsigned flags,
13370                       const UserPerm& perms)
13371 {
13372   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
13373   if (!mref_reader.is_state_satisfied())
13374     return -CEPHFS_ENOTCONN;
13375
13376   unsigned caps = statx_to_mask(flags, want);
13377
13378   vinodeno_t vparent = _get_vino(parent);
13379
13380   ldout(cct, 3) << "ll_mknodx " << vparent << " " << name << dendl;
13381   tout(cct) << "ll_mknodx" << std::endl;
13382   tout(cct) << vparent.ino.val << std::endl;
13383   tout(cct) << name << std::endl;
13384   tout(cct) << mode << std::endl;
13385   tout(cct) << rdev << std::endl;
13386
13387   std::scoped_lock lock(client_lock);
13388
13389   if (!fuse_default_permissions) {
13390     int r = may_create(parent, perms);
13391     if (r < 0)
13392       return r;
13393   }
13394
13395   InodeRef in;
13396   int r = _mknod(parent, name, mode, rdev, perms, &in);
13397   if (r == 0) {
13398     fill_statx(in, caps, stx);
13399     _ll_get(in.get());
13400   }
13401   tout(cct) << stx->stx_ino << std::endl;
13402   ldout(cct, 3) << "ll_mknodx " << vparent << " " << name
13403           << " = " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
13404   *out = in.get();
13405   return r;
13406 }
13407
13408 int Client::_create(Inode *dir, const char *name, int flags, mode_t mode,
13409                     InodeRef *inp, Fh **fhp, int stripe_unit, int stripe_count,
13410                     int object_size, const char *data_pool, bool *created,
13411                     const UserPerm& perms, std::string alternate_name)
13412 {
13413   ldout(cct, 8) << "_create(" << dir->ino << " " << name << ", 0" << oct <<
13414     mode << dec << ")" << dendl;
13415
13416   if (strlen(name) > NAME_MAX)
13417     return -CEPHFS_ENAMETOOLONG;
13418   if (dir->snapid != CEPH_NOSNAP) {
13419     return -CEPHFS_EROFS;
13420   }
13421   if (is_quota_files_exceeded(dir, perms)) {
13422     return -CEPHFS_EDQUOT;
13423   }
13424
13425   // use normalized flags to generate cmode
13426   int cflags = ceph_flags_sys2wire(flags);
13427   if (cct->_conf.get_val<bool>("client_force_lazyio"))
13428     cflags |= CEPH_O_LAZY;
13429
13430   int cmode = ceph_flags_to_mode(cflags);
13431
13432   int64_t pool_id = -1;
13433   if (data_pool && *data_pool) {
13434     pool_id = objecter->with_osdmap(
13435       std::mem_fn(&OSDMap::lookup_pg_pool_name), data_pool);
13436     if (pool_id < 0)
13437       return -CEPHFS_EINVAL;
13438     if (pool_id > 0xffffffffll)
13439       return -CEPHFS_ERANGE;  // bummer!
13440   }
13441
13442   MetaRequest *req = new MetaRequest(CEPH_MDS_OP_CREATE);
13443
13444   filepath path;
13445   dir->make_nosnap_relative_path(path);
13446   path.push_dentry(name);
13447   req->set_filepath(path);
13448   req->set_alternate_name(std::move(alternate_name));
13449   req->set_inode(dir);
13450   req->head.args.open.flags = cflags | CEPH_O_CREAT;
13451
13452   req->head.args.open.stripe_unit = stripe_unit;
13453   req->head.args.open.stripe_count = stripe_count;
13454   req->head.args.open.object_size = object_size;
13455   if (cct->_conf->client_debug_getattr_caps)
13456     req->head.args.open.mask = DEBUG_GETATTR_CAPS;
13457   else
13458     req->head.args.open.mask = 0;
13459   req->head.args.open.pool = pool_id;
13460   req->dentry_drop = CEPH_CAP_FILE_SHARED;
13461   req->dentry_unless = CEPH_CAP_FILE_EXCL;
13462
13463   mode |= S_IFREG;
13464   bufferlist xattrs_bl;
13465   int res = _posix_acl_create(dir, &mode, xattrs_bl, perms);
13466   if (res < 0)
13467     goto fail;
13468   req->head.args.open.mode = mode;
13469   if (xattrs_bl.length() > 0)
13470     req->set_data(xattrs_bl);
13471
13472   Dentry *de;
13473   res = get_or_create(dir, name, &de);
13474   if (res < 0)
13475     goto fail;
13476   req->set_dentry(de);
13477
13478   res = make_request(req, perms, inp, created);
13479   if (res < 0) {
13480     goto reply_error;
13481   }
13482
13483   /* If the caller passed a value in fhp, do the open */
13484   if(fhp) {
13485     (*inp)->get_open_ref(cmode);
13486     *fhp = _create_fh(inp->get(), flags, cmode, perms);
13487   }
13488
13489  reply_error:
13490   trim_cache();
13491
13492   ldout(cct, 8) << "create(" << path << ", 0" << oct << mode << dec
13493                 << " layout " << stripe_unit
13494                 << ' ' << stripe_count
13495                 << ' ' << object_size
13496                 <<") = " << res << dendl;
13497   return res;
13498
13499  fail:
13500   put_request(req);
13501   return res;
13502 }
13503
13504 int Client::_mkdir(Inode *dir, const char *name, mode_t mode, const UserPerm& perm,
13505                    InodeRef *inp, const std::map<std::string, std::string> &metadata,
13506                    std::string alternate_name)
13507 {
13508   ldout(cct, 8) << "_mkdir(" << dir->ino << " " << name << ", 0" << oct
13509                 << mode << dec << ", uid " << perm.uid()
13510                 << ", gid " << perm.gid() << ")" << dendl;
13511
13512   if (strlen(name) > NAME_MAX)
13513     return -CEPHFS_ENAMETOOLONG;
13514
13515   if (dir->snapid != CEPH_NOSNAP && dir->snapid != CEPH_SNAPDIR) {
13516     return -CEPHFS_EROFS;
13517   }
13518   if (is_quota_files_exceeded(dir, perm)) {
13519     return -CEPHFS_EDQUOT;
13520   }
13521
13522   bool is_snap_op = dir->snapid == CEPH_SNAPDIR;
13523   MetaRequest *req = new MetaRequest(is_snap_op ?
13524                                      CEPH_MDS_OP_MKSNAP : CEPH_MDS_OP_MKDIR);
13525
13526   filepath path;
13527   dir->make_nosnap_relative_path(path);
13528   path.push_dentry(name);
13529   req->set_filepath(path);
13530   req->set_inode(dir);
13531   req->dentry_drop = CEPH_CAP_FILE_SHARED;
13532   req->dentry_unless = CEPH_CAP_FILE_EXCL;
13533   req->set_alternate_name(std::move(alternate_name));
13534
13535   mode |= S_IFDIR;
13536   bufferlist bl;
13537   int res = _posix_acl_create(dir, &mode, bl, perm);
13538   if (res < 0)
13539     goto fail;
13540   req->head.args.mkdir.mode = mode;
13541   if (is_snap_op) {
13542     SnapPayload payload;
13543     // clear the bufferlist that may have been populated by the call
13544     // to _posix_acl_create(). MDS mksnap does not make use of it.
13545     // So, reuse it to pass metadata payload.
13546     bl.clear();
13547     payload.metadata = metadata;
13548     encode(payload, bl);
13549   }
13550   if (bl.length() > 0) {
13551     req->set_data(bl);
13552   }
13553
13554   Dentry *de;
13555   res = get_or_create(dir, name, &de);
13556   if (res < 0)
13557     goto fail;
13558   req->set_dentry(de);
13559
13560   ldout(cct, 10) << "_mkdir: making request" << dendl;
13561   res = make_request(req, perm, inp);
13562   ldout(cct, 10) << "_mkdir result is " << res << dendl;
13563
13564   trim_cache();
13565
13566   ldout(cct, 8) << "_mkdir(" << path << ", 0" << oct << mode << dec << ") = " << res << dendl;
13567   return res;
13568
13569  fail:
13570   put_request(req);
13571   return res;
13572 }
13573
13574 int Client::ll_mkdir(Inode *parent, const char *name, mode_t mode,
13575                      struct stat *attr, Inode **out, const UserPerm& perm)
13576 {
13577   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
13578   if (!mref_reader.is_state_satisfied())
13579     return -CEPHFS_ENOTCONN;
13580
13581   vinodeno_t vparent = _get_vino(parent);
13582
13583   ldout(cct, 3) << "ll_mkdir " << vparent << " " << name << dendl;
13584   tout(cct) << "ll_mkdir" << std::endl;
13585   tout(cct) << vparent.ino.val << std::endl;
13586   tout(cct) << name << std::endl;
13587   tout(cct) << mode << std::endl;
13588
13589   std::scoped_lock lock(client_lock);
13590
13591   if (!fuse_default_permissions) {
13592     int r = may_create(parent, perm);
13593     if (r < 0)
13594       return r;
13595   }
13596
13597   InodeRef in;
13598   int r = _mkdir(parent, name, mode, perm, &in);
13599   if (r == 0) {
13600     fill_stat(in, attr);
13601     _ll_get(in.get());
13602   }
13603   tout(cct) << attr->st_ino << std::endl;
13604   ldout(cct, 3) << "ll_mkdir " << vparent << " " << name
13605           << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
13606   *out = in.get();
13607   return r;
13608 }
13609
13610 int Client::ll_mkdirx(Inode *parent, const char *name, mode_t mode, Inode **out,
13611                       struct ceph_statx *stx, unsigned want, unsigned flags,
13612                       const UserPerm& perms)
13613 {
13614   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
13615   if (!mref_reader.is_state_satisfied())
13616     return -CEPHFS_ENOTCONN;
13617
13618   vinodeno_t vparent = _get_vino(parent);
13619
13620   ldout(cct, 3) << "ll_mkdirx " << vparent << " " << name << dendl;
13621   tout(cct) << "ll_mkdirx" << std::endl;
13622   tout(cct) << vparent.ino.val << std::endl;
13623   tout(cct) << name << std::endl;
13624   tout(cct) << mode << std::endl;
13625
13626   std::scoped_lock lock(client_lock);
13627
13628   if (!fuse_default_permissions) {
13629     int r = may_create(parent, perms);
13630     if (r < 0)
13631       return r;
13632   }
13633
13634   InodeRef in;
13635   int r = _mkdir(parent, name, mode, perms, &in);
13636   if (r == 0) {
13637     fill_statx(in, statx_to_mask(flags, want), stx);
13638     _ll_get(in.get());
13639   } else {
13640     stx->stx_ino = 0;
13641     stx->stx_mask = 0;
13642   }
13643   tout(cct) << stx->stx_ino << std::endl;
13644   ldout(cct, 3) << "ll_mkdirx " << vparent << " " << name
13645           << " = " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
13646   *out = in.get();
13647   return r;
13648 }
13649
13650 int Client::_symlink(Inode *dir, const char *name, const char *target,
13651                      const UserPerm& perms, std::string alternate_name, InodeRef *inp)
13652 {
13653   ldout(cct, 8) << "_symlink(" << dir->ino << " " << name << ", " << target
13654                 << ", uid " << perms.uid() << ", gid " << perms.gid() << ")"
13655                 << dendl;
13656
13657   if (strlen(name) > NAME_MAX)
13658     return -CEPHFS_ENAMETOOLONG;
13659
13660   if (dir->snapid != CEPH_NOSNAP) {
13661     return -CEPHFS_EROFS;
13662   }
13663   if (is_quota_files_exceeded(dir, perms)) {
13664     return -CEPHFS_EDQUOT;
13665   }
13666
13667   MetaRequest *req = new MetaRequest(CEPH_MDS_OP_SYMLINK);
13668
13669   filepath path;
13670   dir->make_nosnap_relative_path(path);
13671   path.push_dentry(name);
13672   req->set_filepath(path);
13673   req->set_alternate_name(std::move(alternate_name));
13674   req->set_inode(dir);
13675   req->set_string2(target);
13676   req->dentry_drop = CEPH_CAP_FILE_SHARED;
13677   req->dentry_unless = CEPH_CAP_FILE_EXCL;
13678
13679   Dentry *de;
13680   int res = get_or_create(dir, name, &de);
13681   if (res < 0)
13682     goto fail;
13683   req->set_dentry(de);
13684
13685   res = make_request(req, perms, inp);
13686
13687   trim_cache();
13688   ldout(cct, 8) << "_symlink(\"" << path << "\", \"" << target << "\") = " <<
13689     res << dendl;
13690   return res;
13691
13692  fail:
13693   put_request(req);
13694   return res;
13695 }
13696
13697 int Client::ll_symlink(Inode *parent, const char *name, const char *value,
13698                        struct stat *attr, Inode **out, const UserPerm& perms)
13699 {
13700   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
13701   if (!mref_reader.is_state_satisfied())
13702     return -CEPHFS_ENOTCONN;
13703
13704   vinodeno_t vparent = _get_vino(parent);
13705
13706   ldout(cct, 3) << "ll_symlink " << vparent << " " << name << " -> " << value
13707                 << dendl;
13708   tout(cct) << "ll_symlink" << std::endl;
13709   tout(cct) << vparent.ino.val << std::endl;
13710   tout(cct) << name << std::endl;
13711   tout(cct) << value << std::endl;
13712
13713   std::scoped_lock lock(client_lock);
13714
13715   if (!fuse_default_permissions) {
13716     int r = may_create(parent, perms);
13717     if (r < 0)
13718       return r;
13719   }
13720
13721   InodeRef in;
13722   int r = _symlink(parent, name, value, perms, "", &in);
13723   if (r == 0) {
13724     fill_stat(in, attr);
13725     _ll_get(in.get());
13726   }
13727   tout(cct) << attr->st_ino << std::endl;
13728   ldout(cct, 3) << "ll_symlink " << vparent << " " << name
13729           << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
13730   *out = in.get();
13731   return r;
13732 }
13733
13734 int Client::ll_symlinkx(Inode *parent, const char *name, const char *value,
13735                         Inode **out, struct ceph_statx *stx, unsigned want,
13736                         unsigned flags, const UserPerm& perms)
13737 {
13738   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
13739   if (!mref_reader.is_state_satisfied())
13740     return -CEPHFS_ENOTCONN;
13741
13742   vinodeno_t vparent = _get_vino(parent);
13743
13744   ldout(cct, 3) << "ll_symlinkx " << vparent << " " << name << " -> " << value
13745                 << dendl;
13746   tout(cct) << "ll_symlinkx" << std::endl;
13747   tout(cct) << vparent.ino.val << std::endl;
13748   tout(cct) << name << std::endl;
13749   tout(cct) << value << std::endl;
13750
13751   std::scoped_lock lock(client_lock);
13752
13753   if (!fuse_default_permissions) {
13754     int r = may_create(parent, perms);
13755     if (r < 0)
13756       return r;
13757   }
13758
13759   InodeRef in;
13760   int r = _symlink(parent, name, value, perms, "", &in);
13761   if (r == 0) {
13762     fill_statx(in, statx_to_mask(flags, want), stx);
13763     _ll_get(in.get());
13764   }
13765   tout(cct) << stx->stx_ino << std::endl;
13766   ldout(cct, 3) << "ll_symlinkx " << vparent << " " << name
13767           << " = " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
13768   *out = in.get();
13769   return r;
13770 }
13771
13772 int Client::_unlink(Inode *dir, const char *name, const UserPerm& perm)
13773 {
13774   ldout(cct, 8) << "_unlink(" << dir->ino << " " << name
13775                 << " uid " << perm.uid() << " gid " << perm.gid()
13776                 << ")" << dendl;
13777
13778   if (dir->snapid != CEPH_NOSNAP) {
13779     return -CEPHFS_EROFS;
13780   }
13781
13782   MetaRequest *req = new MetaRequest(CEPH_MDS_OP_UNLINK);
13783
13784   filepath path;
13785   dir->make_nosnap_relative_path(path);
13786   path.push_dentry(name);
13787   req->set_filepath(path);
13788
13789   InodeRef otherin;
13790   Inode *in;
13791   Dentry *de;
13792
13793   int res = get_or_create(dir, name, &de);
13794   if (res < 0)
13795     goto fail;
13796   req->set_dentry(de);
13797   req->dentry_drop = CEPH_CAP_FILE_SHARED;
13798   req->dentry_unless = CEPH_CAP_FILE_EXCL;
13799
13800   res = _lookup(dir, name, 0, &otherin, perm);
13801   if (res < 0)
13802     goto fail;
13803
13804   in = otherin.get();
13805   req->set_other_inode(in);
13806   in->break_all_delegs();
13807   req->other_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
13808
13809   req->set_inode(dir);
13810
13811   res = make_request(req, perm);
13812
13813   trim_cache();
13814   ldout(cct, 8) << "unlink(" << path << ") = " << res << dendl;
13815   return res;
13816
13817  fail:
13818   put_request(req);
13819   return res;
13820 }
13821
13822 int Client::ll_unlink(Inode *in, const char *name, const UserPerm& perm)
13823 {
13824   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
13825   if (!mref_reader.is_state_satisfied())
13826     return -CEPHFS_ENOTCONN;
13827
13828   vinodeno_t vino = _get_vino(in);
13829
13830   ldout(cct, 3) << "ll_unlink " << vino << " " << name << dendl;
13831   tout(cct) << "ll_unlink" << std::endl;
13832   tout(cct) << vino.ino.val << std::endl;
13833   tout(cct) << name << std::endl;
13834
13835   std::scoped_lock lock(client_lock);
13836
13837   if (!fuse_default_permissions) {
13838     int r = may_delete(in, name, perm);
13839     if (r < 0)
13840       return r;
13841   }
13842   return _unlink(in, name, perm);
13843 }
13844
13845 int Client::_rmdir(Inode *dir, const char *name, const UserPerm& perms)
13846 {
13847   ldout(cct, 8) << "_rmdir(" << dir->ino << " " << name << " uid "
13848                 << perms.uid() << " gid " << perms.gid() << ")" << dendl;
13849
13850   if (dir->snapid != CEPH_NOSNAP && dir->snapid != CEPH_SNAPDIR) {
13851     return -CEPHFS_EROFS;
13852   }
13853
13854   int op = dir->snapid == CEPH_SNAPDIR ? CEPH_MDS_OP_RMSNAP : CEPH_MDS_OP_RMDIR;
13855   MetaRequest *req = new MetaRequest(op);
13856   filepath path;
13857   dir->make_nosnap_relative_path(path);
13858   path.push_dentry(name);
13859   req->set_filepath(path);
13860   req->set_inode(dir);
13861
13862   req->dentry_drop = CEPH_CAP_FILE_SHARED;
13863   req->dentry_unless = CEPH_CAP_FILE_EXCL;
13864   req->other_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
13865
13866   InodeRef in;
13867
13868   Dentry *de;
13869   int res = get_or_create(dir, name, &de);
13870   if (res < 0)
13871     goto fail;
13872   if (op == CEPH_MDS_OP_RMDIR)
13873     req->set_dentry(de);
13874   else
13875     de->get();
13876
13877   res = _lookup(dir, name, 0, &in, perms);
13878   if (res < 0)
13879     goto fail;
13880
13881   if (op == CEPH_MDS_OP_RMSNAP) {
13882     unlink(de, true, true);
13883     de->put();
13884   }
13885   req->set_other_inode(in.get());
13886
13887   res = make_request(req, perms);
13888
13889   trim_cache();
13890   ldout(cct, 8) << "rmdir(" << path << ") = " << res << dendl;
13891   return res;
13892
13893  fail:
13894   put_request(req);
13895   return res;
13896 }
13897
13898 int Client::ll_rmdir(Inode *in, const char *name, const UserPerm& perms)
13899 {
13900   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
13901   if (!mref_reader.is_state_satisfied())
13902     return -CEPHFS_ENOTCONN;
13903
13904   vinodeno_t vino = _get_vino(in);
13905
13906   ldout(cct, 3) << "ll_rmdir " << vino << " " << name << dendl;
13907   tout(cct) << "ll_rmdir" << std::endl;
13908   tout(cct) << vino.ino.val << std::endl;
13909   tout(cct) << name << std::endl;
13910
13911   std::scoped_lock lock(client_lock);
13912
13913   if (!fuse_default_permissions) {
13914     int r = may_delete(in, name, perms);
13915     if (r < 0)
13916       return r;
13917   }
13918
13919   return _rmdir(in, name, perms);
13920 }
13921
13922 int Client::_rename(Inode *fromdir, const char *fromname, Inode *todir, const char *toname, const UserPerm& perm, std::string alternate_name)
13923 {
13924   ldout(cct, 8) << "_rename(" << fromdir->ino << " " << fromname << " to "
13925                 << todir->ino << " " << toname
13926                 << " uid " << perm.uid() << " gid " << perm.gid() << ")"
13927                 << dendl;
13928
13929   if (fromdir->snapid != todir->snapid)
13930     return -CEPHFS_EXDEV;
13931
13932   int op = CEPH_MDS_OP_RENAME;
13933   if (fromdir->snapid != CEPH_NOSNAP) {
13934     if (fromdir == todir && fromdir->snapid == CEPH_SNAPDIR)
13935       op = CEPH_MDS_OP_RENAMESNAP;
13936     else
13937       return -CEPHFS_EROFS;
13938   }
13939   if (cct->_conf.get_val<bool>("client_quota") && fromdir != todir) {
13940     Inode *fromdir_root =
13941       fromdir->quota.is_enable() ? fromdir : get_quota_root(fromdir, perm);
13942     Inode *todir_root =
13943       todir->quota.is_enable() ? todir : get_quota_root(todir, perm);
13944     if (fromdir_root != todir_root) {
13945       return -CEPHFS_EXDEV;
13946     }
13947   }
13948
13949   InodeRef target;
13950   MetaRequest *req = new MetaRequest(op);
13951
13952   filepath from;
13953   fromdir->make_nosnap_relative_path(from);
13954   from.push_dentry(fromname);
13955   filepath to;
13956   todir->make_nosnap_relative_path(to);
13957   to.push_dentry(toname);
13958   req->set_filepath(to);
13959   req->set_filepath2(from);
13960   req->set_alternate_name(std::move(alternate_name));
13961
13962   Dentry *oldde;
13963   int res = get_or_create(fromdir, fromname, &oldde);
13964   if (res < 0)
13965     goto fail;
13966   Dentry *de;
13967   res = get_or_create(todir, toname, &de);
13968   if (res < 0)
13969     goto fail;
13970
13971   if (op == CEPH_MDS_OP_RENAME) {
13972     req->set_old_dentry(oldde);
13973     req->old_dentry_drop = CEPH_CAP_FILE_SHARED;
13974     req->old_dentry_unless = CEPH_CAP_FILE_EXCL;
13975
13976     req->set_dentry(de);
13977     req->dentry_drop = CEPH_CAP_FILE_SHARED;
13978     req->dentry_unless = CEPH_CAP_FILE_EXCL;
13979
13980     InodeRef oldin, otherin;
13981     res = _lookup(fromdir, fromname, 0, &oldin, perm);
13982     if (res < 0)
13983       goto fail;
13984
13985     Inode *oldinode = oldin.get();
13986     oldinode->break_all_delegs();
13987     req->set_old_inode(oldinode);
13988     req->old_inode_drop = CEPH_CAP_LINK_SHARED;
13989
13990     res = _lookup(todir, toname, 0, &otherin, perm);
13991     switch (res) {
13992     case 0:
13993       {
13994         Inode *in = otherin.get();
13995         req->set_other_inode(in);
13996         in->break_all_delegs();
13997       }
13998       req->other_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
13999       break;
14000     case -CEPHFS_ENOENT:
14001       break;
14002     default:
14003       goto fail;
14004     }
14005
14006     req->set_inode(todir);
14007   } else {
14008     // renamesnap reply contains no tracedn, so we need to invalidate
14009     // dentry manually
14010     unlink(oldde, true, true);
14011     unlink(de, true, true);
14012
14013     req->set_inode(todir);
14014   }
14015
14016   res = make_request(req, perm, &target);
14017   ldout(cct, 10) << "rename result is " << res << dendl;
14018
14019   // renamed item from our cache
14020
14021   trim_cache();
14022   ldout(cct, 8) << "_rename(" << from << ", " << to << ") = " << res << dendl;
14023   return res;
14024
14025  fail:
14026   put_request(req);
14027   return res;
14028 }
14029
14030 int Client::ll_rename(Inode *parent, const char *name, Inode *newparent,
14031                       const char *newname, const UserPerm& perm)
14032 {
14033   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14034   if (!mref_reader.is_state_satisfied())
14035     return -CEPHFS_ENOTCONN;
14036
14037   vinodeno_t vparent = _get_vino(parent);
14038   vinodeno_t vnewparent = _get_vino(newparent);
14039
14040   ldout(cct, 3) << "ll_rename " << vparent << " " << name << " to "
14041           << vnewparent << " " << newname << dendl;
14042   tout(cct) << "ll_rename" << std::endl;
14043   tout(cct) << vparent.ino.val << std::endl;
14044   tout(cct) << name << std::endl;
14045   tout(cct) << vnewparent.ino.val << std::endl;
14046   tout(cct) << newname << std::endl;
14047
14048   std::scoped_lock lock(client_lock);
14049
14050   if (!fuse_default_permissions) {
14051     int r = may_delete(parent, name, perm);
14052     if (r < 0)
14053       return r;
14054     r = may_delete(newparent, newname, perm);
14055     if (r < 0 && r != -CEPHFS_ENOENT)
14056       return r;
14057   }
14058
14059   return _rename(parent, name, newparent, newname, perm, "");
14060 }
14061
14062 int Client::_link(Inode *in, Inode *dir, const char *newname, const UserPerm& perm, std::string alternate_name, InodeRef *inp)
14063 {
14064   ldout(cct, 8) << "_link(" << in->ino << " to " << dir->ino << " " << newname
14065                 << " uid " << perm.uid() << " gid " << perm.gid() << ")" << dendl;
14066
14067   if (strlen(newname) > NAME_MAX)
14068     return -CEPHFS_ENAMETOOLONG;
14069
14070   if (in->snapid != CEPH_NOSNAP || dir->snapid != CEPH_NOSNAP) {
14071     return -CEPHFS_EROFS;
14072   }
14073   if (is_quota_files_exceeded(dir, perm)) {
14074     return -CEPHFS_EDQUOT;
14075   }
14076
14077   in->break_all_delegs();
14078   MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LINK);
14079
14080   filepath path(newname, dir->ino);
14081   req->set_filepath(path);
14082   req->set_alternate_name(std::move(alternate_name));
14083   filepath existing(in->ino);
14084   req->set_filepath2(existing);
14085
14086   req->set_inode(dir);
14087   req->inode_drop = CEPH_CAP_FILE_SHARED;
14088   req->inode_unless = CEPH_CAP_FILE_EXCL;
14089
14090   Dentry *de;
14091   int res = get_or_create(dir, newname, &de);
14092   if (res < 0)
14093     goto fail;
14094   req->set_dentry(de);
14095
14096   res = make_request(req, perm, inp);
14097   ldout(cct, 10) << "link result is " << res << dendl;
14098
14099   trim_cache();
14100   ldout(cct, 8) << "link(" << existing << ", " << path << ") = " << res << dendl;
14101   return res;
14102
14103  fail:
14104   put_request(req);
14105   return res;
14106 }
14107
14108 int Client::ll_link(Inode *in, Inode *newparent, const char *newname,
14109                     const UserPerm& perm)
14110 {
14111   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14112   if (!mref_reader.is_state_satisfied())
14113     return -CEPHFS_ENOTCONN;
14114
14115   vinodeno_t vino = _get_vino(in);
14116   vinodeno_t vnewparent = _get_vino(newparent);
14117
14118   ldout(cct, 3) << "ll_link " << vino << " to " << vnewparent << " " <<
14119     newname << dendl;
14120   tout(cct) << "ll_link" << std::endl;
14121   tout(cct) << vino.ino.val << std::endl;
14122   tout(cct) << vnewparent << std::endl;
14123   tout(cct) << newname << std::endl;
14124
14125   InodeRef target;
14126
14127   std::scoped_lock lock(client_lock);
14128
14129   if (!fuse_default_permissions) {
14130     if (S_ISDIR(in->mode))
14131       return -CEPHFS_EPERM;
14132
14133     int r = may_hardlink(in, perm);
14134     if (r < 0)
14135       return r;
14136
14137     r = may_create(newparent, perm);
14138     if (r < 0)
14139       return r;
14140   }
14141
14142   return _link(in, newparent, newname, perm, "", &target);
14143 }
14144
14145 int Client::ll_num_osds(void)
14146 {
14147   std::scoped_lock lock(client_lock);
14148   return objecter->with_osdmap(std::mem_fn(&OSDMap::get_num_osds));
14149 }
14150
14151 int Client::ll_osdaddr(int osd, uint32_t *addr)
14152 {
14153   std::scoped_lock lock(client_lock);
14154
14155   entity_addr_t g;
14156   bool exists = objecter->with_osdmap([&](const OSDMap& o) {
14157       if (!o.exists(osd))
14158         return false;
14159       g = o.get_addrs(osd).front();
14160       return true;
14161     });
14162   if (!exists)
14163     return -1;
14164   uint32_t nb_addr = (g.in4_addr()).sin_addr.s_addr;
14165   *addr = ntohl(nb_addr);
14166   return 0;
14167 }
14168
14169 uint32_t Client::ll_stripe_unit(Inode *in)
14170 {
14171   std::scoped_lock lock(client_lock);
14172   return in->layout.stripe_unit;
14173 }
14174
14175 uint64_t Client::ll_snap_seq(Inode *in)
14176 {
14177   std::scoped_lock lock(client_lock);
14178   return in->snaprealm->seq;
14179 }
14180
14181 int Client::ll_file_layout(Inode *in, file_layout_t *layout)
14182 {
14183   std::scoped_lock lock(client_lock);
14184   *layout = in->layout;
14185   return 0;
14186 }
14187
14188 int Client::ll_file_layout(Fh *fh, file_layout_t *layout)
14189 {
14190   return ll_file_layout(fh->inode.get(), layout);
14191 }
14192
14193 /* Currently we cannot take advantage of redundancy in reads, since we
14194    would have to go through all possible placement groups (a
14195    potentially quite large number determined by a hash), and use CRUSH
14196    to calculate the appropriate set of OSDs for each placement group,
14197    then index into that.  An array with one entry per OSD is much more
14198    tractable and works for demonstration purposes. */
14199
14200 int Client::ll_get_stripe_osd(Inode *in, uint64_t blockno,
14201                               file_layout_t* layout)
14202 {
14203   std::scoped_lock lock(client_lock);
14204
14205   inodeno_t ino = in->ino;
14206   uint32_t object_size = layout->object_size;
14207   uint32_t su = layout->stripe_unit;
14208   uint32_t stripe_count = layout->stripe_count;
14209   uint64_t stripes_per_object = object_size / su;
14210   uint64_t stripeno = 0, stripepos = 0;
14211
14212   if(stripe_count) {
14213       stripeno = blockno / stripe_count;    // which horizontal stripe        (Y)
14214       stripepos = blockno % stripe_count;   // which object in the object set (X)
14215   }
14216   uint64_t objectsetno = stripeno / stripes_per_object;       // which object set
14217   uint64_t objectno = objectsetno * stripe_count + stripepos;  // object id
14218
14219   object_t oid = file_object_t(ino, objectno);
14220   return objecter->with_osdmap([&](const OSDMap& o) {
14221       ceph_object_layout olayout =
14222         o.file_to_object_layout(oid, *layout);
14223       pg_t pg = (pg_t)olayout.ol_pgid;
14224       vector<int> osds;
14225       int primary;
14226       o.pg_to_acting_osds(pg, &osds, &primary);
14227       return primary;
14228     });
14229 }
14230
14231 /* Return the offset of the block, internal to the object */
14232
14233 uint64_t Client::ll_get_internal_offset(Inode *in, uint64_t blockno)
14234 {
14235   std::scoped_lock lock(client_lock);
14236   file_layout_t *layout=&(in->layout);
14237   uint32_t object_size = layout->object_size;
14238   uint32_t su = layout->stripe_unit;
14239   uint64_t stripes_per_object = object_size / su;
14240
14241   return (blockno % stripes_per_object) * su;
14242 }
14243
14244 int Client::ll_opendir(Inode *in, int flags, dir_result_t** dirpp,
14245                        const UserPerm& perms)
14246 {
14247   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14248   if (!mref_reader.is_state_satisfied())
14249     return -CEPHFS_ENOTCONN;
14250
14251   vinodeno_t vino = _get_vino(in);
14252
14253   ldout(cct, 3) << "ll_opendir " << vino << dendl;
14254   tout(cct) << "ll_opendir" << std::endl;
14255   tout(cct) << vino.ino.val << std::endl;
14256
14257   std::scoped_lock lock(client_lock);
14258
14259   if (!fuse_default_permissions) {
14260     int r = may_open(in, flags, perms);
14261     if (r < 0)
14262       return r;
14263   }
14264
14265   int r = _opendir(in, dirpp, perms);
14266   tout(cct) << (uintptr_t)*dirpp << std::endl;
14267
14268   ldout(cct, 3) << "ll_opendir " << vino << " = " << r << " (" << *dirpp << ")"
14269                 << dendl;
14270   return r;
14271 }
14272
14273 int Client::ll_releasedir(dir_result_t *dirp)
14274 {
14275   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14276   if (!mref_reader.is_state_satisfied())
14277     return -CEPHFS_ENOTCONN;
14278
14279   ldout(cct, 3) << "ll_releasedir " << dirp << dendl;
14280   tout(cct) << "ll_releasedir" << std::endl;
14281   tout(cct) << (uintptr_t)dirp << std::endl;
14282
14283   std::scoped_lock lock(client_lock);
14284
14285   _closedir(dirp);
14286   return 0;
14287 }
14288
14289 int Client::ll_fsyncdir(dir_result_t *dirp)
14290 {
14291   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14292   if (!mref_reader.is_state_satisfied())
14293     return -CEPHFS_ENOTCONN;
14294
14295   ldout(cct, 3) << "ll_fsyncdir " << dirp << dendl;
14296   tout(cct) << "ll_fsyncdir" << std::endl;
14297   tout(cct) << (uintptr_t)dirp << std::endl;
14298
14299   std::scoped_lock lock(client_lock);
14300   return _fsync(dirp->inode.get(), false);
14301 }
14302
14303 int Client::ll_open(Inode *in, int flags, Fh **fhp, const UserPerm& perms)
14304 {
14305   ceph_assert(!(flags & O_CREAT));
14306
14307   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14308   if (!mref_reader.is_state_satisfied())
14309     return -CEPHFS_ENOTCONN;
14310
14311   vinodeno_t vino = _get_vino(in);
14312
14313   ldout(cct, 3) << "ll_open " << vino << " " << ceph_flags_sys2wire(flags) << dendl;
14314   tout(cct) << "ll_open" << std::endl;
14315   tout(cct) << vino.ino.val << std::endl;
14316   tout(cct) << ceph_flags_sys2wire(flags) << std::endl;
14317
14318   std::scoped_lock lock(client_lock);
14319
14320   int r;
14321   if (!fuse_default_permissions) {
14322     r = may_open(in, flags, perms);
14323     if (r < 0)
14324       goto out;
14325   }
14326
14327   r = _open(in, flags, 0, fhp /* may be NULL */, perms);
14328
14329  out:
14330   Fh *fhptr = fhp ? *fhp : NULL;
14331   if (fhptr) {
14332     ll_unclosed_fh_set.insert(fhptr);
14333   }
14334   tout(cct) << (uintptr_t)fhptr << std::endl;
14335   ldout(cct, 3) << "ll_open " << vino << " " << ceph_flags_sys2wire(flags) <<
14336       " = " << r << " (" << fhptr << ")" << dendl;
14337   return r;
14338 }
14339
14340 int Client::_ll_create(Inode *parent, const char *name, mode_t mode,
14341                       int flags, InodeRef *in, int caps, Fh **fhp,
14342                       const UserPerm& perms)
14343 {
14344   *fhp = NULL;
14345
14346   vinodeno_t vparent = _get_vino(parent);
14347
14348   ldout(cct, 8) << "_ll_create " << vparent << " " << name << " 0" << oct <<
14349     mode << dec << " " << ceph_flags_sys2wire(flags) << ", uid " << perms.uid()
14350                 << ", gid " << perms.gid() << dendl;
14351   tout(cct) << "ll_create" << std::endl;
14352   tout(cct) << vparent.ino.val << std::endl;
14353   tout(cct) << name << std::endl;
14354   tout(cct) << mode << std::endl;
14355   tout(cct) << ceph_flags_sys2wire(flags) << std::endl;
14356
14357   bool created = false;
14358   int r = _lookup(parent, name, caps, in, perms);
14359
14360   if (r == 0 && (flags & O_CREAT) && (flags & O_EXCL))
14361     return -CEPHFS_EEXIST;
14362
14363   if (r == -CEPHFS_ENOENT && (flags & O_CREAT)) {
14364     if (!fuse_default_permissions) {
14365       r = may_create(parent, perms);
14366       if (r < 0)
14367         goto out;
14368     }
14369     r = _create(parent, name, flags, mode, in, fhp, 0, 0, 0, NULL, &created,
14370                 perms, "");
14371     if (r < 0)
14372       goto out;
14373   }
14374
14375   if (r < 0)
14376     goto out;
14377
14378   ceph_assert(*in);
14379
14380   ldout(cct, 20) << "_ll_create created = " << created << dendl;
14381   if (!created) {
14382     if (!fuse_default_permissions) {
14383       r = may_open(in->get(), flags, perms);
14384       if (r < 0) {
14385         if (*fhp) {
14386           int release_r = _release_fh(*fhp);
14387           ceph_assert(release_r == 0);  // during create, no async data ops should have happened
14388         }
14389         goto out;
14390       }
14391     }
14392     if (*fhp == NULL) {
14393       r = _open(in->get(), flags, mode, fhp, perms);
14394       if (r < 0)
14395         goto out;
14396     }
14397   }
14398
14399 out:
14400   if (*fhp) {
14401     ll_unclosed_fh_set.insert(*fhp);
14402   }
14403
14404   ino_t ino = 0;
14405   if (r >= 0) {
14406     Inode *inode = in->get();
14407     if (use_faked_inos())
14408       ino = inode->faked_ino;
14409     else
14410       ino = inode->ino;
14411   }
14412
14413   tout(cct) << (uintptr_t)*fhp << std::endl;
14414   tout(cct) << ino << std::endl;
14415   ldout(cct, 8) << "_ll_create " << vparent << " " << name << " 0" << oct <<
14416     mode << dec << " " << ceph_flags_sys2wire(flags) << " = " << r << " (" <<
14417     *fhp << " " << hex << ino << dec << ")" << dendl;
14418
14419   return r;
14420 }
14421
14422 int Client::ll_create(Inode *parent, const char *name, mode_t mode,
14423                       int flags, struct stat *attr, Inode **outp, Fh **fhp,
14424                       const UserPerm& perms)
14425 {
14426   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14427   if (!mref_reader.is_state_satisfied())
14428     return -CEPHFS_ENOTCONN;
14429
14430   std::scoped_lock lock(client_lock);
14431   InodeRef in;
14432
14433   int r = _ll_create(parent, name, mode, flags, &in, CEPH_STAT_CAP_INODE_ALL,
14434                       fhp, perms);
14435   if (r >= 0) {
14436     ceph_assert(in);
14437
14438     // passing an Inode in outp requires an additional ref
14439     if (outp) {
14440       _ll_get(in.get());
14441       *outp = in.get();
14442     }
14443     fill_stat(in, attr);
14444   } else {
14445     attr->st_ino = 0;
14446   }
14447
14448   return r;
14449 }
14450
14451 int Client::ll_createx(Inode *parent, const char *name, mode_t mode,
14452                         int oflags, Inode **outp, Fh **fhp,
14453                         struct ceph_statx *stx, unsigned want, unsigned lflags,
14454                         const UserPerm& perms)
14455 {
14456   unsigned caps = statx_to_mask(lflags, want);
14457   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14458   if (!mref_reader.is_state_satisfied())
14459     return -CEPHFS_ENOTCONN;
14460
14461   std::scoped_lock lock(client_lock);
14462   InodeRef in;
14463
14464   int r = _ll_create(parent, name, mode, oflags, &in, caps, fhp, perms);
14465   if (r >= 0) {
14466     ceph_assert(in);
14467
14468     // passing an Inode in outp requires an additional ref
14469     if (outp) {
14470       _ll_get(in.get());
14471       *outp = in.get();
14472     }
14473     fill_statx(in, caps, stx);
14474   } else {
14475     stx->stx_ino = 0;
14476     stx->stx_mask = 0;
14477   }
14478
14479   return r;
14480 }
14481
14482 loff_t Client::ll_lseek(Fh *fh, loff_t offset, int whence)
14483 {
14484   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14485   if (!mref_reader.is_state_satisfied())
14486     return -CEPHFS_ENOTCONN;
14487
14488   tout(cct) << "ll_lseek" << std::endl;
14489   tout(cct) << offset << std::endl;
14490   tout(cct) << whence << std::endl;
14491
14492   std::scoped_lock lock(client_lock);
14493   return _lseek(fh, offset, whence);
14494 }
14495
14496 int Client::ll_read(Fh *fh, loff_t off, loff_t len, bufferlist *bl)
14497 {
14498   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14499   if (!mref_reader.is_state_satisfied())
14500     return -CEPHFS_ENOTCONN;
14501
14502   ldout(cct, 3) << "ll_read " << fh << " " << fh->inode->ino << " " << " " << off << "~" << len << dendl;
14503   tout(cct) << "ll_read" << std::endl;
14504   tout(cct) << (uintptr_t)fh << std::endl;
14505   tout(cct) << off << std::endl;
14506   tout(cct) << len << std::endl;
14507
14508   /* We can't return bytes written larger than INT_MAX, clamp len to that */
14509   len = std::min(len, (loff_t)INT_MAX);
14510   std::scoped_lock lock(client_lock);
14511
14512   int r = _read(fh, off, len, bl);
14513   ldout(cct, 3) << "ll_read " << fh << " " << off << "~" << len << " = " << r
14514                 << dendl;
14515   return r;
14516 }
14517
14518 int Client::ll_read_block(Inode *in, uint64_t blockid,
14519                           char *buf,
14520                           uint64_t offset,
14521                           uint64_t length,
14522                           file_layout_t* layout)
14523 {
14524   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14525   if (!mref_reader.is_state_satisfied())
14526     return -CEPHFS_ENOTCONN;
14527
14528   vinodeno_t vino = _get_vino(in);
14529   object_t oid = file_object_t(vino.ino, blockid);
14530   C_SaferCond onfinish;
14531   bufferlist bl;
14532
14533   objecter->read(oid,
14534                  object_locator_t(layout->pool_id),
14535                  offset,
14536                  length,
14537                  vino.snapid,
14538                  &bl,
14539                  CEPH_OSD_FLAG_READ,
14540                  &onfinish);
14541
14542   int r = onfinish.wait();
14543   if (r >= 0) {
14544       bl.begin().copy(bl.length(), buf);
14545       r = bl.length();
14546   }
14547
14548   return r;
14549 }
14550
14551 /* It appears that the OSD doesn't return success unless the entire
14552    buffer was written, return the write length on success. */
14553
14554 int Client::ll_write_block(Inode *in, uint64_t blockid,
14555                            char* buf, uint64_t offset,
14556                            uint64_t length, file_layout_t* layout,
14557                            uint64_t snapseq, uint32_t sync)
14558 {
14559   vinodeno_t vino = ll_get_vino(in);
14560   int r = 0;
14561   std::unique_ptr<C_SaferCond> onsafe = nullptr;
14562
14563   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14564   if (!mref_reader.is_state_satisfied())
14565     return -CEPHFS_ENOTCONN;
14566
14567   if (length == 0) {
14568     return -CEPHFS_EINVAL;
14569   }
14570   if (true || sync) {
14571     /* if write is stable, the epilogue is waiting on
14572      * flock */
14573     onsafe.reset(new C_SaferCond("Client::ll_write_block flock"));
14574   }
14575   object_t oid = file_object_t(vino.ino, blockid);
14576   SnapContext fakesnap;
14577   ceph::bufferlist bl;
14578   if (length > 0) {
14579     bl.push_back(buffer::copy(buf, length));
14580   }
14581
14582   ldout(cct, 1) << "ll_block_write for " << vino.ino << "." << blockid
14583                 << dendl;
14584
14585   fakesnap.seq = snapseq;
14586
14587   /* lock just in time */
14588   objecter->write(oid,
14589                   object_locator_t(layout->pool_id),
14590                   offset,
14591                   length,
14592                   fakesnap,
14593                   bl,
14594                   ceph::real_clock::now(),
14595                   0,
14596                   onsafe.get());
14597
14598   if (nullptr != onsafe) {
14599     r = onsafe->wait();
14600   }
14601
14602   if (r < 0) {
14603     return r;
14604   } else {
14605     return length;
14606   }
14607 }
14608
14609 int Client::ll_commit_blocks(Inode *in,
14610                              uint64_t offset,
14611                              uint64_t length)
14612 {
14613     /*
14614     BarrierContext *bctx;
14615     vinodeno_t vino = _get_vino(in);
14616     uint64_t ino = vino.ino;
14617
14618     ldout(cct, 1) << "ll_commit_blocks for " << vino.ino << " from "
14619                   << offset << " to " << length << dendl;
14620
14621     if (length == 0) {
14622       return -CEPHFS_EINVAL;
14623     }
14624
14625     std::scoped_lock lock(client_lock);
14626     map<uint64_t, BarrierContext*>::iterator p = barriers.find(ino);
14627     if (p != barriers.end()) {
14628       barrier_interval civ(offset, offset + length);
14629       p->second->commit_barrier(civ);
14630     }
14631     */
14632     return 0;
14633 }
14634
14635 int Client::ll_write(Fh *fh, loff_t off, loff_t len, const char *data)
14636 {
14637   ldout(cct, 3) << "ll_write " << fh << " " << fh->inode->ino << " " << off <<
14638     "~" << len << dendl;
14639   tout(cct) << "ll_write" << std::endl;
14640   tout(cct) << (uintptr_t)fh << std::endl;
14641   tout(cct) << off << std::endl;
14642   tout(cct) << len << std::endl;
14643
14644   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14645   if (!mref_reader.is_state_satisfied())
14646     return -CEPHFS_ENOTCONN;
14647
14648   /* We can't return bytes written larger than INT_MAX, clamp len to that */
14649   len = std::min(len, (loff_t)INT_MAX);
14650   std::scoped_lock lock(client_lock);
14651
14652   int r = _write(fh, off, len, data, NULL, 0);
14653   ldout(cct, 3) << "ll_write " << fh << " " << off << "~" << len << " = " << r
14654                 << dendl;
14655   return r;
14656 }
14657
14658 int64_t Client::ll_writev(struct Fh *fh, const struct iovec *iov, int iovcnt, int64_t off)
14659 {
14660   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14661   if (!mref_reader.is_state_satisfied())
14662     return -CEPHFS_ENOTCONN;
14663
14664   std::scoped_lock cl(client_lock);
14665   return _preadv_pwritev_locked(fh, iov, iovcnt, off, true, false);
14666 }
14667
14668 int64_t Client::ll_readv(struct Fh *fh, const struct iovec *iov, int iovcnt, int64_t off)
14669 {
14670   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14671   if (!mref_reader.is_state_satisfied())
14672     return -CEPHFS_ENOTCONN;
14673
14674   std::scoped_lock cl(client_lock);
14675   return _preadv_pwritev_locked(fh, iov, iovcnt, off, false, false);
14676 }
14677
14678 int Client::ll_flush(Fh *fh)
14679 {
14680   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14681   if (!mref_reader.is_state_satisfied())
14682     return -CEPHFS_ENOTCONN;
14683
14684   ldout(cct, 3) << "ll_flush " << fh << " " << fh->inode->ino << " " << dendl;
14685   tout(cct) << "ll_flush" << std::endl;
14686   tout(cct) << (uintptr_t)fh << std::endl;
14687
14688   std::scoped_lock lock(client_lock);
14689   return _flush(fh);
14690 }
14691
14692 int Client::ll_fsync(Fh *fh, bool syncdataonly)
14693 {
14694   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14695   if (!mref_reader.is_state_satisfied())
14696     return -CEPHFS_ENOTCONN;
14697
14698   ldout(cct, 3) << "ll_fsync " << fh << " " << fh->inode->ino << " " << dendl;
14699   tout(cct) << "ll_fsync" << std::endl;
14700   tout(cct) << (uintptr_t)fh << std::endl;
14701
14702   std::scoped_lock lock(client_lock);
14703   int r = _fsync(fh, syncdataonly);
14704   if (r) {
14705     // If we're returning an error, clear it from the FH
14706     fh->take_async_err();
14707   }
14708   return r;
14709 }
14710
14711 int Client::ll_sync_inode(Inode *in, bool syncdataonly)
14712 {
14713   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14714   if (!mref_reader.is_state_satisfied())
14715     return -CEPHFS_ENOTCONN;
14716
14717   ldout(cct, 3) << "ll_sync_inode " << *in << " " << dendl;
14718   tout(cct) << "ll_sync_inode" << std::endl;
14719   tout(cct) << (uintptr_t)in << std::endl;
14720
14721   std::scoped_lock lock(client_lock);
14722   return _fsync(in, syncdataonly);
14723 }
14724
14725 int Client::_fallocate(Fh *fh, int mode, int64_t offset, int64_t length)
14726 {
14727   ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
14728
14729   if (offset < 0 || length <= 0)
14730     return -CEPHFS_EINVAL;
14731
14732   if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
14733     return -CEPHFS_EOPNOTSUPP;
14734
14735   if ((mode & FALLOC_FL_PUNCH_HOLE) && !(mode & FALLOC_FL_KEEP_SIZE))
14736     return -CEPHFS_EOPNOTSUPP;
14737
14738   Inode *in = fh->inode.get();
14739
14740   if (objecter->osdmap_pool_full(in->layout.pool_id) &&
14741       !(mode & FALLOC_FL_PUNCH_HOLE)) {
14742     return -CEPHFS_ENOSPC;
14743   }
14744
14745   if (in->snapid != CEPH_NOSNAP)
14746     return -CEPHFS_EROFS;
14747
14748   if ((fh->mode & CEPH_FILE_MODE_WR) == 0)
14749     return -CEPHFS_EBADF;
14750
14751   uint64_t size = offset + length;
14752   if (!(mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE)) &&
14753       size > in->size &&
14754       is_quota_bytes_exceeded(in, size - in->size, fh->actor_perms)) {
14755     return -CEPHFS_EDQUOT;
14756   }
14757
14758   int have;
14759   int r = get_caps(fh, CEPH_CAP_FILE_WR, CEPH_CAP_FILE_BUFFER, &have, -1);
14760   if (r < 0)
14761     return r;
14762
14763   std::unique_ptr<C_SaferCond> onuninline = nullptr;
14764   if (mode & FALLOC_FL_PUNCH_HOLE) {
14765     if (in->inline_version < CEPH_INLINE_NONE &&
14766         (have & CEPH_CAP_FILE_BUFFER)) {
14767       bufferlist bl;
14768       auto inline_iter = in->inline_data.cbegin();
14769       int len = in->inline_data.length();
14770       if (offset < len) {
14771         if (offset > 0)
14772           inline_iter.copy(offset, bl);
14773         int size = length;
14774         if (offset + size > len)
14775           size = len - offset;
14776         if (size > 0)
14777           bl.append_zero(size);
14778         if (offset + size < len) {
14779           inline_iter += size;
14780           inline_iter.copy(len - offset - size, bl);
14781         }
14782         in->inline_data = bl;
14783         in->inline_version++;
14784       }
14785       in->mtime = in->ctime = ceph_clock_now();
14786       in->change_attr++;
14787       in->mark_caps_dirty(CEPH_CAP_FILE_WR);
14788     } else {
14789       if (in->inline_version < CEPH_INLINE_NONE) {
14790         onuninline.reset(new C_SaferCond("Client::_fallocate_uninline_data flock"));
14791         uninline_data(in, onuninline.get());
14792       }
14793
14794       C_SaferCond onfinish("Client::_punch_hole flock");
14795
14796       get_cap_ref(in, CEPH_CAP_FILE_BUFFER);
14797
14798       _invalidate_inode_cache(in, offset, length);
14799       filer->zero(in->ino, &in->layout,
14800                   in->snaprealm->get_snap_context(),
14801                   offset, length,
14802                   ceph::real_clock::now(),
14803                   0, true, &onfinish);
14804       in->mtime = in->ctime = ceph_clock_now();
14805       in->change_attr++;
14806       in->mark_caps_dirty(CEPH_CAP_FILE_WR);
14807
14808       client_lock.unlock();
14809       onfinish.wait();
14810       client_lock.lock();
14811       put_cap_ref(in, CEPH_CAP_FILE_BUFFER);
14812     }
14813   } else if (!(mode & FALLOC_FL_KEEP_SIZE)) {
14814     uint64_t size = offset + length;
14815     if (size > in->size) {
14816       in->size = size;
14817       in->mtime = in->ctime = ceph_clock_now();
14818       in->change_attr++;
14819       in->mark_caps_dirty(CEPH_CAP_FILE_WR);
14820
14821       if (is_quota_bytes_approaching(in, fh->actor_perms)) {
14822         check_caps(in, CHECK_CAPS_NODELAY);
14823       } else if (is_max_size_approaching(in)) {
14824         check_caps(in, 0);
14825       }
14826     }
14827   }
14828
14829   if (nullptr != onuninline) {
14830     client_lock.unlock();
14831     int ret = onuninline->wait();
14832     client_lock.lock();
14833
14834     if (ret >= 0 || ret == -CEPHFS_ECANCELED) {
14835       in->inline_data.clear();
14836       in->inline_version = CEPH_INLINE_NONE;
14837       in->mark_caps_dirty(CEPH_CAP_FILE_WR);
14838       check_caps(in, 0);
14839     } else
14840       r = ret;
14841   }
14842
14843   put_cap_ref(in, CEPH_CAP_FILE_WR);
14844   return r;
14845 }
14846
14847 int Client::ll_fallocate(Fh *fh, int mode, int64_t offset, int64_t length)
14848 {
14849   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14850   if (!mref_reader.is_state_satisfied())
14851     return -CEPHFS_ENOTCONN;
14852
14853   ldout(cct, 3) << __func__ << " " << fh << " " << fh->inode->ino << " " << dendl;
14854   tout(cct) << __func__ << " " << mode << " " << offset << " " << length << std::endl;
14855   tout(cct) << (uintptr_t)fh << std::endl;
14856
14857   std::scoped_lock lock(client_lock);
14858   return _fallocate(fh, mode, offset, length);
14859 }
14860
14861 int Client::fallocate(int fd, int mode, loff_t offset, loff_t length)
14862 {
14863   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14864   if (!mref_reader.is_state_satisfied())
14865     return -CEPHFS_ENOTCONN;
14866
14867   tout(cct) << __func__ << " " << " " << fd << mode << " " << offset << " " << length << std::endl;
14868
14869   std::scoped_lock lock(client_lock);
14870   Fh *fh = get_filehandle(fd);
14871   if (!fh)
14872     return -CEPHFS_EBADF;
14873 #if defined(__linux__) && defined(O_PATH)
14874   if (fh->flags & O_PATH)
14875     return -CEPHFS_EBADF;
14876 #endif
14877   return _fallocate(fh, mode, offset, length);
14878 }
14879
14880 int Client::ll_release(Fh *fh)
14881 {
14882   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14883   if (!mref_reader.is_state_satisfied())
14884     return -CEPHFS_ENOTCONN;
14885
14886   ldout(cct, 3) << __func__ << " (fh)" << fh << " " << fh->inode->ino << " " <<
14887     dendl;
14888   tout(cct) << __func__ << " (fh)" << std::endl;
14889   tout(cct) << (uintptr_t)fh << std::endl;
14890
14891   std::scoped_lock lock(client_lock);
14892
14893   if (ll_unclosed_fh_set.count(fh))
14894     ll_unclosed_fh_set.erase(fh);
14895   return _release_fh(fh);
14896 }
14897
14898 int Client::ll_getlk(Fh *fh, struct flock *fl, uint64_t owner)
14899 {
14900   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14901   if (!mref_reader.is_state_satisfied())
14902     return -CEPHFS_ENOTCONN;
14903
14904   ldout(cct, 3) << "ll_getlk (fh)" << fh << " " << fh->inode->ino << dendl;
14905   tout(cct) << "ll_getk (fh)" << (uintptr_t)fh << std::endl;
14906
14907   std::scoped_lock lock(client_lock);
14908   return _getlk(fh, fl, owner);
14909 }
14910
14911 int Client::ll_setlk(Fh *fh, struct flock *fl, uint64_t owner, int sleep)
14912 {
14913   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14914   if (!mref_reader.is_state_satisfied())
14915     return -CEPHFS_ENOTCONN;
14916
14917   ldout(cct, 3) << __func__ << "  (fh) " << fh << " " << fh->inode->ino << dendl;
14918   tout(cct) << __func__ << " (fh)" << (uintptr_t)fh << std::endl;
14919
14920   std::scoped_lock lock(client_lock);
14921   return _setlk(fh, fl, owner, sleep);
14922 }
14923
14924 int Client::ll_flock(Fh *fh, int cmd, uint64_t owner)
14925 {
14926   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14927   if (!mref_reader.is_state_satisfied())
14928     return -CEPHFS_ENOTCONN;
14929
14930   ldout(cct, 3) << __func__ << "  (fh) " << fh << " " << fh->inode->ino << dendl;
14931   tout(cct) << __func__ << " (fh)" << (uintptr_t)fh << std::endl;
14932
14933   std::scoped_lock lock(client_lock);
14934   return _flock(fh, cmd, owner);
14935 }
14936
14937 int Client::set_deleg_timeout(uint32_t timeout)
14938 {
14939   std::scoped_lock lock(client_lock);
14940
14941   /*
14942    * The whole point is to prevent blocklisting so we must time out the
14943    * delegation before the session autoclose timeout kicks in.
14944    */
14945   if (timeout >= mdsmap->get_session_autoclose())
14946     return -CEPHFS_EINVAL;
14947
14948   deleg_timeout = timeout;
14949   return 0;
14950 }
14951
14952 int Client::ll_delegation(Fh *fh, unsigned cmd, ceph_deleg_cb_t cb, void *priv)
14953 {
14954   int ret = -CEPHFS_EINVAL;
14955
14956   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14957   if (!mref_reader.is_state_satisfied())
14958     return -CEPHFS_ENOTCONN;
14959
14960   std::scoped_lock lock(client_lock);
14961
14962   Inode *inode = fh->inode.get();
14963
14964   switch(cmd) {
14965   case CEPH_DELEGATION_NONE:
14966     inode->unset_deleg(fh);
14967     ret = 0;
14968     break;
14969   default:
14970     try {
14971       ret = inode->set_deleg(fh, cmd, cb, priv);
14972     } catch (std::bad_alloc&) {
14973       ret = -CEPHFS_ENOMEM;
14974     }
14975     break;
14976   }
14977   return ret;
14978 }
14979
14980 class C_Client_RequestInterrupt : public Context  {
14981 private:
14982   Client *client;
14983   MetaRequest *req;
14984 public:
14985   C_Client_RequestInterrupt(Client *c, MetaRequest *r) : client(c), req(r) {
14986     req->get();
14987   }
14988   void finish(int r) override {
14989     std::scoped_lock l(client->client_lock);
14990     ceph_assert(req->head.op == CEPH_MDS_OP_SETFILELOCK);
14991     client->_interrupt_filelock(req);
14992     client->put_request(req);
14993   }
14994 };
14995
14996 void Client::ll_interrupt(void *d)
14997 {
14998   MetaRequest *req = static_cast<MetaRequest*>(d);
14999   ldout(cct, 3) << __func__ << " tid " << req->get_tid() << dendl;
15000   tout(cct) << __func__ << " tid " << req->get_tid() << std::endl;
15001   interrupt_finisher.queue(new C_Client_RequestInterrupt(this, req));
15002 }
15003
15004 // =========================================
15005 // layout
15006
15007 // expose file layouts
15008
15009 int Client::describe_layout(const char *relpath, file_layout_t *lp,
15010                             const UserPerm& perms)
15011 {
15012   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
15013   if (!mref_reader.is_state_satisfied())
15014     return -CEPHFS_ENOTCONN;
15015
15016   std::scoped_lock lock(client_lock);
15017
15018   filepath path(relpath);
15019   InodeRef in;
15020   int r = path_walk(path, &in, perms);
15021   if (r < 0)
15022     return r;
15023
15024   *lp = in->layout;
15025
15026   ldout(cct, 3) << __func__ << "(" << relpath << ") = 0" << dendl;
15027   return 0;
15028 }
15029
15030 int Client::fdescribe_layout(int fd, file_layout_t *lp)
15031 {
15032   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
15033   if (!mref_reader.is_state_satisfied())
15034     return -CEPHFS_ENOTCONN;
15035
15036   std::scoped_lock lock(client_lock);
15037
15038   Fh *f = get_filehandle(fd);
15039   if (!f)
15040     return -CEPHFS_EBADF;
15041   Inode *in = f->inode.get();
15042
15043   *lp = in->layout;
15044
15045   ldout(cct, 3) << __func__ << "(" << fd << ") = 0" << dendl;
15046   return 0;
15047 }
15048
15049 int64_t Client::get_default_pool_id()
15050 {
15051   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
15052   if (!mref_reader.is_state_satisfied())
15053     return -CEPHFS_ENOTCONN;
15054
15055   std::scoped_lock lock(client_lock);
15056
15057   /* first data pool is the default */
15058   return mdsmap->get_first_data_pool();
15059 }
15060
15061 // expose osdmap
15062
15063 int64_t Client::get_pool_id(const char *pool_name)
15064 {
15065   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
15066   if (!mref_reader.is_state_satisfied())
15067     return -CEPHFS_ENOTCONN;
15068
15069   std::scoped_lock lock(client_lock);
15070
15071   return objecter->with_osdmap(std::mem_fn(&OSDMap::lookup_pg_pool_name),
15072                                pool_name);
15073 }
15074
15075 string Client::get_pool_name(int64_t pool)
15076 {
15077   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
15078   if (!mref_reader.is_state_satisfied())
15079     return string();
15080
15081   std::scoped_lock lock(client_lock);
15082
15083   return objecter->with_osdmap([pool](const OSDMap& o) {
15084       return o.have_pg_pool(pool) ? o.get_pool_name(pool) : string();
15085     });
15086 }
15087
15088 int Client::get_pool_replication(int64_t pool)
15089 {
15090   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
15091   if (!mref_reader.is_state_satisfied())
15092     return -CEPHFS_ENOTCONN;
15093
15094   std::scoped_lock lock(client_lock);
15095
15096   return objecter->with_osdmap([pool](const OSDMap& o) {
15097       return o.have_pg_pool(pool) ? o.get_pg_pool(pool)->get_size() : -CEPHFS_ENOENT;
15098     });
15099 }
15100
15101 int Client::get_file_extent_osds(int fd, loff_t off, loff_t *len, vector<int>& osds)
15102 {
15103   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
15104   if (!mref_reader.is_state_satisfied())
15105     return -CEPHFS_ENOTCONN;
15106
15107   std::scoped_lock lock(client_lock);
15108
15109   Fh *f = get_filehandle(fd);
15110   if (!f)
15111     return -CEPHFS_EBADF;
15112   Inode *in = f->inode.get();
15113
15114   vector<ObjectExtent> extents;
15115   Striper::file_to_extents(cct, in->ino, &in->layout, off, 1, in->truncate_size, extents);
15116   ceph_assert(extents.size() == 1);
15117
15118   objecter->with_osdmap([&](const OSDMap& o) {
15119       pg_t pg = o.object_locator_to_pg(extents[0].oid, extents[0].oloc);
15120       o.pg_to_acting_osds(pg, osds);
15121     });
15122
15123   if (osds.empty())
15124     return -CEPHFS_EINVAL;
15125
15126   /*
15127    * Return the remainder of the extent (stripe unit)
15128    *
15129    * If length = 1 is passed to Striper::file_to_extents we get a single
15130    * extent back, but its length is one so we still need to compute the length
15131    * to the end of the stripe unit.
15132    *
15133    * If length = su then we may get 1 or 2 objects back in the extents vector
15134    * which would have to be examined. Even then, the offsets are local to the
15135    * object, so matching up to the file offset is extra work.
15136    *
15137    * It seems simpler to stick with length = 1 and manually compute the
15138    * remainder.
15139    */
15140   if (len) {
15141     uint64_t su = in->layout.stripe_unit;
15142     *len = su - (off % su);
15143   }
15144
15145   return 0;
15146 }
15147
15148 int Client::get_osd_crush_location(int id, vector<pair<string, string> >& path)
15149 {
15150   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
15151   if (!mref_reader.is_state_satisfied())
15152     return -CEPHFS_ENOTCONN;
15153
15154   std::scoped_lock lock(client_lock);
15155
15156   if (id < 0)
15157     return -CEPHFS_EINVAL;
15158   return objecter->with_osdmap([&](const OSDMap& o) {
15159       return o.crush->get_full_location_ordered(id, path);
15160     });
15161 }
15162
15163 int Client::get_file_stripe_address(int fd, loff_t offset,
15164                                     vector<entity_addr_t>& address)
15165 {
15166   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
15167   if (!mref_reader.is_state_satisfied())
15168     return -CEPHFS_ENOTCONN;
15169
15170   std::scoped_lock lock(client_lock);
15171
15172   Fh *f = get_filehandle(fd);
15173   if (!f)
15174     return -CEPHFS_EBADF;
15175   Inode *in = f->inode.get();
15176
15177   // which object?
15178   vector<ObjectExtent> extents;
15179   Striper::file_to_extents(cct, in->ino, &in->layout, offset, 1,
15180                            in->truncate_size, extents);
15181   ceph_assert(extents.size() == 1);
15182
15183   // now we have the object and its 'layout'
15184   return objecter->with_osdmap([&](const OSDMap& o) {
15185       pg_t pg = o.object_locator_to_pg(extents[0].oid, extents[0].oloc);
15186       vector<int> osds;
15187       o.pg_to_acting_osds(pg, osds);
15188       if (osds.empty())
15189         return -CEPHFS_EINVAL;
15190       for (unsigned i = 0; i < osds.size(); i++) {
15191         entity_addr_t addr = o.get_addrs(osds[i]).front();
15192         address.push_back(addr);
15193       }
15194       return 0;
15195     });
15196 }
15197
15198 int Client::get_osd_addr(int osd, entity_addr_t& addr)
15199 {
15200   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
15201   if (!mref_reader.is_state_satisfied())
15202     return -CEPHFS_ENOTCONN;
15203
15204   std::scoped_lock lock(client_lock);
15205
15206   return objecter->with_osdmap([&](const OSDMap& o) {
15207       if (!o.exists(osd))
15208         return -CEPHFS_ENOENT;
15209
15210       addr = o.get_addrs(osd).front();
15211       return 0;
15212     });
15213 }
15214
15215 int Client::enumerate_layout(int fd, vector<ObjectExtent>& result,
15216                              loff_t length, loff_t offset)
15217 {
15218   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
15219   if (!mref_reader.is_state_satisfied())
15220     return -CEPHFS_ENOTCONN;
15221
15222   std::scoped_lock lock(client_lock);
15223
15224   Fh *f = get_filehandle(fd);
15225   if (!f)
15226     return -CEPHFS_EBADF;
15227   Inode *in = f->inode.get();
15228
15229   // map to a list of extents
15230   Striper::file_to_extents(cct, in->ino, &in->layout, offset, length, in->truncate_size, result);
15231
15232   ldout(cct, 3) << __func__ << "(" << fd << ", " << length << ", " << offset << ") = 0" << dendl;
15233   return 0;
15234 }
15235
15236
15237 /* find an osd with the same ip.  -CEPHFS_ENXIO if none. */
15238 int Client::get_local_osd()
15239 {
15240   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
15241   if (!mref_reader.is_state_satisfied())
15242     return -CEPHFS_ENOTCONN;
15243
15244   std::scoped_lock lock(client_lock);
15245
15246   objecter->with_osdmap([this](const OSDMap& o) {
15247       if (o.get_epoch() != local_osd_epoch) {
15248         local_osd = o.find_osd_on_ip(messenger->get_myaddrs().front());
15249         local_osd_epoch = o.get_epoch();
15250       }
15251     });
15252   return local_osd;
15253 }
15254
15255
15256
15257
15258
15259
15260 // ===============================
15261
15262 void Client::ms_handle_connect(Connection *con)
15263 {
15264   ldout(cct, 10) << __func__ << " on " << con->get_peer_addr() << dendl;
15265 }
15266
15267 bool Client::ms_handle_reset(Connection *con)
15268 {
15269   ldout(cct, 0) << __func__ << " on " << con->get_peer_addr() << dendl;
15270   return false;
15271 }
15272
15273 void Client::ms_handle_remote_reset(Connection *con)
15274 {
15275   std::scoped_lock lock(client_lock);
15276   ldout(cct, 0) << __func__ << " on " << con->get_peer_addr() << dendl;
15277   switch (con->get_peer_type()) {
15278   case CEPH_ENTITY_TYPE_MDS:
15279     {
15280       // kludge to figure out which mds this is; fixme with a Connection* state
15281       mds_rank_t mds = MDS_RANK_NONE;
15282       MetaSessionRef s = NULL;
15283       for (auto &p : mds_sessions) {
15284         if (mdsmap->have_inst(p.first) && mdsmap->get_addrs(p.first) == con->get_peer_addrs()) {
15285           mds = p.first;
15286           s = p.second;
15287         }
15288       }
15289       if (mds >= 0) {
15290         ceph_assert(s != NULL);
15291         switch (s->state) {
15292         case MetaSession::STATE_CLOSING:
15293           ldout(cct, 1) << "reset from mds we were closing; we'll call that closed" << dendl;
15294           _closed_mds_session(s.get());
15295           break;
15296
15297         case MetaSession::STATE_OPENING:
15298           {
15299             ldout(cct, 1) << "reset from mds we were opening; retrying" << dendl;
15300             list<Context*> waiters;
15301             waiters.swap(s->waiting_for_open);
15302             _closed_mds_session(s.get());
15303             auto news = _get_or_open_mds_session(mds);
15304             news->waiting_for_open.swap(waiters);
15305           }
15306           break;
15307
15308         case MetaSession::STATE_OPEN:
15309           {
15310             objecter->maybe_request_map(); /* to check if we are blocklisted */
15311             if (cct->_conf.get_val<bool>("client_reconnect_stale")) {
15312               ldout(cct, 1) << "reset from mds we were open; close mds session for reconnect" << dendl;
15313               _closed_mds_session(s.get());
15314             } else {
15315               ldout(cct, 1) << "reset from mds we were open; mark session as stale" << dendl;
15316               s->state = MetaSession::STATE_STALE;
15317             }
15318           }
15319           break;
15320
15321         case MetaSession::STATE_NEW:
15322         case MetaSession::STATE_CLOSED:
15323         default:
15324           break;
15325         }
15326       }
15327     }
15328     break;
15329   }
15330 }
15331
15332 bool Client::ms_handle_refused(Connection *con)
15333 {
15334   ldout(cct, 1) << __func__ << " on " << con->get_peer_addr() << dendl;
15335   return false;
15336 }
15337
15338 Inode *Client::get_quota_root(Inode *in, const UserPerm& perms)
15339 {
15340   Inode *quota_in = root_ancestor;
15341   SnapRealm *realm = in->snaprealm;
15342
15343   if (!cct->_conf.get_val<bool>("client_quota"))
15344     return NULL;
15345
15346   while (realm) {
15347     ldout(cct, 10) << __func__ << " realm " << realm->ino << dendl;
15348     if (realm->ino != in->ino) {
15349       auto p = inode_map.find(vinodeno_t(realm->ino, CEPH_NOSNAP));
15350       if (p == inode_map.end())
15351         break;
15352
15353       if (p->second->quota.is_enable()) {
15354         quota_in = p->second;
15355         break;
15356       }
15357     }
15358     realm = realm->pparent;
15359   }
15360   ldout(cct, 10) << __func__ << " " << in->vino() << " -> " << quota_in->vino() << dendl;
15361   return quota_in;
15362 }
15363
15364 /**
15365  * Traverse quota ancestors of the Inode, return true
15366  * if any of them passes the passed function
15367  */
15368 bool Client::check_quota_condition(Inode *in, const UserPerm& perms,
15369                                    std::function<bool (const Inode &in)> test)
15370 {
15371   if (!cct->_conf.get_val<bool>("client_quota"))
15372     return false;
15373
15374   while (true) {
15375     ceph_assert(in != NULL);
15376     if (test(*in)) {
15377       return true;
15378     }
15379
15380     if (in == root_ancestor) {
15381       // We're done traversing, drop out
15382       return false;
15383     } else {
15384       // Continue up the tree
15385       in = get_quota_root(in, perms);
15386     }
15387   }
15388
15389   return false;
15390 }
15391
15392 bool Client::is_quota_files_exceeded(Inode *in, const UserPerm& perms)
15393 {
15394   return check_quota_condition(in, perms,
15395       [](const Inode &in) {
15396         return in.quota.max_files && in.rstat.rsize() >= in.quota.max_files;
15397       });
15398 }
15399
15400 bool Client::is_quota_bytes_exceeded(Inode *in, int64_t new_bytes,
15401                                      const UserPerm& perms)
15402 {
15403   return check_quota_condition(in, perms,
15404       [&new_bytes](const Inode &in) {
15405         return in.quota.max_bytes && (in.rstat.rbytes + new_bytes)
15406                > in.quota.max_bytes;
15407       });
15408 }
15409
15410 bool Client::is_quota_bytes_approaching(Inode *in, const UserPerm& perms)
15411 {
15412   ceph_assert(in->size >= in->reported_size);
15413   const uint64_t size = in->size - in->reported_size;
15414   return check_quota_condition(in, perms,
15415       [&size](const Inode &in) {
15416         if (in.quota.max_bytes) {
15417           if (in.rstat.rbytes >= in.quota.max_bytes) {
15418             return true;
15419           }
15420
15421           const uint64_t space = in.quota.max_bytes - in.rstat.rbytes;
15422           return (space >> 4) < size;
15423         } else {
15424           return false;
15425         }
15426       });
15427 }
15428
15429 enum {
15430   POOL_CHECKED = 1,
15431   POOL_CHECKING = 2,
15432   POOL_READ = 4,
15433   POOL_WRITE = 8,
15434 };
15435
15436 int Client::check_pool_perm(Inode *in, int need)
15437 {
15438   ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
15439
15440   if (!cct->_conf->client_check_pool_perm)
15441     return 0;
15442
15443   /* Only need to do this for regular files */
15444   if (!in->is_file())
15445     return 0;
15446
15447   int64_t pool_id = in->layout.pool_id;
15448   std::string pool_ns = in->layout.pool_ns;
15449   std::pair<int64_t, std::string> perm_key(pool_id, pool_ns);
15450   int have = 0;
15451   while (true) {
15452     auto it = pool_perms.find(perm_key);
15453     if (it == pool_perms.end())
15454       break;
15455     if (it->second == POOL_CHECKING) {
15456       // avoid concurrent checkings
15457       wait_on_list(waiting_for_pool_perm);
15458     } else {
15459       have = it->second;
15460       ceph_assert(have & POOL_CHECKED);
15461       break;
15462     }
15463   }
15464
15465   if (!have) {
15466     if (in->snapid != CEPH_NOSNAP) {
15467       // pool permission check needs to write to the first object. But for snapshot,
15468       // head of the first object may have already been deleted. To avoid creating
15469       // orphan object, skip the check for now.
15470       return 0;
15471     }
15472
15473     pool_perms[perm_key] = POOL_CHECKING;
15474
15475     char oid_buf[32];
15476     snprintf(oid_buf, sizeof(oid_buf), "%llx.00000000", (unsigned long long)in->ino);
15477     object_t oid = oid_buf;
15478
15479     SnapContext nullsnapc;
15480
15481     C_SaferCond rd_cond;
15482     ObjectOperation rd_op;
15483     rd_op.stat(nullptr, nullptr, nullptr);
15484
15485     objecter->mutate(oid, OSDMap::file_to_object_locator(in->layout), rd_op,
15486                      nullsnapc, ceph::real_clock::now(), 0, &rd_cond);
15487
15488     C_SaferCond wr_cond;
15489     ObjectOperation wr_op;
15490     wr_op.create(true);
15491
15492     objecter->mutate(oid, OSDMap::file_to_object_locator(in->layout), wr_op,
15493                      nullsnapc, ceph::real_clock::now(), 0, &wr_cond);
15494
15495     client_lock.unlock();
15496     int rd_ret = rd_cond.wait();
15497     int wr_ret = wr_cond.wait();
15498     client_lock.lock();
15499
15500     bool errored = false;
15501
15502     if (rd_ret == 0 || rd_ret == -CEPHFS_ENOENT)
15503       have |= POOL_READ;
15504     else if (rd_ret != -CEPHFS_EPERM) {
15505       ldout(cct, 10) << __func__ << " on pool " << pool_id << " ns " << pool_ns
15506                      << " rd_err = " << rd_ret << " wr_err = " << wr_ret << dendl;
15507       errored = true;
15508     }
15509
15510     if (wr_ret == 0 || wr_ret == -CEPHFS_EEXIST)
15511       have |= POOL_WRITE;
15512     else if (wr_ret != -CEPHFS_EPERM) {
15513       ldout(cct, 10) << __func__ << " on pool " << pool_id << " ns " << pool_ns
15514                      << " rd_err = " << rd_ret << " wr_err = " << wr_ret << dendl;
15515       errored = true;
15516     }
15517
15518     if (errored) {
15519       // Indeterminate: erase CHECKING state so that subsequent calls re-check.
15520       // Raise EIO because actual error code might be misleading for
15521       // userspace filesystem user.
15522       pool_perms.erase(perm_key);
15523       signal_cond_list(waiting_for_pool_perm);
15524       return -CEPHFS_EIO;
15525     }
15526
15527     pool_perms[perm_key] = have | POOL_CHECKED;
15528     signal_cond_list(waiting_for_pool_perm);
15529   }
15530
15531   if ((need & CEPH_CAP_FILE_RD) && !(have & POOL_READ)) {
15532     ldout(cct, 10) << __func__ << " on pool " << pool_id << " ns " << pool_ns
15533                    << " need " << ccap_string(need) << ", but no read perm" << dendl;
15534     return -CEPHFS_EPERM;
15535   }
15536   if ((need & CEPH_CAP_FILE_WR) && !(have & POOL_WRITE)) {
15537     ldout(cct, 10) << __func__ << " on pool " << pool_id << " ns " << pool_ns
15538                    << " need " << ccap_string(need) << ", but no write perm" << dendl;
15539     return -CEPHFS_EPERM;
15540   }
15541
15542   return 0;
15543 }
15544
15545 int Client::_posix_acl_permission(Inode *in, const UserPerm& perms, unsigned want)
15546 {
15547   if (acl_type == POSIX_ACL) {
15548     if (in->xattrs.count(ACL_EA_ACCESS)) {
15549       const bufferptr& access_acl = in->xattrs[ACL_EA_ACCESS];
15550
15551       return posix_acl_permits(access_acl, in->uid, in->gid, perms, want);
15552     }
15553   }
15554   return -CEPHFS_EAGAIN;
15555 }
15556
15557 int Client::_posix_acl_chmod(Inode *in, mode_t mode, const UserPerm& perms)
15558 {
15559   if (acl_type == NO_ACL)
15560     return 0;
15561
15562   int r = _getattr(in, CEPH_STAT_CAP_XATTR, perms, in->xattr_version == 0);
15563   if (r < 0)
15564     goto out;
15565
15566   if (acl_type == POSIX_ACL) {
15567     if (in->xattrs.count(ACL_EA_ACCESS)) {
15568       const bufferptr& access_acl = in->xattrs[ACL_EA_ACCESS];
15569       bufferptr acl(access_acl.c_str(), access_acl.length());
15570       r = posix_acl_access_chmod(acl, mode);
15571       if (r < 0)
15572         goto out;
15573       r = _do_setxattr(in, ACL_EA_ACCESS, acl.c_str(), acl.length(), 0, perms);
15574     } else {
15575       r = 0;
15576     }
15577   }
15578 out:
15579   ldout(cct, 10) << __func__ << " ino " << in->ino << " result=" << r << dendl;
15580   return r;
15581 }
15582
15583 int Client::_posix_acl_create(Inode *dir, mode_t *mode, bufferlist& xattrs_bl,
15584                               const UserPerm& perms)
15585 {
15586   if (acl_type == NO_ACL)
15587     return 0;
15588
15589   if (S_ISLNK(*mode))
15590     return 0;
15591
15592   int r = _getattr(dir, CEPH_STAT_CAP_XATTR, perms, dir->xattr_version == 0);
15593   if (r < 0)
15594     goto out;
15595
15596   if (acl_type == POSIX_ACL) {
15597     if (dir->xattrs.count(ACL_EA_DEFAULT)) {
15598       map<string, bufferptr> xattrs;
15599
15600       const bufferptr& default_acl = dir->xattrs[ACL_EA_DEFAULT];
15601       bufferptr acl(default_acl.c_str(), default_acl.length());
15602       r = posix_acl_inherit_mode(acl, mode);
15603       if (r < 0)
15604         goto out;
15605
15606       if (r > 0) {
15607         r = posix_acl_equiv_mode(acl.c_str(), acl.length(), mode);
15608         if (r < 0)
15609           goto out;
15610         if (r > 0)
15611           xattrs[ACL_EA_ACCESS] = acl;
15612       }
15613
15614       if (S_ISDIR(*mode))
15615         xattrs[ACL_EA_DEFAULT] = dir->xattrs[ACL_EA_DEFAULT];
15616
15617       r = xattrs.size();
15618       if (r > 0)
15619         encode(xattrs, xattrs_bl);
15620     } else {
15621       if (umask_cb)
15622         *mode &= ~umask_cb(callback_handle);
15623       r = 0;
15624     }
15625   }
15626 out:
15627   ldout(cct, 10) << __func__ << " dir ino " << dir->ino << " result=" << r << dendl;
15628   return r;
15629 }
15630
15631 void Client::set_filer_flags(int flags)
15632 {
15633   std::scoped_lock l(client_lock);
15634   ceph_assert(flags == 0 ||
15635          flags == CEPH_OSD_FLAG_LOCALIZE_READS);
15636   objecter->add_global_op_flags(flags);
15637 }
15638
15639 void Client::clear_filer_flags(int flags)
15640 {
15641   std::scoped_lock l(client_lock);
15642   ceph_assert(flags == CEPH_OSD_FLAG_LOCALIZE_READS);
15643   objecter->clear_global_op_flag(flags);
15644 }
15645
15646 // called before mount
15647 void Client::set_uuid(const std::string& uuid)
15648 {
15649   RWRef_t iref_reader(initialize_state, CLIENT_INITIALIZED);
15650   ceph_assert(iref_reader.is_state_satisfied());
15651
15652   std::scoped_lock l(client_lock);
15653   ceph_assert(!uuid.empty());
15654
15655   metadata["uuid"] = uuid;
15656   _close_sessions();
15657 }
15658
15659 // called before mount. 0 means infinite
15660 void Client::set_session_timeout(unsigned timeout)
15661 {
15662   RWRef_t iref_reader(initialize_state, CLIENT_INITIALIZED);
15663   ceph_assert(iref_reader.is_state_satisfied());
15664
15665   std::scoped_lock l(client_lock);
15666
15667   metadata["timeout"] = stringify(timeout);
15668 }
15669
15670 // called before mount
15671 int Client::start_reclaim(const std::string& uuid, unsigned flags,
15672                           const std::string& fs_name)
15673 {
15674   RWRef_t iref_reader(initialize_state, CLIENT_INITIALIZED);
15675   if (!iref_reader.is_state_satisfied())
15676     return -CEPHFS_ENOTCONN;
15677
15678   if (uuid.empty())
15679     return -CEPHFS_EINVAL;
15680
15681   std::unique_lock l(client_lock);
15682   {
15683     auto it = metadata.find("uuid");
15684     if (it != metadata.end() && it->second == uuid)
15685       return -CEPHFS_EINVAL;
15686   }
15687
15688   int r = subscribe_mdsmap(fs_name);
15689   if (r < 0) {
15690     lderr(cct) << "mdsmap subscription failed: " << cpp_strerror(r) << dendl;
15691     return r;
15692   }
15693
15694   if (metadata.empty())
15695     populate_metadata("");
15696
15697   while (mdsmap->get_epoch() == 0)
15698     wait_on_list(waiting_for_mdsmap);
15699
15700   reclaim_errno = 0;
15701   for (unsigned mds = 0; mds < mdsmap->get_num_in_mds(); ) {
15702     if (!mdsmap->is_up(mds)) {
15703       ldout(cct, 10) << "mds." << mds << " not active, waiting for new mdsmap" << dendl;
15704       wait_on_list(waiting_for_mdsmap);
15705       continue;
15706     }
15707
15708     MetaSessionRef session;
15709     if (!have_open_session(mds)) {
15710       session = _get_or_open_mds_session(mds);
15711       if (session->state == MetaSession::STATE_REJECTED)
15712         return -CEPHFS_EPERM;
15713       if (session->state != MetaSession::STATE_OPENING) {
15714         // umounting?
15715         return -CEPHFS_EINVAL;
15716       }
15717       ldout(cct, 10) << "waiting for session to mds." << mds << " to open" << dendl;
15718       wait_on_context_list(session->waiting_for_open);
15719       continue;
15720     }
15721
15722     session = mds_sessions.at(mds);
15723     if (!session->mds_features.test(CEPHFS_FEATURE_RECLAIM_CLIENT))
15724       return -CEPHFS_EOPNOTSUPP;
15725
15726     if (session->reclaim_state == MetaSession::RECLAIM_NULL ||
15727         session->reclaim_state == MetaSession::RECLAIMING) {
15728       session->reclaim_state = MetaSession::RECLAIMING;
15729       auto m = make_message<MClientReclaim>(uuid, flags);
15730       session->con->send_message2(std::move(m));
15731       wait_on_list(waiting_for_reclaim);
15732     } else if (session->reclaim_state == MetaSession::RECLAIM_FAIL) {
15733       return reclaim_errno ? : -CEPHFS_ENOTRECOVERABLE;
15734     } else {
15735       mds++;
15736     }
15737   }
15738
15739   // didn't find target session in any mds
15740   if (reclaim_target_addrs.empty()) {
15741     if (flags & CEPH_RECLAIM_RESET)
15742       return -CEPHFS_ENOENT;
15743     return -CEPHFS_ENOTRECOVERABLE;
15744   }
15745
15746   if (flags & CEPH_RECLAIM_RESET)
15747     return 0;
15748
15749   // use blocklist to check if target session was killed
15750   // (config option mds_session_blocklist_on_evict needs to be true)
15751   ldout(cct, 10) << __func__ << ": waiting for OSD epoch " << reclaim_osd_epoch << dendl;
15752   bs::error_code ec;
15753   l.unlock();
15754   objecter->wait_for_map(reclaim_osd_epoch, ca::use_blocked[ec]);
15755   l.lock();
15756
15757   if (ec)
15758     return ceph::from_error_code(ec);
15759
15760   bool blocklisted = objecter->with_osdmap(
15761       [this](const OSDMap &osd_map) -> bool {
15762         return osd_map.is_blocklisted(reclaim_target_addrs);
15763       });
15764   if (blocklisted)
15765     return -CEPHFS_ENOTRECOVERABLE;
15766
15767   metadata["reclaiming_uuid"] = uuid;
15768   return 0;
15769 }
15770
15771 void Client::finish_reclaim()
15772 {
15773   auto it = metadata.find("reclaiming_uuid");
15774   if (it == metadata.end()) {
15775     for (auto &p : mds_sessions)
15776       p.second->reclaim_state = MetaSession::RECLAIM_NULL;
15777     return;
15778   }
15779
15780   for (auto &p : mds_sessions) {
15781     p.second->reclaim_state = MetaSession::RECLAIM_NULL;
15782     auto m = make_message<MClientReclaim>("", MClientReclaim::FLAG_FINISH);
15783     p.second->con->send_message2(std::move(m));
15784   }
15785
15786   metadata["uuid"] = it->second;
15787   metadata.erase(it);
15788 }
15789
15790 void Client::handle_client_reclaim_reply(const MConstRef<MClientReclaimReply>& reply)
15791 {
15792   mds_rank_t from = mds_rank_t(reply->get_source().num());
15793   ldout(cct, 10) << __func__ << " " << *reply << " from mds." << from << dendl;
15794
15795   std::scoped_lock cl(client_lock);
15796   auto session = _get_mds_session(from, reply->get_connection().get());
15797   if (!session) {
15798     ldout(cct, 10) << " discarding reclaim reply from sessionless mds." <<  from << dendl;
15799     return;
15800   }
15801
15802   if (reply->get_result() >= 0) {
15803     session->reclaim_state = MetaSession::RECLAIM_OK;
15804     if (reply->get_epoch() > reclaim_osd_epoch)
15805       reclaim_osd_epoch = reply->get_epoch();
15806     if (!reply->get_addrs().empty())
15807       reclaim_target_addrs = reply->get_addrs();
15808   } else {
15809     session->reclaim_state = MetaSession::RECLAIM_FAIL;
15810     reclaim_errno = reply->get_result();
15811   }
15812
15813   signal_cond_list(waiting_for_reclaim);
15814 }
15815
15816 /**
15817  * This is included in cap release messages, to cause
15818  * the MDS to wait until this OSD map epoch.  It is necessary
15819  * in corner cases where we cancel RADOS ops, so that
15820  * nobody else tries to do IO to the same objects in
15821  * the same epoch as the cancelled ops.
15822  */
15823 void Client::set_cap_epoch_barrier(epoch_t e)
15824 {
15825   ldout(cct, 5) << __func__ << " epoch = " << e << dendl;
15826   cap_epoch_barrier = e;
15827 }
15828
15829 const char** Client::get_tracked_conf_keys() const
15830 {
15831   static const char* keys[] = {
15832     "client_cache_size",
15833     "client_cache_mid",
15834     "client_acl_type",
15835     "client_deleg_timeout",
15836     "client_deleg_break_on_open",
15837     "client_oc_size",
15838     "client_oc_max_objects",
15839     "client_oc_max_dirty",
15840     "client_oc_target_dirty",
15841     "client_oc_max_dirty_age",
15842     "client_caps_release_delay",
15843     "client_mount_timeout",
15844     NULL
15845   };
15846   return keys;
15847 }
15848
15849 void Client::handle_conf_change(const ConfigProxy& conf,
15850                                 const std::set <std::string> &changed)
15851 {
15852   std::scoped_lock lock(client_lock);
15853
15854   if (changed.count("client_cache_mid")) {
15855     lru.lru_set_midpoint(cct->_conf->client_cache_mid);
15856   }
15857   if (changed.count("client_acl_type")) {
15858     acl_type = NO_ACL;
15859     if (cct->_conf->client_acl_type == "posix_acl")
15860       acl_type = POSIX_ACL;
15861   }
15862   if (changed.count("client_oc_size")) {
15863     objectcacher->set_max_size(cct->_conf->client_oc_size);
15864   }
15865   if (changed.count("client_oc_max_objects")) {
15866     objectcacher->set_max_objects(cct->_conf->client_oc_max_objects);
15867   }
15868   if (changed.count("client_oc_max_dirty")) {
15869     objectcacher->set_max_dirty(cct->_conf->client_oc_max_dirty);
15870   }
15871   if (changed.count("client_oc_target_dirty")) {
15872     objectcacher->set_target_dirty(cct->_conf->client_oc_target_dirty);
15873   }
15874   if (changed.count("client_oc_max_dirty_age")) {
15875     objectcacher->set_max_dirty_age(cct->_conf->client_oc_max_dirty_age);
15876   }
15877   if (changed.count("client_collect_and_send_global_metrics")) {
15878     _collect_and_send_global_metrics = cct->_conf.get_val<bool>(
15879       "client_collect_and_send_global_metrics");
15880   }
15881   if (changed.count("client_caps_release_delay")) {
15882     caps_release_delay = cct->_conf.get_val<std::chrono::seconds>(
15883       "client_caps_release_delay");
15884   }
15885   if (changed.count("client_mount_timeout")) {
15886     mount_timeout = cct->_conf.get_val<std::chrono::seconds>(
15887       "client_mount_timeout");
15888   }
15889 }
15890
15891 void intrusive_ptr_add_ref(Inode *in)
15892 {
15893   in->iget();
15894 }
15895
15896 void intrusive_ptr_release(Inode *in)
15897 {
15898   in->client->put_inode(in);
15899 }
15900
15901 mds_rank_t Client::_get_random_up_mds() const
15902 {
15903   ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
15904
15905   std::set<mds_rank_t> up;
15906   mdsmap->get_up_mds_set(up);
15907
15908   if (up.empty())
15909     return MDS_RANK_NONE;
15910   std::set<mds_rank_t>::const_iterator p = up.begin();
15911   for (int n = rand() % up.size(); n; n--)
15912     ++p;
15913   return *p;
15914 }
15915
15916
15917 StandaloneClient::StandaloneClient(Messenger *m, MonClient *mc,
15918                                    boost::asio::io_context& ictx)
15919   : Client(m, mc, new Objecter(m->cct, m, mc, ictx))
15920 {
15921   monclient->set_messenger(m);
15922   objecter->set_client_incarnation(0);
15923 }
15924
15925 StandaloneClient::~StandaloneClient()
15926 {
15927   delete objecter;
15928   objecter = nullptr;
15929 }
15930
15931 int StandaloneClient::init()
15932 {
15933   RWRef_t iref_writer(initialize_state, CLIENT_INITIALIZING, false);
15934   ceph_assert(iref_writer.is_first_writer());
15935
15936   _pre_init();
15937   objecter->init();
15938
15939   client_lock.lock();
15940
15941   messenger->add_dispatcher_tail(objecter);
15942   messenger->add_dispatcher_tail(this);
15943
15944   monclient->set_want_keys(CEPH_ENTITY_TYPE_MDS | CEPH_ENTITY_TYPE_OSD);
15945   int r = monclient->init();
15946   if (r < 0) {
15947     // need to do cleanup because we're in an intermediate init state
15948     {
15949       std::scoped_lock l(timer_lock);
15950       timer.shutdown();
15951     }
15952
15953     client_lock.unlock();
15954     objecter->shutdown();
15955     objectcacher->stop();
15956     monclient->shutdown();
15957     return r;
15958   }
15959   objecter->start();
15960
15961   client_lock.unlock();
15962   _finish_init();
15963   iref_writer.update_state(CLIENT_INITIALIZED);
15964
15965   return 0;
15966 }
15967
15968 void StandaloneClient::shutdown()
15969 {
15970   Client::shutdown();
15971   objecter->shutdown();
15972   monclient->shutdown();
15973 }