ceph/src/client/Client.cc

   1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
   2 // vim: ts=8 sw=2 smarttab
   3 /*
   4  * Ceph - scalable distributed file system
   5  *
   6  * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
   7  *
   8  * This is free software; you can redistribute it and/or
   9  * modify it under the terms of the GNU Lesser General Public
  10  * License version 2.1, as published by the Free Software
  11  * Foundation.  See file COPYING.
  12  *
  13  */
  14
  15
  16 // unix-ey fs stuff
  17 #include <unistd.h>
  18 #include <sys/types.h>
  19 #include <time.h>
  20 #include <utime.h>
  21 #include <string.h>
  22 #include <sys/stat.h>
  23 #include <sys/param.h>
  24 #include <fcntl.h>
  25 #include <sys/file.h>
  26 #ifndef _WIN32
  27 #include <sys/utsname.h>
  28 #endif
  29 #include <sys/uio.h>
  30
  31 #include <boost/lexical_cast.hpp>
  32 #include <boost/fusion/include/std_pair.hpp>
  33
  34 #include "common/async/waiter.h"
  35
  36 #if defined(__FreeBSD__)
  37 #define XATTR_CREATE    0x1
  38 #define XATTR_REPLACE   0x2
  39 #elif !defined(_WIN32)
  40 #include <sys/xattr.h>
  41 #endif
  42
  43 #if defined(__linux__)
  44 #include <linux/falloc.h>
  45 #endif
  46
  47 #include <sys/statvfs.h>
  48
  49 #include "common/config.h"
  50 #include "common/version.h"
  51 #include "common/async/blocked_completion.h"
  52
  53 #include "mon/MonClient.h"
  54
  55 #include "messages/MClientCaps.h"
  56 #include "messages/MClientLease.h"
  57 #include "messages/MClientQuota.h"
  58 #include "messages/MClientReclaim.h"
  59 #include "messages/MClientReclaimReply.h"
  60 #include "messages/MClientReconnect.h"
  61 #include "messages/MClientReply.h"
  62 #include "messages/MClientRequest.h"
  63 #include "messages/MClientRequestForward.h"
  64 #include "messages/MClientSession.h"
  65 #include "messages/MClientSnap.h"
  66 #include "messages/MClientMetrics.h"
  67 #include "messages/MCommandReply.h"
  68 #include "messages/MFSMap.h"
  69 #include "messages/MFSMapUser.h"
  70 #include "messages/MMDSMap.h"
  71 #include "messages/MOSDMap.h"
  72
  73 #include "mds/flock.h"
  74 #include "mds/cephfs_features.h"
  75 #include "mds/snap.h"
  76 #include "osd/OSDMap.h"
  77 #include "osdc/Filer.h"
  78
  79 #include "common/Cond.h"
  80 #include "common/perf_counters.h"
  81 #include "common/admin_socket.h"
  82 #include "common/errno.h"
  83 #include "include/str_list.h"
  84
  85 #define dout_subsys ceph_subsys_client
  86
  87 #include "include/lru.h"
  88 #include "include/compat.h"
  89 #include "include/stringify.h"
  90 #include "include/random.h"
  91
  92 #include "Client.h"
  93 #include "Inode.h"
  94 #include "Dentry.h"
  95 #include "Delegation.h"
  96 #include "Dir.h"
  97 #include "ClientSnapRealm.h"
  98 #include "Fh.h"
  99 #include "MetaSession.h"
 100 #include "MetaRequest.h"
 101 #include "ObjecterWriteback.h"
 102 #include "posix_acl.h"
 103
 104 #include "include/ceph_assert.h"
 105 #include "include/stat.h"
 106
 107 #include "include/cephfs/ceph_ll_client.h"
 108
 109 #if HAVE_GETGROUPLIST
 110 #include <grp.h>
 111 #include <pwd.h>
 112 #include <unistd.h>
 113 #endif
 114
 115 #undef dout_prefix
 116 #define dout_prefix *_dout << "client." << whoami << " "
 117
 118 #define  tout(cct)       if (!cct->_conf->client_trace.empty()) traceout
 119
 120 // FreeBSD fails to define this
 121 #ifndef O_DSYNC
 122 #define O_DSYNC 0x0
 123 #endif
 124 // Darwin fails to define this
 125 #ifndef O_RSYNC
 126 #define O_RSYNC 0x0
 127 #endif
 128
 129 #ifndef O_DIRECT
 130 #define O_DIRECT 0x0
 131 #endif
 132
 133 // Windows doesn't define those values. While the Posix compatibilty layer
 134 // doesn't support those values, the Windows native functions do provide
 135 // similar flags. Special care should be taken if we're going to use those
 136 // flags in ceph-dokan. The current values are no-ops, while propagating
 137 // them to the rest of the code might cause the Windows functions to reject
 138 // them as invalid.
 139 #ifndef O_NOFOLLOW
 140 #define O_NOFOLLOW 0x0
 141 #endif
 142
 143 #ifndef O_SYNC
 144 #define O_SYNC 0x0
 145 #endif
 146
 147 #define DEBUG_GETATTR_CAPS (CEPH_CAP_XATTR_SHARED)
 148
 149 #ifndef S_IXUGO
 150 #define S_IXUGO (S_IXUSR|S_IXGRP|S_IXOTH)
 151 #endif
 152
 153 using std::dec;
 154 using std::hex;
 155 using std::list;
 156 using std::oct;
 157 using std::pair;
 158 using std::string;
 159 using std::vector;
 160
 161 using namespace TOPNSPC::common;
 162
 163 namespace bs = boost::system;
 164 namespace ca = ceph::async;
 165
 166 void client_flush_set_callback(void *p, ObjectCacher::ObjectSet *oset)
 167 {
 168   Client *client = static_cast<Client*>(p);
 169   client->flush_set_callback(oset);
 170 }
 171
 172 bool Client::is_reserved_vino(vinodeno_t &vino) {
 173   if (MDS_IS_PRIVATE_INO(vino.ino)) {
 174     ldout(cct, -1) << __func__ << " attempt to access reserved inode number " << vino << dendl;
 175     return true;
 176   }
 177   return false;
 178 }
 179
 180 // running average and standard deviation -- presented in
 181 // Donald Knuth's TAoCP, Volume II.
 182 double calc_average(double old_avg, double value, uint64_t count) {
 183   double new_avg;
 184   if (count == 1) {
 185     new_avg = value;
 186   } else {
 187     new_avg = old_avg + ((value - old_avg) / count);
 188   }
 189
 190   return new_avg;
 191 }
 192
 193 double calc_sq_sum(double old_sq_sum, double old_mean, double new_mean,
 194                    double value, uint64_t count) {
 195   double new_sq_sum;
 196   if (count == 1) {
 197     new_sq_sum = 0.0;
 198   } else {
 199     new_sq_sum = old_sq_sum + (value - old_mean)*(value - new_mean);
 200   }
 201
 202   return new_sq_sum;
 203 }
 204
 205 // -------------
 206
 207 Client::CommandHook::CommandHook(Client *client) :
 208   m_client(client)
 209 {
 210 }
 211
 212 int Client::CommandHook::call(
 213   std::string_view command,
 214   const cmdmap_t& cmdmap,
 215   const bufferlist&,
 216   Formatter *f,
 217   std::ostream& errss,
 218   bufferlist& out)
 219 {
 220   f->open_object_section("result");
 221   {
 222     std::scoped_lock l{m_client->client_lock};
 223     if (command == "mds_requests")
 224       m_client->dump_mds_requests(f);
 225     else if (command == "mds_sessions") {
 226       bool cap_dump = false;
 227       cmd_getval(cmdmap, "cap_dump", cap_dump);
 228       m_client->dump_mds_sessions(f, cap_dump);
 229     } else if (command == "dump_cache")
 230       m_client->dump_cache(f);
 231     else if (command == "kick_stale_sessions")
 232       m_client->_kick_stale_sessions();
 233     else if (command == "status")
 234       m_client->dump_status(f);
 235     else
 236       ceph_abort_msg("bad command registered");
 237   }
 238   f->close_section();
 239   return 0;
 240 }
 241
 242
 243 // -------------
 244
 245 int Client::get_fd_inode(int fd, InodeRef *in) {
 246   int r = 0;
 247   if (fd == CEPHFS_AT_FDCWD) {
 248     *in = cwd;
 249   } else {
 250     Fh *f = get_filehandle(fd);
 251     if (!f) {
 252       r = -CEPHFS_EBADF;
 253     } else {
 254       *in = f->inode;
 255     }
 256   }
 257   return r;
 258 }
 259
 260 dir_result_t::dir_result_t(Inode *in, const UserPerm& perms)
 261   : inode(in), offset(0), next_offset(2),
 262     release_count(0), ordered_count(0), cache_index(0), start_shared_gen(0),
 263     perms(perms)
 264   { }
 265
 266 void Client::_reset_faked_inos()
 267 {
 268   ino_t start = 1024;
 269   free_faked_inos.clear();
 270   free_faked_inos.insert(start, (uint32_t)-1 - start + 1);
 271   last_used_faked_ino = 0;
 272   last_used_faked_root = 0;
 273   #ifdef _WIN32
 274   // On Windows, sizeof(ino_t) is just 2. Despite that, most "native"
 275   // Windows structures, including Dokan ones, are using 64B identifiers.
 276   _use_faked_inos = false;
 277   #else
 278   _use_faked_inos = sizeof(ino_t) < 8 || cct->_conf->client_use_faked_inos;
 279   #endif
 280 }
 281
 282 void Client::_assign_faked_ino(Inode *in)
 283 {
 284   if (0 == last_used_faked_ino)
 285     last_used_faked_ino = last_used_faked_ino + 2048; // start(1024)~2048 reserved for _assign_faked_root
 286   interval_set<ino_t>::const_iterator it = free_faked_inos.lower_bound(last_used_faked_ino + 1);
 287   if (it == free_faked_inos.end() && last_used_faked_ino > 0) {
 288     last_used_faked_ino = 2048;
 289     it = free_faked_inos.lower_bound(last_used_faked_ino + 1);
 290   }
 291   ceph_assert(it != free_faked_inos.end());
 292   if (last_used_faked_ino < it.get_start()) {
 293     ceph_assert(it.get_len() > 0);
 294     last_used_faked_ino = it.get_start();
 295   } else {
 296     ++last_used_faked_ino;
 297     ceph_assert(it.get_start() + it.get_len() > last_used_faked_ino);
 298   }
 299   in->faked_ino = last_used_faked_ino;
 300   free_faked_inos.erase(in->faked_ino);
 301   faked_ino_map[in->faked_ino] = in->vino();
 302 }
 303
 304 /*
 305  * In the faked mode, if you export multiple subdirectories,
 306  * you will see that the inode numbers of the exported subdirectories
 307  * are the same. so we distinguish the mount point by reserving
 308  * the "fake ids" between "1024~2048" and combining the last
 309  * 10bits(0x3ff) of the "root inodes".
 310 */
 311 void Client::_assign_faked_root(Inode *in)
 312 {
 313   interval_set<ino_t>::const_iterator it = free_faked_inos.lower_bound(last_used_faked_root + 1);
 314   if (it == free_faked_inos.end() && last_used_faked_root > 0) {
 315     last_used_faked_root = 0;
 316     it = free_faked_inos.lower_bound(last_used_faked_root + 1);
 317   }
 318   ceph_assert(it != free_faked_inos.end());
 319   vinodeno_t inode_info = in->vino();
 320   uint64_t inode_num = (uint64_t)inode_info.ino;
 321   ldout(cct, 10) << "inode_num " << inode_num << "inode_num & 0x3ff=" << (inode_num & 0x3ff)<< dendl;
 322   last_used_faked_root = it.get_start()  + (inode_num & 0x3ff); // 0x3ff mask and get_start will not exceed 2048
 323   ceph_assert(it.get_start() + it.get_len() > last_used_faked_root);
 324
 325   in->faked_ino = last_used_faked_root;
 326   free_faked_inos.erase(in->faked_ino);
 327   faked_ino_map[in->faked_ino] = in->vino();
 328 }
 329
 330 void Client::_release_faked_ino(Inode *in)
 331 {
 332   free_faked_inos.insert(in->faked_ino);
 333   faked_ino_map.erase(in->faked_ino);
 334 }
 335
 336 vinodeno_t Client::_map_faked_ino(ino_t ino)
 337 {
 338   vinodeno_t vino;
 339   if (ino == 1)
 340     vino = root->vino();
 341   else if (faked_ino_map.count(ino))
 342     vino = faked_ino_map[ino];
 343   else
 344     vino = vinodeno_t(0, CEPH_NOSNAP);
 345   ldout(cct, 10) << __func__ << " " << ino << " -> " << vino << dendl;
 346   return vino;
 347 }
 348
 349 vinodeno_t Client::map_faked_ino(ino_t ino)
 350 {
 351   std::scoped_lock lock(client_lock);
 352   return _map_faked_ino(ino);
 353 }
 354
 355 // cons/des
 356
 357 Client::Client(Messenger *m, MonClient *mc, Objecter *objecter_)
 358   : Dispatcher(m->cct->get()),
 359     timer(m->cct, timer_lock, false),
 360     messenger(m),
 361     monclient(mc),
 362     objecter(objecter_),
 363     whoami(mc->get_global_id()),
 364     mount_state(CLIENT_UNMOUNTED, "Client::mountstate_lock"),
 365     initialize_state(CLIENT_NEW, "Client::initstate_lock"),
 366     cct_deleter{m->cct, [](CephContext *p) {p->put();}},
 367     async_ino_invalidator(m->cct),
 368     async_dentry_invalidator(m->cct),
 369     interrupt_finisher(m->cct),
 370     remount_finisher(m->cct),
 371     async_ino_releasor(m->cct),
 372     objecter_finisher(m->cct),
 373     m_command_hook(this),
 374     fscid(0)
 375 {
 376   _reset_faked_inos();
 377
 378   user_id = cct->_conf->client_mount_uid;
 379   group_id = cct->_conf->client_mount_gid;
 380   fuse_default_permissions = cct->_conf.get_val<bool>(
 381     "fuse_default_permissions");
 382
 383   _collect_and_send_global_metrics = cct->_conf.get_val<bool>(
 384     "client_collect_and_send_global_metrics");
 385
 386   mount_timeout = cct->_conf.get_val<std::chrono::seconds>(
 387     "client_mount_timeout");
 388
 389   caps_release_delay = cct->_conf.get_val<std::chrono::seconds>(
 390     "client_caps_release_delay");
 391
 392   if (cct->_conf->client_acl_type == "posix_acl")
 393     acl_type = POSIX_ACL;
 394
 395   lru.lru_set_midpoint(cct->_conf->client_cache_mid);
 396
 397   // file handles
 398   free_fd_set.insert(10, 1<<30);
 399
 400   mdsmap.reset(new MDSMap);
 401
 402   // osd interfaces
 403   writeback_handler.reset(new ObjecterWriteback(objecter, &objecter_finisher,
 404                                             &client_lock));
 405   objectcacher.reset(new ObjectCacher(cct, "libcephfs", *writeback_handler, client_lock,
 406                                   client_flush_set_callback,    // all commit callback
 407                                   (void*)this,
 408                                   cct->_conf->client_oc_size,
 409                                   cct->_conf->client_oc_max_objects,
 410                                   cct->_conf->client_oc_max_dirty,
 411                                   cct->_conf->client_oc_target_dirty,
 412                                   cct->_conf->client_oc_max_dirty_age,
 413                                   true));
 414 }
 415
 416
 417 Client::~Client()
 418 {
 419   ceph_assert(ceph_mutex_is_not_locked(client_lock));
 420
 421   // If the task is crashed or aborted and doesn't
 422   // get any chance to run the umount and shutdow.
 423   {
 424     std::scoped_lock l{client_lock};
 425     tick_thread_stopped = true;
 426     upkeep_cond.notify_one();
 427   }
 428
 429   if (upkeeper.joinable())
 430     upkeeper.join();
 431
 432   // It is necessary to hold client_lock, because any inode destruction
 433   // may call into ObjectCacher, which asserts that it's lock (which is
 434   // client_lock) is held.
 435   std::scoped_lock l{client_lock};
 436   tear_down_cache();
 437 }
 438
 439 void Client::tear_down_cache()
 440 {
 441   // fd's
 442   for (auto &[fd, fh] : fd_map) {
 443     ldout(cct, 1) << __func__ << " forcing close of fh " << fd << " ino " << fh->inode->ino << dendl;
 444     _release_fh(fh);
 445   }
 446   fd_map.clear();
 447
 448   while (!opened_dirs.empty()) {
 449     dir_result_t *dirp = *opened_dirs.begin();
 450     ldout(cct, 1) << __func__ << " forcing close of dir " << dirp << " ino " << dirp->inode->ino << dendl;
 451     _closedir(dirp);
 452   }
 453
 454   // caps!
 455   // *** FIXME ***
 456
 457   // empty lru
 458   trim_cache();
 459   ceph_assert(lru.lru_get_size() == 0);
 460
 461   // close root ino
 462   ceph_assert(inode_map.size() <= 1 + root_parents.size());
 463   if (root && inode_map.size() == 1 + root_parents.size()) {
 464     root.reset();
 465   }
 466
 467   ceph_assert(inode_map.empty());
 468 }
 469
 470 inodeno_t Client::get_root_ino()
 471 {
 472   std::scoped_lock l(client_lock);
 473   if (use_faked_inos())
 474     return root->faked_ino;
 475   else
 476     return root->ino;
 477 }
 478
 479 Inode *Client::get_root()
 480 {
 481   std::scoped_lock l(client_lock);
 482   root->ll_get();
 483   return root.get();
 484 }
 485
 486
 487 // debug crapola
 488
 489 void Client::dump_inode(Formatter *f, Inode *in, set<Inode*>& did, bool disconnected)
 490 {
 491   filepath path;
 492   in->make_long_path(path);
 493   ldout(cct, 1) << "dump_inode: "
 494                 << (disconnected ? "DISCONNECTED ":"")
 495                 << "inode " << in->ino
 496                 << " " << path
 497                 << " ref " << in->get_nref()
 498                 << " " << *in << dendl;
 499
 500   if (f) {
 501     f->open_object_section("inode");
 502     f->dump_stream("path") << path;
 503     if (disconnected)
 504       f->dump_int("disconnected", 1);
 505     in->dump(f);
 506     f->close_section();
 507   }
 508
 509   did.insert(in);
 510   if (in->dir) {
 511     ldout(cct, 1) << "  dir " << in->dir << " size " << in->dir->dentries.size() << dendl;
 512     for (ceph::unordered_map<string, Dentry*>::iterator it = in->dir->dentries.begin();
 513          it != in->dir->dentries.end();
 514          ++it) {
 515       ldout(cct, 1) << "   " << in->ino << " dn " << it->first << " " << it->second << " ref " << it->second->ref << dendl;
 516       if (f) {
 517         f->open_object_section("dentry");
 518         it->second->dump(f);
 519         f->close_section();
 520       }
 521       if (it->second->inode)
 522         dump_inode(f, it->second->inode.get(), did, false);
 523     }
 524   }
 525 }
 526
 527 void Client::dump_cache(Formatter *f)
 528 {
 529   set<Inode*> did;
 530
 531   ldout(cct, 1) << __func__ << dendl;
 532
 533   if (f)
 534     f->open_array_section("cache");
 535
 536   if (root)
 537     dump_inode(f, root.get(), did, true);
 538
 539   // make a second pass to catch anything disconnected
 540   for (ceph::unordered_map<vinodeno_t, Inode*>::iterator it = inode_map.begin();
 541        it != inode_map.end();
 542        ++it) {
 543     if (did.count(it->second))
 544       continue;
 545     dump_inode(f, it->second, did, true);
 546   }
 547
 548   if (f)
 549     f->close_section();
 550 }
 551
 552 void Client::dump_status(Formatter *f)
 553 {
 554   ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
 555
 556   ldout(cct, 1) << __func__ << dendl;
 557
 558   const epoch_t osd_epoch
 559     = objecter->with_osdmap(std::mem_fn(&OSDMap::get_epoch));
 560
 561   if (f) {
 562     f->open_object_section("metadata");
 563     for (const auto& kv : metadata)
 564       f->dump_string(kv.first.c_str(), kv.second);
 565     f->close_section();
 566
 567     f->dump_int("dentry_count", lru.lru_get_size());
 568     f->dump_int("dentry_pinned_count", lru.lru_get_num_pinned());
 569     f->dump_int("id", get_nodeid().v);
 570     entity_inst_t inst(messenger->get_myname(), messenger->get_myaddr_legacy());
 571     f->dump_object("inst", inst);
 572     f->dump_object("addr", inst.addr);
 573     f->dump_stream("inst_str") << inst.name << " " << inst.addr.get_legacy_str();
 574     f->dump_string("addr_str", inst.addr.get_legacy_str());
 575     f->dump_int("inode_count", inode_map.size());
 576     f->dump_int("mds_epoch", mdsmap->get_epoch());
 577     f->dump_int("osd_epoch", osd_epoch);
 578     f->dump_int("osd_epoch_barrier", cap_epoch_barrier);
 579     f->dump_bool("blocklisted", blocklisted);
 580     f->dump_string("fs_name", mdsmap->get_fs_name());
 581   }
 582 }
 583
 584 void Client::_pre_init()
 585 {
 586   timer.init();
 587
 588   objecter_finisher.start();
 589   filer.reset(new Filer(objecter, &objecter_finisher));
 590
 591   objectcacher->start();
 592 }
 593
 594 int Client::init()
 595 {
 596   RWRef_t iref_writer(initialize_state, CLIENT_INITIALIZING, false);
 597   ceph_assert(iref_writer.is_first_writer());
 598
 599   _pre_init();
 600   {
 601     std::scoped_lock l{client_lock};
 602     messenger->add_dispatcher_tail(this);
 603   }
 604   _finish_init();
 605   iref_writer.update_state(CLIENT_INITIALIZED);
 606   return 0;
 607 }
 608
 609 void Client::_finish_init()
 610 {
 611   {
 612     std::scoped_lock l{client_lock};
 613     // logger
 614     PerfCountersBuilder plb(cct, "client", l_c_first, l_c_last);
 615     plb.add_time_avg(l_c_reply, "reply", "Latency of receiving a reply on metadata request");
 616     plb.add_time_avg(l_c_lat, "lat", "Latency of processing a metadata request");
 617     plb.add_time_avg(l_c_wrlat, "wrlat", "Latency of a file data write operation");
 618     plb.add_time_avg(l_c_read, "rdlat", "Latency of a file data read operation");
 619     plb.add_time_avg(l_c_fsync, "fsync", "Latency of a file sync operation");
 620     // average, standard deviation mds/r/w/ latencies
 621     plb.add_time(l_c_md_avg, "mdavg", "Average latency for processing metadata requests");
 622     plb.add_u64(l_c_md_sqsum, "mdsqsum", "Sum of squares (to calculate variability/stdev) for metadata requests");
 623     plb.add_u64(l_c_md_ops, "mdops", "Total metadata IO operations");
 624     plb.add_time(l_c_rd_avg, "readavg", "Average latency for processing read requests");
 625     plb.add_u64(l_c_rd_sqsum, "readsqsum", "Sum of squares ((to calculate variability/stdev) for read requests");
 626     plb.add_u64(l_c_rd_ops, "rdops", "Total read IO operations");
 627     plb.add_time(l_c_wr_avg, "writeavg", "Average latency for processing write requests");
 628     plb.add_u64(l_c_wr_sqsum, "writesqsum", "Sum of squares ((to calculate variability/stdev) for write requests");
 629     plb.add_u64(l_c_wr_ops, "rdops", "Total write IO operations");
 630     logger.reset(plb.create_perf_counters());
 631     cct->get_perfcounters_collection()->add(logger.get());
 632   }
 633
 634   cct->_conf.add_observer(this);
 635
 636   AdminSocket* admin_socket = cct->get_admin_socket();
 637   int ret = admin_socket->register_command("mds_requests",
 638                                            &m_command_hook,
 639                                            "show in-progress mds requests");
 640   if (ret < 0) {
 641     lderr(cct) << "error registering admin socket command: "
 642                << cpp_strerror(-ret) << dendl;
 643   }
 644   ret = admin_socket->register_command("mds_sessions "
 645                                        "name=cap_dump,type=CephBool,req=false",
 646                                        &m_command_hook,
 647                                        "show mds session state");
 648   if (ret < 0) {
 649     lderr(cct) << "error registering admin socket command: "
 650                << cpp_strerror(-ret) << dendl;
 651   }
 652   ret = admin_socket->register_command("dump_cache",
 653                                        &m_command_hook,
 654                                        "show in-memory metadata cache contents");
 655   if (ret < 0) {
 656     lderr(cct) << "error registering admin socket command: "
 657                << cpp_strerror(-ret) << dendl;
 658   }
 659   ret = admin_socket->register_command("kick_stale_sessions",
 660                                        &m_command_hook,
 661                                        "kick sessions that were remote reset");
 662   if (ret < 0) {
 663     lderr(cct) << "error registering admin socket command: "
 664                << cpp_strerror(-ret) << dendl;
 665   }
 666   ret = admin_socket->register_command("status",
 667                                        &m_command_hook,
 668                                        "show overall client status");
 669   if (ret < 0) {
 670     lderr(cct) << "error registering admin socket command: "
 671                << cpp_strerror(-ret) << dendl;
 672   }
 673 }
 674
 675 void Client::shutdown()
 676 {
 677   ldout(cct, 1) << __func__ << dendl;
 678
 679   // If we were not mounted, but were being used for sending
 680   // MDS commands, we may have sessions that need closing.
 681   {
 682     std::scoped_lock l{client_lock};
 683
 684     // To make sure the tick thread will be stoppped before
 685     // destructing the Client, just in case like the _mount()
 686     // failed but didn't not get a chance to stop the tick
 687     // thread
 688     tick_thread_stopped = true;
 689     upkeep_cond.notify_one();
 690
 691     _close_sessions();
 692   }
 693   cct->_conf.remove_observer(this);
 694
 695   cct->get_admin_socket()->unregister_commands(&m_command_hook);
 696
 697   if (ino_invalidate_cb) {
 698     ldout(cct, 10) << "shutdown stopping cache invalidator finisher" << dendl;
 699     async_ino_invalidator.wait_for_empty();
 700     async_ino_invalidator.stop();
 701   }
 702
 703   if (dentry_invalidate_cb) {
 704     ldout(cct, 10) << "shutdown stopping dentry invalidator finisher" << dendl;
 705     async_dentry_invalidator.wait_for_empty();
 706     async_dentry_invalidator.stop();
 707   }
 708
 709   if (switch_interrupt_cb) {
 710     ldout(cct, 10) << "shutdown stopping interrupt finisher" << dendl;
 711     interrupt_finisher.wait_for_empty();
 712     interrupt_finisher.stop();
 713   }
 714
 715   if (remount_cb) {
 716     ldout(cct, 10) << "shutdown stopping remount finisher" << dendl;
 717     remount_finisher.wait_for_empty();
 718     remount_finisher.stop();
 719   }
 720
 721   if (ino_release_cb) {
 722     ldout(cct, 10) << "shutdown stopping inode release finisher" << dendl;
 723     async_ino_releasor.wait_for_empty();
 724     async_ino_releasor.stop();
 725   }
 726
 727   objectcacher->stop();  // outside of client_lock! this does a join.
 728
 729   /*
 730    * We are shuting down the client.
 731    *
 732    * Just declare the state to CLIENT_NEW to block and fail any
 733    * new comming "reader" and then try to wait all the in-flight
 734    * "readers" to finish.
 735    */
 736   RWRef_t iref_writer(initialize_state, CLIENT_NEW, false);
 737   if (!iref_writer.is_first_writer())
 738     return;
 739   iref_writer.wait_readers_done();
 740
 741   {
 742     std::scoped_lock l(timer_lock);
 743     timer.shutdown();
 744   }
 745
 746   objecter_finisher.wait_for_empty();
 747   objecter_finisher.stop();
 748
 749   if (logger) {
 750     cct->get_perfcounters_collection()->remove(logger.get());
 751     logger.reset();
 752   }
 753 }
 754
 755 void Client::update_io_stat_metadata(utime_t latency) {
 756   auto lat_nsec = latency.to_nsec();
 757   // old values are used to compute new ones
 758   auto o_avg = logger->tget(l_c_md_avg).to_nsec();
 759   auto o_sqsum = logger->get(l_c_md_sqsum);
 760
 761   auto n_avg = calc_average(o_avg, lat_nsec, nr_metadata_request);
 762   auto n_sqsum = calc_sq_sum(o_sqsum, o_avg, n_avg, lat_nsec,
 763                               nr_metadata_request);
 764
 765   logger->tinc(l_c_lat, latency);
 766   logger->tinc(l_c_reply, latency);
 767
 768   utime_t avg;
 769   avg.set_from_double(n_avg / 1000000000);
 770   logger->tset(l_c_md_avg, avg);
 771   logger->set(l_c_md_sqsum, n_sqsum);
 772   logger->set(l_c_md_ops, nr_metadata_request);
 773 }
 774
 775 void Client::update_io_stat_read(utime_t latency) {
 776   auto lat_nsec = latency.to_nsec();
 777   // old values are used to compute new ones
 778   auto o_avg = logger->tget(l_c_rd_avg).to_nsec();
 779   auto o_sqsum = logger->get(l_c_rd_sqsum);
 780
 781   auto n_avg = calc_average(o_avg, lat_nsec, nr_read_request);
 782   auto n_sqsum = calc_sq_sum(o_sqsum, o_avg, n_avg, lat_nsec,
 783                               nr_read_request);
 784
 785   logger->tinc(l_c_read, latency);
 786
 787   utime_t avg;
 788   avg.set_from_double(n_avg / 1000000000);
 789   logger->tset(l_c_rd_avg, avg);
 790   logger->set(l_c_rd_sqsum, n_sqsum);
 791   logger->set(l_c_rd_ops, nr_read_request);
 792 }
 793
 794 void Client::update_io_stat_write(utime_t latency) {
 795   auto lat_nsec = latency.to_nsec();
 796   // old values are used to compute new ones
 797   auto o_avg = logger->tget(l_c_wr_avg).to_nsec();
 798   auto o_sqsum = logger->get(l_c_wr_sqsum);
 799
 800   auto n_avg = calc_average(o_avg, lat_nsec, nr_write_request);
 801   auto n_sqsum = calc_sq_sum(o_sqsum, o_avg, n_avg, lat_nsec,
 802                               nr_write_request);
 803
 804   logger->tinc(l_c_wrlat, latency);
 805
 806   utime_t avg;
 807   avg.set_from_double(n_avg / 1000000000);
 808   logger->tset(l_c_wr_avg, avg);
 809   logger->set(l_c_wr_sqsum, n_sqsum);
 810   logger->set(l_c_wr_ops, nr_write_request);
 811 }
 812
 813 // ===================
 814 // metadata cache stuff
 815
 816 void Client::trim_cache(bool trim_kernel_dcache)
 817 {
 818   uint64_t max = cct->_conf->client_cache_size;
 819   ldout(cct, 20) << "trim_cache size " << lru.lru_get_size() << " max " << max << dendl;
 820   unsigned last = 0;
 821   while (lru.lru_get_size() != last) {
 822     last = lru.lru_get_size();
 823
 824     if (!is_unmounting() && lru.lru_get_size() <= max)  break;
 825
 826     // trim!
 827     Dentry *dn = static_cast<Dentry*>(lru.lru_get_next_expire());
 828     if (!dn)
 829       break;  // done
 830
 831     trim_dentry(dn);
 832   }
 833
 834   if (trim_kernel_dcache && lru.lru_get_size() > max)
 835     _invalidate_kernel_dcache();
 836
 837   // hose root?
 838   if (lru.lru_get_size() == 0 && root && root->get_nref() == 1 && inode_map.size() == 1 + root_parents.size()) {
 839     ldout(cct, 15) << "trim_cache trimmed root " << root << dendl;
 840     root.reset();
 841   }
 842 }
 843
 844 void Client::trim_cache_for_reconnect(MetaSession *s)
 845 {
 846   mds_rank_t mds = s->mds_num;
 847   ldout(cct, 20) << __func__ << " mds." << mds << dendl;
 848
 849   int trimmed = 0;
 850   list<Dentry*> skipped;
 851   while (lru.lru_get_size() > 0) {
 852     Dentry *dn = static_cast<Dentry*>(lru.lru_expire());
 853     if (!dn)
 854       break;
 855
 856     if ((dn->inode && dn->inode->caps.count(mds)) ||
 857         dn->dir->parent_inode->caps.count(mds)) {
 858       trim_dentry(dn);
 859       trimmed++;
 860     } else
 861       skipped.push_back(dn);
 862   }
 863
 864   for(list<Dentry*>::iterator p = skipped.begin(); p != skipped.end(); ++p)
 865     lru.lru_insert_mid(*p);
 866
 867   ldout(cct, 20) << __func__ << " mds." << mds
 868                  << " trimmed " << trimmed << " dentries" << dendl;
 869
 870   if (s->caps.size() > 0)
 871     _invalidate_kernel_dcache();
 872 }
 873
 874 void Client::trim_dentry(Dentry *dn)
 875 {
 876   ldout(cct, 15) << "trim_dentry unlinking dn " << dn->name
 877                  << " in dir "
 878                  << std::hex << dn->dir->parent_inode->ino << std::dec
 879                  << dendl;
 880   if (dn->inode) {
 881     Inode *diri = dn->dir->parent_inode;
 882     clear_dir_complete_and_ordered(diri, true);
 883   }
 884   unlink(dn, false, false);  // drop dir, drop dentry
 885 }
 886
 887
 888 void Client::update_inode_file_size(Inode *in, int issued, uint64_t size,
 889                                     uint64_t truncate_seq, uint64_t truncate_size)
 890 {
 891   uint64_t prior_size = in->size;
 892
 893   if (truncate_seq > in->truncate_seq ||
 894       (truncate_seq == in->truncate_seq && size > in->size)) {
 895     ldout(cct, 10) << "size " << in->size << " -> " << size << dendl;
 896     in->size = size;
 897     in->reported_size = size;
 898     if (truncate_seq != in->truncate_seq) {
 899       ldout(cct, 10) << "truncate_seq " << in->truncate_seq << " -> "
 900                << truncate_seq << dendl;
 901       in->truncate_seq = truncate_seq;
 902       in->oset.truncate_seq = truncate_seq;
 903
 904       // truncate cached file data
 905       if (prior_size > size) {
 906         _invalidate_inode_cache(in, size, prior_size - size);
 907       }
 908     }
 909
 910     // truncate inline data
 911     if (in->inline_version < CEPH_INLINE_NONE) {
 912       uint32_t len = in->inline_data.length();
 913       if (size < len)
 914         in->inline_data.splice(size, len - size);
 915     }
 916   }
 917   if (truncate_seq >= in->truncate_seq &&
 918       in->truncate_size != truncate_size) {
 919     if (in->is_file()) {
 920       ldout(cct, 10) << "truncate_size " << in->truncate_size << " -> "
 921                << truncate_size << dendl;
 922       in->truncate_size = truncate_size;
 923       in->oset.truncate_size = truncate_size;
 924     } else {
 925       ldout(cct, 0) << "Hmmm, truncate_seq && truncate_size changed on non-file inode!" << dendl;
 926     }
 927   }
 928 }
 929
 930 void Client::update_inode_file_time(Inode *in, int issued, uint64_t time_warp_seq,
 931                                     utime_t ctime, utime_t mtime, utime_t atime)
 932 {
 933   ldout(cct, 10) << __func__ << " " << *in << " " << ccap_string(issued)
 934                  << " ctime " << ctime << " mtime " << mtime << dendl;
 935
 936   if (time_warp_seq > in->time_warp_seq)
 937     ldout(cct, 10) << " mds time_warp_seq " << time_warp_seq
 938                    << " is higher than local time_warp_seq "
 939                    << in->time_warp_seq << dendl;
 940
 941   int warn = false;
 942   // be careful with size, mtime, atime
 943   if (issued & (CEPH_CAP_FILE_EXCL|
 944                 CEPH_CAP_FILE_WR|
 945                 CEPH_CAP_FILE_BUFFER|
 946                 CEPH_CAP_AUTH_EXCL|
 947                 CEPH_CAP_XATTR_EXCL)) {
 948     ldout(cct, 30) << "Yay have enough caps to look at our times" << dendl;
 949     if (ctime > in->ctime)
 950       in->ctime = ctime;
 951     if (time_warp_seq > in->time_warp_seq) {
 952       //the mds updated times, so take those!
 953       in->mtime = mtime;
 954       in->atime = atime;
 955       in->time_warp_seq = time_warp_seq;
 956     } else if (time_warp_seq == in->time_warp_seq) {
 957       //take max times
 958       if (mtime > in->mtime)
 959         in->mtime = mtime;
 960       if (atime > in->atime)
 961         in->atime = atime;
 962     } else if (issued & CEPH_CAP_FILE_EXCL) {
 963       //ignore mds values as we have a higher seq
 964     } else warn = true;
 965   } else {
 966     ldout(cct, 30) << "Don't have enough caps, just taking mds' time values" << dendl;
 967     if (time_warp_seq >= in->time_warp_seq) {
 968       in->ctime = ctime;
 969       in->mtime = mtime;
 970       in->atime = atime;
 971       in->time_warp_seq = time_warp_seq;
 972     } else warn = true;
 973   }
 974   if (warn) {
 975     ldout(cct, 0) << "WARNING: " << *in << " mds time_warp_seq "
 976             << time_warp_seq << " is lower than local time_warp_seq "
 977             << in->time_warp_seq
 978             << dendl;
 979   }
 980 }
 981
 982 void Client::_fragmap_remove_non_leaves(Inode *in)
 983 {
 984   for (map<frag_t,int>::iterator p = in->fragmap.begin(); p != in->fragmap.end(); )
 985     if (!in->dirfragtree.is_leaf(p->first))
 986       in->fragmap.erase(p++);
 987     else
 988       ++p;
 989 }
 990
 991 void Client::_fragmap_remove_stopped_mds(Inode *in, mds_rank_t mds)
 992 {
 993   for (auto p = in->fragmap.begin(); p != in->fragmap.end(); )
 994     if (p->second == mds)
 995       in->fragmap.erase(p++);
 996     else
 997       ++p;
 998 }
 999
1000 Inode * Client::add_update_inode(InodeStat *st, utime_t from,
1001                                  MetaSession *session,
1002                                  const UserPerm& request_perms)
1003 {
1004   Inode *in;
1005   bool was_new = false;
1006   if (inode_map.count(st->vino)) {
1007     in = inode_map[st->vino];
1008     ldout(cct, 12) << __func__ << " had " << *in << " caps " << ccap_string(st->cap.caps) << dendl;
1009   } else {
1010     in = new Inode(this, st->vino, &st->layout);
1011     inode_map[st->vino] = in;
1012
1013     if (use_faked_inos())
1014       _assign_faked_ino(in);
1015
1016     if (!root) {
1017       root = in;
1018       if (use_faked_inos())
1019         _assign_faked_root(root.get());
1020       root_ancestor = in;
1021       cwd = root;
1022     } else if (is_mounting()) {
1023       root_parents[root_ancestor] = in;
1024       root_ancestor = in;
1025     }
1026
1027     // immutable bits
1028     in->ino = st->vino.ino;
1029     in->snapid = st->vino.snapid;
1030     in->mode = st->mode & S_IFMT;
1031     was_new = true;
1032   }
1033
1034   in->rdev = st->rdev;
1035   if (in->is_symlink())
1036     in->symlink = st->symlink;
1037
1038   // only update inode if mds info is strictly newer, or it is the same and projected (odd).
1039   bool new_version = false;
1040   if (in->version == 0 ||
1041       ((st->cap.flags & CEPH_CAP_FLAG_AUTH) &&
1042        (in->version & ~1) < st->version))
1043     new_version = true;
1044
1045   int issued;
1046   in->caps_issued(&issued);
1047   issued |= in->caps_dirty();
1048   int new_issued = ~issued & (int)st->cap.caps;
1049
1050   bool need_snapdir_attr_refresh = false;
1051   if ((new_version || (new_issued & CEPH_CAP_AUTH_SHARED)) &&
1052       !(issued & CEPH_CAP_AUTH_EXCL)) {
1053     in->mode = st->mode;
1054     in->uid = st->uid;
1055     in->gid = st->gid;
1056     in->btime = st->btime;
1057     in->snap_btime = st->snap_btime;
1058     in->snap_metadata = st->snap_metadata;
1059     in->fscrypt_auth = st->fscrypt_auth;
1060     need_snapdir_attr_refresh = true;
1061   }
1062
1063   if ((new_version || (new_issued & CEPH_CAP_LINK_SHARED)) &&
1064       !(issued & CEPH_CAP_LINK_EXCL)) {
1065     in->nlink = st->nlink;
1066   }
1067
1068   if (new_version || (new_issued & CEPH_CAP_ANY_RD)) {
1069     need_snapdir_attr_refresh = true;
1070     update_inode_file_time(in, issued, st->time_warp_seq,
1071                            st->ctime, st->mtime, st->atime);
1072   }
1073
1074   if (new_version ||
1075       (new_issued & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR))) {
1076     in->layout = st->layout;
1077     in->fscrypt_file = st->fscrypt_file;
1078     update_inode_file_size(in, issued, st->size, st->truncate_seq, st->truncate_size);
1079   }
1080
1081   if (in->is_dir()) {
1082     if (new_version || (new_issued & CEPH_CAP_FILE_SHARED)) {
1083       in->dirstat = st->dirstat;
1084     }
1085     // dir_layout/rstat/quota are not tracked by capability, update them only if
1086     // the inode stat is from auth mds
1087     if (new_version || (st->cap.flags & CEPH_CAP_FLAG_AUTH)) {
1088       in->dir_layout = st->dir_layout;
1089       ldout(cct, 20) << " dir hash is " << (int)in->dir_layout.dl_dir_hash << dendl;
1090       in->rstat = st->rstat;
1091       in->quota = st->quota;
1092       in->dir_pin = st->dir_pin;
1093     }
1094     // move me if/when version reflects fragtree changes.
1095     if (in->dirfragtree != st->dirfragtree) {
1096       in->dirfragtree = st->dirfragtree;
1097       _fragmap_remove_non_leaves(in);
1098     }
1099   }
1100
1101   if ((in->xattr_version  == 0 || !(issued & CEPH_CAP_XATTR_EXCL)) &&
1102       st->xattrbl.length() &&
1103       st->xattr_version > in->xattr_version) {
1104     auto p = st->xattrbl.cbegin();
1105     decode(in->xattrs, p);
1106     in->xattr_version = st->xattr_version;
1107     need_snapdir_attr_refresh = true;
1108   }
1109
1110   if (st->inline_version > in->inline_version) {
1111     in->inline_data = st->inline_data;
1112     in->inline_version = st->inline_version;
1113   }
1114
1115   /* always take a newer change attr */
1116   ldout(cct, 12) << __func__ << " client inode change_attr: " << in->change_attr << " , mds inodestat change_attr:  " << st->change_attr << dendl;
1117   if (st->change_attr > in->change_attr)
1118     in->change_attr = st->change_attr;
1119
1120   if (st->version > in->version)
1121     in->version = st->version;
1122
1123   if (was_new)
1124     ldout(cct, 12) << __func__ << " adding " << *in << " caps " << ccap_string(st->cap.caps) << dendl;
1125
1126   if (!st->cap.caps)
1127     return in;   // as with readdir returning indoes in different snaprealms (no caps!)
1128
1129   if (in->snapid == CEPH_NOSNAP) {
1130     add_update_cap(in, session, st->cap.cap_id, st->cap.caps, st->cap.wanted,
1131                    st->cap.seq, st->cap.mseq, inodeno_t(st->cap.realm),
1132                    st->cap.flags, request_perms);
1133     if (in->auth_cap && in->auth_cap->session == session) {
1134       in->max_size = st->max_size;
1135       in->rstat = st->rstat;
1136     }
1137
1138     // setting I_COMPLETE needs to happen after adding the cap
1139     if (in->is_dir() &&
1140         (st->cap.caps & CEPH_CAP_FILE_SHARED) &&
1141         (issued & CEPH_CAP_FILE_EXCL) == 0 &&
1142         in->dirstat.nfiles == 0 &&
1143         in->dirstat.nsubdirs == 0) {
1144       ldout(cct, 10) << " marking (I_COMPLETE|I_DIR_ORDERED) on empty dir " << *in << dendl;
1145       in->flags |= I_COMPLETE | I_DIR_ORDERED;
1146       if (in->dir) {
1147         ldout(cct, 10) << " dir is open on empty dir " << in->ino << " with "
1148                        << in->dir->dentries.size() << " entries, marking all dentries null" << dendl;
1149         in->dir->readdir_cache.clear();
1150         for (const auto& p : in->dir->dentries) {
1151           unlink(p.second, true, true);  // keep dir, keep dentry
1152         }
1153         if (in->dir->dentries.empty())
1154           close_dir(in->dir);
1155       }
1156     }
1157   } else {
1158     in->snap_caps |= st->cap.caps;
1159   }
1160
1161   if (need_snapdir_attr_refresh && in->is_dir() && in->snapid == CEPH_NOSNAP) {
1162     vinodeno_t vino(in->ino, CEPH_SNAPDIR);
1163     if (inode_map.count(vino)) {
1164       refresh_snapdir_attrs(inode_map[vino], in);
1165     }
1166   }
1167
1168   return in;
1169 }
1170
1171
1172 /*
1173  * insert_dentry_inode - insert + link a single dentry + inode into the metadata cache.
1174  */
1175 Dentry *Client::insert_dentry_inode(Dir *dir, const string& dname, LeaseStat *dlease,
1176                                     Inode *in, utime_t from, MetaSession *session,
1177                                     Dentry *old_dentry)
1178 {
1179   Dentry *dn = NULL;
1180   if (dir->dentries.count(dname))
1181     dn = dir->dentries[dname];
1182
1183   ldout(cct, 12) << __func__ << " '" << dname << "' vino " << in->vino()
1184                  << " in dir " << dir->parent_inode->vino() << " dn " << dn
1185                  << dendl;
1186
1187   if (dn && dn->inode) {
1188     if (dn->inode->vino() == in->vino()) {
1189       touch_dn(dn);
1190       ldout(cct, 12) << " had dentry " << dname
1191                << " with correct vino " << dn->inode->vino()
1192                << dendl;
1193     } else {
1194       ldout(cct, 12) << " had dentry " << dname
1195                << " with WRONG vino " << dn->inode->vino()
1196                << dendl;
1197       unlink(dn, true, true);  // keep dir, keep dentry
1198     }
1199   }
1200
1201   if (!dn || !dn->inode) {
1202     InodeRef tmp_ref(in);
1203     if (old_dentry) {
1204       if (old_dentry->dir != dir) {
1205         Inode *old_diri = old_dentry->dir->parent_inode;
1206         clear_dir_complete_and_ordered(old_diri, false);
1207       }
1208       unlink(old_dentry, dir == old_dentry->dir, false);  // drop dentry, keep dir open if its the same dir
1209     }
1210     Inode *diri = dir->parent_inode;
1211     clear_dir_complete_and_ordered(diri, false);
1212     dn = link(dir, dname, in, dn);
1213
1214     if (old_dentry) {
1215       dn->is_renaming = false;
1216       signal_cond_list(waiting_for_rename);
1217     }
1218   }
1219
1220   update_dentry_lease(dn, dlease, from, session);
1221   return dn;
1222 }
1223
1224 void Client::update_dentry_lease(Dentry *dn, LeaseStat *dlease, utime_t from, MetaSession *session)
1225 {
1226   utime_t dttl = from;
1227   dttl += (float)dlease->duration_ms / 1000.0;
1228
1229   ldout(cct, 15) << __func__ << " " << *dn << " " << *dlease << " from " << from << dendl;
1230
1231   ceph_assert(dn);
1232
1233   if (dlease->mask & CEPH_LEASE_VALID) {
1234     if (dttl > dn->lease_ttl) {
1235       ldout(cct, 10) << "got dentry lease on " << dn->name
1236                << " dur " << dlease->duration_ms << "ms ttl " << dttl << dendl;
1237       dn->lease_ttl = dttl;
1238       dn->lease_mds = session->mds_num;
1239       dn->lease_seq = dlease->seq;
1240       dn->lease_gen = session->cap_gen;
1241     }
1242   }
1243   dn->cap_shared_gen = dn->dir->parent_inode->shared_gen;
1244   if (dlease->mask & CEPH_LEASE_PRIMARY_LINK)
1245     dn->mark_primary();
1246   dn->alternate_name = std::move(dlease->alternate_name);
1247 }
1248
1249
1250 /*
1251  * update MDS location cache for a single inode
1252  */
1253 void Client::update_dir_dist(Inode *in, DirStat *dst, mds_rank_t from)
1254 {
1255   // auth
1256   ldout(cct, 20) << "got dirfrag map for " << in->ino << " frag " << dst->frag << " to mds " << dst->auth << dendl;
1257   if (dst->auth >= 0) {
1258     in->fragmap[dst->frag] = dst->auth;
1259   } else {
1260     in->fragmap.erase(dst->frag);
1261   }
1262   if (!in->dirfragtree.is_leaf(dst->frag)) {
1263     in->dirfragtree.force_to_leaf(cct, dst->frag);
1264     _fragmap_remove_non_leaves(in);
1265   }
1266
1267   // replicated, only update from auth mds reply
1268   if (from == dst->auth) {
1269     in->dir_replicated = !dst->dist.empty();
1270     if (!dst->dist.empty())
1271       in->frag_repmap[dst->frag].assign(dst->dist.begin(), dst->dist.end()) ;
1272     else
1273       in->frag_repmap.erase(dst->frag);
1274   }
1275 }
1276
1277 void Client::clear_dir_complete_and_ordered(Inode *diri, bool complete)
1278 {
1279   if (complete)
1280     diri->dir_release_count++;
1281   else
1282     diri->dir_ordered_count++;
1283   if (diri->flags & I_COMPLETE) {
1284     if (complete) {
1285       ldout(cct, 10) << " clearing (I_COMPLETE|I_DIR_ORDERED) on " << *diri << dendl;
1286       diri->flags &= ~(I_COMPLETE | I_DIR_ORDERED);
1287     } else {
1288       if (diri->flags & I_DIR_ORDERED) {
1289         ldout(cct, 10) << " clearing I_DIR_ORDERED on " << *diri << dendl;
1290         diri->flags &= ~I_DIR_ORDERED;
1291       }
1292     }
1293     if (diri->dir)
1294       diri->dir->readdir_cache.clear();
1295   }
1296 }
1297
1298 /*
1299  * insert results from readdir or lssnap into the metadata cache.
1300  */
1301 void Client::insert_readdir_results(MetaRequest *request, MetaSession *session,
1302                                     Inode *diri, Inode *diri_other) {
1303
1304   auto& reply = request->reply;
1305   ConnectionRef con = request->reply->get_connection();
1306   uint64_t features;
1307   if(session->mds_features.test(CEPHFS_FEATURE_REPLY_ENCODING)) {
1308     features = (uint64_t)-1;
1309   }
1310   else {
1311     features = con->get_features();
1312   }
1313
1314   dir_result_t *dirp = request->dirp;
1315   ceph_assert(dirp);
1316
1317   // the extra buffer list is only set for readdir, lssnap and
1318   // readdir_snapdiff replies
1319   auto p = reply->get_extra_bl().cbegin();
1320   if (!p.end()) {
1321     // snapdir?
1322     if (request->head.op == CEPH_MDS_OP_LSSNAP) {
1323       ceph_assert(diri);
1324       diri = open_snapdir(diri);
1325     }
1326     bool snapdiff_req = request->head.op == CEPH_MDS_OP_READDIR_SNAPDIFF;
1327     frag_t fg;
1328     unsigned offset_hash;
1329     if (snapdiff_req) {
1330       fg = (unsigned)request->head.args.snapdiff.frag;
1331       offset_hash = (unsigned)request->head.args.snapdiff.offset_hash;
1332     } else {
1333       fg = (unsigned)request->head.args.readdir.frag;
1334       offset_hash = (unsigned)request->head.args.readdir.offset_hash;
1335     }
1336
1337     // only open dir if we're actually adding stuff to it!
1338     Dir *dir = diri->open_dir();
1339     ceph_assert(dir);
1340     //open opponent dir for snapdiff if any
1341     Dir *dir_other = nullptr;
1342     if (snapdiff_req) {
1343       ceph_assert(diri_other);
1344       dir_other = diri_other->open_dir();
1345       ceph_assert(dir_other);
1346     }
1347
1348     // dirstat
1349     DirStat dst(p, features);
1350     __u32 numdn;
1351     __u16 flags;
1352     decode(numdn, p);
1353     decode(flags, p);
1354
1355     bool end = ((unsigned)flags & CEPH_READDIR_FRAG_END);
1356     bool hash_order = ((unsigned)flags & CEPH_READDIR_HASH_ORDER);
1357
1358     unsigned readdir_offset = dirp->next_offset;
1359     string readdir_start = dirp->last_name;
1360     ceph_assert(!readdir_start.empty() || readdir_offset == 2);
1361
1362     unsigned last_hash = 0;
1363     if (hash_order) {
1364       if (!readdir_start.empty()) {
1365         last_hash = ceph_frag_value(diri->hash_dentry_name(readdir_start));
1366       } else if (flags & CEPH_READDIR_OFFSET_HASH) {
1367         /* mds understands offset_hash */
1368         last_hash = offset_hash;
1369       }
1370     }
1371
1372     if (fg != dst.frag) {
1373       ldout(cct, 10) << "insert_trace got new frag " << fg << " -> " << dst.frag << dendl;
1374       fg = dst.frag;
1375       if (!hash_order) {
1376         readdir_offset = 2;
1377         readdir_start.clear();
1378         dirp->offset = dir_result_t::make_fpos(fg, readdir_offset, false);
1379       }
1380     }
1381
1382     ldout(cct, 10) << __func__ << " " << numdn << " readdir items, end=" << end
1383                    << ", hash_order=" << hash_order
1384                    << ", readdir_start " << readdir_start
1385                    << ", last_hash " << last_hash
1386                    << ", next_offset " << readdir_offset << dendl;
1387
1388     if (diri->snapid != CEPH_SNAPDIR &&
1389         fg.is_leftmost() && readdir_offset == 2 &&
1390         !(hash_order && last_hash)) {
1391       dirp->release_count = diri->dir_release_count;
1392       dirp->ordered_count = diri->dir_ordered_count;
1393       dirp->start_shared_gen = diri->shared_gen;
1394       dirp->cache_index = 0;
1395     }
1396
1397     dirp->buffer_frag = fg;
1398
1399     _readdir_drop_dirp_buffer(dirp);
1400     dirp->buffer.reserve(numdn);
1401
1402     string dname;
1403     LeaseStat dlease;
1404     for (unsigned i=0; i<numdn; i++) {
1405       decode(dname, p);
1406       dlease.decode(p, features);
1407       InodeStat ist(p, features);
1408
1409       ldout(cct, 15) << "" << i << ": '" << dname << "'" << dendl;
1410
1411       Inode *in = add_update_inode(&ist, request->sent_stamp, session,
1412                                    request->perms);
1413       auto *effective_dir = dir;
1414       auto *effective_diri = diri;
1415
1416       if (snapdiff_req && in->snapid != diri->snapid) {
1417         ceph_assert(diri_other);
1418         ceph_assert(dir_other);
1419         effective_diri = diri_other;
1420         effective_dir = dir_other;
1421       }
1422       Dentry *dn;
1423       if (effective_dir->dentries.count(dname)) {
1424         Dentry *olddn = effective_dir->dentries[dname];
1425         if (olddn->inode != in) {
1426           // replace incorrect dentry
1427           unlink(olddn, true, true);  // keep dir, dentry
1428           dn = link(effective_dir, dname, in, olddn);
1429           ceph_assert(dn == olddn);
1430         } else {
1431           // keep existing dn
1432           dn = olddn;
1433           touch_dn(dn);
1434         }
1435       } else {
1436         // new dn
1437         dn = link(effective_dir, dname, in, NULL);
1438       }
1439       dn->alternate_name = std::move(dlease.alternate_name);
1440
1441       update_dentry_lease(dn, &dlease, request->sent_stamp, session);
1442       if (hash_order) {
1443         unsigned hash = ceph_frag_value(effective_diri->hash_dentry_name(dname));
1444         if (hash != last_hash)
1445           readdir_offset = 2;
1446         last_hash = hash;
1447         dn->offset = dir_result_t::make_fpos(hash, readdir_offset++, true);
1448       } else {
1449         dn->offset = dir_result_t::make_fpos(fg, readdir_offset++, false);
1450       }
1451       // add to readdir cache
1452       if (!snapdiff_req &&
1453           dirp->release_count == effective_diri->dir_release_count &&
1454           dirp->ordered_count == effective_diri->dir_ordered_count &&
1455           dirp->start_shared_gen == effective_diri->shared_gen) {
1456         if (dirp->cache_index == effective_dir->readdir_cache.size()) {
1457           if (i == 0) {
1458             ceph_assert(!dirp->inode->is_complete_and_ordered());
1459             dir->readdir_cache.reserve(dirp->cache_index + numdn);
1460           }
1461           effective_dir->readdir_cache.push_back(dn);
1462         } else if (dirp->cache_index < effective_dir->readdir_cache.size()) {
1463           if (dirp->inode->is_complete_and_ordered())
1464             ceph_assert(effective_dir->readdir_cache[dirp->cache_index] == dn);
1465           else
1466             effective_dir->readdir_cache[dirp->cache_index] = dn;
1467         } else {
1468           ceph_abort_msg("unexpected readdir buffer idx");
1469         }
1470         dirp->cache_index++;
1471       }
1472       // add to cached result list
1473       dirp->buffer.push_back(dir_result_t::dentry(dn->offset, dname, dn->alternate_name, in));
1474       ldout(cct, 15) << __func__ << "  " << hex << dn->offset << dec << ": '" << dname << "' -> " << in->ino << dendl;
1475     }
1476
1477     if (numdn > 0)
1478       dirp->last_name = dname;
1479     if (end)
1480       dirp->next_offset = 2;
1481     else
1482       dirp->next_offset = readdir_offset;
1483
1484     if (dir->is_empty())
1485       close_dir(dir);
1486     if (dir_other && dir_other->is_empty())
1487       close_dir(dir_other);
1488   }
1489 }
1490
1491 /** insert_trace
1492  *
1493  * insert a trace from a MDS reply into the cache.
1494  */
1495 Inode* Client::insert_trace(MetaRequest *request, MetaSession *session)
1496 {
1497   auto& reply = request->reply;
1498   int op = request->get_op();
1499
1500   ldout(cct, 10) << "insert_trace from " << request->sent_stamp << " mds." << session->mds_num
1501            << " is_target=" << (int)reply->head.is_target
1502            << " is_dentry=" << (int)reply->head.is_dentry
1503            << dendl;
1504
1505   auto p = reply->get_trace_bl().cbegin();
1506   if (request->got_unsafe) {
1507     ldout(cct, 10) << "insert_trace -- already got unsafe; ignoring" << dendl;
1508     ceph_assert(p.end());
1509     return NULL;
1510   }
1511
1512   if (p.end()) {
1513     ldout(cct, 10) << "insert_trace -- no trace" << dendl;
1514
1515     Dentry *d = request->dentry();
1516     if (d) {
1517       Inode *diri = d->dir->parent_inode;
1518       clear_dir_complete_and_ordered(diri, true);
1519     }
1520
1521     if (d && reply->get_result() == 0) {
1522       if (op == CEPH_MDS_OP_RENAME) {
1523         // rename
1524         Dentry *od = request->old_dentry();
1525         ldout(cct, 10) << " unlinking rename src dn " << od << " for traceless reply" << dendl;
1526         ceph_assert(od);
1527         unlink(od, true, true);  // keep dir, dentry
1528       } else if (op == CEPH_MDS_OP_RMDIR ||
1529                  op == CEPH_MDS_OP_UNLINK) {
1530         // unlink, rmdir
1531         ldout(cct, 10) << " unlinking unlink/rmdir dn " << d << " for traceless reply" << dendl;
1532         unlink(d, true, true);  // keep dir, dentry
1533       }
1534     }
1535     return NULL;
1536   }
1537
1538   ConnectionRef con = request->reply->get_connection();
1539   uint64_t features;
1540   if (session->mds_features.test(CEPHFS_FEATURE_REPLY_ENCODING)) {
1541     features = (uint64_t)-1;
1542   }
1543   else {
1544     features = con->get_features();
1545   }
1546   ldout(cct, 10) << " features 0x" << hex << features << dec << dendl;
1547
1548   // snap trace
1549   SnapRealm *realm = NULL;
1550   if (reply->snapbl.length())
1551     update_snap_trace(session, reply->snapbl, &realm);
1552
1553   ldout(cct, 10) << " hrm "
1554            << " is_target=" << (int)reply->head.is_target
1555            << " is_dentry=" << (int)reply->head.is_dentry
1556            << dendl;
1557
1558   InodeStat dirst;
1559   DirStat dst;
1560   string dname;
1561   LeaseStat dlease;
1562   InodeStat ist;
1563
1564   if (reply->head.is_dentry) {
1565     dirst.decode(p, features);
1566     dst.decode(p, features);
1567     decode(dname, p);
1568     dlease.decode(p, features);
1569   }
1570
1571   Inode *in = 0;
1572   if (reply->head.is_target) {
1573     ist.decode(p, features);
1574     if (cct->_conf->client_debug_getattr_caps) {
1575       unsigned wanted = 0;
1576       if (op == CEPH_MDS_OP_GETATTR || op == CEPH_MDS_OP_LOOKUP)
1577         wanted = request->head.args.getattr.mask;
1578       else if (op == CEPH_MDS_OP_OPEN || op == CEPH_MDS_OP_CREATE)
1579         wanted = request->head.args.open.mask;
1580
1581       if ((wanted & CEPH_CAP_XATTR_SHARED) &&
1582           !(ist.xattr_version > 0 && ist.xattrbl.length() > 0))
1583         ceph_abort_msg("MDS reply does not contain xattrs");
1584     }
1585
1586     in = add_update_inode(&ist, request->sent_stamp, session,
1587                           request->perms);
1588   }
1589
1590   Inode *diri = NULL;
1591   if (reply->head.is_dentry) {
1592     diri = add_update_inode(&dirst, request->sent_stamp, session,
1593                             request->perms);
1594     mds_rank_t from_mds = mds_rank_t(reply->get_source().num());
1595     update_dir_dist(diri, &dst, from_mds);  // dir stat info is attached to ..
1596
1597     if (in) {
1598       Dir *dir = diri->open_dir();
1599       insert_dentry_inode(dir, dname, &dlease, in, request->sent_stamp, session,
1600                           (op == CEPH_MDS_OP_RENAME) ? request->old_dentry() : NULL);
1601     } else {
1602       Dentry *dn = NULL;
1603       if (diri->dir && diri->dir->dentries.count(dname)) {
1604         dn = diri->dir->dentries[dname];
1605         if (dn->inode) {
1606           clear_dir_complete_and_ordered(diri, false);
1607           unlink(dn, true, true);  // keep dir, dentry
1608         }
1609       }
1610       if (dlease.duration_ms > 0) {
1611         if (!dn) {
1612           Dir *dir = diri->open_dir();
1613           dn = link(dir, dname, NULL, NULL);
1614         }
1615         update_dentry_lease(dn, &dlease, request->sent_stamp, session);
1616       }
1617     }
1618   } else if (op == CEPH_MDS_OP_LOOKUPSNAP ||
1619              op == CEPH_MDS_OP_MKSNAP) {
1620     ldout(cct, 10) << " faking snap lookup weirdness" << dendl;
1621     // fake it for snap lookup
1622     vinodeno_t vino = ist.vino;
1623     vino.snapid = CEPH_SNAPDIR;
1624     ceph_assert(inode_map.count(vino));
1625     diri = inode_map[vino];
1626
1627     string dname = request->path.last_dentry();
1628
1629     LeaseStat dlease;
1630     dlease.duration_ms = 0;
1631
1632     if (in) {
1633       Dir *dir = diri->open_dir();
1634       insert_dentry_inode(dir, dname, &dlease, in, request->sent_stamp, session);
1635     } else {
1636       if (diri->dir && diri->dir->dentries.count(dname)) {
1637         Dentry *dn = diri->dir->dentries[dname];
1638         if (dn->inode)
1639           unlink(dn, true, true);  // keep dir, dentry
1640       }
1641     }
1642   }
1643
1644   if (in) {
1645     if (op == CEPH_MDS_OP_READDIR ||
1646         op == CEPH_MDS_OP_LSSNAP) {
1647       insert_readdir_results(request,
1648         session,
1649         in,
1650         nullptr);
1651     } else if (op == CEPH_MDS_OP_LOOKUPNAME) {
1652       // hack: return parent inode instead
1653       in = diri;
1654     } else if (op == CEPH_MDS_OP_READDIR_SNAPDIFF) {
1655       // provide both request's inode (aka snapA) and traced one (snapB)
1656       // to properly match snapdiff results
1657       insert_readdir_results(request,
1658         session,
1659         request->inode(),
1660         in);
1661     }
1662
1663     if (request->dentry() == NULL && in != request->inode()) {
1664       // pin the target inode if its parent dentry is not pinned
1665       request->set_other_inode(in);
1666     }
1667   }
1668
1669   if (realm)
1670     put_snap_realm(realm);
1671
1672   request->target = in;
1673   return in;
1674 }
1675
1676 // -------
1677
1678 mds_rank_t Client::choose_target_mds(MetaRequest *req, Inode** phash_diri)
1679 {
1680   mds_rank_t mds = MDS_RANK_NONE;
1681   __u32 hash = 0;
1682   bool is_hash = false;
1683   int issued = 0;
1684
1685   Inode *in = NULL;
1686   Dentry *de = NULL;
1687
1688   if (req->resend_mds >= 0) {
1689     mds = req->resend_mds;
1690     req->resend_mds = -1;
1691     ldout(cct, 10) << __func__ << " resend_mds specified as mds." << mds << dendl;
1692     goto out;
1693   }
1694
1695   if (cct->_conf->client_use_random_mds)
1696     goto random_mds;
1697
1698   in = req->inode();
1699   de = req->dentry();
1700   if (in) {
1701     ldout(cct, 20) << __func__ << " starting with req->inode " << *in << dendl;
1702     if (req->path.depth()) {
1703       hash = in->hash_dentry_name(req->path[0]);
1704       ldout(cct, 20) << __func__ << " inode dir hash is " << (int)in->dir_layout.dl_dir_hash
1705                << " on " << req->path[0]
1706                << " => " << hash << dendl;
1707       is_hash = true;
1708     }
1709   } else if (de) {
1710     if (de->inode) {
1711       in = de->inode.get();
1712       ldout(cct, 20) << __func__ << " starting with req->dentry inode " << *in << dendl;
1713     } else {
1714       in = de->dir->parent_inode;
1715       hash = in->hash_dentry_name(de->name);
1716       ldout(cct, 20) << __func__ << " dentry dir hash is " << (int)in->dir_layout.dl_dir_hash
1717                << " on " << de->name
1718                << " => " << hash << dendl;
1719       is_hash = true;
1720     }
1721   }
1722   if (in) {
1723     if (in->snapid != CEPH_NOSNAP) {
1724       ldout(cct, 10) << __func__ << " " << *in << " is snapped, using nonsnap parent" << dendl;
1725       while (in->snapid != CEPH_NOSNAP) {
1726         if (in->snapid == CEPH_SNAPDIR)
1727           in = in->snapdir_parent.get();
1728         else if (!in->dentries.empty())
1729           /* In most cases there will only be one dentry, so getting it
1730            * will be the correct action. If there are multiple hard links,
1731            * I think the MDS should be able to redirect as needed*/
1732           in = in->get_first_parent()->dir->parent_inode;
1733         else {
1734           ldout(cct, 10) << __func__ << "got unlinked inode, can't look at parent" << dendl;
1735           break;
1736         }
1737       }
1738       is_hash = false;
1739     }
1740
1741     ldout(cct, 20) << __func__ << " " << *in << " is_hash=" << is_hash
1742              << " hash=" << hash << dendl;
1743
1744     if (req->get_op() == CEPH_MDS_OP_GETATTR)
1745       issued = req->inode()->caps_issued();
1746
1747     if (is_hash && S_ISDIR(in->mode) && (!in->fragmap.empty() || !in->frag_repmap.empty())) {
1748       frag_t fg = in->dirfragtree[hash];
1749       if (!req->auth_is_best(issued)) {
1750         auto repmapit = in->frag_repmap.find(fg);
1751         if (repmapit != in->frag_repmap.end()) {
1752           auto& repmap = repmapit->second;
1753           auto r = ceph::util::generate_random_number<uint64_t>(0, repmap.size()-1);
1754           mds = repmap.at(r);
1755         }
1756       } else if (in->fragmap.count(fg)) {
1757         mds = in->fragmap[fg];
1758         if (phash_diri)
1759           *phash_diri = in;
1760       } else if (in->auth_cap) {
1761         req->send_to_auth = true;
1762         mds = in->auth_cap->session->mds_num;
1763       }
1764       if (mds >= 0) {
1765         ldout(cct, 10) << __func__ << " from dirfragtree hash" << dendl;
1766         goto out;
1767       }
1768     }
1769
1770     if (in->auth_cap && req->auth_is_best(issued)) {
1771       mds = in->auth_cap->session->mds_num;
1772     } else if (!in->caps.empty()) {
1773       mds = in->caps.begin()->second.session->mds_num;
1774     } else {
1775       goto random_mds;
1776     }
1777     ldout(cct, 10) << __func__ << " from caps on inode " << *in << dendl;
1778
1779     goto out;
1780   }
1781
1782 random_mds:
1783   if (mds < 0) {
1784     mds = _get_random_up_mds();
1785     ldout(cct, 10) << "did not get mds through better means, so chose random mds " << mds << dendl;
1786   }
1787
1788 out:
1789   ldout(cct, 20) << "mds is " << mds << dendl;
1790   return mds;
1791 }
1792
1793 void Client::connect_mds_targets(mds_rank_t mds)
1794 {
1795   ldout(cct, 10) << __func__ << " for mds." << mds << dendl;
1796   ceph_assert(mds_sessions.count(mds));
1797   const MDSMap::mds_info_t& info = mdsmap->get_mds_info(mds);
1798   for (const auto &rank : info.export_targets) {
1799     if (mds_sessions.count(rank) == 0 &&
1800         mdsmap->is_clientreplay_or_active_or_stopping(rank)) {
1801       ldout(cct, 10) << "check_mds_sessions opening mds." << mds
1802                      << " export target mds." << rank << dendl;
1803
1804       auto session = _get_or_open_mds_session(rank);
1805       if (session->state == MetaSession::STATE_OPENING ||
1806           session->state == MetaSession::STATE_OPEN)
1807         continue;
1808
1809       _open_mds_session(rank);
1810     }
1811   }
1812 }
1813
1814 void Client::dump_mds_sessions(Formatter *f, bool cap_dump)
1815 {
1816   f->dump_int("id", get_nodeid().v);
1817   entity_inst_t inst(messenger->get_myname(), messenger->get_myaddr_legacy());
1818   f->dump_object("inst", inst);
1819   f->dump_stream("inst_str") << inst;
1820   f->dump_stream("addr_str") << inst.addr;
1821   f->open_array_section("sessions");
1822   for (const auto &p : mds_sessions) {
1823     f->open_object_section("session");
1824     p.second->dump(f, cap_dump);
1825     f->close_section();
1826   }
1827   f->close_section();
1828   f->dump_int("mdsmap_epoch", mdsmap->get_epoch());
1829 }
1830
1831 void Client::dump_mds_requests(Formatter *f)
1832 {
1833   for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
1834        p != mds_requests.end();
1835        ++p) {
1836     f->open_object_section("request");
1837     p->second->dump(f);
1838     f->close_section();
1839   }
1840 }
1841
1842 int Client::verify_reply_trace(int r, MetaSession *session,
1843                                MetaRequest *request, const MConstRef<MClientReply>& reply,
1844                                InodeRef *ptarget, bool *pcreated,
1845                                const UserPerm& perms)
1846 {
1847   // check whether this request actually did the create, and set created flag
1848   bufferlist extra_bl;
1849   inodeno_t created_ino;
1850   bool got_created_ino = false;
1851   ceph::unordered_map<vinodeno_t, Inode*>::iterator p;
1852
1853   extra_bl = reply->get_extra_bl();
1854   if (extra_bl.length() >= 8) {
1855     if (session->mds_features.test(CEPHFS_FEATURE_DELEG_INO)) {
1856      struct openc_response_t    ocres;
1857
1858      decode(ocres, extra_bl);
1859      created_ino = ocres.created_ino;
1860      /*
1861       * The userland cephfs client doesn't have a way to do an async create
1862       * (yet), so just discard delegated_inos for now. Eventually we should
1863       * store them and use them in create calls, even if they are synchronous,
1864       * if only for testing purposes.
1865       */
1866      ldout(cct, 10) << "delegated_inos: " << ocres.delegated_inos << dendl;
1867     } else {
1868      // u64 containing number of created ino
1869      decode(created_ino, extra_bl);
1870     }
1871     ldout(cct, 10) << "make_request created ino " << created_ino << dendl;
1872     got_created_ino = true;
1873   }
1874
1875   if (pcreated)
1876     *pcreated = got_created_ino;
1877
1878   if (request->target) {
1879     *ptarget = request->target;
1880     ldout(cct, 20) << "make_request target is " << *ptarget->get() << dendl;
1881   } else {
1882     if (got_created_ino && (p = inode_map.find(vinodeno_t(created_ino, CEPH_NOSNAP))) != inode_map.end()) {
1883       (*ptarget) = p->second;
1884       ldout(cct, 20) << "make_request created, target is " << *ptarget->get() << dendl;
1885     } else {
1886       // we got a traceless reply, and need to look up what we just
1887       // created.  for now, do this by name.  someday, do this by the
1888       // ino... which we know!  FIXME.
1889       InodeRef target;
1890       Dentry *d = request->dentry();
1891       if (d) {
1892         if (d->dir) {
1893           ldout(cct, 10) << "make_request got traceless reply, looking up #"
1894                          << d->dir->parent_inode->ino << "/" << d->name
1895                          << " got_ino " << got_created_ino
1896                          << " ino " << created_ino
1897                          << dendl;
1898           r = _do_lookup(d->dir->parent_inode, d->name, request->regetattr_mask,
1899                          &target, perms);
1900         } else {
1901           // if the dentry is not linked, just do our best. see #5021.
1902           ceph_abort_msg("how did this happen?  i want logs!");
1903         }
1904       } else {
1905         Inode *in = request->inode();
1906         ldout(cct, 10) << "make_request got traceless reply, forcing getattr on #"
1907                        << in->ino << dendl;
1908         r = _getattr(in, request->regetattr_mask, perms, true);
1909         target = in;
1910       }
1911       if (r >= 0) {
1912         // verify ino returned in reply and trace_dist are the same
1913         if (got_created_ino &&
1914             created_ino.val != target->ino.val) {
1915           ldout(cct, 5) << "create got ino " << created_ino << " but then failed on lookup; EINTR?" << dendl;
1916           r = -CEPHFS_EINTR;
1917         }
1918         if (ptarget)
1919           ptarget->swap(target);
1920       }
1921     }
1922   }
1923
1924   return r;
1925 }
1926
1927
1928 /**
1929  * make a request
1930  *
1931  * Blocking helper to make an MDS request.
1932  *
1933  * If the ptarget flag is set, behavior changes slightly: the caller
1934  * expects to get a pointer to the inode we are creating or operating
1935  * on.  As a result, we will follow up any traceless mutation reply
1936  * with a getattr or lookup to transparently handle a traceless reply
1937  * from the MDS (as when the MDS restarts and the client has to replay
1938  * a request).
1939  *
1940  * @param request the MetaRequest to execute
1941  * @param perms The user uid/gid to execute as (eventually, full group lists?)
1942  * @param ptarget [optional] address to store a pointer to the target inode we want to create or operate on
1943  * @param pcreated [optional; required if ptarget] where to store a bool of whether our create atomically created a file
1944  * @param use_mds [optional] prefer a specific mds (-1 for default)
1945  * @param pdirbl [optional; disallowed if ptarget] where to pass extra reply payload to the caller
1946  */
1947 int Client::make_request(MetaRequest *request,
1948                          const UserPerm& perms,
1949                          InodeRef *ptarget, bool *pcreated,
1950                          mds_rank_t use_mds,
1951                          bufferlist *pdirbl,
1952                          size_t feature_needed)
1953 {
1954   int r = 0;
1955
1956   // assign a unique tid
1957   ceph_tid_t tid = ++last_tid;
1958   request->set_tid(tid);
1959
1960   // and timestamp
1961   request->op_stamp = ceph_clock_now();
1962   request->created = ceph::coarse_mono_clock::now();
1963
1964   // make note
1965   mds_requests[tid] = request->get();
1966   if (oldest_tid == 0 && request->get_op() != CEPH_MDS_OP_SETFILELOCK)
1967     oldest_tid = tid;
1968
1969   request->set_caller_perms(perms);
1970
1971   if (cct->_conf->client_inject_fixed_oldest_tid) {
1972     ldout(cct, 20) << __func__ << " injecting fixed oldest_client_tid(1)" << dendl;
1973     request->set_oldest_client_tid(1);
1974   } else {
1975     request->set_oldest_client_tid(oldest_tid);
1976   }
1977
1978   // hack target mds?
1979   if (use_mds >= 0)
1980     request->resend_mds = use_mds;
1981
1982   MetaSessionRef session = NULL;
1983   while (1) {
1984     if (request->aborted())
1985       break;
1986
1987     if (blocklisted) {
1988       request->abort(-CEPHFS_EBLOCKLISTED);
1989       break;
1990     }
1991
1992     // set up wait cond
1993     ceph::condition_variable caller_cond;
1994     request->caller_cond = &caller_cond;
1995
1996     // choose mds
1997     Inode *hash_diri = NULL;
1998     mds_rank_t mds = choose_target_mds(request, &hash_diri);
1999     int mds_state = (mds == MDS_RANK_NONE) ? MDSMap::STATE_NULL : mdsmap->get_state(mds);
2000     if (mds_state != MDSMap::STATE_ACTIVE && mds_state != MDSMap::STATE_STOPPING) {
2001       if (mds_state == MDSMap::STATE_NULL && mds >= mdsmap->get_max_mds()) {
2002         if (hash_diri) {
2003           ldout(cct, 10) << " target mds." << mds << " has stopped, remove it from fragmap" << dendl;
2004           _fragmap_remove_stopped_mds(hash_diri, mds);
2005         } else {
2006           ldout(cct, 10) << " target mds." << mds << " has stopped, trying a random mds" << dendl;
2007           request->resend_mds = _get_random_up_mds();
2008         }
2009       } else {
2010         ldout(cct, 10) << " target mds." << mds << " not active, waiting for new mdsmap" << dendl;
2011         wait_on_list(waiting_for_mdsmap);
2012       }
2013       continue;
2014     }
2015
2016     // open a session?
2017     if (!have_open_session(mds)) {
2018       session = _get_or_open_mds_session(mds);
2019       if (session->state == MetaSession::STATE_REJECTED) {
2020         request->abort(-CEPHFS_EPERM);
2021         break;
2022       }
2023       // wait
2024       if (session->state == MetaSession::STATE_OPENING) {
2025         ldout(cct, 10) << "waiting for session to mds." << mds << " to open" << dendl;
2026         wait_on_context_list(session->waiting_for_open);
2027         continue;
2028       }
2029
2030       if (!have_open_session(mds))
2031         continue;
2032     } else {
2033       session = mds_sessions.at(mds);
2034     }
2035
2036     if (feature_needed != ULONG_MAX && !session->mds_features.test(feature_needed)) {
2037       request->abort(-CEPHFS_EOPNOTSUPP);
2038       break;
2039     }
2040
2041     // send request.
2042     send_request(request, session.get());
2043
2044     // wait for signal
2045     ldout(cct, 20) << "awaiting reply|forward|kick on " << &caller_cond << dendl;
2046     request->kick = false;
2047     std::unique_lock l{client_lock, std::adopt_lock};
2048     caller_cond.wait(l, [request] {
2049       return (request->reply ||           // reply
2050               request->resend_mds >= 0 || // forward
2051               request->kick);
2052     });
2053     l.release();
2054     request->caller_cond = nullptr;
2055
2056     // did we get a reply?
2057     if (request->reply)
2058       break;
2059   }
2060
2061   if (!request->reply) {
2062     ceph_assert(request->aborted());
2063     ceph_assert(!request->got_unsafe);
2064     r = request->get_abort_code();
2065     request->item.remove_myself();
2066     unregister_request(request);
2067     put_request(request);
2068     return r;
2069   }
2070
2071   // got it!
2072   auto reply = std::move(request->reply);
2073   r = reply->get_result();
2074   if (r >= 0)
2075     request->success = true;
2076
2077   // kick dispatcher (we've got it!)
2078   ceph_assert(request->dispatch_cond);
2079   request->dispatch_cond->notify_all();
2080   ldout(cct, 20) << "sendrecv kickback on tid " << tid << " " << request->dispatch_cond << dendl;
2081   request->dispatch_cond = 0;
2082
2083   if (r >= 0 && ptarget)
2084     r = verify_reply_trace(r, session.get(), request, reply, ptarget, pcreated, perms);
2085
2086   if (pdirbl)
2087     *pdirbl = reply->get_extra_bl();
2088
2089   // -- log times --
2090   utime_t lat = ceph_clock_now();
2091   lat -= request->sent_stamp;
2092   ldout(cct, 20) << "lat " << lat << dendl;
2093
2094   ++nr_metadata_request;
2095   update_io_stat_metadata(lat);
2096
2097   put_request(request);
2098   return r;
2099 }
2100
2101 void Client::unregister_request(MetaRequest *req)
2102 {
2103   mds_requests.erase(req->tid);
2104   if (req->tid == oldest_tid) {
2105     map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.upper_bound(oldest_tid);
2106     while (true) {
2107       if (p == mds_requests.end()) {
2108         oldest_tid = 0;
2109         break;
2110       }
2111       if (p->second->get_op() != CEPH_MDS_OP_SETFILELOCK) {
2112         oldest_tid = p->first;
2113         break;
2114       }
2115       ++p;
2116     }
2117   }
2118   put_request(req);
2119 }
2120
2121 void Client::put_request(MetaRequest *request)
2122 {
2123   if (request->_put()) {
2124     int op = -1;
2125     if (request->success)
2126       op = request->get_op();
2127     InodeRef other_in;
2128     request->take_other_inode(&other_in);
2129     delete request;
2130
2131     if (other_in &&
2132         (op == CEPH_MDS_OP_RMDIR ||
2133          op == CEPH_MDS_OP_RENAME ||
2134          op == CEPH_MDS_OP_RMSNAP)) {
2135       _try_to_trim_inode(other_in.get(), false);
2136     }
2137   }
2138 }
2139
2140 int Client::encode_inode_release(Inode *in, MetaRequest *req,
2141                          mds_rank_t mds, int drop,
2142                          int unless, int force)
2143 {
2144   ldout(cct, 20) << __func__ << " enter(in:" << *in << ", req:" << req
2145            << " mds:" << mds << ", drop:" << ccap_string(drop) << ", unless:" << ccap_string(unless)
2146            << ", force:" << force << ")" << dendl;
2147   int released = 0;
2148   auto it = in->caps.find(mds);
2149   if (it != in->caps.end()) {
2150     Cap &cap = it->second;
2151     drop &= ~(in->dirty_caps | get_caps_used(in));
2152     if ((drop & cap.issued) &&
2153         !(unless & cap.issued)) {
2154       ldout(cct, 25) << "dropping caps " << ccap_string(drop) << dendl;
2155       cap.issued &= ~drop;
2156       cap.implemented &= ~drop;
2157       released = 1;
2158     } else {
2159       released = force;
2160     }
2161     if (released) {
2162       cap.wanted = in->caps_wanted();
2163       if (&cap == in->auth_cap &&
2164           !(cap.wanted & CEPH_CAP_ANY_FILE_WR)) {
2165         in->requested_max_size = 0;
2166         ldout(cct, 25) << "reset requested_max_size due to not wanting any file write cap" << dendl;
2167       }
2168       ceph_mds_request_release rel;
2169       rel.ino = in->ino;
2170       rel.cap_id = cap.cap_id;
2171       rel.seq = cap.seq;
2172       rel.issue_seq = cap.issue_seq;
2173       rel.mseq = cap.mseq;
2174       rel.caps = cap.implemented;
2175       rel.wanted = cap.wanted;
2176       rel.dname_len = 0;
2177       rel.dname_seq = 0;
2178       req->cap_releases.push_back(MClientRequest::Release(rel,""));
2179     }
2180   }
2181   ldout(cct, 25) << __func__ << " exit(in:" << *in << ") released:"
2182            << released << dendl;
2183   return released;
2184 }
2185
2186 void Client::encode_dentry_release(Dentry *dn, MetaRequest *req,
2187                            mds_rank_t mds, int drop, int unless)
2188 {
2189   ldout(cct, 20) << __func__ << " enter(dn:"
2190            << dn << ")" << dendl;
2191   int released = 0;
2192   if (dn->dir)
2193     released = encode_inode_release(dn->dir->parent_inode, req,
2194                                     mds, drop, unless, 1);
2195   if (released && dn->lease_mds == mds) {
2196     ldout(cct, 25) << "preemptively releasing dn to mds" << dendl;
2197     auto& rel = req->cap_releases.back();
2198     rel.item.dname_len = dn->name.length();
2199     rel.item.dname_seq = dn->lease_seq;
2200     rel.dname = dn->name;
2201     dn->lease_mds = -1;
2202   }
2203   ldout(cct, 25) << __func__ << " exit(dn:"
2204            << dn << ")" << dendl;
2205 }
2206
2207
2208 /*
2209  * This requires the MClientRequest *request member to be set.
2210  * It will error out horribly without one.
2211  * Additionally, if you set any *drop member, you'd better have
2212  * set the corresponding dentry!
2213  */
2214 void Client::encode_cap_releases(MetaRequest *req, mds_rank_t mds)
2215 {
2216   ldout(cct, 20) << __func__ << " enter (req: "
2217                  << req << ", mds: " << mds << ")" << dendl;
2218   if (req->inode_drop && req->inode())
2219     encode_inode_release(req->inode(), req,
2220                          mds, req->inode_drop,
2221                          req->inode_unless);
2222
2223   if (req->old_inode_drop && req->old_inode())
2224     encode_inode_release(req->old_inode(), req,
2225                          mds, req->old_inode_drop,
2226                          req->old_inode_unless);
2227   if (req->other_inode_drop && req->other_inode())
2228     encode_inode_release(req->other_inode(), req,
2229                          mds, req->other_inode_drop,
2230                          req->other_inode_unless);
2231
2232   if (req->dentry_drop && req->dentry())
2233     encode_dentry_release(req->dentry(), req,
2234                           mds, req->dentry_drop,
2235                           req->dentry_unless);
2236
2237   if (req->old_dentry_drop && req->old_dentry())
2238     encode_dentry_release(req->old_dentry(), req,
2239                           mds, req->old_dentry_drop,
2240                           req->old_dentry_unless);
2241   ldout(cct, 25) << __func__ << " exit (req: "
2242            << req << ", mds " << mds <<dendl;
2243 }
2244
2245 bool Client::have_open_session(mds_rank_t mds)
2246 {
2247   const auto &it = mds_sessions.find(mds);
2248   return it != mds_sessions.end() &&
2249     (it->second->state == MetaSession::STATE_OPEN ||
2250      it->second->state == MetaSession::STATE_STALE);
2251 }
2252
2253 MetaSessionRef Client::_get_mds_session(mds_rank_t mds, Connection *con)
2254 {
2255   const auto &it = mds_sessions.find(mds);
2256   if (it == mds_sessions.end() || it->second->con != con) {
2257     return NULL;
2258   } else {
2259     return it->second;
2260   }
2261 }
2262
2263 MetaSessionRef Client::_get_or_open_mds_session(mds_rank_t mds)
2264 {
2265   auto it = mds_sessions.find(mds);
2266   return it == mds_sessions.end() ? _open_mds_session(mds) : it->second;
2267 }
2268
2269 /**
2270  * Populate a map of strings with client-identifying metadata,
2271  * such as the hostname.  Call this once at initialization.
2272  */
2273 void Client::populate_metadata(const std::string &mount_root)
2274 {
2275   // Hostname
2276 #ifdef _WIN32
2277   // TODO: move this to compat.h
2278   char hostname[64];
2279   DWORD hostname_sz = 64;
2280   GetComputerNameA(hostname, &hostname_sz);
2281   metadata["hostname"] = hostname;
2282 #else
2283   struct utsname u;
2284   int r = uname(&u);
2285   if (r >= 0) {
2286     metadata["hostname"] = u.nodename;
2287     ldout(cct, 20) << __func__ << " read hostname '" << u.nodename << "'" << dendl;
2288   } else {
2289     ldout(cct, 1) << __func__ << " failed to read hostname (" << cpp_strerror(r) << ")" << dendl;
2290   }
2291 #endif
2292
2293   metadata["pid"] = stringify(getpid());
2294
2295   // Ceph entity id (the '0' in "client.0")
2296   metadata["entity_id"] = cct->_conf->name.get_id();
2297
2298   // Our mount position
2299   if (!mount_root.empty()) {
2300     metadata["root"] = mount_root;
2301   }
2302
2303   // Ceph version
2304   metadata["ceph_version"] = pretty_version_to_str();
2305   metadata["ceph_sha1"] = git_version_to_str();
2306
2307   // Apply any metadata from the user's configured overrides
2308   std::vector<std::string> tokens;
2309   get_str_vec(cct->_conf->client_metadata, ",", tokens);
2310   for (const auto &i : tokens) {
2311     auto eqpos = i.find("=");
2312     // Throw out anything that isn't of the form "<str>=<str>"
2313     if (eqpos == 0 || eqpos == std::string::npos || eqpos == i.size()) {
2314       lderr(cct) << "Invalid metadata keyval pair: '" << i << "'" << dendl;
2315       continue;
2316     }
2317     metadata[i.substr(0, eqpos)] = i.substr(eqpos + 1);
2318   }
2319 }
2320
2321 /**
2322  * Optionally add or override client metadata fields.
2323  */
2324 void Client::update_metadata(std::string const &k, std::string const &v)
2325 {
2326   RWRef_t iref_reader(initialize_state, CLIENT_INITIALIZED);
2327   ceph_assert(iref_reader.is_state_satisfied());
2328
2329   std::scoped_lock l(client_lock);
2330
2331   auto it = metadata.find(k);
2332   if (it != metadata.end()) {
2333     ldout(cct, 1) << __func__ << " warning, overriding metadata field '" << k
2334                   << "' from '" << it->second << "' to '" << v << "'" << dendl;
2335   }
2336
2337   metadata[k] = v;
2338 }
2339
2340 MetaSessionRef Client::_open_mds_session(mds_rank_t mds)
2341 {
2342   ldout(cct, 10) << __func__ << " mds." << mds << dendl;
2343   auto addrs = mdsmap->get_addrs(mds);
2344   auto em = mds_sessions.emplace(std::piecewise_construct,
2345       std::forward_as_tuple(mds),
2346       std::forward_as_tuple(new MetaSession(mds, messenger->connect_to_mds(addrs), addrs)));
2347   ceph_assert(em.second); /* not already present */
2348   auto session = em.first->second;
2349
2350   auto m = make_message<MClientSession>(CEPH_SESSION_REQUEST_OPEN);
2351   m->metadata = metadata;
2352   m->supported_features = feature_bitset_t(CEPHFS_FEATURES_CLIENT_SUPPORTED);
2353   m->metric_spec = feature_bitset_t(CEPHFS_METRIC_FEATURES_ALL);
2354   session->con->send_message2(std::move(m));
2355   return session;
2356 }
2357
2358 void Client::_close_mds_session(MetaSession *s)
2359 {
2360   ldout(cct, 2) << __func__ << " mds." << s->mds_num << " seq " << s->seq << dendl;
2361   s->state = MetaSession::STATE_CLOSING;
2362   s->con->send_message2(make_message<MClientSession>(CEPH_SESSION_REQUEST_CLOSE, s->seq));
2363 }
2364
2365 void Client::_closed_mds_session(MetaSession *s, int err, bool rejected)
2366 {
2367   ldout(cct, 5) << __func__ << " mds." << s->mds_num << " seq " << s->seq << dendl;
2368   if (rejected && s->state != MetaSession::STATE_CLOSING)
2369     s->state = MetaSession::STATE_REJECTED;
2370   else
2371     s->state = MetaSession::STATE_CLOSED;
2372   s->con->mark_down();
2373   signal_context_list(s->waiting_for_open);
2374   mount_cond.notify_all();
2375   remove_session_caps(s, err);
2376   kick_requests_closed(s);
2377   mds_ranks_closing.erase(s->mds_num);
2378   if (s->state == MetaSession::STATE_CLOSED)
2379     mds_sessions.erase(s->mds_num);
2380 }
2381
2382 static void reinit_mds_features(MetaSession *session,
2383                                 const MConstRef<MClientSession>& m) {
2384   session->mds_features = std::move(m->supported_features);
2385   session->mds_metric_flags = std::move(m->metric_spec.metric_flags);
2386 }
2387
2388 void Client::handle_client_session(const MConstRef<MClientSession>& m)
2389 {
2390   mds_rank_t from = mds_rank_t(m->get_source().num());
2391   ldout(cct, 10) << __func__ << " " << *m << " from mds." << from << dendl;
2392
2393   std::scoped_lock cl(client_lock);
2394   auto session = _get_mds_session(from, m->get_connection().get());
2395   if (!session) {
2396     ldout(cct, 10) << " discarding session message from sessionless mds " << m->get_source_inst() << dendl;
2397     return;
2398   }
2399
2400   switch (m->get_op()) {
2401   case CEPH_SESSION_OPEN:
2402     {
2403       if (session->state == MetaSession::STATE_OPEN) {
2404         ldout(cct, 10) << "mds." << from << " already opened, ignore it"
2405                        << dendl;
2406         // The MDS could send a client_session(open) message even when
2407         // the session state is STATE_OPEN. Normally, its fine to
2408         // ignore this message, but, if the MDS sent this message just
2409         // after it got upgraded, the MDS feature bits could differ
2410         // than the one before the upgrade - so, refresh the feature
2411         // bits the client holds.
2412         reinit_mds_features(session.get(), m);
2413         return;
2414       }
2415       /*
2416        * The connection maybe broken and the session in client side
2417        * has been reinitialized, need to update the seq anyway.
2418        */
2419       if (!session->seq && m->get_seq())
2420         session->seq = m->get_seq();
2421
2422       reinit_mds_features(session.get(), m);
2423
2424       renew_caps(session.get());
2425       session->state = MetaSession::STATE_OPEN;
2426       if (is_unmounting())
2427         mount_cond.notify_all();
2428       else
2429         connect_mds_targets(from);
2430       signal_context_list(session->waiting_for_open);
2431       break;
2432     }
2433
2434   case CEPH_SESSION_CLOSE:
2435     _closed_mds_session(session.get());
2436     break;
2437
2438   case CEPH_SESSION_RENEWCAPS:
2439     if (session->cap_renew_seq == m->get_seq()) {
2440       bool was_stale = ceph_clock_now() >= session->cap_ttl;
2441       session->cap_ttl =
2442         session->last_cap_renew_request + mdsmap->get_session_timeout();
2443       if (was_stale)
2444         wake_up_session_caps(session.get(), false);
2445     }
2446     break;
2447
2448   case CEPH_SESSION_STALE:
2449     // invalidate session caps/leases
2450     session->cap_gen++;
2451     session->cap_ttl = ceph_clock_now();
2452     session->cap_ttl -= 1;
2453     renew_caps(session.get());
2454     break;
2455
2456   case CEPH_SESSION_RECALL_STATE:
2457     /*
2458      * Call the renew caps and flush cap releases just before
2459      * triming the caps in case the tick() won't get a chance
2460      * to run them, which could cause the client to be blocklisted
2461      * and MDS daemons trying to recall the caps again and
2462      * again.
2463      *
2464      * In most cases it will do nothing, and the new cap releases
2465      * added by trim_caps() followed will be deferred flushing
2466      * by tick().
2467      */
2468     renew_and_flush_cap_releases();
2469     trim_caps(session.get(), m->get_max_caps());
2470     break;
2471
2472   case CEPH_SESSION_FLUSHMSG:
2473     /* flush cap release */
2474     if (auto& m = session->release; m) {
2475       session->con->send_message2(std::move(m));
2476     }
2477     session->con->send_message2(make_message<MClientSession>(CEPH_SESSION_FLUSHMSG_ACK, m->get_seq()));
2478     break;
2479
2480   case CEPH_SESSION_FORCE_RO:
2481     force_session_readonly(session.get());
2482     break;
2483
2484   case CEPH_SESSION_REJECT:
2485     {
2486       std::string_view error_str;
2487       auto it = m->metadata.find("error_string");
2488       if (it != m->metadata.end())
2489         error_str = it->second;
2490       else
2491         error_str = "unknown error";
2492       lderr(cct) << "mds." << from << " rejected us (" << error_str << ")" << dendl;
2493
2494       _closed_mds_session(session.get(), -CEPHFS_EPERM, true);
2495     }
2496     break;
2497
2498   default:
2499     ceph_abort();
2500   }
2501 }
2502
2503 bool Client::_any_stale_sessions() const
2504 {
2505   ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
2506
2507   for (const auto &p : mds_sessions) {
2508     if (p.second->state == MetaSession::STATE_STALE) {
2509       return true;
2510     }
2511   }
2512
2513   return false;
2514 }
2515
2516 void Client::_kick_stale_sessions()
2517 {
2518   ldout(cct, 1) << __func__ << dendl;
2519
2520   for (auto it = mds_sessions.begin(); it != mds_sessions.end(); ) {
2521     auto s = it->second;
2522     if (s->state == MetaSession::STATE_REJECTED) {
2523       mds_sessions.erase(it->first);
2524       continue;
2525     }
2526     if (s->state == MetaSession::STATE_STALE)
2527       _closed_mds_session(s.get());
2528   }
2529 }
2530
2531 void Client::send_request(MetaRequest *request, MetaSession *session,
2532                           bool drop_cap_releases)
2533 {
2534   // make the request
2535   mds_rank_t mds = session->mds_num;
2536   ldout(cct, 10) << __func__ << " rebuilding request " << request->get_tid()
2537                  << " for mds." << mds << dendl;
2538   auto r = build_client_request(request, mds);
2539   if (!r)
2540     return;
2541
2542   if (request->dentry()) {
2543     r->set_dentry_wanted();
2544   }
2545   if (request->got_unsafe) {
2546     r->set_replayed_op();
2547     if (request->target)
2548       r->head.ino = request->target->ino;
2549   } else {
2550     encode_cap_releases(request, mds);
2551     if (drop_cap_releases) // we haven't send cap reconnect yet, drop cap releases
2552       request->cap_releases.clear();
2553     else
2554       r->releases.swap(request->cap_releases);
2555   }
2556   r->set_mdsmap_epoch(mdsmap->get_epoch());
2557   if (r->head.op == CEPH_MDS_OP_SETXATTR) {
2558     objecter->with_osdmap([r](const OSDMap& o) {
2559         r->set_osdmap_epoch(o.get_epoch());
2560       });
2561   }
2562
2563   if (request->mds == -1) {
2564     request->sent_stamp = ceph_clock_now();
2565     ldout(cct, 20) << __func__ << " set sent_stamp to " << request->sent_stamp << dendl;
2566   }
2567   request->mds = mds;
2568
2569   Inode *in = request->inode();
2570   if (in) {
2571     auto it = in->caps.find(mds);
2572     if (it != in->caps.end()) {
2573       request->sent_on_mseq = it->second.mseq;
2574     }
2575   }
2576
2577   session->requests.push_back(&request->item);
2578
2579   ldout(cct, 10) << __func__ << " " << *r << " to mds." << mds << dendl;
2580   session->con->send_message2(std::move(r));
2581 }
2582
2583 ref_t<MClientRequest> Client::build_client_request(MetaRequest *request, mds_rank_t mds)
2584 {
2585   auto session = mds_sessions.at(mds);
2586   bool old_version = !session->mds_features.test(CEPHFS_FEATURE_32BITS_RETRY_FWD);
2587
2588   /*
2589    * Avoid inifinite retrying after overflow.
2590    *
2591    * The client will increase the retry count and if the MDS is
2592    * old version, so we limit to retry at most 256 times.
2593    */
2594   if (request->retry_attempt) {
2595     int old_max_retry = sizeof(((struct ceph_mds_request_head*)0)->num_retry);
2596     old_max_retry = 1 << (old_max_retry * CHAR_BIT);
2597     if ((old_version && request->retry_attempt >= old_max_retry) ||
2598         (uint32_t)request->retry_attempt >= UINT32_MAX) {
2599       request->abort(-CEPHFS_EMULTIHOP);
2600       request->caller_cond->notify_all();
2601       ldout(cct, 1) << __func__ << " request tid " << request->tid
2602                     << " retry seq overflow" << ", abort it" << dendl;
2603       return nullptr;
2604     }
2605   }
2606
2607   auto req = make_message<MClientRequest>(request->get_op(), session->mds_features);
2608   req->set_tid(request->tid);
2609   req->set_stamp(request->op_stamp);
2610   memcpy(&req->head, &request->head, sizeof(ceph_mds_request_head));
2611
2612   // if the filepath's haven't been set, set them!
2613   if (request->path.empty()) {
2614     Inode *in = request->inode();
2615     Dentry *de = request->dentry();
2616     if (in)
2617       in->make_nosnap_relative_path(request->path);
2618     else if (de) {
2619       if (de->inode)
2620         de->inode->make_nosnap_relative_path(request->path);
2621       else if (de->dir) {
2622         de->dir->parent_inode->make_nosnap_relative_path(request->path);
2623         request->path.push_dentry(de->name);
2624       }
2625       else ldout(cct, 1) << "Warning -- unable to construct a filepath!"
2626                    << " No path, inode, or appropriately-endowed dentry given!"
2627                    << dendl;
2628     } else ldout(cct, 1) << "Warning -- unable to construct a filepath!"
2629                    << " No path, inode, or dentry given!"
2630                    << dendl;
2631   }
2632   req->set_filepath(request->get_filepath());
2633   req->set_filepath2(request->get_filepath2());
2634   req->set_alternate_name(request->alternate_name);
2635   req->set_data(request->data);
2636   req->fscrypt_auth = request->fscrypt_auth;
2637   req->fscrypt_file = request->fscrypt_file;
2638   req->set_retry_attempt(request->retry_attempt++);
2639   req->head.ext_num_fwd = request->num_fwd;
2640   const gid_t *_gids;
2641   int gid_count = request->perms.get_gids(&_gids);
2642   req->set_gid_list(gid_count, _gids);
2643   return req;
2644 }
2645
2646
2647
2648 void Client::handle_client_request_forward(const MConstRef<MClientRequestForward>& fwd)
2649 {
2650   mds_rank_t mds = mds_rank_t(fwd->get_source().num());
2651
2652   std::scoped_lock cl(client_lock);
2653   auto session = _get_mds_session(mds, fwd->get_connection().get());
2654   if (!session) {
2655     return;
2656   }
2657   ceph_tid_t tid = fwd->get_tid();
2658
2659   if (mds_requests.count(tid) == 0) {
2660     ldout(cct, 10) << __func__ << " no pending request on tid " << tid << dendl;
2661     return;
2662   }
2663
2664   MetaRequest *request = mds_requests[tid];
2665   ceph_assert(request);
2666
2667   /*
2668    * Avoid inifinite retrying after overflow.
2669    *
2670    * The MDS will increase the fwd count and in client side
2671    * if the num_fwd is less than the one saved in request
2672    * that means the MDS is an old version and overflowed of
2673    * 8 bits.
2674    */
2675   auto num_fwd = fwd->get_num_fwd();
2676   if (num_fwd <= request->num_fwd || (uint32_t)num_fwd >= UINT32_MAX) {
2677     request->abort(-CEPHFS_EMULTIHOP);
2678     request->caller_cond->notify_all();
2679     ldout(cct, 0) << __func__ << " request tid " << tid << " new num_fwd "
2680       << num_fwd << " old num_fwd " << request->num_fwd << ", fwd seq overflow"
2681       << ", abort it" << dendl;
2682     return;
2683   }
2684
2685   // reset retry counter
2686   request->retry_attempt = 0;
2687
2688   // request not forwarded, or dest mds has no session.
2689   // resend.
2690   ldout(cct, 10) << __func__ << " tid " << tid
2691            << " fwd " << fwd->get_num_fwd()
2692            << " to mds." << fwd->get_dest_mds()
2693            << ", resending to " << fwd->get_dest_mds()
2694            << dendl;
2695
2696   request->mds = -1;
2697   request->item.remove_myself();
2698   request->num_fwd = num_fwd;
2699   request->resend_mds = fwd->get_dest_mds();
2700   request->caller_cond->notify_all();
2701 }
2702
2703 bool Client::is_dir_operation(MetaRequest *req)
2704 {
2705   int op = req->get_op();
2706   if (op == CEPH_MDS_OP_MKNOD || op == CEPH_MDS_OP_LINK ||
2707       op == CEPH_MDS_OP_UNLINK || op == CEPH_MDS_OP_RENAME ||
2708       op == CEPH_MDS_OP_MKDIR || op == CEPH_MDS_OP_RMDIR ||
2709       op == CEPH_MDS_OP_SYMLINK || op == CEPH_MDS_OP_CREATE)
2710     return true;
2711   return false;
2712 }
2713
2714 void Client::handle_client_reply(const MConstRef<MClientReply>& reply)
2715 {
2716   mds_rank_t mds_num = mds_rank_t(reply->get_source().num());
2717
2718   std::scoped_lock cl(client_lock);
2719   auto session = _get_mds_session(mds_num, reply->get_connection().get());
2720   if (!session) {
2721     return;
2722   }
2723
2724   ceph_tid_t tid = reply->get_tid();
2725   bool is_safe = reply->is_safe();
2726
2727   if (mds_requests.count(tid) == 0) {
2728     lderr(cct) << __func__ << " no pending request on tid " << tid
2729                << " safe is:" << is_safe << dendl;
2730     return;
2731   }
2732   MetaRequest *request = mds_requests.at(tid);
2733
2734   ldout(cct, 20) << __func__ << " got a reply. Safe:" << is_safe
2735                  << " tid " << tid << dendl;
2736
2737   // correct sessions ?
2738   if (request->mds != mds_num) {
2739     ldout(cct, 0) << "got a stale reply from mds." << mds_num
2740             << " instead of mds." << request->mds << dendl;
2741     return;
2742   }
2743
2744   if (request->got_unsafe && !is_safe) {
2745     //duplicate response
2746     ldout(cct, 0) << "got a duplicate reply on tid " << tid << " from mds "
2747             << mds_num << " safe:" << is_safe << dendl;
2748     return;
2749   }
2750
2751   ceph_assert(!request->reply);
2752   request->reply = reply;
2753   insert_trace(request, session.get());
2754
2755   // Handle unsafe reply
2756   if (!is_safe) {
2757     request->got_unsafe = true;
2758     session->unsafe_requests.push_back(&request->unsafe_item);
2759     if (is_dir_operation(request)) {
2760       Inode *dir = request->inode();
2761       ceph_assert(dir);
2762       dir->unsafe_ops.push_back(&request->unsafe_dir_item);
2763     }
2764     if (request->target) {
2765       InodeRef &in = request->target;
2766       in->unsafe_ops.push_back(&request->unsafe_target_item);
2767     }
2768   }
2769
2770   // Only signal the caller once (on the first reply):
2771   // Either its an unsafe reply, or its a safe reply and no unsafe reply was sent.
2772   if (!is_safe || !request->got_unsafe) {
2773     ceph::condition_variable cond;
2774     request->dispatch_cond = &cond;
2775
2776     // wake up waiter
2777     ldout(cct, 20) << __func__ << " signalling caller " << (void*)request->caller_cond << dendl;
2778     request->caller_cond->notify_all();
2779
2780     // wake for kick back
2781     std::unique_lock l{client_lock, std::adopt_lock};
2782     cond.wait(l, [tid, request, &cond, this] {
2783       if (request->dispatch_cond) {
2784         ldout(cct, 20) << "handle_client_reply awaiting kickback on tid "
2785                        << tid << " " << &cond << dendl;
2786       }
2787       return !request->dispatch_cond;
2788     });
2789     l.release();
2790   }
2791
2792   if (is_safe) {
2793     // the filesystem change is committed to disk
2794     // we're done, clean up
2795     if (request->got_unsafe) {
2796       request->unsafe_item.remove_myself();
2797       request->unsafe_dir_item.remove_myself();
2798       request->unsafe_target_item.remove_myself();
2799       signal_cond_list(request->waitfor_safe);
2800     }
2801     request->item.remove_myself();
2802     unregister_request(request);
2803   }
2804   if (is_unmounting())
2805     mount_cond.notify_all();
2806 }
2807
2808 void Client::_handle_full_flag(int64_t pool)
2809 {
2810   ldout(cct, 1) << __func__ << ": FULL: cancelling outstanding operations "
2811     << "on " << pool << dendl;
2812   // Cancel all outstanding ops in this pool with -CEPHFS_ENOSPC: it is necessary
2813   // to do this rather than blocking, because otherwise when we fill up we
2814   // potentially lock caps forever on files with dirty pages, and we need
2815   // to be able to release those caps to the MDS so that it can delete files
2816   // and free up space.
2817   epoch_t cancelled_epoch = objecter->op_cancel_writes(-CEPHFS_ENOSPC, pool);
2818
2819   // For all inodes with layouts in this pool and a pending flush write op
2820   // (i.e. one of the ones we will cancel), we've got to purge_set their data
2821   // from ObjectCacher so that it doesn't re-issue the write in response to
2822   // the ENOSPC error.
2823   // Fortunately since we're cancelling everything in a given pool, we don't
2824   // need to know which ops belong to which ObjectSet, we can just blow all
2825   // the un-flushed cached data away and mark any dirty inodes' async_err
2826   // field with -CEPHFS_ENOSPC as long as we're sure all the ops we cancelled were
2827   // affecting this pool, and all the objectsets we're purging were also
2828   // in this pool.
2829   for (unordered_map<vinodeno_t,Inode*>::iterator i = inode_map.begin();
2830        i != inode_map.end(); ++i)
2831   {
2832     Inode *inode = i->second;
2833     if (inode->oset.dirty_or_tx
2834         && (pool == -1 || inode->layout.pool_id == pool)) {
2835       ldout(cct, 4) << __func__ << ": FULL: inode 0x" << std::hex << i->first << std::dec
2836         << " has dirty objects, purging and setting ENOSPC" << dendl;
2837       objectcacher->purge_set(&inode->oset);
2838       inode->set_async_err(-CEPHFS_ENOSPC);
2839     }
2840   }
2841
2842   if (cancelled_epoch != (epoch_t)-1) {
2843     set_cap_epoch_barrier(cancelled_epoch);
2844   }
2845 }
2846
2847 void Client::handle_osd_map(const MConstRef<MOSDMap>& m)
2848 {
2849   std::scoped_lock cl(client_lock);
2850
2851   const auto myaddrs = messenger->get_myaddrs();
2852   bool new_blocklist = objecter->with_osdmap(
2853     [&](const OSDMap& o) {
2854       return o.is_blocklisted(myaddrs);
2855     });
2856
2857   if (new_blocklist && !blocklisted) {
2858     auto epoch = objecter->with_osdmap([](const OSDMap &o){
2859         return o.get_epoch();
2860         });
2861     lderr(cct) << "I was blocklisted at osd epoch " << epoch << dendl;
2862     blocklisted = true;
2863
2864     _abort_mds_sessions(-CEPHFS_EBLOCKLISTED);
2865
2866     // Since we know all our OSD ops will fail, cancel them all preemtively,
2867     // so that on an unhealthy cluster we can umount promptly even if e.g.
2868     // some PGs were inaccessible.
2869     objecter->op_cancel_writes(-CEPHFS_EBLOCKLISTED);
2870
2871   }
2872
2873   if (blocklisted) {
2874     // Handle case where we were blocklisted but no longer are
2875     blocklisted = objecter->with_osdmap([myaddrs](const OSDMap &o){
2876         return o.is_blocklisted(myaddrs);});
2877   }
2878
2879   // Always subscribe to next osdmap for blocklisted client
2880   // until this client is not blocklisted.
2881   if (blocklisted) {
2882     objecter->maybe_request_map();
2883   }
2884
2885   if (objecter->osdmap_full_flag()) {
2886     _handle_full_flag(-1);
2887   } else {
2888     // Accumulate local list of full pools so that I can drop
2889     // the objecter lock before re-entering objecter in
2890     // cancel_writes
2891     std::vector<int64_t> full_pools;
2892
2893     objecter->with_osdmap([&full_pools](const OSDMap &o) {
2894         for (const auto& kv : o.get_pools()) {
2895           if (kv.second.has_flag(pg_pool_t::FLAG_FULL)) {
2896             full_pools.push_back(kv.first);
2897           }
2898         }
2899       });
2900
2901     for (auto p : full_pools)
2902       _handle_full_flag(p);
2903
2904     // Subscribe to subsequent maps to watch for the full flag going
2905     // away.  For the global full flag objecter does this for us, but
2906     // it pays no attention to the per-pool full flag so in this branch
2907     // we do it ourselves.
2908     if (!full_pools.empty()) {
2909       objecter->maybe_request_map();
2910     }
2911   }
2912 }
2913
2914
2915 // ------------------------
2916 // incoming messages
2917
2918
2919 bool Client::ms_dispatch2(const MessageRef &m)
2920 {
2921   RWRef_t iref_reader(initialize_state, CLIENT_INITIALIZED);
2922   if (!iref_reader.is_state_satisfied()) {
2923     ldout(cct, 10) << "inactive, discarding " << *m << dendl;
2924     return true;
2925   }
2926
2927   switch (m->get_type()) {
2928     // mounting and mds sessions
2929   case CEPH_MSG_MDS_MAP:
2930     handle_mds_map(ref_cast<MMDSMap>(m));
2931     break;
2932   case CEPH_MSG_FS_MAP:
2933     handle_fs_map(ref_cast<MFSMap>(m));
2934     break;
2935   case CEPH_MSG_FS_MAP_USER:
2936     handle_fs_map_user(ref_cast<MFSMapUser>(m));
2937     break;
2938   case CEPH_MSG_CLIENT_SESSION:
2939     handle_client_session(ref_cast<MClientSession>(m));
2940     break;
2941
2942   case CEPH_MSG_OSD_MAP:
2943     handle_osd_map(ref_cast<MOSDMap>(m));
2944     break;
2945
2946     // requests
2947   case CEPH_MSG_CLIENT_REQUEST_FORWARD:
2948     handle_client_request_forward(ref_cast<MClientRequestForward>(m));
2949     break;
2950   case CEPH_MSG_CLIENT_REPLY:
2951     handle_client_reply(ref_cast<MClientReply>(m));
2952     break;
2953
2954   // reclaim reply
2955   case CEPH_MSG_CLIENT_RECLAIM_REPLY:
2956     handle_client_reclaim_reply(ref_cast<MClientReclaimReply>(m));
2957     break;
2958
2959   case CEPH_MSG_CLIENT_SNAP:
2960     handle_snap(ref_cast<MClientSnap>(m));
2961     break;
2962   case CEPH_MSG_CLIENT_CAPS:
2963     handle_caps(ref_cast<MClientCaps>(m));
2964     break;
2965   case CEPH_MSG_CLIENT_LEASE:
2966     handle_lease(ref_cast<MClientLease>(m));
2967     break;
2968   case MSG_COMMAND_REPLY:
2969     if (m->get_source().type() == CEPH_ENTITY_TYPE_MDS) {
2970       handle_command_reply(ref_cast<MCommandReply>(m));
2971     } else {
2972       return false;
2973     }
2974     break;
2975   case CEPH_MSG_CLIENT_QUOTA:
2976     handle_quota(ref_cast<MClientQuota>(m));
2977     break;
2978
2979   default:
2980     return false;
2981   }
2982
2983   // unmounting?
2984   std::scoped_lock cl(client_lock);
2985   if (is_unmounting()) {
2986     ldout(cct, 10) << "unmounting: trim pass, size was " << lru.lru_get_size()
2987              << "+" << inode_map.size() << dendl;
2988     uint64_t size = lru.lru_get_size() + inode_map.size();
2989     trim_cache();
2990     if (size > lru.lru_get_size() + inode_map.size()) {
2991       ldout(cct, 10) << "unmounting: trim pass, cache shrank, poking unmount()" << dendl;
2992       mount_cond.notify_all();
2993     } else {
2994       ldout(cct, 10) << "unmounting: trim pass, size still " << lru.lru_get_size()
2995                << "+" << inode_map.size() << dendl;
2996     }
2997   }
2998
2999   return true;
3000 }
3001
3002 void Client::handle_fs_map(const MConstRef<MFSMap>& m)
3003 {
3004   std::scoped_lock cl(client_lock);
3005   fsmap.reset(new FSMap(m->get_fsmap()));
3006
3007   signal_cond_list(waiting_for_fsmap);
3008
3009   monclient->sub_got("fsmap", fsmap->get_epoch());
3010 }
3011
3012 void Client::handle_fs_map_user(const MConstRef<MFSMapUser>& m)
3013 {
3014   std::scoped_lock cl(client_lock);
3015   fsmap_user.reset(new FSMapUser);
3016   *fsmap_user = m->get_fsmap();
3017
3018   monclient->sub_got("fsmap.user", fsmap_user->get_epoch());
3019   signal_cond_list(waiting_for_fsmap);
3020 }
3021
3022 // Cancel all the commands for missing or laggy GIDs
3023 void Client::cancel_commands(const MDSMap& newmap)
3024 {
3025   std::vector<ceph_tid_t> cancel_ops;
3026
3027   std::scoped_lock cmd_lock(command_lock);
3028   auto &commands = command_table.get_commands();
3029   for (const auto &[tid, op] : commands) {
3030     const mds_gid_t op_mds_gid = op.mds_gid;
3031     if (newmap.is_dne_gid(op_mds_gid) || newmap.is_laggy_gid(op_mds_gid)) {
3032       ldout(cct, 1) << __func__ << ": cancelling command op " << tid << dendl;
3033       cancel_ops.push_back(tid);
3034       if (op.outs) {
3035         std::ostringstream ss;
3036         ss << "MDS " << op_mds_gid << " went away";
3037         *(op.outs) = ss.str();
3038       }
3039       /*
3040        * No need to make the con->mark_down under
3041        * client_lock here, because the con will
3042        * has its own lock.
3043        */
3044       op.con->mark_down();
3045       if (op.on_finish)
3046         op.on_finish->complete(-CEPHFS_ETIMEDOUT);
3047     }
3048   }
3049
3050   for (const auto &tid : cancel_ops)
3051     command_table.erase(tid);
3052 }
3053
3054 void Client::handle_mds_map(const MConstRef<MMDSMap>& m)
3055 {
3056   std::unique_lock cl(client_lock);
3057   if (m->get_epoch() <= mdsmap->get_epoch()) {
3058     ldout(cct, 1) << __func__ << " epoch " << m->get_epoch()
3059                   << " is identical to or older than our "
3060                   << mdsmap->get_epoch() << dendl;
3061     return;
3062   }
3063
3064   cl.unlock();
3065   ldout(cct, 1) << __func__ << " epoch " << m->get_epoch() << dendl;
3066   std::unique_ptr<MDSMap> _mdsmap(new MDSMap);
3067   _mdsmap->decode(m->get_encoded());
3068   cancel_commands(*_mdsmap.get());
3069   cl.lock();
3070
3071   _mdsmap.swap(mdsmap);
3072
3073   // reset session
3074   for (auto p = mds_sessions.begin(); p != mds_sessions.end(); ) {
3075     mds_rank_t mds = p->first;
3076     MetaSessionRef session = p->second;
3077     ++p;
3078
3079     int oldstate = _mdsmap->get_state(mds);
3080     int newstate = mdsmap->get_state(mds);
3081     if (!mdsmap->is_up(mds)) {
3082       session->con->mark_down();
3083     } else if (mdsmap->get_addrs(mds) != session->addrs) {
3084       auto old_inc = _mdsmap->get_incarnation(mds);
3085       auto new_inc = mdsmap->get_incarnation(mds);
3086       if (old_inc != new_inc) {
3087         ldout(cct, 1) << "mds incarnation changed from "
3088                       << old_inc << " to " << new_inc << dendl;
3089         oldstate = MDSMap::STATE_NULL;
3090       }
3091       session->con->mark_down();
3092       session->addrs = mdsmap->get_addrs(mds);
3093       // When new MDS starts to take over, notify kernel to trim unused entries
3094       // in its dcache/icache. Hopefully, the kernel will release some unused
3095       // inodes before the new MDS enters reconnect state.
3096       trim_cache_for_reconnect(session.get());
3097     } else if (oldstate == newstate)
3098       continue;  // no change
3099
3100     session->mds_state = newstate;
3101     if (newstate == MDSMap::STATE_RECONNECT) {
3102       session->con = messenger->connect_to_mds(session->addrs);
3103       send_reconnect(session.get());
3104     } else if (newstate > MDSMap::STATE_RECONNECT) {
3105       if (oldstate < MDSMap::STATE_RECONNECT) {
3106         ldout(cct, 1) << "we may miss the MDSMap::RECONNECT, close mds session ... " << dendl;
3107         _closed_mds_session(session.get());
3108         continue;
3109       }
3110       if (newstate >= MDSMap::STATE_ACTIVE) {
3111         if (oldstate < MDSMap::STATE_ACTIVE) {
3112           // kick new requests
3113           kick_requests(session.get());
3114           kick_flushing_caps(session.get());
3115           signal_context_list(session->waiting_for_open);
3116           wake_up_session_caps(session.get(), true);
3117         }
3118         connect_mds_targets(mds);
3119       }
3120     } else if (newstate == MDSMap::STATE_NULL &&
3121                mds >= mdsmap->get_max_mds()) {
3122       _closed_mds_session(session.get());
3123     }
3124   }
3125
3126   // kick any waiting threads
3127   signal_cond_list(waiting_for_mdsmap);
3128
3129   monclient->sub_got("mdsmap", mdsmap->get_epoch());
3130 }
3131
3132 void Client::send_reconnect(MetaSession *session)
3133 {
3134   mds_rank_t mds = session->mds_num;
3135   ldout(cct, 10) << __func__ << " to mds." << mds << dendl;
3136
3137   // trim unused caps to reduce MDS's cache rejoin time
3138   trim_cache_for_reconnect(session);
3139
3140   session->readonly = false;
3141
3142   session->release.reset();
3143
3144   // reset my cap seq number
3145   session->seq = 0;
3146   //connect to the mds' offload targets
3147   connect_mds_targets(mds);
3148   //make sure unsafe requests get saved
3149   resend_unsafe_requests(session);
3150
3151   early_kick_flushing_caps(session);
3152
3153   auto m = make_message<MClientReconnect>();
3154   bool allow_multi = session->mds_features.test(CEPHFS_FEATURE_MULTI_RECONNECT);
3155
3156   // i have an open session.
3157   ceph::unordered_set<inodeno_t> did_snaprealm;
3158   for (ceph::unordered_map<vinodeno_t, Inode*>::iterator p = inode_map.begin();
3159        p != inode_map.end();
3160        ++p) {
3161     Inode *in = p->second;
3162     auto it = in->caps.find(mds);
3163     if (it != in->caps.end()) {
3164       if (allow_multi &&
3165           m->get_approx_size() >=
3166           static_cast<size_t>((std::numeric_limits<int>::max() >> 1))) {
3167         m->mark_more();
3168         session->con->send_message2(std::move(m));
3169
3170         m = make_message<MClientReconnect>();
3171       }
3172
3173       Cap &cap = it->second;
3174       ldout(cct, 10) << " caps on " << p->first
3175                << " " << ccap_string(cap.issued)
3176                << " wants " << ccap_string(in->caps_wanted())
3177                << dendl;
3178       filepath path;
3179       in->make_short_path(path);
3180       ldout(cct, 10) << "    path " << path << dendl;
3181
3182       bufferlist flockbl;
3183       _encode_filelocks(in, flockbl);
3184
3185       cap.seq = 0;  // reset seq.
3186       cap.issue_seq = 0;  // reset seq.
3187       cap.mseq = 0;  // reset seq.
3188       // cap gen should catch up with session cap_gen
3189       if (cap.gen < session->cap_gen) {
3190         cap.gen = session->cap_gen;
3191         cap.issued = cap.implemented = CEPH_CAP_PIN;
3192       } else {
3193         cap.issued = cap.implemented;
3194       }
3195       snapid_t snap_follows = 0;
3196       if (!in->cap_snaps.empty())
3197         snap_follows = in->cap_snaps.begin()->first;
3198
3199       m->add_cap(p->first.ino,
3200                  cap.cap_id,
3201                  path.get_ino(), path.get_path(),   // ino
3202                  in->caps_wanted(), // wanted
3203                  cap.issued,     // issued
3204                  in->snaprealm->ino,
3205                  snap_follows,
3206                  flockbl);
3207
3208       if (did_snaprealm.count(in->snaprealm->ino) == 0) {
3209         ldout(cct, 10) << " snaprealm " << *in->snaprealm << dendl;
3210         m->add_snaprealm(in->snaprealm->ino, in->snaprealm->seq, in->snaprealm->parent);
3211         did_snaprealm.insert(in->snaprealm->ino);
3212       }
3213     }
3214   }
3215
3216   if (!allow_multi)
3217     m->set_encoding_version(0); // use connection features to choose encoding
3218   session->con->send_message2(std::move(m));
3219
3220   mount_cond.notify_all();
3221
3222   if (session->reclaim_state == MetaSession::RECLAIMING)
3223     signal_cond_list(waiting_for_reclaim);
3224 }
3225
3226
3227 void Client::kick_requests(MetaSession *session)
3228 {
3229   ldout(cct, 10) << __func__ << " for mds." << session->mds_num << dendl;
3230   for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
3231        p != mds_requests.end();
3232        ++p) {
3233     MetaRequest *req = p->second;
3234     if (req->got_unsafe)
3235       continue;
3236     if (req->aborted()) {
3237       if (req->caller_cond) {
3238         req->kick = true;
3239         req->caller_cond->notify_all();
3240       }
3241       continue;
3242     }
3243     if (req->retry_attempt > 0)
3244       continue; // new requests only
3245     if (req->mds == session->mds_num) {
3246       send_request(p->second, session);
3247     }
3248   }
3249 }
3250
3251 void Client::resend_unsafe_requests(MetaSession *session)
3252 {
3253   for (xlist<MetaRequest*>::iterator iter = session->unsafe_requests.begin();
3254        !iter.end();
3255        ++iter)
3256     send_request(*iter, session);
3257
3258   // also re-send old requests when MDS enters reconnect stage. So that MDS can
3259   // process completed requests in clientreplay stage.
3260   for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
3261        p != mds_requests.end();
3262        ++p) {
3263     MetaRequest *req = p->second;
3264     if (req->got_unsafe)
3265       continue;
3266     if (req->aborted())
3267       continue;
3268     if (req->retry_attempt == 0)
3269       continue; // old requests only
3270     if (req->mds == session->mds_num)
3271       send_request(req, session, true);
3272   }
3273 }
3274
3275 void Client::wait_unsafe_requests()
3276 {
3277   list<MetaRequest*> last_unsafe_reqs;
3278   for (const auto &p : mds_sessions) {
3279     const auto s = p.second;
3280     if (!s->unsafe_requests.empty()) {
3281       MetaRequest *req = s->unsafe_requests.back();
3282       req->get();
3283       last_unsafe_reqs.push_back(req);
3284     }
3285   }
3286
3287   for (list<MetaRequest*>::iterator p = last_unsafe_reqs.begin();
3288        p != last_unsafe_reqs.end();
3289        ++p) {
3290     MetaRequest *req = *p;
3291     if (req->unsafe_item.is_on_list())
3292       wait_on_list(req->waitfor_safe);
3293     put_request(req);
3294   }
3295 }
3296
3297 void Client::kick_requests_closed(MetaSession *session)
3298 {
3299   ldout(cct, 10) << __func__ << " for mds." << session->mds_num << dendl;
3300   for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
3301        p != mds_requests.end(); ) {
3302     MetaRequest *req = p->second;
3303     ++p;
3304     if (req->mds == session->mds_num) {
3305       if (req->caller_cond) {
3306         req->kick = true;
3307         req->caller_cond->notify_all();
3308       }
3309       req->item.remove_myself();
3310       if (req->got_unsafe) {
3311         lderr(cct) << __func__ << " removing unsafe request " << req->get_tid() << dendl;
3312         req->unsafe_item.remove_myself();
3313         if (is_dir_operation(req)) {
3314           Inode *dir = req->inode();
3315           ceph_assert(dir);
3316           dir->set_async_err(-CEPHFS_EIO);
3317           lderr(cct) << "kick_requests_closed drop req of inode(dir) : "
3318                      <<  dir->ino  << " " << req->get_tid() << dendl;
3319           req->unsafe_dir_item.remove_myself();
3320         }
3321         if (req->target) {
3322           InodeRef &in = req->target;
3323           in->set_async_err(-CEPHFS_EIO);
3324           lderr(cct) << "kick_requests_closed drop req of inode : "
3325                      <<  in->ino  << " " << req->get_tid() << dendl;
3326           req->unsafe_target_item.remove_myself();
3327         }
3328         signal_cond_list(req->waitfor_safe);
3329         unregister_request(req);
3330       }
3331     }
3332   }
3333   ceph_assert(session->requests.empty());
3334   ceph_assert(session->unsafe_requests.empty());
3335 }
3336
3337
3338
3339
3340 /************
3341  * leases
3342  */
3343
3344 void Client::got_mds_push(MetaSession *s)
3345 {
3346   s->seq++;
3347   ldout(cct, 10) << " mds." << s->mds_num << " seq now " << s->seq << dendl;
3348   if (s->state == MetaSession::STATE_CLOSING) {
3349     s->con->send_message2(make_message<MClientSession>(CEPH_SESSION_REQUEST_CLOSE, s->seq));
3350   }
3351 }
3352
3353 void Client::handle_lease(const MConstRef<MClientLease>& m)
3354 {
3355   ldout(cct, 10) << __func__ << " " << *m << dendl;
3356
3357   ceph_assert(m->get_action() == CEPH_MDS_LEASE_REVOKE);
3358   mds_rank_t mds = mds_rank_t(m->get_source().num());
3359
3360   std::scoped_lock cl(client_lock);
3361   auto session = _get_mds_session(mds, m->get_connection().get());
3362   if (!session) {
3363     return;
3364   }
3365
3366   got_mds_push(session.get());
3367
3368   ceph_seq_t seq = m->get_seq();
3369
3370   Inode *in;
3371   vinodeno_t vino(m->get_ino(), CEPH_NOSNAP);
3372   if (inode_map.count(vino) == 0) {
3373     ldout(cct, 10) << " don't have vino " << vino << dendl;
3374     goto revoke;
3375   }
3376   in = inode_map[vino];
3377
3378   if (m->get_mask() & CEPH_LEASE_VALID) {
3379     if (!in->dir || in->dir->dentries.count(m->dname) == 0) {
3380       ldout(cct, 10) << " don't have dir|dentry " << m->get_ino() << "/" << m->dname <<dendl;
3381       goto revoke;
3382     }
3383     Dentry *dn = in->dir->dentries[m->dname];
3384     ldout(cct, 10) << " revoked DN lease on " << dn << dendl;
3385     dn->lease_mds = -1;
3386   }
3387
3388  revoke:
3389   {
3390     auto reply = make_message<MClientLease>(CEPH_MDS_LEASE_RELEASE, seq,
3391                                             m->get_mask(), m->get_ino(),
3392                                             m->get_first(), m->get_last(), m->dname);
3393     m->get_connection()->send_message2(std::move(reply));
3394   }
3395 }
3396
3397 void Client::_put_inode(Inode *in, int n)
3398 {
3399   ldout(cct, 10) << __func__ << " on " << *in << " n = " << n << dendl;
3400
3401   int left = in->get_nref();
3402   ceph_assert(left >= n + 1);
3403   in->iput(n);
3404   left -= n;
3405   if (left == 1) { // the last one will be held by the inode_map
3406     // release any caps
3407     remove_all_caps(in);
3408
3409     ldout(cct, 10) << __func__ << " deleting " << *in << dendl;
3410     bool unclean = objectcacher->release_set(&in->oset);
3411     ceph_assert(!unclean);
3412     inode_map.erase(in->vino());
3413     if (use_faked_inos())
3414       _release_faked_ino(in);
3415
3416     if (root == nullptr) {
3417       root_ancestor = 0;
3418       while (!root_parents.empty())
3419         root_parents.erase(root_parents.begin());
3420     }
3421
3422     in->iput();
3423   }
3424 }
3425
3426 void Client::delay_put_inodes(bool wakeup)
3427 {
3428   ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
3429
3430   std::map<Inode*,int> release;
3431   {
3432     std::scoped_lock dl(delay_i_lock);
3433     release.swap(delay_i_release);
3434   }
3435
3436   if (release.empty())
3437     return;
3438
3439   for (auto &[in, cnt] : release)
3440     _put_inode(in, cnt);
3441
3442   if (wakeup)
3443     mount_cond.notify_all();
3444 }
3445
3446 void Client::put_inode(Inode *in, int n)
3447 {
3448   ldout(cct, 20) << __func__ << " on " << *in << " n = " << n << dendl;
3449
3450   std::scoped_lock dl(delay_i_lock);
3451   delay_i_release[in] += n;
3452 }
3453
3454 void Client::close_dir(Dir *dir)
3455 {
3456   Inode *in = dir->parent_inode;
3457   ldout(cct, 15) << __func__ << " dir " << dir << " on " << in << dendl;
3458   ceph_assert(dir->is_empty());
3459   ceph_assert(in->dir == dir);
3460   ceph_assert(in->dentries.size() < 2);     // dirs can't be hard-linked
3461   if (!in->dentries.empty())
3462     in->get_first_parent()->put();   // unpin dentry
3463
3464   delete in->dir;
3465   in->dir = 0;
3466   put_inode(in);               // unpin inode
3467 }
3468
3469   /**
3470    * Don't call this with in==NULL, use get_or_create for that
3471    * leave dn set to default NULL unless you're trying to add
3472    * a new inode to a pre-created Dentry
3473    */
3474 Dentry* Client::link(Dir *dir, const string& name, Inode *in, Dentry *dn)
3475 {
3476   if (!dn) {
3477     // create a new Dentry
3478     dn = new Dentry(dir, name);
3479
3480     lru.lru_insert_mid(dn);    // mid or top?
3481
3482     if(in) {
3483       ldout(cct, 15) << "link dir " << *dir->parent_inode << " '" << name << "' to inode " << *in
3484                      << " dn " << *dn << " (new dn)" << dendl;
3485     } else {
3486       ldout(cct, 15) << "link dir " << *dir->parent_inode << " '" << name << "' "
3487         << " dn " << *dn << " (new dn)" << dendl;
3488     }
3489   } else {
3490     ceph_assert(!dn->inode);
3491     ldout(cct, 15) << "link dir " << *dir->parent_inode << " '" << name << "' to inode " << in
3492                    << " dn " << *dn << " (old dn)" << dendl;
3493   }
3494
3495   if (in) {    // link to inode
3496     InodeRef tmp_ref;
3497     // only one parent for directories!
3498     if (in->is_dir() && !in->dentries.empty()) {
3499       tmp_ref = in; // prevent unlink below from freeing the inode.
3500       Dentry *olddn = in->get_first_parent();
3501       ceph_assert(olddn->dir != dir || olddn->name != name);
3502       Inode *old_diri = olddn->dir->parent_inode;
3503       clear_dir_complete_and_ordered(old_diri, true);
3504       unlink(olddn, true, true);  // keep dir, dentry
3505     }
3506
3507     dn->link(in);
3508     inc_dentry_nr();
3509     ldout(cct, 20) << "link  inode " << in << " parents now " << in->dentries << dendl;
3510   }
3511
3512   return dn;
3513 }
3514
3515 void Client::unlink(Dentry *dn, bool keepdir, bool keepdentry)
3516 {
3517   InodeRef in(dn->inode);
3518   ldout(cct, 15) << "unlink dir " << dn->dir->parent_inode << " '" << dn->name << "' dn " << dn
3519                  << " inode " << dn->inode << dendl;
3520
3521   // unlink from inode
3522   if (dn->inode) {
3523     dn->unlink();
3524     dec_dentry_nr();
3525     ldout(cct, 20) << "unlink  inode " << in << " parents now " << in->dentries << dendl;
3526   }
3527
3528   if (keepdentry) {
3529     dn->lease_mds = -1;
3530   } else {
3531     ldout(cct, 15) << "unlink  removing '" << dn->name << "' dn " << dn << dendl;
3532
3533     // unlink from dir
3534     Dir *dir = dn->dir;
3535     dn->detach();
3536
3537     // delete den
3538     lru.lru_remove(dn);
3539     dn->put();
3540
3541     if (dir->is_empty() && !keepdir)
3542       close_dir(dir);
3543   }
3544 }
3545
3546 /**
3547  * For asynchronous flushes, check for errors from the IO and
3548  * update the inode if necessary
3549  */
3550 class C_Client_FlushComplete : public Context {
3551 private:
3552   Client *client;
3553   InodeRef inode;
3554 public:
3555   C_Client_FlushComplete(Client *c, Inode *in) : client(c), inode(in) { }
3556   void finish(int r) override {
3557     ceph_assert(ceph_mutex_is_locked_by_me(client->client_lock));
3558     if (r != 0) {
3559       client_t const whoami = client->whoami;  // For the benefit of ldout prefix
3560       ldout(client->cct, 1) << "I/O error from flush on inode " << inode
3561         << " 0x" << std::hex << inode->ino << std::dec
3562         << ": " << r << "(" << cpp_strerror(r) << ")" << dendl;
3563       inode->set_async_err(r);
3564     }
3565   }
3566 };
3567
3568
3569 /****
3570  * caps
3571  */
3572
3573 void Client::get_cap_ref(Inode *in, int cap)
3574 {
3575   if ((cap & CEPH_CAP_FILE_BUFFER) &&
3576       in->cap_refs[CEPH_CAP_FILE_BUFFER] == 0) {
3577     ldout(cct, 5) << __func__ << " got first FILE_BUFFER ref on " << *in << dendl;
3578     in->iget();
3579   }
3580   if ((cap & CEPH_CAP_FILE_CACHE) &&
3581       in->cap_refs[CEPH_CAP_FILE_CACHE] == 0) {
3582     ldout(cct, 5) << __func__ << " got first FILE_CACHE ref on " << *in << dendl;
3583     in->iget();
3584   }
3585   in->get_cap_ref(cap);
3586 }
3587
3588 void Client::put_cap_ref(Inode *in, int cap)
3589 {
3590   int last = in->put_cap_ref(cap);
3591   if (last) {
3592     int put_nref = 0;
3593     int drop = last & ~in->caps_issued();
3594     if (in->snapid == CEPH_NOSNAP) {
3595       if ((last & CEPH_CAP_FILE_WR) &&
3596           !in->cap_snaps.empty() &&
3597           in->cap_snaps.rbegin()->second.writing) {
3598         ldout(cct, 10) << __func__ << " finishing pending cap_snap on " << *in << dendl;
3599         in->cap_snaps.rbegin()->second.writing = 0;
3600         finish_cap_snap(in, in->cap_snaps.rbegin()->second, get_caps_used(in));
3601         signal_cond_list(in->waitfor_caps);  // wake up blocked sync writers
3602       }
3603       if (last & CEPH_CAP_FILE_BUFFER) {
3604         for (auto &p : in->cap_snaps)
3605           p.second.dirty_data = 0;
3606         signal_cond_list(in->waitfor_commit);
3607         ldout(cct, 5) << __func__ << " dropped last FILE_BUFFER ref on " << *in << dendl;
3608         ++put_nref;
3609
3610         if (!in->cap_snaps.empty()) {
3611           flush_snaps(in);
3612         }
3613       }
3614     }
3615     if (last & CEPH_CAP_FILE_CACHE) {
3616       ldout(cct, 5) << __func__ << " dropped last FILE_CACHE ref on " << *in << dendl;
3617       ++put_nref;
3618     }
3619     if (drop)
3620       check_caps(in, 0);
3621     if (put_nref)
3622       put_inode(in, put_nref);
3623   }
3624 }
3625
3626 // get caps for a given file handle -- the inode should have @need caps
3627 // issued by the mds and @want caps not revoked (or not under revocation).
3628 // this routine blocks till the cap requirement is satisfied. also account
3629 // (track) for capability hit when required (when cap requirement succeedes).
3630 int Client::get_caps(Fh *fh, int need, int want, int *phave, loff_t endoff)
3631 {
3632   Inode *in = fh->inode.get();
3633
3634   int r = check_pool_perm(in, need);
3635   if (r < 0)
3636     return r;
3637
3638   while (1) {
3639     int file_wanted = in->caps_file_wanted();
3640     if ((file_wanted & need) != need) {
3641       ldout(cct, 10) << "get_caps " << *in << " need " << ccap_string(need)
3642                      << " file_wanted " << ccap_string(file_wanted) << ", EBADF "
3643                      << dendl;
3644       return -CEPHFS_EBADF;
3645     }
3646
3647     if ((fh->mode & CEPH_FILE_MODE_WR) && fh->gen != fd_gen)
3648       return -CEPHFS_EBADF;
3649
3650     if ((in->flags & I_ERROR_FILELOCK) && fh->has_any_filelocks())
3651       return -CEPHFS_EIO;
3652
3653     int implemented;
3654     int have = in->caps_issued(&implemented);
3655
3656     bool waitfor_caps = false;
3657     bool waitfor_commit = false;
3658
3659     if (have & need & CEPH_CAP_FILE_WR) {
3660       if (endoff > 0) {
3661          if ((endoff >= (loff_t)in->max_size ||
3662               endoff > (loff_t)(in->size << 1)) &&
3663              endoff > (loff_t)in->wanted_max_size) {
3664            ldout(cct, 10) << "wanted_max_size " << in->wanted_max_size << " -> " << endoff << dendl;
3665            in->wanted_max_size = endoff;
3666          }
3667          if (in->wanted_max_size > in->max_size &&
3668              in->wanted_max_size > in->requested_max_size)
3669            check_caps(in, 0);
3670       }
3671
3672       if (endoff >= 0 && endoff > (loff_t)in->max_size) {
3673         ldout(cct, 10) << "waiting on max_size, endoff " << endoff << " max_size " << in->max_size << " on " << *in << dendl;
3674         waitfor_caps = true;
3675       }
3676       if (!in->cap_snaps.empty()) {
3677         if (in->cap_snaps.rbegin()->second.writing) {
3678           ldout(cct, 10) << "waiting on cap_snap write to complete" << dendl;
3679           waitfor_caps = true;
3680         }
3681         for (auto &p : in->cap_snaps) {
3682           if (p.second.dirty_data) {
3683             waitfor_commit = true;
3684             break;
3685           }
3686         }
3687         if (waitfor_commit) {
3688           _flush(in, new C_Client_FlushComplete(this, in));
3689           ldout(cct, 10) << "waiting for WRBUFFER to get dropped" << dendl;
3690         }
3691       }
3692     }
3693
3694     if (!waitfor_caps && !waitfor_commit) {
3695       if ((have & need) == need) {
3696         int revoking = implemented & ~have;
3697         ldout(cct, 10) << "get_caps " << *in << " have " << ccap_string(have)
3698                  << " need " << ccap_string(need) << " want " << ccap_string(want)
3699                  << " revoking " << ccap_string(revoking)
3700                  << dendl;
3701         if ((revoking & want) == 0) {
3702           *phave = need | (have & want);
3703           in->get_cap_ref(need);
3704           cap_hit();
3705           return 0;
3706         }
3707       }
3708       ldout(cct, 10) << "waiting for caps " << *in << " need " << ccap_string(need) << " want " << ccap_string(want) << dendl;
3709       waitfor_caps = true;
3710     }
3711
3712     if ((need & CEPH_CAP_FILE_WR) &&
3713         ((in->auth_cap && in->auth_cap->session->readonly) ||
3714          // userland clients are only allowed to read if fscrypt enabled
3715          in->is_fscrypt_enabled()))
3716       return -CEPHFS_EROFS;
3717
3718     if (in->flags & I_CAP_DROPPED) {
3719       int mds_wanted = in->caps_mds_wanted();
3720       if ((mds_wanted & need) != need) {
3721         int ret = _renew_caps(in);
3722         if (ret < 0)
3723           return ret;
3724         continue;
3725       }
3726       if (!(file_wanted & ~mds_wanted))
3727         in->flags &= ~I_CAP_DROPPED;
3728     }
3729
3730     if (waitfor_caps)
3731       wait_on_list(in->waitfor_caps);
3732     else if (waitfor_commit)
3733       wait_on_list(in->waitfor_commit);
3734   }
3735 }
3736
3737 int Client::get_caps_used(Inode *in)
3738 {
3739   unsigned used = in->caps_used();
3740   if (!(used & CEPH_CAP_FILE_CACHE) &&
3741       !objectcacher->set_is_empty(&in->oset))
3742     used |= CEPH_CAP_FILE_CACHE;
3743   return used;
3744 }
3745
3746 void Client::cap_delay_requeue(Inode *in)
3747 {
3748   ldout(cct, 10) << __func__ << " on " << *in << dendl;
3749
3750   in->hold_caps_until = ceph::coarse_mono_clock::now() + caps_release_delay;
3751   delayed_list.push_back(&in->delay_cap_item);
3752 }
3753
3754 void Client::send_cap(Inode *in, MetaSession *session, Cap *cap,
3755                       int flags, int used, int want, int retain,
3756                       int flush, ceph_tid_t flush_tid)
3757 {
3758   int held = cap->issued | cap->implemented;
3759   int revoking = cap->implemented & ~cap->issued;
3760   retain &= ~revoking;
3761   int dropping = cap->issued & ~retain;
3762   int op = CEPH_CAP_OP_UPDATE;
3763
3764   ldout(cct, 10) << __func__ << " " << *in
3765            << " mds." << session->mds_num << " seq " << cap->seq
3766            << " used " << ccap_string(used)
3767            << " want " << ccap_string(want)
3768            << " flush " << ccap_string(flush)
3769            << " retain " << ccap_string(retain)
3770            << " held "<< ccap_string(held)
3771            << " revoking " << ccap_string(revoking)
3772            << " dropping " << ccap_string(dropping)
3773            << dendl;
3774
3775   if (cct->_conf->client_inject_release_failure && revoking) {
3776     const int would_have_issued = cap->issued & retain;
3777     const int would_have_implemented = cap->implemented & (cap->issued | used);
3778     // Simulated bug:
3779     //  - tell the server we think issued is whatever they issued plus whatever we implemented
3780     //  - leave what we have implemented in place
3781     ldout(cct, 20) << __func__ << " injecting failure to release caps" << dendl;
3782     cap->issued = cap->issued | cap->implemented;
3783
3784     // Make an exception for revoking xattr caps: we are injecting
3785     // failure to release other caps, but allow xattr because client
3786     // will block on xattr ops if it can't release these to MDS (#9800)
3787     const int xattr_mask = CEPH_CAP_XATTR_SHARED | CEPH_CAP_XATTR_EXCL;
3788     cap->issued ^= xattr_mask & revoking;
3789     cap->implemented ^= xattr_mask & revoking;
3790
3791     ldout(cct, 20) << __func__ << " issued " << ccap_string(cap->issued) << " vs " << ccap_string(would_have_issued) << dendl;
3792     ldout(cct, 20) << __func__ << " implemented " << ccap_string(cap->implemented) << " vs " << ccap_string(would_have_implemented) << dendl;
3793   } else {
3794     // Normal behaviour
3795     cap->issued &= retain;
3796     cap->implemented &= cap->issued | used;
3797   }
3798
3799   snapid_t follows = 0;
3800
3801   if (flush)
3802     follows = in->snaprealm->get_snap_context().seq;
3803
3804   auto m = make_message<MClientCaps>(op,
3805                                    in->ino,
3806                                    0,
3807                                    cap->cap_id, cap->seq,
3808                                    cap->implemented,
3809                                    want,
3810                                    flush,
3811                                    cap->mseq,
3812                                    cap_epoch_barrier);
3813   m->caller_uid = in->cap_dirtier_uid;
3814   m->caller_gid = in->cap_dirtier_gid;
3815
3816   m->head.issue_seq = cap->issue_seq;
3817   m->set_tid(flush_tid);
3818
3819   m->head.uid = in->uid;
3820   m->head.gid = in->gid;
3821   m->head.mode = in->mode;
3822
3823   m->head.nlink = in->nlink;
3824
3825   if (flush & CEPH_CAP_XATTR_EXCL) {
3826     encode(in->xattrs, m->xattrbl);
3827     m->head.xattr_version = in->xattr_version;
3828   }
3829
3830   m->size = in->size;
3831   m->max_size = in->max_size;
3832   m->truncate_seq = in->truncate_seq;
3833   m->truncate_size = in->truncate_size;
3834   m->mtime = in->mtime;
3835   m->atime = in->atime;
3836   m->ctime = in->ctime;
3837   m->btime = in->btime;
3838   m->time_warp_seq = in->time_warp_seq;
3839   m->change_attr = in->change_attr;
3840   m->fscrypt_auth = in->fscrypt_auth;
3841   m->fscrypt_file = in->fscrypt_file;
3842
3843   if (!(flags & MClientCaps::FLAG_PENDING_CAPSNAP) &&
3844       !in->cap_snaps.empty() &&
3845       in->cap_snaps.rbegin()->second.flush_tid == 0)
3846     flags |= MClientCaps::FLAG_PENDING_CAPSNAP;
3847   m->flags = flags;
3848
3849   if (flush & CEPH_CAP_FILE_WR) {
3850     m->inline_version = in->inline_version;
3851     m->inline_data = in->inline_data;
3852   }
3853
3854   in->reported_size = in->size;
3855   m->set_snap_follows(follows);
3856   cap->wanted = want;
3857   if (cap == in->auth_cap) {
3858     if (want & CEPH_CAP_ANY_FILE_WR) {
3859       m->set_max_size(in->wanted_max_size);
3860       in->requested_max_size = in->wanted_max_size;
3861       ldout(cct, 15) << "auth cap, requesting max_size " << in->requested_max_size << dendl;
3862     } else {
3863       in->requested_max_size = 0;
3864       ldout(cct, 15) << "auth cap, reset requested_max_size due to not wanting any file write cap" << dendl;
3865     }
3866   }
3867
3868   if (!session->flushing_caps_tids.empty())
3869     m->set_oldest_flush_tid(*session->flushing_caps_tids.begin());
3870
3871   session->con->send_message2(std::move(m));
3872 }
3873
3874 static bool is_max_size_approaching(Inode *in)
3875 {
3876   /* mds will adjust max size according to the reported size */
3877   if (in->flushing_caps & CEPH_CAP_FILE_WR)
3878     return false;
3879   if (in->size >= in->max_size)
3880     return true;
3881   /* half of previous max_size increment has been used */
3882   if (in->max_size > in->reported_size &&
3883       (in->size << 1) >= in->max_size + in->reported_size)
3884     return true;
3885   return false;
3886 }
3887
3888 static int adjust_caps_used_for_lazyio(int used, int issued, int implemented)
3889 {
3890   if (!(used & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_BUFFER)))
3891     return used;
3892   if (!(implemented & CEPH_CAP_FILE_LAZYIO))
3893     return used;
3894
3895   if (issued & CEPH_CAP_FILE_LAZYIO) {
3896     if (!(issued & CEPH_CAP_FILE_CACHE)) {
3897       used &= ~CEPH_CAP_FILE_CACHE;
3898       used |= CEPH_CAP_FILE_LAZYIO;
3899     }
3900     if (!(issued & CEPH_CAP_FILE_BUFFER)) {
3901       used &= ~CEPH_CAP_FILE_BUFFER;
3902       used |= CEPH_CAP_FILE_LAZYIO;
3903     }
3904   } else {
3905     if (!(implemented & CEPH_CAP_FILE_CACHE)) {
3906       used &= ~CEPH_CAP_FILE_CACHE;
3907       used |= CEPH_CAP_FILE_LAZYIO;
3908     }
3909     if (!(implemented & CEPH_CAP_FILE_BUFFER)) {
3910       used &= ~CEPH_CAP_FILE_BUFFER;
3911       used |= CEPH_CAP_FILE_LAZYIO;
3912     }
3913   }
3914   return used;
3915 }
3916
3917 /**
3918  * check_caps
3919  *
3920  * Examine currently used and wanted versus held caps. Release, flush or ack
3921  * revoked caps to the MDS as appropriate.
3922  *
3923  * @param in the inode to check
3924  * @param flags flags to apply to cap check
3925  */
3926 void Client::check_caps(Inode *in, unsigned flags)
3927 {
3928   unsigned wanted = in->caps_wanted();
3929   unsigned used = get_caps_used(in);
3930   unsigned cap_used;
3931
3932   int implemented;
3933   int issued = in->caps_issued(&implemented);
3934   int revoking = implemented & ~issued;
3935
3936   int orig_used = used;
3937   used = adjust_caps_used_for_lazyio(used, issued, implemented);
3938
3939   int retain = wanted | used | CEPH_CAP_PIN;
3940   if (!is_unmounting() && in->nlink > 0) {
3941     if (wanted) {
3942       retain |= CEPH_CAP_ANY;
3943     } else if (in->is_dir() &&
3944                (issued & CEPH_CAP_FILE_SHARED) &&
3945                (in->flags & I_COMPLETE)) {
3946       // we do this here because we don't want to drop to Fs (and then
3947       // drop the Fs if we do a create!) if that alone makes us send lookups
3948       // to the MDS. Doing it in in->caps_wanted() has knock-on effects elsewhere
3949       wanted = CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_EXCL;
3950       retain |= wanted;
3951     } else {
3952       retain |= CEPH_CAP_ANY_SHARED;
3953       // keep RD only if we didn't have the file open RW,
3954       // because then the mds would revoke it anyway to
3955       // journal max_size=0.
3956       if (in->max_size == 0)
3957         retain |= CEPH_CAP_ANY_RD;
3958     }
3959   }
3960
3961   ldout(cct, 10) << __func__ << " on " << *in
3962            << " wanted " << ccap_string(wanted)
3963            << " used " << ccap_string(used)
3964            << " issued " << ccap_string(issued)
3965            << " revoking " << ccap_string(revoking)
3966            << " flags=" << flags
3967            << dendl;
3968
3969   if (in->snapid != CEPH_NOSNAP)
3970     return; //snap caps last forever, can't write
3971
3972   if (in->caps.empty())
3973     return;   // guard if at end of func
3974
3975   if (!(orig_used & CEPH_CAP_FILE_BUFFER) &&
3976       (revoking & used & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO))) {
3977     if (_release(in))
3978       used &= ~(CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO);
3979   }
3980
3981   for (auto &[mds, cap] : in->caps) {
3982     auto session = mds_sessions.at(mds);
3983
3984     cap_used = used;
3985     if (in->auth_cap && &cap != in->auth_cap)
3986       cap_used &= ~in->auth_cap->issued;
3987
3988     revoking = cap.implemented & ~cap.issued;
3989
3990     ldout(cct, 10) << " cap mds." << mds
3991              << " issued " << ccap_string(cap.issued)
3992              << " implemented " << ccap_string(cap.implemented)
3993              << " revoking " << ccap_string(revoking) << dendl;
3994
3995     if (in->wanted_max_size > in->max_size &&
3996         in->wanted_max_size > in->requested_max_size &&
3997         &cap == in->auth_cap)
3998       goto ack;
3999
4000     /* approaching file_max? */
4001     if ((cap.issued & CEPH_CAP_FILE_WR) &&
4002         &cap == in->auth_cap &&
4003         is_max_size_approaching(in)) {
4004       ldout(cct, 10) << "size " << in->size << " approaching max_size " << in->max_size
4005                      << ", reported " << in->reported_size << dendl;
4006       goto ack;
4007     }
4008
4009     /* completed revocation? */
4010     if (revoking && (revoking & cap_used) == 0) {
4011       ldout(cct, 10) << "completed revocation of " << ccap_string(cap.implemented & ~cap.issued) << dendl;
4012       goto ack;
4013     }
4014
4015     /* want more caps from mds? */
4016     if (wanted & ~(cap.wanted | cap.issued))
4017       goto ack;
4018
4019     if (!revoking && is_unmounting() && (cap_used == 0))
4020       goto ack;
4021
4022     if ((cap.issued & ~retain) == 0 && // and we don't have anything we wouldn't like
4023         !in->dirty_caps)               // and we have no dirty caps
4024       continue;
4025
4026     if (!(flags & CHECK_CAPS_NODELAY)) {
4027       ldout(cct, 10) << "delaying cap release" << dendl;
4028       cap_delay_requeue(in);
4029       continue;
4030     }
4031
4032   ack:
4033     if (&cap == in->auth_cap) {
4034       if (in->flags & I_KICK_FLUSH) {
4035         ldout(cct, 20) << " reflushing caps (check_caps) on " << *in
4036                        << " to mds." << mds << dendl;
4037         kick_flushing_caps(in, session.get());
4038       }
4039       if (!in->cap_snaps.empty() &&
4040           in->cap_snaps.rbegin()->second.flush_tid == 0)
4041         flush_snaps(in);
4042     }
4043
4044     int flushing;
4045     int msg_flags = 0;
4046     ceph_tid_t flush_tid;
4047     if (in->auth_cap == &cap && in->dirty_caps) {
4048       flushing = mark_caps_flushing(in, &flush_tid);
4049       if (flags & CHECK_CAPS_SYNCHRONOUS)
4050         msg_flags |= MClientCaps::FLAG_SYNC;
4051     } else {
4052       flushing = 0;
4053       flush_tid = 0;
4054     }
4055
4056     in->delay_cap_item.remove_myself();
4057     send_cap(in, session.get(), &cap, msg_flags, cap_used, wanted, retain,
4058              flushing, flush_tid);
4059   }
4060 }
4061
4062
4063 void Client::queue_cap_snap(Inode *in, SnapContext& old_snapc)
4064 {
4065   int used = get_caps_used(in);
4066   int dirty = in->caps_dirty();
4067   ldout(cct, 10) << __func__ << " " << *in << " snapc " << old_snapc << " used " << ccap_string(used) << dendl;
4068
4069   if (in->cap_snaps.size() &&
4070       in->cap_snaps.rbegin()->second.writing) {
4071     ldout(cct, 10) << __func__ << " already have pending cap_snap on " << *in << dendl;
4072     return;
4073   } else if (dirty || (used & CEPH_CAP_FILE_WR)) {
4074     const auto &capsnapem = in->cap_snaps.emplace(std::piecewise_construct, std::make_tuple(old_snapc.seq), std::make_tuple(in));
4075     ceph_assert(capsnapem.second); /* element inserted */
4076     CapSnap &capsnap = capsnapem.first->second;
4077     capsnap.context = old_snapc;
4078     capsnap.issued = in->caps_issued();
4079     capsnap.dirty = dirty;
4080
4081     capsnap.dirty_data = (used & CEPH_CAP_FILE_BUFFER);
4082
4083     capsnap.uid = in->uid;
4084     capsnap.gid = in->gid;
4085     capsnap.mode = in->mode;
4086     capsnap.btime = in->btime;
4087     capsnap.xattrs = in->xattrs;
4088     capsnap.xattr_version = in->xattr_version;
4089     capsnap.cap_dirtier_uid = in->cap_dirtier_uid;
4090     capsnap.cap_dirtier_gid = in->cap_dirtier_gid;
4091
4092     if (used & CEPH_CAP_FILE_WR) {
4093       ldout(cct, 10) << __func__ << " WR used on " << *in << dendl;
4094       capsnap.writing = 1;
4095     } else {
4096       finish_cap_snap(in, capsnap, used);
4097     }
4098   } else {
4099     ldout(cct, 10) << __func__ << " not dirty|writing on " << *in << dendl;
4100   }
4101 }
4102
4103 void Client::finish_cap_snap(Inode *in, CapSnap &capsnap, int used)
4104 {
4105   ldout(cct, 10) << __func__ << " " << *in << " capsnap " << (void *)&capsnap << " used " << ccap_string(used) << dendl;
4106   capsnap.size = in->size;
4107   capsnap.mtime = in->mtime;
4108   capsnap.atime = in->atime;
4109   capsnap.ctime = in->ctime;
4110   capsnap.time_warp_seq = in->time_warp_seq;
4111   capsnap.change_attr = in->change_attr;
4112   capsnap.dirty |= in->caps_dirty();
4113
4114   /* Only reset it if it wasn't set before */
4115   if (capsnap.cap_dirtier_uid == -1) {
4116     capsnap.cap_dirtier_uid = in->cap_dirtier_uid;
4117     capsnap.cap_dirtier_gid = in->cap_dirtier_gid;
4118   }
4119
4120   if (capsnap.dirty & CEPH_CAP_FILE_WR) {
4121     capsnap.inline_data = in->inline_data;
4122     capsnap.inline_version = in->inline_version;
4123   }
4124
4125   if (used & CEPH_CAP_FILE_BUFFER) {
4126     ldout(cct, 10) << __func__ << " " << *in << " cap_snap " << &capsnap << " used " << used
4127              << " WRBUFFER, trigger to flush dirty buffer" << dendl;
4128
4129     /* trigger to flush the buffer */
4130     _flush(in, new C_Client_FlushComplete(this, in));
4131   } else {
4132     capsnap.dirty_data = 0;
4133     flush_snaps(in);
4134   }
4135 }
4136
4137 void Client::send_flush_snap(Inode *in, MetaSession *session,
4138                              snapid_t follows, CapSnap& capsnap)
4139 {
4140   auto m = make_message<MClientCaps>(CEPH_CAP_OP_FLUSHSNAP,
4141                                      in->ino, in->snaprealm->ino, 0,
4142                                      in->auth_cap->mseq, cap_epoch_barrier);
4143   m->caller_uid = capsnap.cap_dirtier_uid;
4144   m->caller_gid = capsnap.cap_dirtier_gid;
4145
4146   m->set_client_tid(capsnap.flush_tid);
4147   m->head.snap_follows = follows;
4148
4149   m->head.caps = capsnap.issued;
4150   m->head.dirty = capsnap.dirty;
4151
4152   m->head.uid = capsnap.uid;
4153   m->head.gid = capsnap.gid;
4154   m->head.mode = capsnap.mode;
4155   m->btime = capsnap.btime;
4156
4157   m->size = capsnap.size;
4158
4159   m->head.xattr_version = capsnap.xattr_version;
4160   encode(capsnap.xattrs, m->xattrbl);
4161
4162   m->ctime = capsnap.ctime;
4163   m->btime = capsnap.btime;
4164   m->mtime = capsnap.mtime;
4165   m->atime = capsnap.atime;
4166   m->time_warp_seq = capsnap.time_warp_seq;
4167   m->change_attr = capsnap.change_attr;
4168
4169   if (capsnap.dirty & CEPH_CAP_FILE_WR) {
4170     m->inline_version = in->inline_version;
4171     m->inline_data = in->inline_data;
4172   }
4173
4174   ceph_assert(!session->flushing_caps_tids.empty());
4175   m->set_oldest_flush_tid(*session->flushing_caps_tids.begin());
4176
4177   session->con->send_message2(std::move(m));
4178 }
4179
4180 void Client::flush_snaps(Inode *in)
4181 {
4182   ldout(cct, 10) << "flush_snaps on " << *in << dendl;
4183   ceph_assert(in->cap_snaps.size());
4184
4185   // pick auth mds
4186   ceph_assert(in->auth_cap);
4187   MetaSession *session = in->auth_cap->session;
4188
4189   for (auto &p : in->cap_snaps) {
4190     CapSnap &capsnap = p.second;
4191     // only do new flush
4192     if (capsnap.flush_tid > 0)
4193       continue;
4194
4195     ldout(cct, 10) << "flush_snaps mds." << session->mds_num
4196              << " follows " << p.first
4197              << " size " << capsnap.size
4198              << " mtime " << capsnap.mtime
4199              << " dirty_data=" << capsnap.dirty_data
4200              << " writing=" << capsnap.writing
4201              << " on " << *in << dendl;
4202     if (capsnap.dirty_data || capsnap.writing)
4203       break;
4204
4205     capsnap.flush_tid = ++last_flush_tid;
4206     session->flushing_caps_tids.insert(capsnap.flush_tid);
4207     in->flushing_cap_tids[capsnap.flush_tid] = 0;
4208     if (!in->flushing_cap_item.is_on_list())
4209       session->flushing_caps.push_back(&in->flushing_cap_item);
4210
4211     send_flush_snap(in, session, p.first, capsnap);
4212   }
4213 }
4214
4215 void Client::wait_on_list(list<ceph::condition_variable*>& ls)
4216 {
4217   ceph::condition_variable cond;
4218   ls.push_back(&cond);
4219   std::unique_lock l{client_lock, std::adopt_lock};
4220   cond.wait(l);
4221   l.release();
4222   ls.remove(&cond);
4223 }
4224
4225 void Client::signal_cond_list(list<ceph::condition_variable*>& ls)
4226 {
4227   for (auto cond : ls) {
4228     cond->notify_all();
4229   }
4230 }
4231
4232 void Client::wait_on_context_list(list<Context*>& ls)
4233 {
4234   ceph::condition_variable cond;
4235   bool done = false;
4236   int r;
4237   ls.push_back(new C_Cond(cond, &done, &r));
4238   std::unique_lock l{client_lock, std::adopt_lock};
4239   cond.wait(l, [&done] { return done;});
4240   l.release();
4241 }
4242
4243 void Client::signal_context_list(list<Context*>& ls)
4244 {
4245   while (!ls.empty()) {
4246     ls.front()->complete(0);
4247     ls.pop_front();
4248   }
4249 }
4250
4251 void Client::wake_up_session_caps(MetaSession *s, bool reconnect)
4252 {
4253   for (const auto &cap : s->caps) {
4254     auto &in = cap->inode;
4255     if (reconnect) {
4256       in.requested_max_size = 0;
4257       in.wanted_max_size = 0;
4258     } else {
4259       if (cap->gen < s->cap_gen) {
4260         // mds did not re-issue stale cap.
4261         cap->issued = cap->implemented = CEPH_CAP_PIN;
4262         // make sure mds knows what we want.
4263         if (in.caps_file_wanted() & ~cap->wanted)
4264           in.flags |= I_CAP_DROPPED;
4265       }
4266     }
4267     signal_cond_list(in.waitfor_caps);
4268   }
4269 }
4270
4271
4272 // flush dirty data (from objectcache)
4273
4274 class C_Client_CacheInvalidate : public Context  {
4275 private:
4276   Client *client;
4277   vinodeno_t ino;
4278   int64_t offset, length;
4279 public:
4280   C_Client_CacheInvalidate(Client *c, Inode *in, int64_t off, int64_t len) :
4281     client(c), offset(off), length(len) {
4282     if (client->use_faked_inos())
4283       ino = vinodeno_t(in->faked_ino, CEPH_NOSNAP);
4284     else
4285       ino = in->vino();
4286   }
4287   void finish(int r) override {
4288     // _async_invalidate takes the lock when it needs to, call this back from outside of lock.
4289     ceph_assert(ceph_mutex_is_not_locked_by_me(client->client_lock));
4290     client->_async_invalidate(ino, offset, length);
4291   }
4292 };
4293
4294 void Client::_async_invalidate(vinodeno_t ino, int64_t off, int64_t len)
4295 {
4296   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
4297   if (!mref_reader.is_state_satisfied())
4298     return;
4299
4300   ldout(cct, 10) << __func__ << " " << ino << " " << off << "~" << len << dendl;
4301   ino_invalidate_cb(callback_handle, ino, off, len);
4302 }
4303
4304 void Client::_schedule_invalidate_callback(Inode *in, int64_t off, int64_t len) {
4305
4306   if (ino_invalidate_cb)
4307     // we queue the invalidate, which calls the callback and decrements the ref
4308     async_ino_invalidator.queue(new C_Client_CacheInvalidate(this, in, off, len));
4309 }
4310
4311 void Client::_invalidate_inode_cache(Inode *in)
4312 {
4313   ldout(cct, 10) << __func__ << " " << *in << dendl;
4314
4315   // invalidate our userspace inode cache
4316   if (cct->_conf->client_oc) {
4317     objectcacher->release_set(&in->oset);
4318     if (!objectcacher->set_is_empty(&in->oset))
4319       lderr(cct) << "failed to invalidate cache for " << *in << dendl;
4320   }
4321
4322   _schedule_invalidate_callback(in, 0, 0);
4323 }
4324
4325 void Client::_invalidate_inode_cache(Inode *in, int64_t off, int64_t len)
4326 {
4327   ldout(cct, 10) << __func__ << " " << *in << " " << off << "~" << len << dendl;
4328
4329   // invalidate our userspace inode cache
4330   if (cct->_conf->client_oc) {
4331     vector<ObjectExtent> ls;
4332     Striper::file_to_extents(cct, in->ino, &in->layout, off, len, in->truncate_size, ls);
4333     objectcacher->discard_writeback(&in->oset, ls, nullptr);
4334   }
4335
4336   _schedule_invalidate_callback(in, off, len);
4337 }
4338
4339 bool Client::_release(Inode *in)
4340 {
4341   ldout(cct, 20) << "_release " << *in << dendl;
4342   if (in->cap_refs[CEPH_CAP_FILE_CACHE] == 0) {
4343     _invalidate_inode_cache(in);
4344     return true;
4345   }
4346   return false;
4347 }
4348
4349 bool Client::_flush(Inode *in, Context *onfinish)
4350 {
4351   ldout(cct, 10) << "_flush " << *in << dendl;
4352
4353   if (!in->oset.dirty_or_tx) {
4354     ldout(cct, 10) << " nothing to flush" << dendl;
4355     onfinish->complete(0);
4356     return true;
4357   }
4358
4359   if (objecter->osdmap_pool_full(in->layout.pool_id)) {
4360     ldout(cct, 8) << __func__ << ": FULL, purging for ENOSPC" << dendl;
4361     objectcacher->purge_set(&in->oset);
4362     if (onfinish) {
4363       onfinish->complete(-CEPHFS_ENOSPC);
4364     }
4365     return true;
4366   }
4367
4368   return objectcacher->flush_set(&in->oset, onfinish);
4369 }
4370
4371 void Client::_flush_range(Inode *in, int64_t offset, uint64_t size)
4372 {
4373   ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
4374   if (!in->oset.dirty_or_tx) {
4375     ldout(cct, 10) << " nothing to flush" << dendl;
4376     return;
4377   }
4378
4379   C_SaferCond onflush("Client::_flush_range flock");
4380   bool ret = objectcacher->file_flush(&in->oset, &in->layout, in->snaprealm->get_snap_context(),
4381                                       offset, size, &onflush);
4382   if (!ret) {
4383     // wait for flush
4384     client_lock.unlock();
4385     onflush.wait();
4386     client_lock.lock();
4387   }
4388 }
4389
4390 void Client::flush_set_callback(ObjectCacher::ObjectSet *oset)
4391 {
4392   //  std::scoped_lock l(client_lock);
4393   ceph_assert(ceph_mutex_is_locked_by_me(client_lock));   // will be called via dispatch() -> objecter -> ...
4394   Inode *in = static_cast<Inode *>(oset->parent);
4395   ceph_assert(in);
4396   _flushed(in);
4397 }
4398
4399 void Client::_flushed(Inode *in)
4400 {
4401   ldout(cct, 10) << "_flushed " << *in << dendl;
4402
4403   put_cap_ref(in, CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_BUFFER);
4404 }
4405
4406
4407
4408 // checks common to add_update_cap, handle_cap_grant
4409 void Client::check_cap_issue(Inode *in, unsigned issued)
4410 {
4411   unsigned had = in->caps_issued();
4412
4413   if ((issued & CEPH_CAP_FILE_CACHE) &&
4414       !(had & CEPH_CAP_FILE_CACHE))
4415     in->cache_gen++;
4416
4417   if ((issued & CEPH_CAP_FILE_SHARED) !=
4418       (had & CEPH_CAP_FILE_SHARED)) {
4419     if (issued & CEPH_CAP_FILE_SHARED)
4420       in->shared_gen++;
4421     if (in->is_dir())
4422       clear_dir_complete_and_ordered(in, true);
4423   }
4424 }
4425
4426 void Client::add_update_cap(Inode *in, MetaSession *mds_session, uint64_t cap_id,
4427                             unsigned issued, unsigned wanted, unsigned seq, unsigned mseq,
4428                             inodeno_t realm, int flags, const UserPerm& cap_perms)
4429 {
4430   if (!in->is_any_caps()) {
4431     ceph_assert(in->snaprealm == 0);
4432     in->snaprealm = get_snap_realm(realm);
4433     in->snaprealm->inodes_with_caps.push_back(&in->snaprealm_item);
4434     ldout(cct, 15) << __func__ << " first one, opened snaprealm " << in->snaprealm << dendl;
4435   } else {
4436     ceph_assert(in->snaprealm);
4437     if ((flags & CEPH_CAP_FLAG_AUTH) &&
4438         realm != inodeno_t(-1) && in->snaprealm->ino != realm) {
4439       in->snaprealm_item.remove_myself();
4440       auto oldrealm = in->snaprealm;
4441       in->snaprealm = get_snap_realm(realm);
4442       in->snaprealm->inodes_with_caps.push_back(&in->snaprealm_item);
4443       put_snap_realm(oldrealm);
4444     }
4445   }
4446
4447   mds_rank_t mds = mds_session->mds_num;
4448   const auto &capem = in->caps.emplace(std::piecewise_construct, std::forward_as_tuple(mds), std::forward_as_tuple(*in, mds_session));
4449   Cap &cap = capem.first->second;
4450   if (!capem.second) {
4451     if (cap.gen < mds_session->cap_gen)
4452       cap.issued = cap.implemented = CEPH_CAP_PIN;
4453
4454     /*
4455      * auth mds of the inode changed. we received the cap export
4456      * message, but still haven't received the cap import message.
4457      * handle_cap_export() updated the new auth MDS' cap.
4458      *
4459      * "ceph_seq_cmp(seq, cap->seq) <= 0" means we are processing
4460      * a message that was send before the cap import message. So
4461      * don't remove caps.
4462      */
4463     if (ceph_seq_cmp(seq, cap.seq) <= 0) {
4464       if (&cap != in->auth_cap)
4465          ldout(cct, 0) << "WARNING: " <<  "inode " << *in << " caps on mds." << mds << " != auth_cap." << dendl;
4466
4467       ceph_assert(cap.cap_id == cap_id);
4468       seq = cap.seq;
4469       mseq = cap.mseq;
4470       issued |= cap.issued;
4471       flags |= CEPH_CAP_FLAG_AUTH;
4472     }
4473   } else {
4474     inc_pinned_icaps();
4475   }
4476
4477   check_cap_issue(in, issued);
4478
4479   if (flags & CEPH_CAP_FLAG_AUTH) {
4480     if (in->auth_cap != &cap &&
4481         (!in->auth_cap || ceph_seq_cmp(in->auth_cap->mseq, mseq) < 0)) {
4482       if (in->auth_cap) {
4483         if (in->flushing_cap_item.is_on_list()) {
4484           ldout(cct, 10) << __func__ << " changing auth cap: "
4485                          << "add myself to new auth MDS' flushing caps list" << dendl;
4486           adjust_session_flushing_caps(in, in->auth_cap->session, mds_session);
4487         }
4488         if (in->dirty_cap_item.is_on_list()) {
4489           ldout(cct, 10) << __func__ << " changing auth cap: "
4490                          << "add myself to new auth MDS' dirty caps list" << dendl;
4491           mds_session->get_dirty_list().push_back(&in->dirty_cap_item);
4492         }
4493       }
4494
4495       in->auth_cap = &cap;
4496     }
4497   }
4498
4499   unsigned old_caps = cap.issued;
4500   cap.cap_id = cap_id;
4501   cap.issued = issued;
4502   cap.implemented |= issued;
4503   if (ceph_seq_cmp(mseq, cap.mseq) > 0)
4504     cap.wanted = wanted;
4505   else
4506     cap.wanted |= wanted;
4507   cap.seq = seq;
4508   cap.issue_seq = seq;
4509   cap.mseq = mseq;
4510   cap.gen = mds_session->cap_gen;
4511   cap.latest_perms = cap_perms;
4512   ldout(cct, 10) << __func__ << " issued " << ccap_string(old_caps) << " -> " << ccap_string(cap.issued)
4513            << " from mds." << mds
4514            << " on " << *in
4515            << dendl;
4516
4517   if ((issued & ~old_caps) && in->auth_cap == &cap) {
4518     // non-auth MDS is revoking the newly grant caps ?
4519     for (auto &p : in->caps) {
4520       if (&p.second == &cap)
4521         continue;
4522       if (p.second.implemented & ~p.second.issued & issued) {
4523         check_caps(in, CHECK_CAPS_NODELAY);
4524         break;
4525       }
4526     }
4527   }
4528
4529   if (issued & ~old_caps)
4530     signal_cond_list(in->waitfor_caps);
4531 }
4532
4533 void Client::remove_cap(Cap *cap, bool queue_release)
4534 {
4535   auto &in = cap->inode;
4536   MetaSession *session = cap->session;
4537   mds_rank_t mds = cap->session->mds_num;
4538
4539   ldout(cct, 10) << __func__ << " mds." << mds << " on " << in << dendl;
4540
4541   if (queue_release) {
4542     session->enqueue_cap_release(
4543       in.ino,
4544       cap->cap_id,
4545       cap->issue_seq,
4546       cap->mseq,
4547       cap_epoch_barrier);
4548   } else {
4549     dec_pinned_icaps();
4550   }
4551
4552
4553   if (in.auth_cap == cap) {
4554     if (in.flushing_cap_item.is_on_list()) {
4555       ldout(cct, 10) << " removing myself from flushing_cap list" << dendl;
4556       in.flushing_cap_item.remove_myself();
4557     }
4558     in.auth_cap = NULL;
4559   }
4560   size_t n = in.caps.erase(mds);
4561   ceph_assert(n == 1);
4562   cap = nullptr;
4563
4564   if (!in.is_any_caps()) {
4565     ldout(cct, 15) << __func__ << " last one, closing snaprealm " << in.snaprealm << dendl;
4566     in.snaprealm_item.remove_myself();
4567     put_snap_realm(in.snaprealm);
4568     in.snaprealm = 0;
4569   }
4570 }
4571
4572 void Client::remove_all_caps(Inode *in)
4573 {
4574   while (!in->caps.empty())
4575     remove_cap(&in->caps.begin()->second, true);
4576 }
4577
4578 void Client::remove_session_caps(MetaSession *s, int err)
4579 {
4580   ldout(cct, 10) << __func__ << " mds." << s->mds_num << dendl;
4581
4582   while (s->caps.size()) {
4583     Cap *cap = *s->caps.begin();
4584     InodeRef in(&cap->inode);
4585     bool dirty_caps = false;
4586     if (in->auth_cap == cap) {
4587       dirty_caps = in->dirty_caps | in->flushing_caps;
4588       in->wanted_max_size = 0;
4589       in->requested_max_size = 0;
4590       if (in->has_any_filelocks())
4591         in->flags |= I_ERROR_FILELOCK;
4592     }
4593     auto caps = cap->implemented;
4594     if (cap->wanted | cap->issued)
4595       in->flags |= I_CAP_DROPPED;
4596     remove_cap(cap, false);
4597     in->cap_snaps.clear();
4598     if (dirty_caps) {
4599       lderr(cct) << __func__ << " still has dirty|flushing caps on " << *in << dendl;
4600       if (in->flushing_caps) {
4601         num_flushing_caps--;
4602         in->flushing_cap_tids.clear();
4603       }
4604       in->flushing_caps = 0;
4605       in->mark_caps_clean();
4606       put_inode(in.get());
4607     }
4608     caps &= CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_BUFFER;
4609     if (caps && !in->caps_issued_mask(caps, true)) {
4610       if (err == -CEPHFS_EBLOCKLISTED) {
4611         if (in->oset.dirty_or_tx) {
4612           lderr(cct) << __func__ << " still has dirty data on " << *in << dendl;
4613           in->set_async_err(err);
4614         }
4615         objectcacher->purge_set(&in->oset);
4616       } else {
4617         objectcacher->release_set(&in->oset);
4618       }
4619       _schedule_invalidate_callback(in.get(), 0, 0);
4620     }
4621
4622     signal_cond_list(in->waitfor_caps);
4623   }
4624   s->flushing_caps_tids.clear();
4625   sync_cond.notify_all();
4626 }
4627
4628 std::pair<int, bool> Client::_do_remount(bool retry_on_error)
4629 {
4630   uint64_t max_retries = cct->_conf.get_val<uint64_t>("client_max_retries_on_remount_failure");
4631   bool abort_on_failure = false;
4632
4633   errno = 0;
4634   int r = remount_cb(callback_handle);
4635   if (r == 0) {
4636     retries_on_invalidate = 0;
4637   } else {
4638     int e = errno;
4639     client_t whoami = get_nodeid();
4640     if (r == -1) {
4641       lderr(cct) <<
4642           "failed to remount (to trim kernel dentries): "
4643           "errno = " << e << " (" << strerror(e) << ")" << dendl;
4644     } else {
4645       lderr(cct) <<
4646           "failed to remount (to trim kernel dentries): "
4647           "return code = " << r << dendl;
4648     }
4649     bool should_abort =
4650       (cct->_conf.get_val<bool>("client_die_on_failed_remount") ||
4651        cct->_conf.get_val<bool>("client_die_on_failed_dentry_invalidate")) &&
4652       !(retry_on_error && (++retries_on_invalidate < max_retries));
4653     if (should_abort && !is_unmounting()) {
4654       lderr(cct) << "failed to remount for kernel dentry trimming; quitting!" << dendl;
4655       abort_on_failure = true;
4656     }
4657   }
4658   return std::make_pair(r, abort_on_failure);
4659 }
4660
4661 class C_Client_Remount : public Context  {
4662 private:
4663   Client *client;
4664 public:
4665   explicit C_Client_Remount(Client *c) : client(c) {}
4666   void finish(int r) override {
4667     ceph_assert(r == 0);
4668     auto result = client->_do_remount(true);
4669     if (result.second) {
4670       ceph_abort();
4671     }
4672   }
4673 };
4674
4675 void Client::_invalidate_kernel_dcache()
4676 {
4677   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
4678   if (!mref_reader.is_state_satisfied())
4679     return;
4680
4681   if (can_invalidate_dentries) {
4682     if (dentry_invalidate_cb && root->dir) {
4683       for (ceph::unordered_map<string, Dentry*>::iterator p = root->dir->dentries.begin();
4684          p != root->dir->dentries.end();
4685          ++p) {
4686        if (p->second->inode)
4687         _schedule_invalidate_dentry_callback(p->second, false);
4688       }
4689     }
4690   } else if (remount_cb) {
4691     // Hacky:
4692     // when remounting a file system, linux kernel trims all unused dentries in the fs
4693     remount_finisher.queue(new C_Client_Remount(this));
4694   }
4695 }
4696
4697 void Client::_trim_negative_child_dentries(InodeRef& in)
4698 {
4699   if (!in->is_dir())
4700     return;
4701
4702   Dir* dir = in->dir;
4703   if (dir && dir->dentries.size() == dir->num_null_dentries) {
4704     for (auto p = dir->dentries.begin(); p != dir->dentries.end(); ) {
4705       Dentry *dn = p->second;
4706       ++p;
4707       ceph_assert(!dn->inode);
4708       if (dn->lru_is_expireable())
4709         unlink(dn, true, false);  // keep dir, drop dentry
4710     }
4711     if (dir->dentries.empty()) {
4712       close_dir(dir);
4713     }
4714   }
4715
4716   if (in->flags & I_SNAPDIR_OPEN) {
4717     InodeRef snapdir = open_snapdir(in.get());
4718     _trim_negative_child_dentries(snapdir);
4719   }
4720 }
4721
4722 class C_Client_CacheRelease : public Context  {
4723 private:
4724   Client *client;
4725   vinodeno_t ino;
4726 public:
4727   C_Client_CacheRelease(Client *c, Inode *in) :
4728     client(c) {
4729     if (client->use_faked_inos())
4730       ino = vinodeno_t(in->faked_ino, CEPH_NOSNAP);
4731     else
4732       ino = in->vino();
4733   }
4734   void finish(int r) override {
4735     ceph_assert(ceph_mutex_is_not_locked_by_me(client->client_lock));
4736     client->_async_inode_release(ino);
4737   }
4738 };
4739
4740 void Client::_async_inode_release(vinodeno_t ino)
4741 {
4742   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
4743   if (!mref_reader.is_state_satisfied())
4744     return;
4745
4746   ldout(cct, 10) << __func__ << " " << ino << dendl;
4747   ino_release_cb(callback_handle, ino);
4748 }
4749
4750 void Client::_schedule_ino_release_callback(Inode *in) {
4751
4752   if (ino_release_cb)
4753     // we queue the invalidate, which calls the callback and decrements the ref
4754     async_ino_releasor.queue(new C_Client_CacheRelease(this, in));
4755 }
4756
4757 void Client::trim_caps(MetaSession *s, uint64_t max)
4758 {
4759   mds_rank_t mds = s->mds_num;
4760   size_t caps_size = s->caps.size();
4761   ldout(cct, 10) << __func__ << " mds." << mds << " max " << max
4762     << " caps " << caps_size << dendl;
4763
4764   uint64_t trimmed = 0;
4765   auto p = s->caps.begin();
4766   std::set<Dentry *> to_trim; /* this avoids caps other than the one we're
4767                                * looking at from getting deleted during traversal. */
4768   while ((caps_size - trimmed) > max && !p.end()) {
4769     Cap *cap = *p;
4770     InodeRef in(&cap->inode);
4771
4772     // Increment p early because it will be invalidated if cap
4773     // is deleted inside remove_cap
4774     ++p;
4775
4776     if (in->caps.size() > 1 && cap != in->auth_cap) {
4777       int mine = cap->issued | cap->implemented;
4778       int oissued = in->auth_cap ? in->auth_cap->issued : 0;
4779       // disposable non-auth cap
4780       if (!(get_caps_used(in.get()) & ~oissued & mine)) {
4781         ldout(cct, 20) << " removing unused, unneeded non-auth cap on " << *in << dendl;
4782         cap = (remove_cap(cap, true), nullptr);
4783         trimmed++;
4784       }
4785     } else {
4786       ldout(cct, 20) << " trying to trim dentries for " << *in << dendl;
4787       _trim_negative_child_dentries(in);
4788       bool all = true;
4789       auto q = in->dentries.begin();
4790       while (q != in->dentries.end()) {
4791         Dentry *dn = *q;
4792         ++q;
4793         if (dn->lru_is_expireable()) {
4794           if (can_invalidate_dentries &&
4795               dn->dir->parent_inode->ino == CEPH_INO_ROOT) {
4796             // Only issue one of these per DN for inodes in root: handle
4797             // others more efficiently by calling for root-child DNs at
4798             // the end of this function.
4799             _schedule_invalidate_dentry_callback(dn, true);
4800           }
4801           ldout(cct, 20) << " queueing dentry for trimming: " << dn->name << dendl;
4802           to_trim.insert(dn);
4803         } else {
4804           ldout(cct, 20) << "  not expirable: " << dn->name << dendl;
4805           all = false;
4806         }
4807       }
4808       if (in->ll_ref == 1 && in->ino != CEPH_INO_ROOT) {
4809          _schedule_ino_release_callback(in.get());
4810       }
4811       if (all && in->ino != CEPH_INO_ROOT) {
4812         ldout(cct, 20) << __func__ << " counting as trimmed: " << *in << dendl;
4813         trimmed++;
4814       }
4815     }
4816   }
4817   ldout(cct, 20) << " trimming queued dentries: " << dendl;
4818   for (const auto &dn : to_trim) {
4819     trim_dentry(dn);
4820   }
4821   to_trim.clear();
4822
4823   caps_size = s->caps.size();
4824   if (caps_size > (size_t)max)
4825     _invalidate_kernel_dcache();
4826 }
4827
4828 void Client::force_session_readonly(MetaSession *s)
4829 {
4830   s->readonly = true;
4831   for (xlist<Cap*>::iterator p = s->caps.begin(); !p.end(); ++p) {
4832     auto &in = (*p)->inode;
4833     if (in.caps_wanted() & CEPH_CAP_FILE_WR)
4834       signal_cond_list(in.waitfor_caps);
4835   }
4836 }
4837
4838 int Client::mark_caps_flushing(Inode *in, ceph_tid_t* ptid)
4839 {
4840   MetaSession *session = in->auth_cap->session;
4841
4842   int flushing = in->dirty_caps;
4843   ceph_assert(flushing);
4844
4845   ceph_tid_t flush_tid = ++last_flush_tid;
4846   in->flushing_cap_tids[flush_tid] = flushing;
4847
4848   if (!in->flushing_caps) {
4849     ldout(cct, 10) << __func__ << " " << ccap_string(flushing) << " " << *in << dendl;
4850     num_flushing_caps++;
4851   } else {
4852     ldout(cct, 10) << __func__ << " (more) " << ccap_string(flushing) << " " << *in << dendl;
4853   }
4854
4855   in->flushing_caps |= flushing;
4856   in->mark_caps_clean();
4857
4858   if (!in->flushing_cap_item.is_on_list())
4859     session->flushing_caps.push_back(&in->flushing_cap_item);
4860   session->flushing_caps_tids.insert(flush_tid);
4861
4862   *ptid = flush_tid;
4863   return flushing;
4864 }
4865
4866 void Client::adjust_session_flushing_caps(Inode *in, MetaSession *old_s,  MetaSession *new_s)
4867 {
4868   for (auto &p : in->cap_snaps) {
4869     CapSnap &capsnap = p.second;
4870     if (capsnap.flush_tid > 0) {
4871       old_s->flushing_caps_tids.erase(capsnap.flush_tid);
4872       new_s->flushing_caps_tids.insert(capsnap.flush_tid);
4873     }
4874   }
4875   for (map<ceph_tid_t, int>::iterator it = in->flushing_cap_tids.begin();
4876        it != in->flushing_cap_tids.end();
4877        ++it) {
4878     old_s->flushing_caps_tids.erase(it->first);
4879     new_s->flushing_caps_tids.insert(it->first);
4880   }
4881   new_s->flushing_caps.push_back(&in->flushing_cap_item);
4882 }
4883
4884 /*
4885  * Flush all the dirty caps back to the MDS. Because the callers
4886  * generally wait on the result of this function (syncfs and umount
4887  * cases), we set CHECK_CAPS_SYNCHRONOUS on the last check_caps call.
4888  */
4889 void Client::flush_caps_sync()
4890 {
4891   ldout(cct, 10) << __func__ << dendl;
4892   for (auto &q : mds_sessions) {
4893     auto s = q.second;
4894     xlist<Inode*>::iterator p = s->dirty_list.begin();
4895     while (!p.end()) {
4896       unsigned flags = CHECK_CAPS_NODELAY;
4897       Inode *in = *p;
4898
4899       ++p;
4900       if (p.end())
4901         flags |= CHECK_CAPS_SYNCHRONOUS;
4902       check_caps(in, flags);
4903     }
4904   }
4905 }
4906
4907 void Client::wait_sync_caps(Inode *in, ceph_tid_t want)
4908 {
4909   while (in->flushing_caps) {
4910     map<ceph_tid_t, int>::iterator it = in->flushing_cap_tids.begin();
4911     ceph_assert(it != in->flushing_cap_tids.end());
4912     if (it->first > want)
4913       break;
4914     ldout(cct, 10) << __func__ << " on " << *in << " flushing "
4915                    << ccap_string(it->second) << " want " << want
4916                    << " last " << it->first << dendl;
4917     wait_on_list(in->waitfor_caps);
4918   }
4919 }
4920
4921 void Client::wait_sync_caps(ceph_tid_t want)
4922 {
4923  retry:
4924   ldout(cct, 10) << __func__ << " want " << want  << " (last is " << last_flush_tid << ", "
4925            << num_flushing_caps << " total flushing)" << dendl;
4926   for (auto &p : mds_sessions) {
4927     auto s = p.second;
4928     if (s->flushing_caps_tids.empty())
4929         continue;
4930     ceph_tid_t oldest_tid = *s->flushing_caps_tids.begin();
4931     if (oldest_tid <= want) {
4932       ldout(cct, 10) << " waiting on mds." << p.first << " tid " << oldest_tid
4933                      << " (want " << want << ")" << dendl;
4934       std::unique_lock l{client_lock, std::adopt_lock};
4935       sync_cond.wait(l);
4936       l.release();
4937       goto retry;
4938     }
4939   }
4940 }
4941
4942 void Client::kick_flushing_caps(Inode *in, MetaSession *session)
4943 {
4944   in->flags &= ~I_KICK_FLUSH;
4945
4946   Cap *cap = in->auth_cap;
4947   ceph_assert(cap->session == session);
4948
4949   ceph_tid_t last_snap_flush = 0;
4950   for (auto p = in->flushing_cap_tids.rbegin();
4951        p != in->flushing_cap_tids.rend();
4952        ++p) {
4953     if (!p->second) {
4954       last_snap_flush = p->first;
4955       break;
4956     }
4957   }
4958
4959   int wanted = in->caps_wanted();
4960   int used = get_caps_used(in) | in->caps_dirty();
4961   auto it = in->cap_snaps.begin();
4962   for (auto& p : in->flushing_cap_tids) {
4963     if (p.second) {
4964       int msg_flags = p.first < last_snap_flush ? MClientCaps::FLAG_PENDING_CAPSNAP : 0;
4965       send_cap(in, session, cap, msg_flags, used, wanted, (cap->issued | cap->implemented),
4966                p.second, p.first);
4967     } else {
4968       ceph_assert(it != in->cap_snaps.end());
4969       ceph_assert(it->second.flush_tid == p.first);
4970       send_flush_snap(in, session, it->first, it->second);
4971       ++it;
4972     }
4973   }
4974 }
4975
4976 void Client::kick_flushing_caps(MetaSession *session)
4977 {
4978   mds_rank_t mds = session->mds_num;
4979   ldout(cct, 10) << __func__ << " mds." << mds << dendl;
4980
4981   for (xlist<Inode*>::iterator p = session->flushing_caps.begin(); !p.end(); ++p) {
4982     Inode *in = *p;
4983     if (in->flags & I_KICK_FLUSH) {
4984       ldout(cct, 20) << " reflushing caps on " << *in << " to mds." << mds << dendl;
4985       kick_flushing_caps(in, session);
4986     }
4987   }
4988 }
4989
4990 void Client::early_kick_flushing_caps(MetaSession *session)
4991 {
4992   for (xlist<Inode*>::iterator p = session->flushing_caps.begin(); !p.end(); ++p) {
4993     Inode *in = *p;
4994     Cap *cap = in->auth_cap;
4995     ceph_assert(cap);
4996
4997     // if flushing caps were revoked, we re-send the cap flush in client reconnect
4998     // stage. This guarantees that MDS processes the cap flush message before issuing
4999     // the flushing caps to other client.
5000     if ((in->flushing_caps & in->auth_cap->issued) == in->flushing_caps) {
5001       in->flags |= I_KICK_FLUSH;
5002       continue;
5003     }
5004
5005     ldout(cct, 20) << " reflushing caps (early_kick) on " << *in
5006                    << " to mds." << session->mds_num << dendl;
5007     // send_reconnect() also will reset these sequence numbers. make sure
5008     // sequence numbers in cap flush message match later reconnect message.
5009     cap->seq = 0;
5010     cap->issue_seq = 0;
5011     cap->mseq = 0;
5012     cap->issued = cap->implemented;
5013
5014     kick_flushing_caps(in, session);
5015   }
5016 }
5017
5018 void Client::invalidate_snaprealm_and_children(SnapRealm *realm)
5019 {
5020   list<SnapRealm*> q;
5021   q.push_back(realm);
5022
5023   while (!q.empty()) {
5024     realm = q.front();
5025     q.pop_front();
5026
5027     ldout(cct, 10) << __func__ << " " << *realm << dendl;
5028     realm->invalidate_cache();
5029
5030     for (set<SnapRealm*>::iterator p = realm->pchildren.begin();
5031          p != realm->pchildren.end();
5032          ++p)
5033       q.push_back(*p);
5034   }
5035 }
5036
5037 SnapRealm *Client::get_snap_realm(inodeno_t r)
5038 {
5039   SnapRealm *realm = snap_realms[r];
5040
5041   ldout(cct, 20) << __func__ << " " << r << " " << realm << ", nref was "
5042                  << (realm ? realm->nref : 0) << dendl;
5043   if (!realm) {
5044     snap_realms[r] = realm = new SnapRealm(r);
5045
5046     // Do not release the global snaprealm until unmounting.
5047     if (r == CEPH_INO_GLOBAL_SNAPREALM)
5048       realm->nref++;
5049   }
5050
5051   realm->nref++;
5052   ldout(cct, 20) << __func__ << " " << r << " " << realm << ", nref now is "
5053                  << realm->nref << dendl;
5054   return realm;
5055 }
5056
5057 SnapRealm *Client::get_snap_realm_maybe(inodeno_t r)
5058 {
5059   if (snap_realms.count(r) == 0) {
5060     ldout(cct, 20) << __func__ << " " << r << " fail" << dendl;
5061     return NULL;
5062   }
5063   SnapRealm *realm = snap_realms[r];
5064   ldout(cct, 20) << __func__ << " " << r << " " << realm << " " << realm->nref << " -> " << (realm->nref + 1) << dendl;
5065   realm->nref++;
5066   return realm;
5067 }
5068
5069 void Client::put_snap_realm(SnapRealm *realm)
5070 {
5071   ldout(cct, 20) << __func__ << " " << realm->ino << " " << realm
5072                  << " " << realm->nref << " -> " << (realm->nref - 1) << dendl;
5073   if (--realm->nref == 0) {
5074     snap_realms.erase(realm->ino);
5075     if (realm->pparent) {
5076       realm->pparent->pchildren.erase(realm);
5077       put_snap_realm(realm->pparent);
5078     }
5079     delete realm;
5080   }
5081 }
5082
5083 bool Client::adjust_realm_parent(SnapRealm *realm, inodeno_t parent)
5084 {
5085   if (realm->parent != parent) {
5086     ldout(cct, 10) << __func__ << " " << *realm
5087              << " " << realm->parent << " -> " << parent << dendl;
5088     realm->parent = parent;
5089     if (realm->pparent) {
5090       realm->pparent->pchildren.erase(realm);
5091       put_snap_realm(realm->pparent);
5092     }
5093     realm->pparent = get_snap_realm(parent);
5094     realm->pparent->pchildren.insert(realm);
5095     return true;
5096   }
5097   return false;
5098 }
5099
5100 static bool has_new_snaps(const SnapContext& old_snapc,
5101                           const SnapContext& new_snapc)
5102 {
5103   return !new_snapc.snaps.empty() && new_snapc.snaps[0] > old_snapc.seq;
5104 }
5105
5106 struct SnapRealmInfoMeta {
5107   SnapRealmInfoMeta(utime_t last_modified, uint64_t change_attr)
5108     : last_modified(last_modified),
5109       change_attr(change_attr) {
5110   }
5111
5112   utime_t last_modified;
5113   uint64_t change_attr;
5114 };
5115
5116 static std::pair<SnapRealmInfo, std::optional<SnapRealmInfoMeta>> get_snap_realm_info(
5117     MetaSession *session, bufferlist::const_iterator &p) {
5118   if (session->mds_features.test(CEPHFS_FEATURE_NEW_SNAPREALM_INFO)) {
5119     SnapRealmInfoNew ninfo;
5120     decode(ninfo, p);
5121     return std::make_pair(ninfo.info, SnapRealmInfoMeta(ninfo.last_modified, ninfo.change_attr));
5122   } else {
5123     SnapRealmInfo info;
5124     decode(info, p);
5125     return std::make_pair(info, std::nullopt);
5126   }
5127 }
5128
5129
5130 void Client::update_snap_trace(MetaSession *session, const bufferlist& bl, SnapRealm **realm_ret, bool flush)
5131 {
5132   SnapRealm *first_realm = NULL;
5133   ldout(cct, 10) << __func__ << " len " << bl.length() << dendl;
5134
5135   map<SnapRealm*, SnapContext> dirty_realms;
5136
5137   auto p = bl.cbegin();
5138   while (!p.end()) {
5139     auto [info, realm_info_meta] = get_snap_realm_info(session, p);
5140     SnapRealm *realm = get_snap_realm(info.ino());
5141
5142     bool invalidate = false;
5143
5144     if (info.seq() > realm->seq ||
5145         (realm_info_meta && (*realm_info_meta).change_attr > realm->change_attr)) {
5146       ldout(cct, 10) << __func__ << " " << *realm << " seq " << info.seq() << " > " << realm->seq
5147                      << dendl;
5148
5149       if (flush) {
5150         // writeback any dirty caps _before_ updating snap list (i.e. with old snap info)
5151         //  flush me + children
5152         list<SnapRealm*> q;
5153         q.push_back(realm);
5154         while (!q.empty()) {
5155           SnapRealm *realm = q.front();
5156           q.pop_front();
5157
5158           for (set<SnapRealm*>::iterator p = realm->pchildren.begin();
5159                p != realm->pchildren.end();
5160                ++p)
5161             q.push_back(*p);
5162
5163           if (dirty_realms.count(realm) == 0) {
5164             realm->nref++;
5165             dirty_realms[realm] = realm->get_snap_context();
5166           }
5167         }
5168       }
5169
5170       // update
5171       realm->seq = info.seq();
5172       realm->created = info.created();
5173       realm->parent_since = info.parent_since();
5174       realm->prior_parent_snaps = info.prior_parent_snaps;
5175       if (realm_info_meta) {
5176         realm->last_modified = (*realm_info_meta).last_modified;
5177         realm->change_attr = (*realm_info_meta).change_attr;
5178       }
5179       realm->my_snaps = info.my_snaps;
5180       invalidate = true;
5181     }
5182
5183     // _always_ verify parent
5184     if (adjust_realm_parent(realm, info.parent()))
5185       invalidate = true;
5186
5187     if (invalidate) {
5188       invalidate_snaprealm_and_children(realm);
5189       ldout(cct, 15) << __func__ << " " << *realm << " self|parent updated" << dendl;
5190       ldout(cct, 15) << "  snapc " << realm->get_snap_context() << dendl;
5191     } else {
5192       ldout(cct, 10) << __func__ << " " << *realm << " seq " << info.seq()
5193                << " <= " << realm->seq << " and same parent, SKIPPING" << dendl;
5194     }
5195
5196     if (!first_realm)
5197       first_realm = realm;
5198     else
5199       put_snap_realm(realm);
5200   }
5201
5202   for (auto &[realm, snapc] : dirty_realms) {
5203     // if there are new snaps ?
5204     if (has_new_snaps(snapc, realm->get_snap_context())) {
5205       ldout(cct, 10) << " flushing caps on " << *realm << dendl;
5206       for (auto&& in : realm->inodes_with_caps) {
5207         queue_cap_snap(in, snapc);
5208       }
5209     } else {
5210       ldout(cct, 10) << " no new snap on " << *realm << dendl;
5211     }
5212     put_snap_realm(realm);
5213   }
5214
5215   if (realm_ret)
5216     *realm_ret = first_realm;
5217   else
5218     put_snap_realm(first_realm);
5219 }
5220
5221 void Client::handle_snap(const MConstRef<MClientSnap>& m)
5222 {
5223   ldout(cct, 10) << __func__ << " " << *m << dendl;
5224   mds_rank_t mds = mds_rank_t(m->get_source().num());
5225
5226   std::scoped_lock cl(client_lock);
5227   auto session = _get_mds_session(mds, m->get_connection().get());
5228   if (!session) {
5229     return;
5230   }
5231
5232   got_mds_push(session.get());
5233
5234   map<Inode*, SnapContext> to_move;
5235   SnapRealm *realm = 0;
5236
5237   if (m->head.op == CEPH_SNAP_OP_SPLIT) {
5238     ceph_assert(m->head.split);
5239     auto p = m->bl.cbegin();
5240     auto [info, _] = get_snap_realm_info(session.get(), p);
5241     ceph_assert(info.ino() == m->head.split);
5242
5243     // flush, then move, ino's.
5244     realm = get_snap_realm(info.ino());
5245     ldout(cct, 10) << " splitting off " << *realm << dendl;
5246     for (auto& ino : m->split_inos) {
5247       vinodeno_t vino(ino, CEPH_NOSNAP);
5248       if (inode_map.count(vino)) {
5249         Inode *in = inode_map[vino];
5250         if (!in->snaprealm || in->snaprealm == realm)
5251           continue;
5252         if (in->snaprealm->created > info.created()) {
5253           ldout(cct, 10) << " NOT moving " << *in << " from _newer_ realm "
5254                    << *in->snaprealm << dendl;
5255           continue;
5256         }
5257         ldout(cct, 10) << " moving " << *in << " from " << *in->snaprealm << dendl;
5258
5259
5260         in->snaprealm_item.remove_myself();
5261         to_move[in] = in->snaprealm->get_snap_context();
5262         put_snap_realm(in->snaprealm);
5263       }
5264     }
5265
5266     // move child snaprealms, too
5267     for (auto& child_realm : m->split_realms) {
5268       ldout(cct, 10) << "adjusting snaprealm " << child_realm << " parent" << dendl;
5269       SnapRealm *child = get_snap_realm_maybe(child_realm);
5270       if (!child)
5271         continue;
5272       adjust_realm_parent(child, realm->ino);
5273       put_snap_realm(child);
5274     }
5275   }
5276
5277   update_snap_trace(session.get(), m->bl, NULL, m->head.op != CEPH_SNAP_OP_DESTROY);
5278
5279   if (realm) {
5280     for (auto p = to_move.begin(); p != to_move.end(); ++p) {
5281       Inode *in = p->first;
5282       in->snaprealm = realm;
5283       realm->inodes_with_caps.push_back(&in->snaprealm_item);
5284       realm->nref++;
5285       // queue for snap writeback
5286       if (has_new_snaps(p->second, realm->get_snap_context()))
5287         queue_cap_snap(in, p->second);
5288     }
5289     put_snap_realm(realm);
5290   }
5291 }
5292
5293 void Client::handle_quota(const MConstRef<MClientQuota>& m)
5294 {
5295   mds_rank_t mds = mds_rank_t(m->get_source().num());
5296
5297   std::scoped_lock cl(client_lock);
5298   auto session = _get_mds_session(mds, m->get_connection().get());
5299   if (!session) {
5300     return;
5301   }
5302
5303   got_mds_push(session.get());
5304
5305   ldout(cct, 10) << __func__ << " " << *m << " from mds." << mds << dendl;
5306
5307   vinodeno_t vino(m->ino, CEPH_NOSNAP);
5308   if (inode_map.count(vino)) {
5309     Inode *in = NULL;
5310     in = inode_map[vino];
5311
5312     if (in) {
5313       in->quota = m->quota;
5314       in->rstat = m->rstat;
5315     }
5316   }
5317 }
5318
5319 void Client::handle_caps(const MConstRef<MClientCaps>& m)
5320 {
5321   mds_rank_t mds = mds_rank_t(m->get_source().num());
5322
5323   std::scoped_lock cl(client_lock);
5324   auto session = _get_mds_session(mds, m->get_connection().get());
5325   if (!session) {
5326     return;
5327   }
5328
5329   if (m->osd_epoch_barrier && !objecter->have_map(m->osd_epoch_barrier)) {
5330     // Pause RADOS operations until we see the required epoch
5331     objecter->set_epoch_barrier(m->osd_epoch_barrier);
5332   }
5333
5334   if (m->osd_epoch_barrier > cap_epoch_barrier) {
5335     // Record the barrier so that we will transmit it to MDS when releasing
5336     set_cap_epoch_barrier(m->osd_epoch_barrier);
5337   }
5338
5339   got_mds_push(session.get());
5340
5341   bool do_cap_release = false;
5342   Inode *in;
5343   vinodeno_t vino(m->get_ino(), CEPH_NOSNAP);
5344   if (auto it = inode_map.find(vino); it != inode_map.end()) {
5345     in = it->second;
5346
5347     /* MDS maybe waiting for cap release with increased seq */
5348     switch (m->get_op()) {
5349       case CEPH_CAP_OP_REVOKE:
5350       case CEPH_CAP_OP_GRANT:
5351         if (!in->caps.count(mds)) {
5352           do_cap_release = true;
5353           ldout(cct, 5) << __func__ << " vino " << vino << " don't have cap "
5354                         << m->get_cap_id() << " op " << m->get_op()
5355                         << ", immediately releasing" << dendl;
5356         }
5357     }
5358   } else {
5359     /* MDS maybe waiting for cap release with increased seq */
5360     switch (m->get_op()) {
5361       case CEPH_CAP_OP_IMPORT:
5362       case CEPH_CAP_OP_REVOKE:
5363       case CEPH_CAP_OP_GRANT:
5364         do_cap_release = true;
5365         ldout(cct, 5) << __func__ << " don't have vino " << vino << " op "
5366                       << m->get_op() << ", immediately releasing" << dendl;
5367         break;
5368       default:
5369         ldout(cct, 5) << __func__ << " don't have vino " << vino << ", dropping" << dendl;
5370         return;
5371     }
5372   }
5373
5374   // In case the mds is waiting on e.g. a revocation
5375   if (do_cap_release) {
5376     session->enqueue_cap_release(
5377       m->get_ino(),
5378       m->get_cap_id(),
5379       m->get_seq(),
5380       m->get_mseq(),
5381       cap_epoch_barrier);
5382
5383     flush_cap_releases();
5384     return;
5385   }
5386
5387   switch (m->get_op()) {
5388     case CEPH_CAP_OP_EXPORT: return handle_cap_export(session.get(), in, m);
5389     case CEPH_CAP_OP_FLUSHSNAP_ACK: return handle_cap_flushsnap_ack(session.get(), in, m);
5390     case CEPH_CAP_OP_IMPORT: /* no return */ handle_cap_import(session.get(), in, m);
5391   }
5392
5393   if (auto it = in->caps.find(mds); it != in->caps.end()) {
5394     Cap &cap = in->caps.at(mds);
5395
5396     switch (m->get_op()) {
5397       case CEPH_CAP_OP_TRUNC: return handle_cap_trunc(session.get(), in, m);
5398       case CEPH_CAP_OP_IMPORT:
5399       case CEPH_CAP_OP_REVOKE:
5400       case CEPH_CAP_OP_GRANT: return handle_cap_grant(session.get(), in, &cap, m);
5401       case CEPH_CAP_OP_FLUSH_ACK: return handle_cap_flush_ack(session.get(), in, &cap, m);
5402     }
5403   } else {
5404     ldout(cct, 5) << __func__ << " don't have " << *in << " cap on mds." << mds << dendl;
5405     return;
5406   }
5407 }
5408
5409 void Client::handle_cap_import(MetaSession *session, Inode *in, const MConstRef<MClientCaps>& m)
5410 {
5411   mds_rank_t mds = session->mds_num;
5412
5413   ldout(cct, 5) << __func__ << " ino " << m->get_ino() << " mseq " << m->get_mseq()
5414                 << " IMPORT from mds." << mds << dendl;
5415
5416   const mds_rank_t peer_mds = mds_rank_t(m->peer.mds);
5417   Cap *cap = NULL;
5418   UserPerm cap_perms;
5419   if (auto it = in->caps.find(peer_mds); m->peer.cap_id && it != in->caps.end()) {
5420     cap = &it->second;
5421     cap_perms = cap->latest_perms;
5422   }
5423
5424   // add/update it
5425   SnapRealm *realm = NULL;
5426   update_snap_trace(session, m->snapbl, &realm);
5427
5428   int issued = m->get_caps();
5429   int wanted = m->get_wanted();
5430   add_update_cap(in, session, m->get_cap_id(),
5431                  issued, wanted, m->get_seq(), m->get_mseq(),
5432                  m->get_realm(), CEPH_CAP_FLAG_AUTH, cap_perms);
5433
5434   if (cap && cap->cap_id == m->peer.cap_id) {
5435       remove_cap(cap, (m->peer.flags & CEPH_CAP_FLAG_RELEASE));
5436   }
5437
5438   if (realm)
5439     put_snap_realm(realm);
5440
5441   if (in->auth_cap && in->auth_cap->session == session) {
5442     if (!(wanted & CEPH_CAP_ANY_FILE_WR) ||
5443         in->requested_max_size > m->get_max_size()) {
5444       in->requested_max_size = 0;
5445       ldout(cct, 15) << "reset requested_max_size after cap import" << dendl;
5446     }
5447     // reflush any/all caps (if we are now the auth_cap)
5448     kick_flushing_caps(in, session);
5449   }
5450 }
5451
5452 void Client::handle_cap_export(MetaSession *session, Inode *in, const MConstRef<MClientCaps>& m)
5453 {
5454   mds_rank_t mds = session->mds_num;
5455
5456   ldout(cct, 5) << __func__ << " ino " << m->get_ino() << " mseq " << m->get_mseq()
5457                 << " EXPORT from mds." << mds << dendl;
5458
5459   auto it = in->caps.find(mds);
5460   if (it != in->caps.end()) {
5461     Cap &cap = it->second;
5462     if (cap.cap_id == m->get_cap_id()) {
5463       if (m->peer.cap_id) {
5464         const auto peer_mds = mds_rank_t(m->peer.mds);
5465         auto tsession = _get_or_open_mds_session(peer_mds);
5466         auto it = in->caps.find(peer_mds);
5467         if (it != in->caps.end()) {
5468           Cap &tcap = it->second;
5469           if (tcap.cap_id == m->peer.cap_id &&
5470               ceph_seq_cmp(tcap.seq, m->peer.seq) < 0) {
5471             tcap.cap_id = m->peer.cap_id;
5472             tcap.seq = m->peer.seq - 1;
5473             tcap.issue_seq = tcap.seq;
5474             tcap.issued |= cap.issued;
5475             tcap.implemented |= cap.issued;
5476             if (&cap == in->auth_cap)
5477               in->auth_cap = &tcap;
5478             if (in->auth_cap == &tcap && in->flushing_cap_item.is_on_list())
5479               adjust_session_flushing_caps(in, session, tsession.get());
5480           }
5481         } else {
5482           add_update_cap(in, tsession.get(), m->peer.cap_id, cap.issued, 0,
5483                          m->peer.seq - 1, m->peer.mseq, (uint64_t)-1,
5484                          &cap == in->auth_cap ? CEPH_CAP_FLAG_AUTH : 0,
5485                          cap.latest_perms);
5486         }
5487       } else {
5488         if (cap.wanted | cap.issued)
5489           in->flags |= I_CAP_DROPPED;
5490       }
5491
5492       remove_cap(&cap, false);
5493     }
5494   }
5495 }
5496
5497 void Client::handle_cap_trunc(MetaSession *session, Inode *in, const MConstRef<MClientCaps>& m)
5498 {
5499   mds_rank_t mds = session->mds_num;
5500   ceph_assert(in->caps.count(mds));
5501
5502   uint64_t size = m->get_size();
5503   if (in->is_fscrypt_enabled()) {
5504     size = std::stoll(std::string(std::rbegin(m->fscrypt_file),
5505                                   std::rend(m->fscrypt_file)));
5506   }
5507   ldout(cct, 10) << __func__ << " on ino " << *in
5508            << " size " << in->size << " -> " << m->get_size()
5509            << dendl;
5510
5511   int issued;
5512   in->caps_issued(&issued);
5513   issued |= in->caps_dirty();
5514   update_inode_file_size(in, issued, size, m->get_truncate_seq(),
5515                          m->get_truncate_size());
5516 }
5517
5518 void Client::handle_cap_flush_ack(MetaSession *session, Inode *in, Cap *cap, const MConstRef<MClientCaps>& m)
5519 {
5520   ceph_tid_t flush_ack_tid = m->get_client_tid();
5521   int dirty = m->get_dirty();
5522   int cleaned = 0;
5523   int flushed = 0;
5524
5525   auto it = in->flushing_cap_tids.begin();
5526   if (it->first < flush_ack_tid) {
5527        ldout(cct, 0) << __func__ << " mds." << session->mds_num
5528                    << " got unexpected flush ack tid " << flush_ack_tid
5529                    << " expected is " << it->first << dendl;
5530   }
5531   for (; it != in->flushing_cap_tids.end(); ) {
5532     if (!it->second) {
5533       // cap snap
5534       ++it;
5535       continue;
5536     }
5537     if (it->first == flush_ack_tid)
5538       cleaned = it->second;
5539     if (it->first <= flush_ack_tid) {
5540       session->flushing_caps_tids.erase(it->first);
5541       in->flushing_cap_tids.erase(it++);
5542       ++flushed;
5543       continue;
5544     }
5545     cleaned &= ~it->second;
5546     if (!cleaned)
5547       break;
5548     ++it;
5549   }
5550
5551   ldout(cct, 5) << __func__ << " mds." << session->mds_num
5552           << " cleaned " << ccap_string(cleaned) << " on " << *in
5553           << " with " << ccap_string(dirty) << dendl;
5554
5555   if (flushed) {
5556     signal_cond_list(in->waitfor_caps);
5557     if (session->flushing_caps_tids.empty() ||
5558         *session->flushing_caps_tids.begin() > flush_ack_tid)
5559       sync_cond.notify_all();
5560   }
5561
5562   if (!dirty) {
5563     in->cap_dirtier_uid = -1;
5564     in->cap_dirtier_gid = -1;
5565   }
5566
5567   if (!cleaned) {
5568     ldout(cct, 10) << " tid " << m->get_client_tid() << " != any cap bit tids" << dendl;
5569   } else {
5570     if (in->flushing_caps) {
5571       ldout(cct, 5) << "  flushing_caps " << ccap_string(in->flushing_caps)
5572               << " -> " << ccap_string(in->flushing_caps & ~cleaned) << dendl;
5573       in->flushing_caps &= ~cleaned;
5574       if (in->flushing_caps == 0) {
5575         ldout(cct, 10) << " " << *in << " !flushing" << dendl;
5576         num_flushing_caps--;
5577        if (in->flushing_cap_tids.empty())
5578           in->flushing_cap_item.remove_myself();
5579       }
5580       if (!in->caps_dirty())
5581         put_inode(in);
5582     }
5583   }
5584 }
5585
5586
5587 void Client::handle_cap_flushsnap_ack(MetaSession *session, Inode *in, const MConstRef<MClientCaps>& m)
5588 {
5589   ceph_tid_t flush_ack_tid = m->get_client_tid();
5590   mds_rank_t mds = session->mds_num;
5591   ceph_assert(in->caps.count(mds));
5592   snapid_t follows = m->get_snap_follows();
5593
5594   if (auto it = in->cap_snaps.find(follows); it != in->cap_snaps.end()) {
5595     auto& capsnap = it->second;
5596     if (flush_ack_tid != capsnap.flush_tid) {
5597       ldout(cct, 10) << " tid " << flush_ack_tid << " != " << capsnap.flush_tid << dendl;
5598     } else {
5599       InodeRef tmp_ref(in);
5600       ldout(cct, 5) << __func__ << " mds." << mds << " flushed snap follows " << follows
5601               << " on " << *in << dendl;
5602       session->flushing_caps_tids.erase(capsnap.flush_tid);
5603       in->flushing_cap_tids.erase(capsnap.flush_tid);
5604       if (in->flushing_caps == 0 && in->flushing_cap_tids.empty())
5605         in->flushing_cap_item.remove_myself();
5606       in->cap_snaps.erase(it);
5607
5608       signal_cond_list(in->waitfor_caps);
5609       if (session->flushing_caps_tids.empty() ||
5610           *session->flushing_caps_tids.begin() > flush_ack_tid)
5611         sync_cond.notify_all();
5612     }
5613   } else {
5614     ldout(cct, 5) << __func__ << " DUP(?) mds." << mds << " flushed snap follows " << follows
5615             << " on " << *in << dendl;
5616     // we may not have it if we send multiple FLUSHSNAP requests and (got multiple FLUSHEDSNAPs back)
5617   }
5618 }
5619
5620 class C_Client_DentryInvalidate : public Context  {
5621 private:
5622   Client *client;
5623   vinodeno_t dirino;
5624   vinodeno_t ino;
5625   string name;
5626 public:
5627   C_Client_DentryInvalidate(Client *c, Dentry *dn, bool del) :
5628     client(c), name(dn->name) {
5629       if (client->use_faked_inos()) {
5630         dirino.ino = dn->dir->parent_inode->faked_ino;
5631         if (del)
5632           ino.ino = dn->inode->faked_ino;
5633       } else {
5634         dirino = dn->dir->parent_inode->vino();
5635         if (del)
5636           ino = dn->inode->vino();
5637       }
5638       if (!del)
5639         ino.ino = inodeno_t();
5640   }
5641   void finish(int r) override {
5642     // _async_dentry_invalidate is responsible for its own locking
5643     ceph_assert(ceph_mutex_is_not_locked_by_me(client->client_lock));
5644     client->_async_dentry_invalidate(dirino, ino, name);
5645   }
5646 };
5647
5648 void Client::_async_dentry_invalidate(vinodeno_t dirino, vinodeno_t ino, string& name)
5649 {
5650   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
5651   if (!mref_reader.is_state_satisfied())
5652     return;
5653
5654   ldout(cct, 10) << __func__ << " '" << name << "' ino " << ino
5655                  << " in dir " << dirino << dendl;
5656   dentry_invalidate_cb(callback_handle, dirino, ino, name.c_str(), name.length());
5657 }
5658
5659 void Client::_schedule_invalidate_dentry_callback(Dentry *dn, bool del)
5660 {
5661   if (dentry_invalidate_cb && dn->inode->ll_ref > 0)
5662     async_dentry_invalidator.queue(new C_Client_DentryInvalidate(this, dn, del));
5663 }
5664
5665 void Client::_try_to_trim_inode(Inode *in, bool sched_inval)
5666 {
5667   int ref = in->get_nref();
5668   ldout(cct, 5) << __func__ << " in " << *in <<dendl;
5669
5670   if (in->dir && !in->dir->dentries.empty()) {
5671     for (auto p = in->dir->dentries.begin();
5672          p != in->dir->dentries.end(); ) {
5673       Dentry *dn = p->second;
5674       ++p;
5675       /* rmsnap removes whole subtree, need trim inodes recursively.
5676        * we don't need to invalidate dentries recursively. because
5677        * invalidating a directory dentry effectively invalidate
5678        * whole subtree */
5679       if (in->snapid != CEPH_NOSNAP && dn->inode && dn->inode->is_dir())
5680         _try_to_trim_inode(dn->inode.get(), false);
5681
5682       if (dn->lru_is_expireable())
5683         unlink(dn, true, false);  // keep dir, drop dentry
5684     }
5685     if (in->dir->dentries.empty()) {
5686       close_dir(in->dir);
5687       --ref;
5688     }
5689   }
5690
5691   if (ref > 1 && (in->flags & I_SNAPDIR_OPEN)) {
5692     InodeRef snapdir = open_snapdir(in);
5693     _try_to_trim_inode(snapdir.get(), false);
5694     --ref;
5695   }
5696
5697   if (ref > 1) {
5698     auto q = in->dentries.begin();
5699     while (q != in->dentries.end()) {
5700       Dentry *dn = *q;
5701       ++q;
5702       if( in->ll_ref > 0 && sched_inval) {
5703         // FIXME: we play lots of unlink/link tricks when handling MDS replies,
5704         //        so in->dentries doesn't always reflect the state of kernel's dcache.
5705         _schedule_invalidate_dentry_callback(dn, true);
5706       }
5707       unlink(dn, true, true);
5708     }
5709   }
5710 }
5711
5712 void Client::handle_cap_grant(MetaSession *session, Inode *in, Cap *cap, const MConstRef<MClientCaps>& m)
5713 {
5714   mds_rank_t mds = session->mds_num;
5715   int used = get_caps_used(in);
5716   int wanted = in->caps_wanted();
5717   int flags = 0;
5718
5719   const unsigned new_caps = m->get_caps();
5720   const bool was_stale = session->cap_gen > cap->gen;
5721   ldout(cct, 5) << __func__ << " on in " << m->get_ino()
5722                 << " mds." << mds << " seq " << m->get_seq()
5723                 << " caps now " << ccap_string(new_caps)
5724                 << " was " << ccap_string(cap->issued)
5725                 << (was_stale ? " (stale)" : "") << dendl;
5726
5727   if (was_stale)
5728       cap->issued = cap->implemented = CEPH_CAP_PIN;
5729   cap->seq = m->get_seq();
5730   cap->gen = session->cap_gen;
5731
5732   check_cap_issue(in, new_caps);
5733
5734   // update inode
5735   int issued;
5736   in->caps_issued(&issued);
5737   issued |= in->caps_dirty();
5738
5739   if ((new_caps & CEPH_CAP_AUTH_SHARED) &&
5740       !(issued & CEPH_CAP_AUTH_EXCL)) {
5741     in->mode = m->head.mode;
5742     in->uid = m->head.uid;
5743     in->gid = m->head.gid;
5744     in->btime = m->btime;
5745   }
5746   bool deleted_inode = false;
5747   if ((new_caps & CEPH_CAP_LINK_SHARED) &&
5748       !(issued & CEPH_CAP_LINK_EXCL)) {
5749     in->nlink = m->head.nlink;
5750     if (in->nlink == 0)
5751       deleted_inode = true;
5752   }
5753   if (!(issued & CEPH_CAP_XATTR_EXCL) &&
5754       m->xattrbl.length() &&
5755       m->head.xattr_version > in->xattr_version) {
5756     auto p = m->xattrbl.cbegin();
5757     decode(in->xattrs, p);
5758     in->xattr_version = m->head.xattr_version;
5759   }
5760
5761   if ((new_caps & CEPH_CAP_FILE_SHARED) && m->dirstat_is_valid()) {
5762     in->dirstat.nfiles = m->get_nfiles();
5763     in->dirstat.nsubdirs = m->get_nsubdirs();
5764   }
5765
5766   if (new_caps & CEPH_CAP_ANY_RD) {
5767     update_inode_file_time(in, issued, m->get_time_warp_seq(),
5768                            m->get_ctime(), m->get_mtime(), m->get_atime());
5769   }
5770
5771   if (new_caps & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR)) {
5772     in->layout = m->get_layout();
5773     update_inode_file_size(in, issued, m->get_size(),
5774                            m->get_truncate_seq(), m->get_truncate_size());
5775   }
5776
5777   if (m->inline_version > in->inline_version) {
5778     in->inline_data = m->inline_data;
5779     in->inline_version = m->inline_version;
5780   }
5781
5782   /* always take a newer change attr */
5783   if (m->get_change_attr() > in->change_attr)
5784     in->change_attr = m->get_change_attr();
5785
5786   // max_size
5787   if (cap == in->auth_cap &&
5788       (new_caps & CEPH_CAP_ANY_FILE_WR) &&
5789       (m->get_max_size() != in->max_size)) {
5790     ldout(cct, 10) << "max_size " << in->max_size << " -> " << m->get_max_size() << dendl;
5791     in->max_size = m->get_max_size();
5792     if (in->max_size > in->wanted_max_size) {
5793       in->wanted_max_size = 0;
5794       in->requested_max_size = 0;
5795     }
5796   }
5797
5798   bool check = false;
5799   if ((was_stale || m->get_op() == CEPH_CAP_OP_IMPORT) &&
5800       (wanted & ~(cap->wanted | new_caps))) {
5801     // If mds is importing cap, prior cap messages that update 'wanted'
5802     // may get dropped by mds (migrate seq mismatch).
5803     //
5804     // We don't send cap message to update 'wanted' if what we want are
5805     // already issued. If mds revokes caps, cap message that releases caps
5806     // also tells mds what we want. But if caps got revoked by mds forcedly
5807     // (session stale). We may haven't told mds what we want.
5808     check = true;
5809   }
5810
5811
5812   // update caps
5813   auto revoked = cap->issued & ~new_caps;
5814   if (revoked) {
5815     ldout(cct, 10) << "  revocation of " << ccap_string(revoked) << dendl;
5816     cap->issued = new_caps;
5817     cap->implemented |= new_caps;
5818
5819     // recall delegations if we're losing caps necessary for them
5820     if (revoked & ceph_deleg_caps_for_type(CEPH_DELEGATION_RD))
5821       in->recall_deleg(false);
5822     else if (revoked & ceph_deleg_caps_for_type(CEPH_DELEGATION_WR))
5823       in->recall_deleg(true);
5824
5825     used = adjust_caps_used_for_lazyio(used, cap->issued, cap->implemented);
5826     if ((used & revoked & (CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO)) &&
5827         !_flush(in, new C_Client_FlushComplete(this, in))) {
5828       // waitin' for flush
5829     } else if (used & revoked & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO)) {
5830       if (_release(in)) {
5831         check = true;
5832         flags = CHECK_CAPS_NODELAY;
5833       }
5834     } else {
5835       cap->wanted = 0; // don't let check_caps skip sending a response to MDS
5836       check = true;
5837       flags = CHECK_CAPS_NODELAY;
5838     }
5839   } else if (cap->issued == new_caps) {
5840     ldout(cct, 10) << "  caps unchanged at " << ccap_string(cap->issued) << dendl;
5841   } else {
5842     ldout(cct, 10) << "  grant, new caps are " << ccap_string(new_caps & ~cap->issued) << dendl;
5843     cap->issued = new_caps;
5844     cap->implemented |= new_caps;
5845
5846     if (cap == in->auth_cap) {
5847       // non-auth MDS is revoking the newly grant caps ?
5848       for (const auto &p : in->caps) {
5849         if (&p.second == cap)
5850           continue;
5851         if (p.second.implemented & ~p.second.issued & new_caps) {
5852           check = true;
5853           break;
5854         }
5855       }
5856     }
5857   }
5858
5859   // just in case the caps was released just before we get the revoke msg
5860   if (!check && m->get_op() == CEPH_CAP_OP_REVOKE) {
5861     cap->wanted = 0; // don't let check_caps skip sending a response to MDS
5862     check = true;
5863     flags = CHECK_CAPS_NODELAY;
5864   }
5865
5866   if (check)
5867     check_caps(in, flags);
5868
5869   // wake up waiters
5870   if (new_caps)
5871     signal_cond_list(in->waitfor_caps);
5872
5873   // may drop inode's last ref
5874   if (deleted_inode)
5875     _try_to_trim_inode(in, true);
5876 }
5877
5878 int Client::inode_permission(Inode *in, const UserPerm& perms, unsigned want)
5879 {
5880   if (perms.uid() == 0) {
5881     // For directories, DACs are overridable.
5882     // For files, Read/write DACs are always overridable but executable DACs are
5883     // overridable when there is at least one exec bit set
5884     if(!S_ISDIR(in->mode) && (want & MAY_EXEC) && !(in->mode & S_IXUGO))
5885       return -CEPHFS_EACCES;
5886     return 0;
5887   }
5888
5889   if (perms.uid() != in->uid && (in->mode & S_IRWXG)) {
5890     int ret = _posix_acl_permission(in, perms, want);
5891     if (ret != -CEPHFS_EAGAIN)
5892       return ret;
5893   }
5894
5895   // check permissions before doing anything else
5896   if (!in->check_mode(perms, want))
5897     return -CEPHFS_EACCES;
5898   return 0;
5899 }
5900
5901 int Client::xattr_permission(Inode *in, const char *name, unsigned want,
5902                              const UserPerm& perms)
5903 {
5904   int r = _getattr_for_perm(in, perms);
5905   if (r < 0)
5906     goto out;
5907
5908   r = 0;
5909   if (strncmp(name, "system.", 7) == 0) {
5910     if ((want & MAY_WRITE) && (perms.uid() != 0 && perms.uid() != in->uid))
5911       r = -CEPHFS_EPERM;
5912   } else {
5913     r = inode_permission(in, perms, want);
5914   }
5915 out:
5916   ldout(cct, 5) << __func__ << " " << in << " = " << r <<  dendl;
5917   return r;
5918 }
5919
5920 std::ostream& operator<<(std::ostream &out, const UserPerm& perm) {
5921   out << "UserPerm(uid: " << perm.uid() << ", gid: " << perm.gid() << ")";
5922   return out;
5923 }
5924
5925 int Client::may_setattr(Inode *in, struct ceph_statx *stx, int mask,
5926                         const UserPerm& perms)
5927 {
5928   ldout(cct, 20) << __func__ << " " << *in << "; " << perms << " stx_mode: "
5929       << hex << stx->stx_mode << " mask:" << mask << dec << dendl;
5930   int r = _getattr_for_perm(in, perms);
5931   if (r < 0)
5932     goto out;
5933
5934   if (mask & CEPH_SETATTR_SIZE) {
5935     r = inode_permission(in, perms, MAY_WRITE);
5936     if (r < 0)
5937       goto out;
5938   }
5939
5940   r = -CEPHFS_EPERM;
5941   if (mask & CEPH_SETATTR_UID) {
5942     if (perms.uid() != 0 && (perms.uid() != in->uid || stx->stx_uid != in->uid))
5943       goto out;
5944   }
5945   if (mask & CEPH_SETATTR_GID) {
5946     if (perms.uid() != 0 && (perms.uid() != in->uid ||
5947                (!perms.gid_in_groups(stx->stx_gid) && stx->stx_gid != in->gid)))
5948       goto out;
5949   }
5950
5951   if (mask & CEPH_SETATTR_MODE) {
5952     uint32_t m = ~stx->stx_mode & in->mode; // mode bits removed
5953     ldout(cct, 20) << __func__ << " " << *in << " = " << hex << m << dec <<  dendl;
5954     if (perms.uid() != 0 && perms.uid() != in->uid &&
5955         /*
5956          * Currently the kernel fuse and libfuse code is buggy and
5957          * won't pass the ATTR_KILL_SUID/ATTR_KILL_SGID to ceph-fuse.
5958          * But will just set the ATTR_MODE and at the same time by
5959          * clearing the suid/sgid bits.
5960          *
5961          * Only allow unprivileged users to clear S_ISUID and S_ISUID.
5962          */
5963         (m & ~(S_ISUID | S_ISGID)))
5964       goto out;
5965
5966     gid_t i_gid = (mask & CEPH_SETATTR_GID) ? stx->stx_gid : in->gid;
5967     if (perms.uid() != 0 && !perms.gid_in_groups(i_gid))
5968       stx->stx_mode &= ~S_ISGID;
5969   }
5970
5971   if (mask & (CEPH_SETATTR_CTIME | CEPH_SETATTR_BTIME |
5972               CEPH_SETATTR_MTIME | CEPH_SETATTR_ATIME)) {
5973     if (perms.uid() != 0 && perms.uid() != in->uid) {
5974       int check_mask = CEPH_SETATTR_CTIME | CEPH_SETATTR_BTIME;
5975       if (!(mask & CEPH_SETATTR_MTIME_NOW))
5976         check_mask |= CEPH_SETATTR_MTIME;
5977       if (!(mask & CEPH_SETATTR_ATIME_NOW))
5978         check_mask |= CEPH_SETATTR_ATIME;
5979       if (check_mask & mask) {
5980         goto out;
5981       } else {
5982         r = inode_permission(in, perms, MAY_WRITE);
5983         if (r < 0)
5984           goto out;
5985       }
5986     }
5987   }
5988   r = 0;
5989 out:
5990   ldout(cct, 3) << __func__ << " " << in << " = " << r <<  dendl;
5991   return r;
5992 }
5993
5994 int Client::may_open(Inode *in, int flags, const UserPerm& perms)
5995 {
5996   ldout(cct, 20) << __func__ << " " << *in << "; " << perms << dendl;
5997   unsigned want = 0;
5998
5999   if ((flags & O_ACCMODE) == O_WRONLY)
6000     want = MAY_WRITE;
6001   else if ((flags & O_ACCMODE) == O_RDWR)
6002     want = MAY_READ | MAY_WRITE;
6003   else if ((flags & O_ACCMODE) == O_RDONLY)
6004     want = MAY_READ;
6005   if (flags & O_TRUNC)
6006     want |= MAY_WRITE;
6007
6008   int r = 0;
6009   switch (in->mode & S_IFMT) {
6010     case S_IFLNK:
6011       r = -CEPHFS_ELOOP;
6012       goto out;
6013     case S_IFDIR:
6014       if (want & MAY_WRITE) {
6015         r = -CEPHFS_EISDIR;
6016         goto out;
6017       }
6018       break;
6019   }
6020
6021   r = _getattr_for_perm(in, perms);
6022   if (r < 0)
6023     goto out;
6024
6025   r = inode_permission(in, perms, want);
6026 out:
6027   ldout(cct, 3) << __func__ << " " << in << " = " << r <<  dendl;
6028   return r;
6029 }
6030
6031 int Client::may_lookup(Inode *dir, const UserPerm& perms)
6032 {
6033   ldout(cct, 20) << __func__ << " " << *dir << "; " << perms << dendl;
6034   int r = _getattr_for_perm(dir, perms);
6035   if (r < 0)
6036     goto out;
6037
6038   r = inode_permission(dir, perms, MAY_EXEC);
6039 out:
6040   ldout(cct, 3) << __func__ << " " << dir << " = " << r <<  dendl;
6041   return r;
6042 }
6043
6044 int Client::may_create(Inode *dir, const UserPerm& perms)
6045 {
6046   ldout(cct, 20) << __func__ << " " << *dir << "; " << perms << dendl;
6047   int r = _getattr_for_perm(dir, perms);
6048   if (r < 0)
6049     goto out;
6050
6051   r = inode_permission(dir, perms, MAY_EXEC | MAY_WRITE);
6052 out:
6053   ldout(cct, 3) << __func__ << " " << dir << " = " << r <<  dendl;
6054   return r;
6055 }
6056
6057 int Client::may_delete(Inode *dir, const char *name, const UserPerm& perms)
6058 {
6059   ldout(cct, 20) << __func__ << " " << *dir << "; " << "; name " << name << "; " << perms << dendl;
6060   int r = _getattr_for_perm(dir, perms);
6061   if (r < 0)
6062     goto out;
6063
6064   r = inode_permission(dir, perms, MAY_EXEC | MAY_WRITE);
6065   if (r < 0)
6066     goto out;
6067
6068   /* 'name == NULL' means rmsnap w/o permission checks */
6069   if (perms.uid() != 0 && name && (dir->mode & S_ISVTX)) {
6070     InodeRef otherin;
6071     r = _lookup(dir, name, CEPH_CAP_AUTH_SHARED, &otherin, perms);
6072     if (r < 0)
6073       goto out;
6074     if (dir->uid != perms.uid() && otherin->uid != perms.uid())
6075       r = -CEPHFS_EPERM;
6076   }
6077 out:
6078   ldout(cct, 3) << __func__ << " " << dir << " = " << r <<  dendl;
6079   return r;
6080 }
6081
6082 int Client::may_delete(const char *relpath, const UserPerm& perms) {
6083   ldout(cct, 20) << __func__ << " " << relpath << "; " << perms << dendl;
6084
6085   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
6086   if (!mref_reader.is_state_satisfied())
6087     return -CEPHFS_ENOTCONN;
6088
6089   filepath path(relpath);
6090   string name = path.last_dentry();
6091   path.pop_dentry();
6092   InodeRef dir;
6093
6094   std::scoped_lock lock(client_lock);
6095   int r = path_walk(path, &dir, perms);
6096   if (r < 0)
6097     return r;
6098   if (cct->_conf->client_permissions) {
6099     int r = may_delete(dir.get(), name.c_str(), perms);
6100     if (r < 0)
6101       return r;
6102   }
6103
6104   return 0;
6105 }
6106
6107 int Client::may_hardlink(Inode *in, const UserPerm& perms)
6108 {
6109   ldout(cct, 20) << __func__ << " " << *in << "; " << perms << dendl;
6110   int r = _getattr_for_perm(in, perms);
6111   if (r < 0)
6112     goto out;
6113
6114   if (perms.uid() == 0 || perms.uid() == in->uid) {
6115     r = 0;
6116     goto out;
6117   }
6118
6119   r = -CEPHFS_EPERM;
6120   if (!S_ISREG(in->mode))
6121     goto out;
6122
6123   if (in->mode & S_ISUID)
6124     goto out;
6125
6126   if ((in->mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP))
6127     goto out;
6128
6129   r = inode_permission(in, perms, MAY_READ | MAY_WRITE);
6130 out:
6131   ldout(cct, 3) << __func__ << " " << in << " = " << r <<  dendl;
6132   return r;
6133 }
6134
6135 int Client::_getattr_for_perm(Inode *in, const UserPerm& perms)
6136 {
6137   int mask = CEPH_STAT_CAP_MODE;
6138   bool force = false;
6139   if (acl_type != NO_ACL) {
6140     mask |= CEPH_STAT_CAP_XATTR;
6141     force = in->xattr_version == 0;
6142   }
6143   return _getattr(in, mask, perms, force);
6144 }
6145
6146 vinodeno_t Client::_get_vino(Inode *in)
6147 {
6148   /* The caller must hold the client lock */
6149   return vinodeno_t(in->ino, in->snapid);
6150 }
6151
6152 /**
6153  * Resolve an MDS spec to a list of MDS daemon GIDs.
6154  *
6155  * The spec is a string representing a GID, rank, filesystem:rank, or name/id.
6156  * It may be '*' in which case it matches all GIDs.
6157  *
6158  * If no error is returned, the `targets` vector will be populated with at least
6159  * one MDS.
6160  */
6161 int Client::resolve_mds(
6162     const std::string &mds_spec,
6163     std::vector<mds_gid_t> *targets)
6164 {
6165   ceph_assert(fsmap);
6166   ceph_assert(targets != nullptr);
6167
6168   mds_role_t role;
6169   CachedStackStringStream css;
6170   int role_r = fsmap->parse_role(mds_spec, &role, *css);
6171   if (role_r == 0) {
6172     // We got a role, resolve it to a GID
6173     auto& info = fsmap->get_filesystem(role.fscid)->mds_map.get_info(role.rank);
6174     ldout(cct, 10) << __func__ << ": resolved " << mds_spec << " to role '"
6175       << role << "' aka " << info.human_name() << dendl;
6176     targets->push_back(info.global_id);
6177     return 0;
6178   }
6179
6180   std::string strtol_err;
6181   long long rank_or_gid = strict_strtoll(mds_spec.c_str(), 10, &strtol_err);
6182   if (strtol_err.empty()) {
6183     // It is a possible GID
6184     const mds_gid_t mds_gid = mds_gid_t(rank_or_gid);
6185     if (fsmap->gid_exists(mds_gid)) {
6186       auto& info = fsmap->get_info_gid(mds_gid);
6187       ldout(cct, 10) << __func__ << ": validated gid " << mds_gid << " aka "
6188                      << info.human_name() << dendl;
6189       targets->push_back(mds_gid);
6190       return 0;
6191     } else {
6192       lderr(cct) << __func__ << ": gid " << mds_gid << " not in MDS map"
6193                  << dendl;
6194       lderr(cct) << "FSMap: " << *fsmap << dendl;
6195       return -CEPHFS_ENOENT;
6196     }
6197   } else if (mds_spec == "*") {
6198     // It is a wildcard: use all MDSs
6199     const auto& mds_info = fsmap->get_mds_info();
6200
6201     ldout(cct, 10) << __func__ << ": resolving `*' to all MDS daemons" << dendl;
6202     if (mds_info.empty()) {
6203       lderr(cct) << __func__ << ": no MDS daemons found" << dendl;
6204       lderr(cct) << "FSMap: " << *fsmap << dendl;
6205       return -CEPHFS_ENOENT;
6206     }
6207
6208     for (const auto& [gid, info] : mds_info) {
6209       ldout(cct, 10) << __func__ << ": appending " << info.human_name() << " to targets" << dendl;
6210       targets->push_back(gid);
6211     }
6212     return 0;
6213   } else {
6214     // It did not parse as an integer, it is not a wildcard, it must be a name
6215     const mds_gid_t mds_gid = fsmap->find_mds_gid_by_name(mds_spec);
6216     if (mds_gid == mds_gid_t{0}) {
6217       lderr(cct) << __func__ << ": no MDS daemons found by name `" << mds_spec << "'" << dendl;
6218       lderr(cct) << "FSMap: " << *fsmap << dendl;
6219       return -CEPHFS_ENOENT;
6220     } else {
6221       auto& info = fsmap->get_info_gid(mds_gid);
6222       ldout(cct, 10) << __func__ << ": resolved name '" << mds_spec
6223                      << "' to " << info.human_name() << dendl;
6224       targets->push_back(mds_gid);
6225     }
6226     return 0;
6227   }
6228 }
6229
6230
6231 /**
6232  * Authenticate with mon and establish global ID
6233  */
6234 int Client::authenticate()
6235 {
6236   ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
6237
6238   if (monclient->is_authenticated()) {
6239     return 0;
6240   }
6241
6242   client_lock.unlock();
6243   int r = monclient->authenticate(std::chrono::duration<double>(mount_timeout).count());
6244   client_lock.lock();
6245   if (r < 0) {
6246     return r;
6247   }
6248
6249   whoami = monclient->get_global_id();
6250   messenger->set_myname(entity_name_t::CLIENT(whoami.v));
6251
6252   return 0;
6253 }
6254
6255 int Client::fetch_fsmap(bool user)
6256 {
6257   ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
6258
6259   // Retrieve FSMap to enable looking up daemon addresses.  We need FSMap
6260   // rather than MDSMap because no one MDSMap contains all the daemons, and
6261   // a `tell` can address any daemon.
6262   version_t fsmap_latest;
6263   bs::error_code ec;
6264   do {
6265     client_lock.unlock();
6266     std::tie(fsmap_latest, std::ignore) =
6267       monclient->get_version("fsmap", ca::use_blocked[ec]);
6268     client_lock.lock();
6269   } while (ec == bs::errc::resource_unavailable_try_again);
6270
6271   if (ec) {
6272     lderr(cct) << "Failed to learn FSMap version: " << ec << dendl;
6273     return ceph::from_error_code(ec);
6274   }
6275
6276   ldout(cct, 10) << __func__ << " learned FSMap version " << fsmap_latest << dendl;
6277
6278   if (user) {
6279     if (!fsmap_user || fsmap_user->get_epoch() < fsmap_latest) {
6280       monclient->sub_want("fsmap.user", fsmap_latest, CEPH_SUBSCRIBE_ONETIME);
6281       monclient->renew_subs();
6282       wait_on_list(waiting_for_fsmap);
6283     }
6284     ceph_assert(fsmap_user);
6285     ceph_assert(fsmap_user->get_epoch() >= fsmap_latest);
6286   } else {
6287     if (!fsmap || fsmap->get_epoch() < fsmap_latest) {
6288       monclient->sub_want("fsmap", fsmap_latest, CEPH_SUBSCRIBE_ONETIME);
6289       monclient->renew_subs();
6290       wait_on_list(waiting_for_fsmap);
6291     }
6292     ceph_assert(fsmap);
6293     ceph_assert(fsmap->get_epoch() >= fsmap_latest);
6294   }
6295   ldout(cct, 10) << __func__ << " finished waiting for FSMap version "
6296                  << fsmap_latest << dendl;
6297   return 0;
6298 }
6299
6300 /**
6301  *
6302  * @mds_spec one of ID, rank, GID, "*"
6303  *
6304  */
6305 int Client::mds_command(
6306     const std::string &mds_spec,
6307     const vector<string>& cmd,
6308     const bufferlist& inbl,
6309     bufferlist *outbl,
6310     string *outs,
6311     Context *onfinish)
6312 {
6313   RWRef_t iref_reader(initialize_state, CLIENT_INITIALIZED);
6314   if (!iref_reader.is_state_satisfied())
6315     return -CEPHFS_ENOTCONN;
6316
6317   std::unique_lock cl(client_lock);
6318
6319   int r;
6320   r = authenticate();
6321   if (r < 0) {
6322     return r;
6323   }
6324
6325   r = fetch_fsmap(false);
6326   if (r < 0) {
6327     return r;
6328   }
6329
6330   // Look up MDS target(s) of the command
6331   std::vector<mds_gid_t> targets;
6332   r = resolve_mds(mds_spec, &targets);
6333   if (r < 0) {
6334     return r;
6335   }
6336
6337   // If daemons are laggy, we won't send them commands.  If all
6338   // are laggy then we fail.
6339   std::vector<mds_gid_t> non_laggy;
6340   for (const auto& gid : targets) {
6341     const auto info = fsmap->get_info_gid(gid);
6342     if (!info.laggy()) {
6343       non_laggy.push_back(gid);
6344     }
6345   }
6346   if (non_laggy.size() == 0) {
6347     *outs = "All targeted MDS daemons are laggy";
6348     return -CEPHFS_ENOENT;
6349   }
6350
6351   if (metadata.empty()) {
6352     // We are called on an unmounted client, so metadata
6353     // won't be initialized yet.
6354     populate_metadata("");
6355   }
6356
6357   // Send commands to targets
6358   C_GatherBuilder gather(cct, onfinish);
6359   for (const auto& target_gid : non_laggy) {
6360     const auto info = fsmap->get_info_gid(target_gid);
6361
6362     // Open a connection to the target MDS
6363     ConnectionRef conn = messenger->connect_to_mds(info.get_addrs());
6364
6365     cl.unlock();
6366     {
6367       std::scoped_lock cmd_lock(command_lock);
6368       // Generate MDSCommandOp state
6369       auto &op = command_table.start_command();
6370
6371       op.on_finish = gather.new_sub();
6372       op.cmd = cmd;
6373       op.outbl = outbl;
6374       op.outs = outs;
6375       op.inbl = inbl;
6376       op.mds_gid = target_gid;
6377       op.con = conn;
6378
6379       ldout(cct, 4) << __func__ << ": new command op to " << target_gid
6380         << " tid=" << op.tid << cmd << dendl;
6381
6382       // Construct and send MCommand
6383       MessageRef m = op.get_message(monclient->get_fsid());
6384       conn->send_message2(std::move(m));
6385     }
6386     cl.lock();
6387   }
6388   gather.activate();
6389
6390   return 0;
6391 }
6392
6393 void Client::handle_command_reply(const MConstRef<MCommandReply>& m)
6394 {
6395   ceph_tid_t const tid = m->get_tid();
6396
6397   ldout(cct, 10) << __func__ << ": tid=" << m->get_tid() << dendl;
6398
6399   std::scoped_lock cmd_lock(command_lock);
6400   if (!command_table.exists(tid)) {
6401     ldout(cct, 1) << __func__ << ": unknown tid " << tid << ", dropping" << dendl;
6402     return;
6403   }
6404
6405   auto &op = command_table.get_command(tid);
6406   if (op.outbl) {
6407     *op.outbl = m->get_data();
6408   }
6409   if (op.outs) {
6410     *op.outs = m->rs;
6411   }
6412
6413   if (op.on_finish) {
6414     op.on_finish->complete(m->r);
6415   }
6416
6417   command_table.erase(tid);
6418 }
6419
6420 // -------------------
6421 // MOUNT
6422
6423 int Client::subscribe_mdsmap(const std::string &fs_name)
6424 {
6425   int r = authenticate();
6426   if (r < 0) {
6427     lderr(cct) << "authentication failed: " << cpp_strerror(r) << dendl;
6428     return r;
6429   }
6430
6431   std::string resolved_fs_name;
6432   if (fs_name.empty()) {
6433     resolved_fs_name = cct->_conf.get_val<std::string>("client_fs");
6434     if (resolved_fs_name.empty())
6435             // Try the backwards compatibility fs name option
6436             resolved_fs_name = cct->_conf.get_val<std::string>("client_mds_namespace");
6437   } else {
6438     resolved_fs_name = fs_name;
6439   }
6440
6441   std::string want = "mdsmap";
6442   if (!resolved_fs_name.empty()) {
6443     r = fetch_fsmap(true);
6444     if (r < 0)
6445       return r;
6446     fscid = fsmap_user->get_fs_cid(resolved_fs_name);
6447     if (fscid == FS_CLUSTER_ID_NONE) {
6448       return -CEPHFS_ENOENT;
6449     }
6450
6451     std::ostringstream oss;
6452     oss << want << "." << fscid;
6453     want = oss.str();
6454   }
6455   ldout(cct, 10) << "Subscribing to map '" << want << "'" << dendl;
6456
6457   monclient->sub_want(want, 0, 0);
6458   monclient->renew_subs();
6459
6460   return 0;
6461 }
6462
6463 int Client::mount(const std::string &mount_root, const UserPerm& perms,
6464                   bool require_mds, const std::string &fs_name)
6465 {
6466   ceph_assert(is_initialized());
6467
6468   /*
6469    * To make sure that the _unmount() must wait until the mount()
6470    * is done.
6471    */
6472   RWRef_t mref_writer(mount_state, CLIENT_MOUNTING, false);
6473   if (!mref_writer.is_first_writer()) // already mounting or mounted
6474     return 0;
6475
6476   std::unique_lock cl(client_lock);
6477
6478   int r = subscribe_mdsmap(fs_name);
6479   if (r < 0) {
6480     lderr(cct) << "mdsmap subscription failed: " << cpp_strerror(r) << dendl;
6481     return r;
6482   }
6483
6484   start_tick_thread(); // start tick thread
6485
6486   if (require_mds) {
6487     while (1) {
6488       auto availability = mdsmap->is_cluster_available();
6489       if (availability == MDSMap::STUCK_UNAVAILABLE) {
6490         // Error out
6491         ldout(cct, 10) << "mds cluster unavailable: epoch=" << mdsmap->get_epoch() << dendl;
6492         return CEPH_FUSE_NO_MDS_UP;
6493       } else if (availability == MDSMap::AVAILABLE) {
6494         // Continue to mount
6495         break;
6496       } else if (availability == MDSMap::TRANSIENT_UNAVAILABLE) {
6497         // Else, wait.  MDSMonitor will update the map to bring
6498         // us to a conclusion eventually.
6499         wait_on_list(waiting_for_mdsmap);
6500       } else {
6501         // Unexpected value!
6502         ceph_abort();
6503       }
6504     }
6505   }
6506
6507   if(mdsmap->test_flag(CEPH_MDSMAP_REFUSE_CLIENT_SESSION)) {
6508     lderr(cct) << "connections cannot be made while"
6509                   " the flag refuse_client_session is set" << dendl;
6510     return -CEPHFS_EACCES;
6511   }
6512
6513   populate_metadata(mount_root.empty() ? "/" : mount_root);
6514
6515   filepath fp(CEPH_INO_ROOT);
6516   if (!mount_root.empty()) {
6517     fp = filepath(mount_root.c_str());
6518   }
6519   while (true) {
6520     MetaRequest *req = new MetaRequest(CEPH_MDS_OP_GETATTR);
6521     req->set_filepath(fp);
6522     req->head.args.getattr.mask = CEPH_STAT_CAP_INODE_ALL;
6523     int res = make_request(req, perms);
6524     if (res < 0) {
6525       if (res == -CEPHFS_EACCES && root) {
6526         ldout(cct, 1) << __func__ << " EACCES on parent of mount point; quotas may not work" << dendl;
6527         break;
6528       }
6529       return res;
6530     }
6531
6532     if (fp.depth())
6533       fp.pop_dentry();
6534     else
6535       break;
6536   }
6537
6538   ceph_assert(root);
6539   _ll_get(root.get());
6540
6541   // trace?
6542   if (!cct->_conf->client_trace.empty()) {
6543     traceout.open(cct->_conf->client_trace.c_str());
6544     if (traceout.is_open()) {
6545       ldout(cct, 1) << "opened trace file '" << cct->_conf->client_trace << "'" << dendl;
6546     } else {
6547       ldout(cct, 1) << "FAILED to open trace file '" << cct->_conf->client_trace << "'" << dendl;
6548     }
6549   }
6550
6551   /*
6552   ldout(cct, 3) << "op: // client trace data structs" << dendl;
6553   ldout(cct, 3) << "op: struct stat st;" << dendl;
6554   ldout(cct, 3) << "op: struct utimbuf utim;" << dendl;
6555   ldout(cct, 3) << "op: int readlinkbuf_len = 1000;" << dendl;
6556   ldout(cct, 3) << "op: char readlinkbuf[readlinkbuf_len];" << dendl;
6557   ldout(cct, 3) << "op: map<string, inode_t*> dir_contents;" << dendl;
6558   ldout(cct, 3) << "op: map<int, int> open_files;" << dendl;
6559   ldout(cct, 3) << "op: int fd;" << dendl;
6560   */
6561
6562   mref_writer.update_state(CLIENT_MOUNTED);
6563   return 0;
6564 }
6565
6566 // UNMOUNT
6567
6568 void Client::_close_sessions()
6569 {
6570   for (auto it = mds_sessions.begin(); it != mds_sessions.end(); ) {
6571     if (it->second->state == MetaSession::STATE_REJECTED)
6572       mds_sessions.erase(it++);
6573     else
6574       ++it;
6575   }
6576
6577   while (!mds_sessions.empty()) {
6578     // send session closes!
6579     for (auto &p : mds_sessions) {
6580       if (p.second->state != MetaSession::STATE_CLOSING) {
6581         _close_mds_session(p.second.get());
6582         mds_ranks_closing.insert(p.first);
6583       }
6584     }
6585
6586     // wait for sessions to close
6587     double timo = cct->_conf.get_val<std::chrono::seconds>("client_shutdown_timeout").count();
6588     ldout(cct, 2) << "waiting for " << mds_ranks_closing.size() << " mds session(s) to close (timeout: "
6589                   << timo << "s)" << dendl;
6590     std::unique_lock l{client_lock, std::adopt_lock};
6591     if (!timo) {
6592       mount_cond.wait(l);
6593     } else if (!mount_cond.wait_for(l, ceph::make_timespan(timo), [this] { return mds_ranks_closing.empty(); })) {
6594       ldout(cct, 1) << mds_ranks_closing.size() << " mds(s) did not respond to session close -- timing out." << dendl;
6595       while (!mds_ranks_closing.empty()) {
6596         auto session = mds_sessions.at(*mds_ranks_closing.begin());
6597         // this prunes entry from mds_sessions and mds_ranks_closing
6598         _closed_mds_session(session.get(), -CEPHFS_ETIMEDOUT);
6599       }
6600     }
6601
6602     mds_ranks_closing.clear();
6603     l.release();
6604   }
6605 }
6606
6607 void Client::flush_mdlog_sync(Inode *in)
6608 {
6609   if (in->unsafe_ops.empty()) {
6610     return;
6611   }
6612
6613   std::set<mds_rank_t> anchor;
6614   for (auto &&p : in->unsafe_ops) {
6615     anchor.emplace(p->mds);
6616   }
6617   if (in->auth_cap) {
6618     anchor.emplace(in->auth_cap->session->mds_num);
6619   }
6620
6621   for (auto &rank : anchor) {
6622     auto session = &mds_sessions.at(rank);
6623     flush_mdlog(session->get());
6624   }
6625 }
6626
6627 void Client::flush_mdlog_sync()
6628 {
6629   if (mds_requests.empty())
6630     return;
6631   for (auto &p : mds_sessions) {
6632     flush_mdlog(p.second.get());
6633   }
6634 }
6635
6636 void Client::flush_mdlog(MetaSession *session)
6637 {
6638   // Only send this to Luminous or newer MDS daemons, older daemons
6639   // will crash if they see an unknown CEPH_SESSION_* value in this msg.
6640   const uint64_t features = session->con->get_features();
6641   if (HAVE_FEATURE(features, SERVER_LUMINOUS)) {
6642     auto m = make_message<MClientSession>(CEPH_SESSION_REQUEST_FLUSH_MDLOG);
6643     session->con->send_message2(std::move(m));
6644   }
6645 }
6646
6647
6648 void Client::_abort_mds_sessions(int err)
6649 {
6650   for (auto p = mds_requests.begin(); p != mds_requests.end(); ) {
6651     auto req = p->second;
6652     ++p;
6653     // unsafe requests will be removed during close session below.
6654     if (req->got_unsafe)
6655       continue;
6656
6657     req->abort(err);
6658     if (req->caller_cond) {
6659       req->kick = true;
6660       req->caller_cond->notify_all();
6661     }
6662   }
6663
6664   // Process aborts on any requests that were on this waitlist.
6665   // Any requests that were on a waiting_for_open session waitlist
6666   // will get kicked during close session below.
6667   signal_cond_list(waiting_for_mdsmap);
6668
6669   // Force-close all sessions
6670   while(!mds_sessions.empty()) {
6671     auto session = mds_sessions.begin()->second;
6672     _closed_mds_session(session.get(), err);
6673   }
6674 }
6675
6676 void Client::_unmount(bool abort)
6677 {
6678   /*
6679    * We are unmounting the client.
6680    *
6681    * Just declare the state to STATE_UNMOUNTING to block and fail
6682    * any new comming "reader" and then try to wait all the in-flight
6683    * "readers" to finish.
6684    */
6685   RWRef_t mref_writer(mount_state, CLIENT_UNMOUNTING, false);
6686   if (!mref_writer.is_first_writer())
6687     return;
6688   mref_writer.wait_readers_done();
6689
6690   std::unique_lock lock{client_lock};
6691
6692   if (abort || blocklisted) {
6693     ldout(cct, 2) << "unmounting (" << (abort ? "abort)" : "blocklisted)") << dendl;
6694   } else {
6695     ldout(cct, 2) << "unmounting" << dendl;
6696   }
6697
6698   deleg_timeout = 0;
6699
6700   if (abort) {
6701     mount_aborted = true;
6702     // Abort all mds sessions
6703     _abort_mds_sessions(-CEPHFS_ENOTCONN);
6704
6705     objecter->op_cancel_writes(-CEPHFS_ENOTCONN);
6706   } else {
6707     // flush the mdlog for pending requests, if any
6708     flush_mdlog_sync();
6709   }
6710
6711   mount_cond.wait(lock, [this] {
6712     // Only wait for write OPs
6713     for (auto& [tid, req] : mds_requests) {
6714       if (req->is_write()) {
6715         ldout(cct, 10) << "waiting for write request '" << tid
6716                        << "' to complete, currently there are "
6717                        << mds_requests.size()
6718                        << " outstanding read/write requests"
6719                        << dendl;
6720         return false;
6721       }
6722     }
6723     return true;
6724   });
6725
6726   cwd.reset();
6727   root.reset();
6728
6729   // clean up any unclosed files
6730   while (!fd_map.empty()) {
6731     Fh *fh = fd_map.begin()->second;
6732     fd_map.erase(fd_map.begin());
6733     ldout(cct, 0) << " destroyed lost open file " << fh << " on " << *fh->inode << dendl;
6734     _release_fh(fh);
6735   }
6736
6737   while (!ll_unclosed_fh_set.empty()) {
6738     set<Fh*>::iterator it = ll_unclosed_fh_set.begin();
6739     Fh *fh = *it;
6740     ll_unclosed_fh_set.erase(fh);
6741     ldout(cct, 0) << " destroyed lost open file " << fh << " on " << *(fh->inode) << dendl;
6742     _release_fh(fh);
6743   }
6744
6745   while (!opened_dirs.empty()) {
6746     dir_result_t *dirp = *opened_dirs.begin();
6747     ldout(cct, 0) << " destroyed lost open dir " << dirp << " on " << *dirp->inode << dendl;
6748     _closedir(dirp);
6749   }
6750
6751   _ll_drop_pins();
6752
6753   if (cct->_conf->client_oc) {
6754     // flush/release all buffered data
6755     std::list<InodeRef> anchor;
6756     for (auto& p : inode_map) {
6757       Inode *in = p.second;
6758       if (!in) {
6759         ldout(cct, 0) << "null inode_map entry ino " << p.first << dendl;
6760         ceph_assert(in);
6761       }
6762
6763       // prevent inode from getting freed
6764       anchor.emplace_back(in);
6765
6766       if (abort || blocklisted) {
6767         objectcacher->purge_set(&in->oset);
6768       } else if (!in->caps.empty()) {
6769         _release(in);
6770         _flush(in, new C_Client_FlushComplete(this, in));
6771       }
6772     }
6773   }
6774
6775   if (abort || blocklisted) {
6776     for (auto &q : mds_sessions) {
6777       auto s = q.second;
6778       for (auto p = s->dirty_list.begin(); !p.end(); ) {
6779         Inode *in = *p;
6780         ++p;
6781         if (in->dirty_caps) {
6782           ldout(cct, 0) << " drop dirty caps on " << *in << dendl;
6783           in->mark_caps_clean();
6784           put_inode(in);
6785         }
6786       }
6787     }
6788   } else {
6789     flush_caps_sync();
6790     wait_sync_caps(last_flush_tid);
6791   }
6792
6793   // empty lru cache
6794   trim_cache();
6795
6796   delay_put_inodes();
6797
6798   while (lru.lru_get_size() > 0 ||
6799          !inode_map.empty()) {
6800     ldout(cct, 2) << "cache still has " << lru.lru_get_size()
6801             << "+" << inode_map.size() << " items"
6802             << ", waiting (for caps to release?)"
6803             << dendl;
6804
6805     if (auto r = mount_cond.wait_for(lock, ceph::make_timespan(5));
6806         r == std::cv_status::timeout) {
6807       dump_cache(NULL);
6808     }
6809   }
6810   ceph_assert(lru.lru_get_size() == 0);
6811   ceph_assert(inode_map.empty());
6812
6813   // stop tracing
6814   if (!cct->_conf->client_trace.empty()) {
6815     ldout(cct, 1) << "closing trace file '" << cct->_conf->client_trace << "'" << dendl;
6816     traceout.close();
6817   }
6818
6819   // stop the tick thread
6820   tick_thread_stopped = true;
6821   upkeep_cond.notify_one();
6822
6823   _close_sessions();
6824
6825   // release the global snapshot realm
6826   SnapRealm *global_realm = snap_realms[CEPH_INO_GLOBAL_SNAPREALM];
6827   if (global_realm) {
6828     ceph_assert(global_realm->nref == 1);
6829     put_snap_realm(global_realm);
6830   }
6831
6832   mref_writer.update_state(CLIENT_UNMOUNTED);
6833
6834   /*
6835    * Stop the remount_queue before clearing the mountpoint memory
6836    * to avoid possible use-after-free bug.
6837    */
6838   if (remount_cb) {
6839     ldout(cct, 10) << "unmount stopping remount finisher" << dendl;
6840     remount_finisher.wait_for_empty();
6841     remount_finisher.stop();
6842     remount_cb = nullptr;
6843   }
6844
6845   ldout(cct, 2) << "unmounted." << dendl;
6846 }
6847
6848 void Client::unmount()
6849 {
6850   _unmount(false);
6851 }
6852
6853 void Client::abort_conn()
6854 {
6855   _unmount(true);
6856 }
6857
6858 void Client::flush_cap_releases()
6859 {
6860   uint64_t nr_caps = 0;
6861
6862   // send any cap releases
6863   for (auto &p : mds_sessions) {
6864     auto session = p.second;
6865     if (session->release && mdsmap->is_clientreplay_or_active_or_stopping(
6866           p.first)) {
6867       nr_caps += session->release->caps.size();
6868       if (cct->_conf->client_inject_release_failure) {
6869         ldout(cct, 20) << __func__ << " injecting failure to send cap release message" << dendl;
6870       } else {
6871         session->con->send_message2(std::move(session->release));
6872       }
6873       session->release.reset();
6874     }
6875   }
6876
6877   if (nr_caps > 0) {
6878     dec_pinned_icaps(nr_caps);
6879   }
6880 }
6881
6882 void Client::renew_and_flush_cap_releases()
6883 {
6884   ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
6885
6886   if (!mount_aborted && mdsmap->get_epoch()) {
6887     // renew caps?
6888     auto el = ceph::coarse_mono_clock::now() - last_cap_renew;
6889     if (unlikely(utime_t(el) > mdsmap->get_session_timeout() / 3.0))
6890       renew_caps();
6891
6892     flush_cap_releases();
6893   }
6894 }
6895
6896 void Client::tick()
6897 {
6898   ldout(cct, 20) << "tick" << dendl;
6899
6900   auto now = ceph::coarse_mono_clock::now();
6901
6902   /*
6903    * If the mount() is not finished
6904    */
6905   if (is_mounting() && !mds_requests.empty()) {
6906     MetaRequest *req = mds_requests.begin()->second;
6907
6908     if (req->created + mount_timeout < now) {
6909       req->abort(-CEPHFS_ETIMEDOUT);
6910       if (req->caller_cond) {
6911         req->kick = true;
6912         req->caller_cond->notify_all();
6913       }
6914       signal_cond_list(waiting_for_mdsmap);
6915       for (auto &p : mds_sessions) {
6916         signal_context_list(p.second->waiting_for_open);
6917       }
6918     }
6919   }
6920
6921   renew_and_flush_cap_releases();
6922
6923   // delayed caps
6924   xlist<Inode*>::iterator p = delayed_list.begin();
6925   while (!p.end()) {
6926     Inode *in = *p;
6927     ++p;
6928     if (!mount_aborted && in->hold_caps_until > now)
6929       break;
6930     delayed_list.pop_front();
6931     if (!mount_aborted)
6932       check_caps(in, CHECK_CAPS_NODELAY);
6933   }
6934
6935   if (!mount_aborted)
6936     collect_and_send_metrics();
6937
6938   delay_put_inodes(is_unmounting());
6939   trim_cache(true);
6940
6941   if (blocklisted && (is_mounted() || is_unmounting()) &&
6942       last_auto_reconnect + std::chrono::seconds(30 * 60) < now &&
6943       cct->_conf.get_val<bool>("client_reconnect_stale")) {
6944     messenger->client_reset();
6945     fd_gen++; // invalidate open files
6946     blocklisted = false;
6947     _kick_stale_sessions();
6948     last_auto_reconnect = now;
6949   }
6950 }
6951
6952 void Client::start_tick_thread()
6953 {
6954   upkeeper = std::thread([this]() {
6955     using time = ceph::coarse_mono_time;
6956     using sec = std::chrono::seconds;
6957
6958     auto last_tick = time::min();
6959
6960     std::unique_lock cl(client_lock);
6961     while (!tick_thread_stopped) {
6962       auto now = clock::now();
6963       auto since = now - last_tick;
6964
6965       auto t_interval = clock::duration(cct->_conf.get_val<sec>("client_tick_interval"));
6966       auto d_interval = clock::duration(cct->_conf.get_val<sec>("client_debug_inject_tick_delay"));
6967
6968       auto interval = std::max(t_interval, d_interval);
6969       if (likely(since >= interval*.90)) {
6970         tick();
6971         last_tick = clock::now();
6972       } else {
6973         interval -= since;
6974       }
6975
6976       ldout(cct, 20) << "upkeep thread waiting interval " << interval << dendl;
6977       if (!tick_thread_stopped)
6978         upkeep_cond.wait_for(cl, interval);
6979     }
6980   });
6981 }
6982
6983 void Client::collect_and_send_metrics() {
6984   ldout(cct, 20) << __func__ << dendl;
6985
6986   ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
6987
6988   // right now, we only track and send global metrics. its sufficient
6989   // to send these metrics to MDS rank0.
6990   collect_and_send_global_metrics();
6991 }
6992
6993 void Client::collect_and_send_global_metrics() {
6994   ldout(cct, 20) << __func__ << dendl;
6995   ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
6996
6997   /* Do not send the metrics until the MDS rank is ready */
6998   if (!mdsmap->is_active((mds_rank_t)0)) {
6999     ldout(cct, 5) << __func__ << " MDS rank 0 is not ready yet -- not sending metric"
7000                   << dendl;
7001     return;
7002   }
7003
7004   if (!have_open_session((mds_rank_t)0)) {
7005     ldout(cct, 5) << __func__ << ": no session with rank=0 -- not sending metric"
7006                   << dendl;
7007     return;
7008   }
7009   auto session = _get_or_open_mds_session((mds_rank_t)0);
7010   if (!session->mds_features.test(CEPHFS_FEATURE_METRIC_COLLECT)) {
7011     ldout(cct, 5) << __func__ << ": rank=0 does not support metrics" << dendl;
7012     return;
7013   }
7014
7015   ClientMetricMessage metric;
7016   std::vector<ClientMetricMessage> message;
7017
7018   // read latency
7019   if (_collect_and_send_global_metrics ||
7020       session->mds_metric_flags.test(CLIENT_METRIC_TYPE_READ_LATENCY)) {
7021     metric = ClientMetricMessage(ReadLatencyPayload(logger->tget(l_c_read),
7022                                                     logger->tget(l_c_rd_avg),
7023                                                     logger->get(l_c_rd_sqsum),
7024                                                     nr_read_request));
7025     message.push_back(metric);
7026   }
7027
7028   // write latency
7029   if (_collect_and_send_global_metrics ||
7030       session->mds_metric_flags.test(CLIENT_METRIC_TYPE_WRITE_LATENCY)) {
7031     metric = ClientMetricMessage(WriteLatencyPayload(logger->tget(l_c_wrlat),
7032                                                      logger->tget(l_c_wr_avg),
7033                                                      logger->get(l_c_wr_sqsum),
7034                                                      nr_write_request));
7035     message.push_back(metric);
7036   }
7037
7038   // metadata latency
7039   if (_collect_and_send_global_metrics ||
7040       session->mds_metric_flags.test(CLIENT_METRIC_TYPE_METADATA_LATENCY)) {
7041     metric = ClientMetricMessage(MetadataLatencyPayload(logger->tget(l_c_lat),
7042                                                         logger->tget(l_c_md_avg),
7043                                                         logger->get(l_c_md_sqsum),
7044                                                         nr_metadata_request));
7045     message.push_back(metric);
7046   }
7047
7048   // cap hit ratio -- nr_caps is unused right now
7049   if (_collect_and_send_global_metrics ||
7050       session->mds_metric_flags.test(CLIENT_METRIC_TYPE_CAP_INFO)) {
7051     auto [cap_hits, cap_misses] = get_cap_hit_rates();
7052     metric = ClientMetricMessage(CapInfoPayload(cap_hits, cap_misses, 0));
7053     message.push_back(metric);
7054   }
7055
7056   // dentry lease hit ratio
7057   if (_collect_and_send_global_metrics ||
7058       session->mds_metric_flags.test(CLIENT_METRIC_TYPE_DENTRY_LEASE)) {
7059     auto [dlease_hits, dlease_misses, nr] = get_dlease_hit_rates();
7060     metric = ClientMetricMessage(DentryLeasePayload(dlease_hits, dlease_misses, nr));
7061     message.push_back(metric);
7062   }
7063
7064   // opened files
7065   if (_collect_and_send_global_metrics ||
7066       session->mds_metric_flags.test(CLIENT_METRIC_TYPE_OPENED_FILES)) {
7067     auto [opened_files, total_inodes] = get_opened_files_rates();
7068     metric = ClientMetricMessage(OpenedFilesPayload(opened_files, total_inodes));
7069     message.push_back(metric);
7070   }
7071
7072   // pinned i_caps
7073   if (_collect_and_send_global_metrics ||
7074       session->mds_metric_flags.test(CLIENT_METRIC_TYPE_PINNED_ICAPS)) {
7075     auto [pinned_icaps, total_inodes] = get_pinned_icaps_rates();
7076     metric = ClientMetricMessage(PinnedIcapsPayload(pinned_icaps, total_inodes));
7077     message.push_back(metric);
7078   }
7079
7080   // opened inodes
7081   if (_collect_and_send_global_metrics ||
7082       session->mds_metric_flags.test(CLIENT_METRIC_TYPE_OPENED_INODES)) {
7083     auto [opened_inodes, total_inodes] = get_opened_inodes_rates();
7084     metric = ClientMetricMessage(OpenedInodesPayload(opened_inodes, total_inodes));
7085     message.push_back(metric);
7086   }
7087
7088   // read io sizes
7089   if (_collect_and_send_global_metrics ||
7090       session->mds_metric_flags.test(CLIENT_METRIC_TYPE_READ_IO_SIZES)) {
7091     metric = ClientMetricMessage(ReadIoSizesPayload(total_read_ops,
7092                                                     total_read_size));
7093     message.push_back(metric);
7094   }
7095
7096   // write io sizes
7097   if (_collect_and_send_global_metrics ||
7098       session->mds_metric_flags.test(CLIENT_METRIC_TYPE_WRITE_IO_SIZES)) {
7099     metric = ClientMetricMessage(WriteIoSizesPayload(total_write_ops,
7100                                                      total_write_size));
7101     message.push_back(metric);
7102   }
7103
7104   session->con->send_message2(make_message<MClientMetrics>(std::move(message)));
7105 }
7106
7107 void Client::renew_caps()
7108 {
7109   ldout(cct, 10) << "renew_caps()" << dendl;
7110   last_cap_renew = ceph::coarse_mono_clock::now();
7111
7112   for (auto &p : mds_sessions) {
7113     ldout(cct, 15) << "renew_caps requesting from mds." << p.first << dendl;
7114     if (mdsmap->get_state(p.first) >= MDSMap::STATE_REJOIN)
7115       renew_caps(p.second.get());
7116   }
7117 }
7118
7119 void Client::renew_caps(MetaSession *session)
7120 {
7121   ldout(cct, 10) << "renew_caps mds." << session->mds_num << dendl;
7122   session->last_cap_renew_request = ceph_clock_now();
7123   uint64_t seq = ++session->cap_renew_seq;
7124   session->con->send_message2(make_message<MClientSession>(CEPH_SESSION_REQUEST_RENEWCAPS, seq));
7125 }
7126
7127
7128 // ===============================================================
7129 // high level (POSIXy) interface
7130
7131 int Client::_do_lookup(Inode *dir, const string& name, int mask,
7132                        InodeRef *target, const UserPerm& perms)
7133 {
7134   int op = dir->snapid == CEPH_SNAPDIR ? CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP;
7135   MetaRequest *req = new MetaRequest(op);
7136   filepath path;
7137   dir->make_nosnap_relative_path(path);
7138   path.push_dentry(name);
7139   req->set_filepath(path);
7140   req->set_inode(dir);
7141   if (cct->_conf->client_debug_getattr_caps && op == CEPH_MDS_OP_LOOKUP)
7142       mask |= DEBUG_GETATTR_CAPS;
7143   req->head.args.getattr.mask = mask;
7144
7145   ldout(cct, 10) << __func__ << " on " << path << dendl;
7146
7147   int r = make_request(req, perms, target);
7148   ldout(cct, 10) << __func__ << " res is " << r << dendl;
7149   return r;
7150 }
7151
7152 bool Client::_dentry_valid(const Dentry *dn)
7153 {
7154   ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
7155
7156   // is dn lease valid?
7157   utime_t now = ceph_clock_now();
7158   if (dn->lease_mds >= 0 && dn->lease_ttl > now &&
7159       mds_sessions.count(dn->lease_mds)) {
7160     auto s = mds_sessions.at(dn->lease_mds);
7161     if (s->cap_ttl > now && s->cap_gen == dn->lease_gen) {
7162       dlease_hit();
7163       return true;
7164     }
7165
7166     ldout(cct, 20) << " bad lease, cap_ttl " << s->cap_ttl << ", cap_gen " << s->cap_gen
7167                    << " vs lease_gen " << dn->lease_gen << dendl;
7168   }
7169
7170   dlease_miss();
7171   return false;
7172 }
7173
7174 int Client::_lookup(Inode *dir, const string& dname, int mask, InodeRef *target,
7175                     const UserPerm& perms, std::string* alternate_name,
7176                     bool is_rename)
7177 {
7178   int r = 0;
7179   Dentry *dn = NULL;
7180   bool did_lookup_request = false;
7181   // can only request shared caps
7182   mask &= CEPH_CAP_ANY_SHARED | CEPH_STAT_RSTAT;
7183
7184   if (dname == "..") {
7185     if (dir->dentries.empty()) {
7186       MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPPARENT);
7187       filepath path(dir->ino);
7188       req->set_filepath(path);
7189
7190       InodeRef tmptarget;
7191       int r = make_request(req, perms, &tmptarget, NULL, rand() % mdsmap->get_num_in_mds());
7192
7193       if (r == 0) {
7194         *target = std::move(tmptarget);
7195         ldout(cct, 8) << __func__ << " found target " << (*target)->ino << dendl;
7196       } else {
7197         *target = dir;
7198       }
7199     }
7200     else
7201       *target = dir->get_first_parent()->dir->parent_inode; //dirs can't be hard-linked
7202     goto done;
7203   }
7204
7205   if (dname == ".") {
7206     *target = dir;
7207     goto done;
7208   }
7209
7210   if (!dir->is_dir()) {
7211     r = -CEPHFS_ENOTDIR;
7212     goto done;
7213   }
7214
7215   if (dname.length() > NAME_MAX) {
7216     r = -CEPHFS_ENAMETOOLONG;
7217     goto done;
7218   }
7219
7220   if (dname == cct->_conf->client_snapdir &&
7221       dir->snapid == CEPH_NOSNAP) {
7222     *target = open_snapdir(dir);
7223     goto done;
7224   }
7225
7226 relookup:
7227   if (dir->dir &&
7228       dir->dir->dentries.count(dname)) {
7229     dn = dir->dir->dentries[dname];
7230
7231     ldout(cct, 20) << __func__ << " have " << *dn << " from mds." << dn->lease_mds
7232         << " ttl " << dn->lease_ttl << " seq " << dn->lease_seq << dendl;
7233
7234     if (!dn->inode || dn->inode->caps_issued_mask(mask, true)) {
7235       if (_dentry_valid(dn)) {
7236         // touch this mds's dir cap too, even though we don't _explicitly_ use it here, to
7237         // make trim_caps() behave.
7238         dir->try_touch_cap(dn->lease_mds);
7239           goto hit_dn;
7240       }
7241       // dir shared caps?
7242       if (dir->caps_issued_mask(CEPH_CAP_FILE_SHARED, true)) {
7243         if (dn->cap_shared_gen == dir->shared_gen &&
7244             (!dn->inode || dn->inode->caps_issued_mask(mask, true)))
7245               goto hit_dn;
7246         if (!dn->inode && (dir->flags & I_COMPLETE)) {
7247           ldout(cct, 10) << __func__ << " concluded ENOENT locally for "
7248                          << *dir << " dn '" << dname << "'" << dendl;
7249           return -CEPHFS_ENOENT;
7250         }
7251       }
7252     } else {
7253       ldout(cct, 20) << " no cap on " << dn->inode->vino() << dendl;
7254     }
7255
7256     // In rare case during the rename if another thread tries to
7257     // lookup the dst dentry, it may get an inconsistent result
7258     // that both src dentry and dst dentry will link to the same
7259     // inode at the same time.
7260     // Will wait the rename to finish and try it again.
7261     if (!is_rename && dn->is_renaming) {
7262       ldout(cct, 1) << __func__ << " dir " << *dir
7263                     << " rename is on the way, will wait for dn '"
7264                     << dname << "'" << dendl;
7265       wait_on_list(waiting_for_rename);
7266       goto relookup;
7267     }
7268   } else {
7269     // can we conclude ENOENT locally?
7270     if (dir->caps_issued_mask(CEPH_CAP_FILE_SHARED, true) &&
7271         (dir->flags & I_COMPLETE)) {
7272       ldout(cct, 10) << __func__ << " concluded ENOENT locally for " << *dir << " dn '" << dname << "'" << dendl;
7273       return -CEPHFS_ENOENT;
7274     }
7275   }
7276
7277   if (did_lookup_request) {
7278     r = 0;
7279     goto done;
7280   }
7281   r = _do_lookup(dir, dname, mask, target, perms);
7282   did_lookup_request = true;
7283   if (r == 0) {
7284     /* complete lookup to get dentry for alternate_name */
7285     goto relookup;
7286   } else {
7287     goto done;
7288   }
7289
7290  hit_dn:
7291   if (dn->inode) {
7292     *target = dn->inode;
7293     if (alternate_name)
7294       *alternate_name = dn->alternate_name;
7295   } else {
7296     r = -CEPHFS_ENOENT;
7297   }
7298   touch_dn(dn);
7299   goto done;
7300
7301  done:
7302   if (r < 0)
7303     ldout(cct, 10) << __func__ << " " << *dir << " " << dname << " = " << r << dendl;
7304   else
7305     ldout(cct, 10) << __func__ << " " << *dir << " " << dname << " = " << **target << dendl;
7306   return r;
7307 }
7308
7309 Dentry *Client::get_or_create(Inode *dir, const char* name)
7310 {
7311   // lookup
7312   ldout(cct, 20) << __func__ << " " << *dir << " name " << name << dendl;
7313   dir->open_dir();
7314   if (dir->dir->dentries.count(name))
7315     return dir->dir->dentries[name];
7316   else // otherwise link up a new one
7317     return link(dir->dir, name, NULL, NULL);
7318 }
7319
7320 int Client::walk(std::string_view path, walk_dentry_result* wdr, const UserPerm& perms, bool followsym)
7321 {
7322   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
7323   if (!mref_reader.is_state_satisfied())
7324     return -CEPHFS_ENOTCONN;
7325
7326   ldout(cct, 10) << __func__ << ": " << path << dendl;
7327
7328   std::scoped_lock lock(client_lock);
7329
7330   return path_walk(path, wdr, perms, followsym);
7331 }
7332
7333 int Client::path_walk(const filepath& origpath, InodeRef *end,
7334                       const UserPerm& perms, bool followsym, int mask, InodeRef dirinode)
7335 {
7336   walk_dentry_result wdr;
7337   int rc = path_walk(origpath, &wdr, perms, followsym, mask, dirinode);
7338   *end = std::move(wdr.in);
7339   return rc;
7340 }
7341
7342 int Client::path_walk(const filepath& origpath, walk_dentry_result* result, const UserPerm& perms,
7343                       bool followsym, int mask, InodeRef dirinode)
7344 {
7345   filepath path = origpath;
7346   InodeRef cur;
7347   std::string alternate_name;
7348   if (origpath.absolute())
7349     cur = root;
7350   else if (!dirinode)
7351     cur = cwd;
7352   else {
7353     cur = dirinode;
7354   }
7355   ceph_assert(cur);
7356
7357   ldout(cct, 20) << __func__ << " cur=" << *cur << dendl;
7358   ldout(cct, 10) << __func__ << " " << path << dendl;
7359
7360   int symlinks = 0;
7361
7362   unsigned i=0;
7363   while (i < path.depth() && cur) {
7364     int caps = 0;
7365     const string &dname = path[i];
7366     ldout(cct, 10) << " " << i << " " << *cur << " " << dname << dendl;
7367     ldout(cct, 20) << "  (path is " << path << ")" << dendl;
7368     InodeRef next;
7369     if (cct->_conf->client_permissions) {
7370       int r = may_lookup(cur.get(), perms);
7371       if (r < 0)
7372         return r;
7373       caps = CEPH_CAP_AUTH_SHARED;
7374     }
7375
7376     /* Get extra requested caps on the last component */
7377     if (i == (path.depth() - 1))
7378       caps |= mask;
7379     int r = _lookup(cur.get(), dname, caps, &next, perms, &alternate_name);
7380     if (r < 0)
7381       return r;
7382     // only follow trailing symlink if followsym.  always follow
7383     // 'directory' symlinks.
7384     if (next && next->is_symlink()) {
7385       symlinks++;
7386       ldout(cct, 20) << " symlink count " << symlinks << ", value is '" << next->symlink << "'" << dendl;
7387       if (symlinks > MAXSYMLINKS) {
7388         return -CEPHFS_ELOOP;
7389       }
7390
7391       if (i < path.depth() - 1) {
7392         // dir symlink
7393         // replace consumed components of path with symlink dir target
7394         filepath resolved(next->symlink.c_str());
7395         resolved.append(path.postfixpath(i + 1));
7396         path = resolved;
7397         i = 0;
7398         if (next->symlink[0] == '/') {
7399           cur = root;
7400         }
7401         continue;
7402       } else if (followsym) {
7403         if (next->symlink[0] == '/') {
7404           path = next->symlink.c_str();
7405           i = 0;
7406           // reset position
7407           cur = root;
7408         } else {
7409           filepath more(next->symlink.c_str());
7410           // we need to remove the symlink component from off of the path
7411           // before adding the target that the symlink points to.  remain
7412           // at the same position in the path.
7413           path.pop_dentry();
7414           path.append(more);
7415         }
7416         continue;
7417       }
7418     }
7419     cur.swap(next);
7420     i++;
7421   }
7422   if (!cur)
7423     return -CEPHFS_ENOENT;
7424   if (result) {
7425     result->in = std::move(cur);
7426     result->alternate_name = std::move(alternate_name);
7427   }
7428   return 0;
7429 }
7430
7431
7432 // namespace ops
7433
7434 int Client::link(const char *relexisting, const char *relpath, const UserPerm& perm, std::string alternate_name)
7435 {
7436   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
7437   if (!mref_reader.is_state_satisfied())
7438     return -CEPHFS_ENOTCONN;
7439
7440   tout(cct) << "link" << std::endl;
7441   tout(cct) << relexisting << std::endl;
7442   tout(cct) << relpath << std::endl;
7443
7444   filepath existing(relexisting);
7445
7446   InodeRef in, dir;
7447
7448   std::scoped_lock lock(client_lock);
7449   int r = path_walk(existing, &in, perm, true);
7450   if (r < 0)
7451     return r;
7452   if (std::string(relpath) == "/") {
7453     r = -CEPHFS_EEXIST;
7454     return r;
7455   }
7456   filepath path(relpath);
7457   string name = path.last_dentry();
7458   path.pop_dentry();
7459
7460   r = path_walk(path, &dir, perm, true);
7461   if (r < 0)
7462     return r;
7463   if (cct->_conf->client_permissions) {
7464     if (S_ISDIR(in->mode)) {
7465       r = -CEPHFS_EPERM;
7466       return r;
7467     }
7468     r = may_hardlink(in.get(), perm);
7469     if (r < 0)
7470       return r;
7471     r = may_create(dir.get(), perm);
7472     if (r < 0)
7473       return r;
7474   }
7475   r = _link(in.get(), dir.get(), name.c_str(), perm, std::move(alternate_name));
7476   return r;
7477 }
7478
7479 int Client::unlink(const char *relpath, const UserPerm& perm)
7480 {
7481   return unlinkat(CEPHFS_AT_FDCWD, relpath, 0, perm);
7482 }
7483
7484 int Client::unlinkat(int dirfd, const char *relpath, int flags, const UserPerm& perm)
7485 {
7486   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
7487   if (!mref_reader.is_state_satisfied()) {
7488     return -CEPHFS_ENOTCONN;
7489   }
7490
7491   tout(cct) << __func__ << std::endl;
7492   tout(cct) << dirfd << std::endl;
7493   tout(cct) << relpath << std::endl;
7494   tout(cct) << flags << std::endl;
7495
7496   if (std::string(relpath) == "/") {
7497     return flags & AT_REMOVEDIR ? -CEPHFS_EBUSY : -CEPHFS_EISDIR;
7498   }
7499
7500   filepath path(relpath);
7501   string name = path.last_dentry();
7502   path.pop_dentry();
7503   InodeRef dir;
7504
7505   std::scoped_lock lock(client_lock);
7506
7507   InodeRef dirinode;
7508   int r = get_fd_inode(dirfd, &dirinode);
7509   if (r < 0) {
7510     return r;
7511   }
7512
7513   r = path_walk(path, &dir, perm, true, 0, dirinode);
7514   if (r < 0) {
7515     return r;
7516   }
7517   if (cct->_conf->client_permissions) {
7518     r = may_delete(dir.get(), name.c_str(), perm);
7519     if (r < 0) {
7520       return r;
7521     }
7522   }
7523   if (flags & AT_REMOVEDIR) {
7524     r = _rmdir(dir.get(), name.c_str(), perm);
7525   } else {
7526     r = _unlink(dir.get(), name.c_str(), perm);
7527   }
7528   return r;
7529 }
7530
7531 int Client::rename(const char *relfrom, const char *relto, const UserPerm& perm, std::string alternate_name)
7532 {
7533   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
7534   if (!mref_reader.is_state_satisfied())
7535     return -CEPHFS_ENOTCONN;
7536
7537   tout(cct) << __func__ << std::endl;
7538   tout(cct) << relfrom << std::endl;
7539   tout(cct) << relto << std::endl;
7540
7541   if (std::string(relfrom) == "/" || std::string(relto) == "/")
7542     return -CEPHFS_EBUSY;
7543
7544   filepath from(relfrom);
7545   filepath to(relto);
7546   string fromname = from.last_dentry();
7547   from.pop_dentry();
7548   string toname = to.last_dentry();
7549   to.pop_dentry();
7550
7551   InodeRef fromdir, todir;
7552
7553   std::scoped_lock lock(client_lock);
7554   int r = path_walk(from, &fromdir, perm);
7555   if (r < 0)
7556     goto out;
7557   r = path_walk(to, &todir, perm);
7558   if (r < 0)
7559     goto out;
7560
7561   if (cct->_conf->client_permissions) {
7562     int r = may_delete(fromdir.get(), fromname.c_str(), perm);
7563     if (r < 0)
7564       return r;
7565     r = may_delete(todir.get(), toname.c_str(), perm);
7566     if (r < 0 && r != -CEPHFS_ENOENT)
7567       return r;
7568   }
7569   r = _rename(fromdir.get(), fromname.c_str(), todir.get(), toname.c_str(), perm, std::move(alternate_name));
7570 out:
7571   return r;
7572 }
7573
7574 // dirs
7575
7576 int Client::mkdir(const char *relpath, mode_t mode, const UserPerm& perm, std::string alternate_name)
7577 {
7578   return mkdirat(CEPHFS_AT_FDCWD, relpath, mode, perm, alternate_name);
7579 }
7580
7581 int Client::mkdirat(int dirfd, const char *relpath, mode_t mode, const UserPerm& perm,
7582                     std::string alternate_name)
7583 {
7584   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
7585   if (!mref_reader.is_state_satisfied())
7586     return -CEPHFS_ENOTCONN;
7587
7588   tout(cct) << __func__ << std::endl;
7589   tout(cct) << dirfd << std::endl;
7590   tout(cct) << relpath << std::endl;
7591   tout(cct) << mode << std::endl;
7592   ldout(cct, 10) << __func__ << ": " << relpath << dendl;
7593
7594   if (std::string(relpath) == "/") {
7595     return -CEPHFS_EEXIST;
7596   }
7597
7598   filepath path(relpath);
7599   string name = path.last_dentry();
7600   path.pop_dentry();
7601   InodeRef dir;
7602
7603   std::scoped_lock lock(client_lock);
7604
7605   InodeRef dirinode;
7606   int r = get_fd_inode(dirfd, &dirinode);
7607   if (r < 0) {
7608     return r;
7609   }
7610
7611   r = path_walk(path, &dir, perm, true, 0, dirinode);
7612   if (r < 0) {
7613     return r;
7614   }
7615   if (cct->_conf->client_permissions) {
7616     r = may_create(dir.get(), perm);
7617     if (r < 0) {
7618       return r;
7619     }
7620   }
7621   return _mkdir(dir.get(), name.c_str(), mode, perm, 0, {}, std::move(alternate_name));
7622 }
7623
7624 int Client::mkdirs(const char *relpath, mode_t mode, const UserPerm& perms)
7625 {
7626   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
7627   if (!mref_reader.is_state_satisfied())
7628     return -CEPHFS_ENOTCONN;
7629
7630   ldout(cct, 10) << "Client::mkdirs " << relpath << dendl;
7631   tout(cct) << __func__ << std::endl;
7632   tout(cct) << relpath << std::endl;
7633   tout(cct) << mode << std::endl;
7634
7635   //get through existing parts of path
7636   filepath path(relpath);
7637   unsigned int i;
7638   int r = 0, caps = 0;
7639   InodeRef cur, next;
7640
7641   std::scoped_lock lock(client_lock);
7642   cur = cwd;
7643   for (i=0; i<path.depth(); ++i) {
7644     if (cct->_conf->client_permissions) {
7645       r = may_lookup(cur.get(), perms);
7646       if (r < 0)
7647         break;
7648       caps = CEPH_CAP_AUTH_SHARED;
7649     }
7650     r = _lookup(cur.get(), path[i].c_str(), caps, &next, perms);
7651     if (r < 0)
7652       break;
7653     cur.swap(next);
7654   }
7655   if (r!=-CEPHFS_ENOENT) return r;
7656   ldout(cct, 20) << __func__ << " got through " << i << " directories on path " << relpath << dendl;
7657   //make new directory at each level
7658   for (; i<path.depth(); ++i) {
7659     if (cct->_conf->client_permissions) {
7660       r = may_create(cur.get(), perms);
7661       if (r < 0)
7662         return r;
7663     }
7664     //make new dir
7665     r = _mkdir(cur.get(), path[i].c_str(), mode, perms, &next);
7666
7667     //check proper creation/existence
7668     if(-CEPHFS_EEXIST == r && i < path.depth() - 1) {
7669       r = _lookup(cur.get(), path[i].c_str(), CEPH_CAP_AUTH_SHARED, &next, perms);
7670     }
7671     if (r < 0)
7672       return r;
7673     //move to new dir and continue
7674     cur.swap(next);
7675     ldout(cct, 20) << __func__ << ": successfully created directory "
7676                    << filepath(cur->ino).get_path() << dendl;
7677   }
7678   return 0;
7679 }
7680
7681 int Client::rmdir(const char *relpath, const UserPerm& perms)
7682 {
7683   return unlinkat(CEPHFS_AT_FDCWD, relpath, AT_REMOVEDIR, perms);
7684 }
7685
7686 int Client::mknod(const char *relpath, mode_t mode, const UserPerm& perms, dev_t rdev)
7687 {
7688   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
7689   if (!mref_reader.is_state_satisfied())
7690     return -CEPHFS_ENOTCONN;
7691
7692   tout(cct) << __func__ << std::endl;
7693   tout(cct) << relpath << std::endl;
7694   tout(cct) << mode << std::endl;
7695   tout(cct) << rdev << std::endl;
7696
7697   if (std::string(relpath) == "/")
7698     return -CEPHFS_EEXIST;
7699
7700   filepath path(relpath);
7701   string name = path.last_dentry();
7702   path.pop_dentry();
7703   InodeRef dir;
7704
7705   std::scoped_lock lock(client_lock);
7706   int r = path_walk(path, &dir, perms);
7707   if (r < 0)
7708     return r;
7709   if (cct->_conf->client_permissions) {
7710     int r = may_create(dir.get(), perms);
7711     if (r < 0)
7712       return r;
7713   }
7714   return _mknod(dir.get(), name.c_str(), mode, rdev, perms);
7715 }
7716
7717 // symlinks
7718
7719 int Client::symlink(const char *target, const char *relpath, const UserPerm& perms, std::string alternate_name)
7720 {
7721   return symlinkat(target, CEPHFS_AT_FDCWD, relpath, perms, alternate_name);
7722 }
7723
7724 int Client::symlinkat(const char *target, int dirfd, const char *relpath, const UserPerm& perms,
7725                       std::string alternate_name)
7726 {
7727   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
7728   if (!mref_reader.is_state_satisfied()) {
7729     return -CEPHFS_ENOTCONN;
7730   }
7731
7732   tout(cct) << __func__ << std::endl;
7733   tout(cct) << target << std::endl;
7734   tout(cct) << dirfd << std::endl;
7735   tout(cct) << relpath << std::endl;
7736
7737   if (std::string(relpath) == "/") {
7738     return -CEPHFS_EEXIST;
7739   }
7740
7741   filepath path(relpath);
7742   string name = path.last_dentry();
7743   path.pop_dentry();
7744   InodeRef dir;
7745
7746   std::scoped_lock lock(client_lock);
7747
7748   InodeRef dirinode;
7749   int r = get_fd_inode(dirfd, &dirinode);
7750   if (r < 0) {
7751     return r;
7752   }
7753   r = path_walk(path, &dir, perms, true, 0, dirinode);
7754   if (r < 0) {
7755     return r;
7756   }
7757   if (cct->_conf->client_permissions) {
7758     int r = may_create(dir.get(), perms);
7759     if (r < 0) {
7760       return r;
7761     }
7762   }
7763   return _symlink(dir.get(), name.c_str(), target, perms, std::move(alternate_name));
7764 }
7765
7766 int Client::readlink(const char *relpath, char *buf, loff_t size, const UserPerm& perms)
7767 {
7768   return readlinkat(CEPHFS_AT_FDCWD, relpath, buf, size, perms);
7769 }
7770
7771 int Client::readlinkat(int dirfd, const char *relpath, char *buf, loff_t size, const UserPerm& perms) {
7772   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
7773   if (!mref_reader.is_state_satisfied()) {
7774     return -CEPHFS_ENOTCONN;
7775   }
7776
7777   tout(cct) << __func__ << std::endl;
7778   tout(cct) << dirfd << std::endl;
7779   tout(cct) << relpath << std::endl;
7780
7781   InodeRef dirinode;
7782   std::scoped_lock lock(client_lock);
7783   int r = get_fd_inode(dirfd, &dirinode);
7784   if (r < 0) {
7785     return r;
7786   }
7787
7788   InodeRef in;
7789   filepath path(relpath);
7790   r = path_walk(path, &in, perms, false, 0, dirinode);
7791   if (r < 0) {
7792     return r;
7793   }
7794
7795   return _readlink(in.get(), buf, size);
7796 }
7797
7798 int Client::_readlink(Inode *in, char *buf, size_t size)
7799 {
7800   if (!in->is_symlink())
7801     return -CEPHFS_EINVAL;
7802
7803   // copy into buf (at most size bytes)
7804   int r = in->symlink.length();
7805   if (r > (int)size)
7806     r = size;
7807   memcpy(buf, in->symlink.c_str(), r);
7808   return r;
7809 }
7810
7811
7812 // inode stuff
7813
7814 int Client::_getattr(Inode *in, int mask, const UserPerm& perms, bool force)
7815 {
7816   bool yes = in->caps_issued_mask(mask, true);
7817
7818   ldout(cct, 10) << __func__ << " mask " << ccap_string(mask) << " issued=" << yes << dendl;
7819   if (yes && !force)
7820     return 0;
7821
7822   MetaRequest *req = new MetaRequest(CEPH_MDS_OP_GETATTR);
7823   filepath path;
7824   in->make_nosnap_relative_path(path);
7825   req->set_filepath(path);
7826   req->set_inode(in);
7827   req->head.args.getattr.mask = mask;
7828
7829   int res = make_request(req, perms);
7830   ldout(cct, 10) << __func__ << " result=" << res << dendl;
7831   return res;
7832 }
7833
7834 int Client::_getvxattr(
7835   Inode *in,
7836   const UserPerm& perms,
7837   const char *xattr_name,
7838   ssize_t size,
7839   void *value,
7840   mds_rank_t rank)
7841 {
7842   if (!xattr_name || strlen(xattr_name) <= 0 || strlen(xattr_name) > 255) {
7843     return -CEPHFS_ENODATA;
7844   }
7845
7846   MetaRequest *req = new MetaRequest(CEPH_MDS_OP_GETVXATTR);
7847   filepath path;
7848   in->make_nosnap_relative_path(path);
7849   req->set_filepath(path);
7850   req->set_inode(in);
7851   req->set_string2(xattr_name);
7852
7853   bufferlist bl;
7854   int res = make_request(req, perms, nullptr, nullptr, rank, &bl,
7855                          CEPHFS_FEATURE_OP_GETVXATTR);
7856   ldout(cct, 10) << __func__ << " result=" << res << dendl;
7857
7858   if (res < 0) {
7859     if (res == -CEPHFS_EOPNOTSUPP) {
7860       return -CEPHFS_ENODATA;
7861     }
7862     return res;
7863   }
7864
7865   std::string buf;
7866   auto p = bl.cbegin();
7867
7868   DECODE_START(1, p);
7869   decode(buf, p);
7870   DECODE_FINISH(p);
7871
7872   ssize_t len = buf.length();
7873
7874   res = len; // refer to man getxattr(2) for output buffer size == 0
7875
7876   if (size > 0) {
7877     if (len > size) {
7878       res = -CEPHFS_ERANGE; // insufficient output buffer space
7879     } else {
7880       memcpy(value, buf.c_str(), len);
7881     }
7882   }
7883   return res;
7884 }
7885
7886 int Client::_do_setattr(Inode *in, struct ceph_statx *stx, int mask,
7887                         const UserPerm& perms, InodeRef *inp,
7888                         std::vector<uint8_t>* aux)
7889 {
7890   int issued = in->caps_issued();
7891   union ceph_mds_request_args args;
7892   bool kill_sguid = false;
7893   int inode_drop = 0;
7894   size_t auxsize = 0;
7895
7896   if (aux)
7897     auxsize = aux->size();
7898
7899   ldout(cct, 10) << __func__ << " mask " << mask << " issued " <<
7900     ccap_string(issued) <<  " aux size " << auxsize << dendl;
7901
7902   if (in->snapid != CEPH_NOSNAP) {
7903     return -CEPHFS_EROFS;
7904   }
7905   if ((mask & CEPH_SETATTR_SIZE) &&
7906       (uint64_t)stx->stx_size > in->size &&
7907       is_quota_bytes_exceeded(in, (uint64_t)stx->stx_size - in->size,
7908                               perms)) {
7909     return -CEPHFS_EDQUOT;
7910   }
7911
7912   // Can't set fscrypt_auth and file at the same time!
7913   if ((mask & (CEPH_SETATTR_FSCRYPT_AUTH|CEPH_SETATTR_FSCRYPT_FILE)) ==
7914       (CEPH_SETATTR_FSCRYPT_AUTH|CEPH_SETATTR_FSCRYPT_FILE))
7915     return -CEPHFS_EINVAL;
7916
7917   if (!aux && (mask & (CEPH_SETATTR_FSCRYPT_AUTH|CEPH_SETATTR_FSCRYPT_FILE)))
7918     return -CEPHFS_EINVAL;
7919
7920   memset(&args, 0, sizeof(args));
7921
7922   // make the change locally?
7923   if ((in->cap_dirtier_uid >= 0 && perms.uid() != in->cap_dirtier_uid) ||
7924       (in->cap_dirtier_gid >= 0 && perms.gid() != in->cap_dirtier_gid)) {
7925     ldout(cct, 10) << __func__ << " caller " << perms.uid() << ":" << perms.gid()
7926                    << " != cap dirtier " << in->cap_dirtier_uid << ":"
7927                    << in->cap_dirtier_gid << ", forcing sync setattr"
7928                    << dendl;
7929     /*
7930      * This works because we implicitly flush the caps as part of the
7931      * request, so the cap update check will happen with the writeback
7932      * cap context, and then the setattr check will happen with the
7933      * caller's context.
7934      *
7935      * In reality this pattern is likely pretty rare (different users
7936      * setattr'ing the same file).  If that turns out not to be the
7937      * case later, we can build a more complex pipelined cap writeback
7938      * infrastructure...
7939      */
7940     mask |= CEPH_SETATTR_CTIME;
7941   }
7942
7943   if (!mask) {
7944     // caller just needs us to bump the ctime
7945     in->ctime = ceph_clock_now();
7946     in->cap_dirtier_uid = perms.uid();
7947     in->cap_dirtier_gid = perms.gid();
7948     if (issued & CEPH_CAP_AUTH_EXCL)
7949       in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7950     else if (issued & CEPH_CAP_FILE_EXCL)
7951       in->mark_caps_dirty(CEPH_CAP_FILE_EXCL);
7952     else if (issued & CEPH_CAP_XATTR_EXCL)
7953       in->mark_caps_dirty(CEPH_CAP_XATTR_EXCL);
7954     else
7955       mask |= CEPH_SETATTR_CTIME;
7956   }
7957
7958   if (in->caps_issued_mask(CEPH_CAP_AUTH_EXCL)) {
7959     kill_sguid = !!(mask & CEPH_SETATTR_KILL_SGUID);
7960   }
7961
7962   if (mask & CEPH_SETATTR_UID) {
7963     ldout(cct,10) << "changing uid to " << stx->stx_uid << dendl;
7964
7965     if (in->caps_issued_mask(CEPH_CAP_AUTH_EXCL)) {
7966       in->ctime = ceph_clock_now();
7967       in->cap_dirtier_uid = perms.uid();
7968       in->cap_dirtier_gid = perms.gid();
7969       in->uid = stx->stx_uid;
7970       in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7971       mask &= ~CEPH_SETATTR_UID;
7972       kill_sguid = true;
7973     } else if (!in->caps_issued_mask(CEPH_CAP_AUTH_SHARED) ||
7974                in->uid != stx->stx_uid) {
7975       args.setattr.uid = stx->stx_uid;
7976       inode_drop |= CEPH_CAP_AUTH_SHARED;
7977     } else {
7978       mask &= ~CEPH_SETATTR_UID;
7979     }
7980   }
7981
7982   if (mask & CEPH_SETATTR_GID) {
7983     ldout(cct,10) << "changing gid to " << stx->stx_gid << dendl;
7984
7985     if (in->caps_issued_mask(CEPH_CAP_AUTH_EXCL)) {
7986       in->ctime = ceph_clock_now();
7987       in->cap_dirtier_uid = perms.uid();
7988       in->cap_dirtier_gid = perms.gid();
7989       in->gid = stx->stx_gid;
7990       in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7991       mask &= ~CEPH_SETATTR_GID;
7992       kill_sguid = true;
7993     } else if (!in->caps_issued_mask(CEPH_CAP_AUTH_SHARED) ||
7994                in->gid != stx->stx_gid) {
7995       args.setattr.gid = stx->stx_gid;
7996       inode_drop |= CEPH_CAP_AUTH_SHARED;
7997     } else {
7998       mask &= ~CEPH_SETATTR_GID;
7999     }
8000   }
8001
8002   if (mask & CEPH_SETATTR_MODE) {
8003     ldout(cct,10) << "changing mode to " << stx->stx_mode << dendl;
8004
8005     if (in->caps_issued_mask(CEPH_CAP_AUTH_EXCL)) {
8006       in->ctime = ceph_clock_now();
8007       in->cap_dirtier_uid = perms.uid();
8008       in->cap_dirtier_gid = perms.gid();
8009       in->mode = (in->mode & ~07777) | (stx->stx_mode & 07777);
8010       in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
8011       mask &= ~CEPH_SETATTR_MODE;
8012     } else if (!in->caps_issued_mask(CEPH_CAP_AUTH_SHARED) ||
8013                in->mode != stx->stx_mode) {
8014       args.setattr.mode = stx->stx_mode;
8015       inode_drop |= CEPH_CAP_AUTH_SHARED;
8016     } else {
8017       mask &= ~CEPH_SETATTR_MODE;
8018     }
8019   } else if (in->caps_issued_mask(CEPH_CAP_AUTH_EXCL) && S_ISREG(in->mode)) {
8020     if (kill_sguid && (in->mode & (S_IXUSR|S_IXGRP|S_IXOTH))) {
8021       in->mode &= ~(S_ISUID|S_ISGID);
8022     } else {
8023       if (mask & CEPH_SETATTR_KILL_SUID) {
8024         in->mode &= ~S_ISUID;
8025       }
8026       if (mask & CEPH_SETATTR_KILL_SGID) {
8027         in->mode &= ~S_ISGID;
8028       }
8029     }
8030     mask &= ~(CEPH_SETATTR_KILL_SGUID|CEPH_SETATTR_KILL_SUID|CEPH_SETATTR_KILL_SGID);
8031     in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
8032   }
8033
8034   if (mask & CEPH_SETATTR_BTIME) {
8035     ldout(cct,10) << "changing btime to " << in->btime << dendl;
8036
8037     if (in->caps_issued_mask(CEPH_CAP_AUTH_EXCL)) {
8038       in->ctime = ceph_clock_now();
8039       in->cap_dirtier_uid = perms.uid();
8040       in->cap_dirtier_gid = perms.gid();
8041       in->btime = utime_t(stx->stx_btime);
8042       in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
8043       mask &= ~CEPH_SETATTR_BTIME;
8044     } else if (!in->caps_issued_mask(CEPH_CAP_AUTH_SHARED) ||
8045                in->btime != utime_t(stx->stx_btime)) {
8046       args.setattr.btime = utime_t(stx->stx_btime);
8047       inode_drop |= CEPH_CAP_AUTH_SHARED;
8048     } else {
8049       mask &= ~CEPH_SETATTR_BTIME;
8050     }
8051   }
8052
8053   if (mask & CEPH_SETATTR_FSCRYPT_AUTH) {
8054     ldout(cct,10) << "resetting cached fscrypt_auth field. size now "
8055                   << in->fscrypt_auth.size() << dendl;
8056
8057     if (in->caps_issued_mask(CEPH_CAP_AUTH_EXCL)) {
8058       in->ctime = ceph_clock_now();
8059       in->cap_dirtier_uid = perms.uid();
8060       in->cap_dirtier_gid = perms.gid();
8061       in->fscrypt_auth = *aux;
8062       in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
8063       mask &= ~CEPH_SETATTR_FSCRYPT_AUTH;
8064     } else if (!in->caps_issued_mask(CEPH_CAP_AUTH_SHARED) ||
8065                in->fscrypt_auth != *aux) {
8066       inode_drop |= CEPH_CAP_AUTH_SHARED;
8067     } else {
8068       mask &= ~CEPH_SETATTR_FSCRYPT_AUTH;
8069     }
8070   }
8071
8072   if (mask & CEPH_SETATTR_SIZE) {
8073     if ((uint64_t)stx->stx_size >= mdsmap->get_max_filesize()) {
8074       //too big!
8075       ldout(cct,10) << "unable to set size to " << stx->stx_size << ". Too large!" << dendl;
8076       return -CEPHFS_EFBIG;
8077     }
8078
8079     ldout(cct,10) << "changing size to " << stx->stx_size << dendl;
8080     if (in->caps_issued_mask(CEPH_CAP_FILE_EXCL) &&
8081         !(mask & CEPH_SETATTR_KILL_SGUID) &&
8082         stx->stx_size >= in->size) {
8083       if (stx->stx_size > in->size) {
8084         in->size = in->reported_size = stx->stx_size;
8085         in->cap_dirtier_uid = perms.uid();
8086         in->cap_dirtier_gid = perms.gid();
8087         in->mark_caps_dirty(CEPH_CAP_FILE_EXCL);
8088         mask &= ~(CEPH_SETATTR_SIZE);
8089         mask |= CEPH_SETATTR_MTIME;
8090       } else {
8091         // ignore it when size doesn't change
8092         mask &= ~(CEPH_SETATTR_SIZE);
8093       }
8094     } else {
8095       args.setattr.size = stx->stx_size;
8096       inode_drop |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_RD |
8097                     CEPH_CAP_FILE_WR;
8098     }
8099   }
8100
8101   if (mask & CEPH_SETATTR_FSCRYPT_FILE) {
8102     ldout(cct,10) << "resetting cached fscrypt_file field. size now "
8103                   << in->fscrypt_file.size() << dendl;
8104
8105     if (in->caps_issued_mask(CEPH_CAP_FILE_EXCL)) {
8106       in->ctime = ceph_clock_now();
8107       in->cap_dirtier_uid = perms.uid();
8108       in->cap_dirtier_gid = perms.gid();
8109       in->fscrypt_file = *aux;
8110       in->mark_caps_dirty(CEPH_CAP_FILE_EXCL);
8111       mask &= ~CEPH_SETATTR_FSCRYPT_FILE;
8112     } else if (!in->caps_issued_mask(CEPH_CAP_FILE_SHARED) ||
8113                in->fscrypt_file != *aux) {
8114       inode_drop |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR;
8115     } else {
8116       mask &= ~CEPH_SETATTR_FSCRYPT_FILE;
8117     }
8118   }
8119
8120   if (mask & CEPH_SETATTR_MTIME) {
8121     if (in->caps_issued_mask(CEPH_CAP_FILE_EXCL)) {
8122       in->mtime = utime_t(stx->stx_mtime);
8123       in->ctime = ceph_clock_now();
8124       in->cap_dirtier_uid = perms.uid();
8125       in->cap_dirtier_gid = perms.gid();
8126       in->time_warp_seq++;
8127       in->mark_caps_dirty(CEPH_CAP_FILE_EXCL);
8128       mask &= ~CEPH_SETATTR_MTIME;
8129     } else if (in->caps_issued_mask(CEPH_CAP_FILE_WR) &&
8130                utime_t(stx->stx_mtime) > in->mtime) {
8131       in->mtime = utime_t(stx->stx_mtime);
8132       in->ctime = ceph_clock_now();
8133       in->cap_dirtier_uid = perms.uid();
8134       in->cap_dirtier_gid = perms.gid();
8135       in->mark_caps_dirty(CEPH_CAP_FILE_WR);
8136       mask &= ~CEPH_SETATTR_MTIME;
8137     } else if (!in->caps_issued_mask(CEPH_CAP_FILE_SHARED) ||
8138                in->mtime != utime_t(stx->stx_mtime)) {
8139       args.setattr.mtime = utime_t(stx->stx_mtime);
8140       inode_drop |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_RD |
8141                     CEPH_CAP_FILE_WR;
8142     } else {
8143       mask &= ~CEPH_SETATTR_MTIME;
8144     }
8145   }
8146
8147   if (mask & CEPH_SETATTR_ATIME) {
8148     if (in->caps_issued_mask(CEPH_CAP_FILE_EXCL)) {
8149       in->atime = utime_t(stx->stx_atime);
8150       in->ctime = ceph_clock_now();
8151       in->cap_dirtier_uid = perms.uid();
8152       in->cap_dirtier_gid = perms.gid();
8153       in->time_warp_seq++;
8154       in->mark_caps_dirty(CEPH_CAP_FILE_EXCL);
8155       mask &= ~CEPH_SETATTR_ATIME;
8156     } else if (in->caps_issued_mask(CEPH_CAP_FILE_WR) &&
8157                utime_t(stx->stx_atime) > in->atime) {
8158       in->atime = utime_t(stx->stx_atime);
8159       in->ctime = ceph_clock_now();
8160       in->cap_dirtier_uid = perms.uid();
8161       in->cap_dirtier_gid = perms.gid();
8162       in->mark_caps_dirty(CEPH_CAP_FILE_WR);
8163       mask &= ~CEPH_SETATTR_ATIME;
8164     } else if (!in->caps_issued_mask(CEPH_CAP_FILE_SHARED) ||
8165                in->atime != utime_t(stx->stx_atime)) {
8166       args.setattr.atime = utime_t(stx->stx_atime);
8167       inode_drop |= CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_RD |
8168                     CEPH_CAP_FILE_WR;
8169     } else {
8170       mask &= ~CEPH_SETATTR_ATIME;
8171     }
8172   }
8173
8174   if (!mask) {
8175     in->change_attr++;
8176     if (in->is_dir() && in->snapid == CEPH_NOSNAP) {
8177       vinodeno_t vino(in->ino, CEPH_SNAPDIR);
8178       if (inode_map.count(vino)) {
8179         refresh_snapdir_attrs(inode_map[vino], in);
8180       }
8181     }
8182     return 0;
8183   }
8184
8185   MetaRequest *req = new MetaRequest(CEPH_MDS_OP_SETATTR);
8186
8187   filepath path;
8188
8189   in->make_nosnap_relative_path(path);
8190   req->set_filepath(path);
8191   req->set_inode(in);
8192
8193   req->head.args = args;
8194   req->inode_drop = inode_drop;
8195   if (mask & CEPH_SETATTR_FSCRYPT_AUTH) {
8196     req->fscrypt_auth = *aux;
8197   } else if (mask & CEPH_SETATTR_FSCRYPT_FILE) {
8198     req->fscrypt_file = *aux;
8199   }
8200   req->head.args.setattr.mask = mask;
8201   req->regetattr_mask = mask;
8202
8203   int res = make_request(req, perms, inp);
8204   ldout(cct, 10) << "_setattr result=" << res << dendl;
8205   return res;
8206 }
8207
8208 /* Note that we only care about attrs that setattr cares about */
8209 void Client::stat_to_statx(struct stat *st, struct ceph_statx *stx)
8210 {
8211   stx->stx_size = st->st_size;
8212   stx->stx_mode = st->st_mode;
8213   stx->stx_uid = st->st_uid;
8214   stx->stx_gid = st->st_gid;
8215 #ifdef __APPLE__
8216   stx->stx_mtime = st->st_mtimespec;
8217   stx->stx_atime = st->st_atimespec;
8218 #elif __WIN32
8219   stx->stx_mtime.tv_sec = st->st_mtime;
8220   stx->stx_mtime.tv_nsec = 0;
8221   stx->stx_atime.tv_sec = st->st_atime;
8222   stx->stx_atime.tv_nsec = 0;
8223 #else
8224   stx->stx_mtime = st->st_mtim;
8225   stx->stx_atime = st->st_atim;
8226 #endif
8227 }
8228
8229 int Client::__setattrx(Inode *in, struct ceph_statx *stx, int mask,
8230                        const UserPerm& perms, InodeRef *inp)
8231 {
8232   if (mask & CEPH_SETATTR_SIZE) {
8233     mask |= clear_suid_sgid(in, perms, true);
8234   }
8235
8236   int ret = _do_setattr(in, stx, mask, perms, inp);
8237   if (ret < 0)
8238    return ret;
8239   if (mask & CEPH_SETATTR_MODE)
8240     ret = _posix_acl_chmod(in, stx->stx_mode, perms);
8241   return ret;
8242 }
8243
8244 int Client::_setattrx(InodeRef &in, struct ceph_statx *stx, int mask,
8245                       const UserPerm& perms)
8246 {
8247   mask &= (CEPH_SETATTR_MODE | CEPH_SETATTR_UID |
8248            CEPH_SETATTR_GID | CEPH_SETATTR_MTIME |
8249            CEPH_SETATTR_ATIME | CEPH_SETATTR_SIZE |
8250            CEPH_SETATTR_CTIME | CEPH_SETATTR_BTIME);
8251   if (cct->_conf->client_permissions) {
8252     int r = may_setattr(in.get(), stx, mask, perms);
8253     if (r < 0)
8254       return r;
8255   }
8256   return __setattrx(in.get(), stx, mask, perms);
8257 }
8258
8259 int Client::_setattr(InodeRef &in, struct stat *attr, int mask,
8260                      const UserPerm& perms)
8261 {
8262   struct ceph_statx stx;
8263
8264   stat_to_statx(attr, &stx);
8265   mask &= ~CEPH_SETATTR_BTIME;
8266
8267   if ((mask & CEPH_SETATTR_UID) && attr->st_uid == static_cast<uid_t>(-1)) {
8268     mask &= ~CEPH_SETATTR_UID;
8269   }
8270   if ((mask & CEPH_SETATTR_GID) && attr->st_gid == static_cast<uid_t>(-1)) {
8271     mask &= ~CEPH_SETATTR_GID;
8272   }
8273
8274   return _setattrx(in, &stx, mask, perms);
8275 }
8276
8277 int Client::setattr(const char *relpath, struct stat *attr, int mask,
8278                     const UserPerm& perms)
8279 {
8280   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8281   if (!mref_reader.is_state_satisfied())
8282     return -CEPHFS_ENOTCONN;
8283
8284   tout(cct) << __func__ << std::endl;
8285   tout(cct) << relpath << std::endl;
8286   tout(cct) << mask  << std::endl;
8287
8288   filepath path(relpath);
8289   InodeRef in;
8290
8291   std::scoped_lock lock(client_lock);
8292   int r = path_walk(path, &in, perms);
8293   if (r < 0)
8294     return r;
8295   return _setattr(in, attr, mask, perms);
8296 }
8297
8298 int Client::setattrx(const char *relpath, struct ceph_statx *stx, int mask,
8299                      const UserPerm& perms, int flags)
8300 {
8301   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8302   if (!mref_reader.is_state_satisfied())
8303     return -CEPHFS_ENOTCONN;
8304
8305   tout(cct) << __func__ << std::endl;
8306   tout(cct) << relpath << std::endl;
8307   tout(cct) << mask  << std::endl;
8308
8309   filepath path(relpath);
8310   InodeRef in;
8311
8312   std::scoped_lock lock(client_lock);
8313   int r = path_walk(path, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW));
8314   if (r < 0)
8315     return r;
8316   return _setattrx(in, stx, mask, perms);
8317 }
8318
8319 int Client::fsetattr(int fd, struct stat *attr, int mask, const UserPerm& perms)
8320 {
8321   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8322   if (!mref_reader.is_state_satisfied())
8323     return -CEPHFS_ENOTCONN;
8324
8325   tout(cct) << __func__ << std::endl;
8326   tout(cct) << fd << std::endl;
8327   tout(cct) << mask  << std::endl;
8328
8329   std::scoped_lock lock(client_lock);
8330   Fh *f = get_filehandle(fd);
8331   if (!f)
8332     return -CEPHFS_EBADF;
8333 #if defined(__linux__) && defined(O_PATH)
8334   if (f->flags & O_PATH)
8335     return -CEPHFS_EBADF;
8336 #endif
8337   return _setattr(f->inode, attr, mask, perms);
8338 }
8339
8340 int Client::fsetattrx(int fd, struct ceph_statx *stx, int mask, const UserPerm& perms)
8341 {
8342   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8343   if (!mref_reader.is_state_satisfied())
8344     return -CEPHFS_ENOTCONN;
8345
8346   tout(cct) << __func__ << std::endl;
8347   tout(cct) << fd << std::endl;
8348   tout(cct) << mask  << std::endl;
8349
8350   std::scoped_lock lock(client_lock);
8351   Fh *f = get_filehandle(fd);
8352   if (!f)
8353     return -CEPHFS_EBADF;
8354 #if defined(__linux__) && defined(O_PATH)
8355   if (f->flags & O_PATH)
8356     return -CEPHFS_EBADF;
8357 #endif
8358   return _setattrx(f->inode, stx, mask, perms);
8359 }
8360
8361 int Client::stat(const char *relpath, struct stat *stbuf, const UserPerm& perms,
8362                  frag_info_t *dirstat, int mask)
8363 {
8364   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8365   if (!mref_reader.is_state_satisfied())
8366     return -CEPHFS_ENOTCONN;
8367
8368   ldout(cct, 3) << __func__ << " enter (relpath " << relpath << " mask " << mask << ")" << dendl;
8369   tout(cct) << "stat" << std::endl;
8370   tout(cct) << relpath << std::endl;
8371
8372   filepath path(relpath);
8373   InodeRef in;
8374
8375   std::scoped_lock lock(client_lock);
8376   int r = path_walk(path, &in, perms, true, mask);
8377   if (r < 0)
8378     return r;
8379   r = _getattr(in, mask, perms);
8380   if (r < 0) {
8381     ldout(cct, 3) << __func__ << " exit on error!" << dendl;
8382     return r;
8383   }
8384   fill_stat(in, stbuf, dirstat);
8385   ldout(cct, 3) << __func__ << " exit (relpath " << relpath << " mask " << mask << ")" << dendl;
8386   return r;
8387 }
8388
8389 unsigned Client::statx_to_mask(unsigned int flags, unsigned int want)
8390 {
8391   unsigned mask = 0;
8392
8393   /* The AT_STATX_FORCE_SYNC is always in higher priority than AT_STATX_DONT_SYNC. */
8394   if ((flags & AT_STATX_SYNC_TYPE) == AT_STATX_DONT_SYNC)
8395     goto out;
8396
8397   /* Always set PIN to distinguish from AT_STATX_DONT_SYNC case */
8398   mask |= CEPH_CAP_PIN;
8399   if (want & (CEPH_STATX_MODE|CEPH_STATX_UID|CEPH_STATX_GID|CEPH_STATX_BTIME|CEPH_STATX_CTIME|CEPH_STATX_VERSION))
8400     mask |= CEPH_CAP_AUTH_SHARED;
8401   if (want & (CEPH_STATX_NLINK|CEPH_STATX_CTIME|CEPH_STATX_VERSION))
8402     mask |= CEPH_CAP_LINK_SHARED;
8403   if (want & (CEPH_STATX_NLINK|CEPH_STATX_ATIME|CEPH_STATX_MTIME|CEPH_STATX_CTIME|CEPH_STATX_SIZE|CEPH_STATX_BLOCKS|CEPH_STATX_VERSION))
8404     mask |= CEPH_CAP_FILE_SHARED;
8405   if (want & (CEPH_STATX_VERSION|CEPH_STATX_CTIME))
8406     mask |= CEPH_CAP_XATTR_SHARED;
8407 out:
8408   return mask;
8409 }
8410
8411 int Client::statx(const char *relpath, struct ceph_statx *stx,
8412                   const UserPerm& perms,
8413                   unsigned int want, unsigned int flags)
8414 {
8415   return statxat(CEPHFS_AT_FDCWD, relpath, stx, perms, want, flags);
8416 }
8417
8418 int Client::lstat(const char *relpath, struct stat *stbuf,
8419                   const UserPerm& perms, frag_info_t *dirstat, int mask)
8420 {
8421   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8422   if (!mref_reader.is_state_satisfied())
8423     return -CEPHFS_ENOTCONN;
8424
8425   ldout(cct, 3) << __func__ << " enter (relpath " << relpath << " mask " << mask << ")" << dendl;
8426   tout(cct) << __func__ << std::endl;
8427   tout(cct) << relpath << std::endl;
8428
8429   filepath path(relpath);
8430   InodeRef in;
8431
8432   std::scoped_lock lock(client_lock);
8433   // don't follow symlinks
8434   int r = path_walk(path, &in, perms, false, mask);
8435   if (r < 0)
8436     return r;
8437   r = _getattr(in, mask, perms);
8438   if (r < 0) {
8439     ldout(cct, 3) << __func__ << " exit on error!" << dendl;
8440     return r;
8441   }
8442   fill_stat(in, stbuf, dirstat);
8443   ldout(cct, 3) << __func__ << " exit (relpath " << relpath << " mask " << mask << ")" << dendl;
8444   return r;
8445 }
8446
8447 int Client::fill_stat(Inode *in, struct stat *st, frag_info_t *dirstat, nest_info_t *rstat)
8448 {
8449   ldout(cct, 10) << __func__ << " on " << in->ino << " snap/dev" << in->snapid
8450            << " mode 0" << oct << in->mode << dec
8451            << " mtime " << in->mtime << " ctime " << in->ctime << dendl;
8452   memset(st, 0, sizeof(struct stat));
8453   if (use_faked_inos())
8454     st->st_ino = in->faked_ino;
8455   else
8456     st->st_ino = in->ino;
8457   st->st_dev = in->snapid;
8458   st->st_mode = in->mode;
8459   st->st_rdev = in->rdev;
8460   if (in->is_dir()) {
8461     switch (in->nlink) {
8462       case 0:
8463         st->st_nlink = 0; /* dir is unlinked */
8464         break;
8465       case 1:
8466         st->st_nlink = 1 /* parent dentry */
8467                        + 1 /* <dir>/. */
8468                        + in->dirstat.nsubdirs; /* include <dir>/. self-reference */
8469         break;
8470       default:
8471         ceph_abort();
8472     }
8473   } else {
8474     st->st_nlink = in->nlink;
8475   }
8476   st->st_uid = in->uid;
8477   st->st_gid = in->gid;
8478   if (in->ctime > in->mtime) {
8479     stat_set_ctime_sec(st, in->ctime.sec());
8480     stat_set_ctime_nsec(st, in->ctime.nsec());
8481   } else {
8482     stat_set_ctime_sec(st, in->mtime.sec());
8483     stat_set_ctime_nsec(st, in->mtime.nsec());
8484   }
8485   stat_set_atime_sec(st, in->atime.sec());
8486   stat_set_atime_nsec(st, in->atime.nsec());
8487   stat_set_mtime_sec(st, in->mtime.sec());
8488   stat_set_mtime_nsec(st, in->mtime.nsec());
8489   if (in->is_dir()) {
8490     if (cct->_conf->client_dirsize_rbytes) {
8491       st->st_size = in->rstat.rbytes;
8492     } else if (in->snapid == CEPH_SNAPDIR) {
8493       SnapRealm *realm = get_snap_realm_maybe(in->vino().ino);
8494       if (realm) {
8495         st->st_size = realm->my_snaps.size();
8496         put_snap_realm(realm);
8497       }
8498     } else {
8499       st->st_size = in->dirstat.size();
8500     }
8501 // The Windows "stat" structure provides just a subset of the fields that are
8502 // available on Linux.
8503 #ifndef _WIN32
8504     st->st_blocks = 1;
8505 #endif
8506   } else {
8507     st->st_size = in->size;
8508 #ifndef _WIN32
8509     st->st_blocks = (in->size + 511) >> 9;
8510 #endif
8511   }
8512 #ifndef _WIN32
8513   st->st_blksize = std::max<uint32_t>(in->layout.stripe_unit, 4096);
8514 #endif
8515
8516   if (dirstat)
8517     *dirstat = in->dirstat;
8518   if (rstat)
8519     *rstat = in->rstat;
8520
8521   return in->caps_issued();
8522 }
8523
8524 void Client::fill_statx(Inode *in, unsigned int mask, struct ceph_statx *stx)
8525 {
8526   ldout(cct, 10) << __func__ << " on " << in->ino << " snap/dev" << in->snapid
8527            << " mode 0" << oct << in->mode << dec
8528            << " mtime " << in->mtime << " ctime " << in->ctime << " change_attr " << in->change_attr << dendl;
8529   memset(stx, 0, sizeof(struct ceph_statx));
8530
8531   /*
8532    * If mask is 0, then the caller set AT_STATX_DONT_SYNC. Reset the mask
8533    * so that all bits are set.
8534    */
8535   if (!mask)
8536     mask = ~0;
8537
8538   /* These are always considered to be available */
8539   stx->stx_dev = in->snapid;
8540   stx->stx_blksize = std::max<uint32_t>(in->layout.stripe_unit, 4096);
8541
8542   /* Type bits are always set, even when CEPH_STATX_MODE is not */
8543   stx->stx_mode = S_IFMT & in->mode;
8544   stx->stx_ino = use_faked_inos() ? in->faked_ino : (uint64_t)in->ino;
8545   stx->stx_rdev = in->rdev;
8546   stx->stx_mask |= (CEPH_STATX_INO|CEPH_STATX_RDEV);
8547
8548   if (mask & CEPH_CAP_AUTH_SHARED) {
8549     stx->stx_uid = in->uid;
8550     stx->stx_gid = in->gid;
8551     stx->stx_mode = in->mode;
8552     in->btime.to_timespec(&stx->stx_btime);
8553     stx->stx_mask |= (CEPH_STATX_MODE|CEPH_STATX_UID|CEPH_STATX_GID|CEPH_STATX_BTIME);
8554   }
8555
8556   if (mask & CEPH_CAP_LINK_SHARED) {
8557     if (in->is_dir()) {
8558       switch (in->nlink) {
8559         case 0:
8560           stx->stx_nlink = 0; /* dir is unlinked */
8561           break;
8562         case 1:
8563           stx->stx_nlink = 1 /* parent dentry */
8564                            + 1 /* <dir>/. */
8565                            + in->dirstat.nsubdirs; /* include <dir>/. self-reference */
8566           break;
8567         default:
8568           ceph_abort();
8569       }
8570     } else {
8571       stx->stx_nlink = in->nlink;
8572     }
8573     stx->stx_mask |= CEPH_STATX_NLINK;
8574   }
8575
8576   if (mask & CEPH_CAP_FILE_SHARED) {
8577
8578     in->atime.to_timespec(&stx->stx_atime);
8579     in->mtime.to_timespec(&stx->stx_mtime);
8580
8581     if (in->is_dir()) {
8582       if (cct->_conf->client_dirsize_rbytes) {
8583         stx->stx_size = in->rstat.rbytes;
8584       } else if (in->snapid == CEPH_SNAPDIR) {
8585         SnapRealm *realm = get_snap_realm_maybe(in->vino().ino);
8586         if (realm) {
8587           stx->stx_size = realm->my_snaps.size();
8588           put_snap_realm(realm);
8589         }
8590       } else {
8591         stx->stx_size = in->dirstat.size();
8592       }
8593       stx->stx_blocks = 1;
8594     } else {
8595       stx->stx_size = in->size;
8596       stx->stx_blocks = (in->size + 511) >> 9;
8597     }
8598     stx->stx_mask |= (CEPH_STATX_ATIME|CEPH_STATX_MTIME|
8599                       CEPH_STATX_SIZE|CEPH_STATX_BLOCKS);
8600   }
8601
8602   /* Change time and change_attr both require all shared caps to view */
8603   if ((mask & CEPH_STAT_CAP_INODE_ALL) == CEPH_STAT_CAP_INODE_ALL) {
8604     stx->stx_version = in->change_attr;
8605     if (in->ctime > in->mtime)
8606       in->ctime.to_timespec(&stx->stx_ctime);
8607     else
8608       in->mtime.to_timespec(&stx->stx_ctime);
8609     stx->stx_mask |= (CEPH_STATX_CTIME|CEPH_STATX_VERSION);
8610   }
8611
8612 }
8613
8614 void Client::touch_dn(Dentry *dn)
8615 {
8616   lru.lru_touch(dn);
8617 }
8618
8619 int Client::chmod(const char *relpath, mode_t mode, const UserPerm& perms)
8620 {
8621   return chmodat(CEPHFS_AT_FDCWD, relpath, mode, 0, perms);
8622 }
8623
8624 int Client::fchmod(int fd, mode_t mode, const UserPerm& perms)
8625 {
8626   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8627   if (!mref_reader.is_state_satisfied())
8628     return -CEPHFS_ENOTCONN;
8629
8630   tout(cct) << __func__ << std::endl;
8631   tout(cct) << fd << std::endl;
8632   tout(cct) << mode << std::endl;
8633
8634   std::scoped_lock lock(client_lock);
8635   Fh *f = get_filehandle(fd);
8636   if (!f)
8637     return -CEPHFS_EBADF;
8638 #if defined(__linux__) && defined(O_PATH)
8639   if (f->flags & O_PATH)
8640     return -CEPHFS_EBADF;
8641 #endif
8642   struct stat attr;
8643   attr.st_mode = mode;
8644   return _setattr(f->inode, &attr, CEPH_SETATTR_MODE, perms);
8645 }
8646
8647 int Client::chmodat(int dirfd, const char *relpath, mode_t mode, int flags,
8648                     const UserPerm& perms) {
8649   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8650   if (!mref_reader.is_state_satisfied()) {
8651     return -CEPHFS_ENOTCONN;
8652   }
8653
8654   tout(cct) << __func__ << std::endl;
8655   tout(cct) << dirfd << std::endl;
8656   tout(cct) << relpath << std::endl;
8657   tout(cct) << mode << std::endl;
8658   tout(cct) << flags << std::endl;
8659
8660   filepath path(relpath);
8661   InodeRef in;
8662   InodeRef dirinode;
8663
8664   std::scoped_lock lock(client_lock);
8665   int r = get_fd_inode(dirfd, &dirinode);
8666   if (r < 0) {
8667     return r;
8668   }
8669
8670   r = path_walk(path, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW), 0, dirinode);
8671   if (r < 0) {
8672     return r;
8673   }
8674   struct stat attr;
8675   attr.st_mode = mode;
8676   return _setattr(in, &attr, CEPH_SETATTR_MODE, perms);
8677 }
8678
8679 int Client::lchmod(const char *relpath, mode_t mode, const UserPerm& perms)
8680 {
8681   return chmodat(CEPHFS_AT_FDCWD, relpath, mode, AT_SYMLINK_NOFOLLOW, perms);
8682 }
8683
8684 int Client::chown(const char *relpath, uid_t new_uid, gid_t new_gid,
8685                   const UserPerm& perms)
8686 {
8687   return chownat(CEPHFS_AT_FDCWD, relpath, new_uid, new_gid, 0, perms);
8688 }
8689
8690 int Client::fchown(int fd, uid_t new_uid, gid_t new_gid, const UserPerm& perms)
8691 {
8692   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8693   if (!mref_reader.is_state_satisfied())
8694     return -CEPHFS_ENOTCONN;
8695
8696   tout(cct) << __func__ << std::endl;
8697   tout(cct) << fd << std::endl;
8698   tout(cct) << new_uid << std::endl;
8699   tout(cct) << new_gid << std::endl;
8700
8701   std::scoped_lock lock(client_lock);
8702   Fh *f = get_filehandle(fd);
8703   if (!f)
8704     return -CEPHFS_EBADF;
8705 #if defined(__linux__) && defined(O_PATH)
8706   if (f->flags & O_PATH)
8707     return -CEPHFS_EBADF;
8708 #endif
8709   struct stat attr;
8710   attr.st_uid = new_uid;
8711   attr.st_gid = new_gid;
8712   int mask = 0;
8713   if (new_uid != static_cast<uid_t>(-1)) mask |= CEPH_SETATTR_UID;
8714   if (new_gid != static_cast<gid_t>(-1)) mask |= CEPH_SETATTR_GID;
8715   return _setattr(f->inode, &attr, mask, perms);
8716 }
8717
8718 int Client::lchown(const char *relpath, uid_t new_uid, gid_t new_gid,
8719                    const UserPerm& perms)
8720 {
8721   return chownat(CEPHFS_AT_FDCWD, relpath, new_uid, new_gid, AT_SYMLINK_NOFOLLOW, perms);
8722 }
8723
8724 int Client::chownat(int dirfd, const char *relpath, uid_t new_uid, gid_t new_gid,
8725                     int flags, const UserPerm& perms) {
8726   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8727   if (!mref_reader.is_state_satisfied()) {
8728     return -CEPHFS_ENOTCONN;
8729   }
8730
8731   tout(cct) << __func__ << std::endl;
8732   tout(cct) << dirfd << std::endl;
8733   tout(cct) << relpath << std::endl;
8734   tout(cct) << new_uid << std::endl;
8735   tout(cct) << new_gid << std::endl;
8736   tout(cct) << flags << std::endl;
8737
8738   filepath path(relpath);
8739   InodeRef in;
8740   InodeRef dirinode;
8741
8742   std::scoped_lock lock(client_lock);
8743   int r = get_fd_inode(dirfd, &dirinode);
8744   if (r < 0) {
8745     return r;
8746   }
8747
8748   r = path_walk(path, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW), 0, dirinode);
8749   if (r < 0) {
8750     return r;
8751   }
8752   struct stat attr;
8753   attr.st_uid = new_uid;
8754   attr.st_gid = new_gid;
8755   return _setattr(in, &attr, CEPH_SETATTR_UID|CEPH_SETATTR_GID, perms);
8756 }
8757
8758 static void attr_set_atime_and_mtime(struct stat *attr,
8759                                      const utime_t &atime,
8760                                      const utime_t &mtime)
8761 {
8762   stat_set_atime_sec(attr, atime.tv.tv_sec);
8763   stat_set_atime_nsec(attr, atime.tv.tv_nsec);
8764   stat_set_mtime_sec(attr, mtime.tv.tv_sec);
8765   stat_set_mtime_nsec(attr, mtime.tv.tv_nsec);
8766 }
8767
8768 // for [l]utime() invoke the timeval variant as the timespec
8769 // variant are not yet implemented. for futime[s](), invoke
8770 // the timespec variant.
8771 int Client::utime(const char *relpath, struct utimbuf *buf,
8772                   const UserPerm& perms)
8773 {
8774   struct timeval tv[2];
8775   tv[0].tv_sec  = buf->actime;
8776   tv[0].tv_usec = 0;
8777   tv[1].tv_sec  = buf->modtime;
8778   tv[1].tv_usec = 0;
8779
8780   return utimes(relpath, tv, perms);
8781 }
8782
8783 int Client::lutime(const char *relpath, struct utimbuf *buf,
8784                    const UserPerm& perms)
8785 {
8786   struct timeval tv[2];
8787   tv[0].tv_sec  = buf->actime;
8788   tv[0].tv_usec = 0;
8789   tv[1].tv_sec  = buf->modtime;
8790   tv[1].tv_usec = 0;
8791
8792   return lutimes(relpath, tv, perms);
8793 }
8794
8795 int Client::futime(int fd, struct utimbuf *buf, const UserPerm& perms)
8796 {
8797   struct timespec ts[2];
8798   ts[0].tv_sec  = buf->actime;
8799   ts[0].tv_nsec = 0;
8800   ts[1].tv_sec  = buf->modtime;
8801   ts[1].tv_nsec = 0;
8802
8803   return futimens(fd, ts, perms);
8804 }
8805
8806 int Client::utimes(const char *relpath, struct timeval times[2],
8807                    const UserPerm& perms)
8808 {
8809   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8810   if (!mref_reader.is_state_satisfied())
8811     return -CEPHFS_ENOTCONN;
8812
8813   tout(cct) << __func__ << std::endl;
8814   tout(cct) << relpath << std::endl;
8815   tout(cct) << "atime: " << times[0].tv_sec << "." << times[0].tv_usec
8816             << std::endl;
8817   tout(cct) << "mtime: " << times[1].tv_sec << "." << times[1].tv_usec
8818             << std::endl;
8819
8820   filepath path(relpath);
8821   InodeRef in;
8822
8823   std::scoped_lock lock(client_lock);
8824   int r = path_walk(path, &in, perms);
8825   if (r < 0)
8826     return r;
8827   struct ceph_statx attr;
8828   utime_t(times[0]).to_timespec(&attr.stx_atime);
8829   utime_t(times[1]).to_timespec(&attr.stx_mtime);
8830
8831   return _setattrx(in, &attr, CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME, perms);
8832 }
8833
8834 int Client::lutimes(const char *relpath, struct timeval times[2],
8835                     const UserPerm& perms)
8836 {
8837   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8838   if (!mref_reader.is_state_satisfied())
8839     return -CEPHFS_ENOTCONN;
8840
8841   tout(cct) << __func__ << std::endl;
8842   tout(cct) << relpath << std::endl;
8843   tout(cct) << "atime: " << times[0].tv_sec << "." << times[0].tv_usec
8844             << std::endl;
8845   tout(cct) << "mtime: " << times[1].tv_sec << "." << times[1].tv_usec
8846             << std::endl;
8847
8848   filepath path(relpath);
8849   InodeRef in;
8850
8851   std::scoped_lock lock(client_lock);
8852   int r = path_walk(path, &in, perms, false);
8853   if (r < 0)
8854     return r;
8855   struct ceph_statx attr;
8856   utime_t(times[0]).to_timespec(&attr.stx_atime);
8857   utime_t(times[1]).to_timespec(&attr.stx_mtime);
8858
8859   return _setattrx(in, &attr, CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME, perms);
8860 }
8861
8862 int Client::futimes(int fd, struct timeval times[2], const UserPerm& perms)
8863 {
8864   struct timespec ts[2];
8865   ts[0].tv_sec  = times[0].tv_sec;
8866   ts[0].tv_nsec = times[0].tv_usec * 1000;
8867   ts[1].tv_sec  = times[1].tv_sec;
8868   ts[1].tv_nsec = times[1].tv_usec * 1000;
8869
8870   return futimens(fd, ts, perms);
8871 }
8872
8873 int Client::futimens(int fd, struct timespec times[2], const UserPerm& perms)
8874 {
8875   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8876   if (!mref_reader.is_state_satisfied())
8877     return -CEPHFS_ENOTCONN;
8878
8879   tout(cct) << __func__ << std::endl;
8880   tout(cct) << fd << std::endl;
8881   tout(cct) << "atime: " << times[0].tv_sec << "." << times[0].tv_nsec
8882             << std::endl;
8883   tout(cct) << "mtime: " << times[1].tv_sec << "." << times[1].tv_nsec
8884             << std::endl;
8885
8886   std::scoped_lock lock(client_lock);
8887   Fh *f = get_filehandle(fd);
8888   if (!f)
8889     return -CEPHFS_EBADF;
8890 #if defined(__linux__) && defined(O_PATH)
8891   if (f->flags & O_PATH)
8892     return -CEPHFS_EBADF;
8893 #endif
8894   struct ceph_statx attr;
8895   utime_t(times[0]).to_timespec(&attr.stx_atime);
8896   utime_t(times[1]).to_timespec(&attr.stx_mtime);
8897
8898   return _setattrx(f->inode, &attr, CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME, perms);
8899 }
8900
8901 int Client::utimensat(int dirfd, const char *relpath, struct timespec times[2], int flags,
8902                       const UserPerm& perms) {
8903   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8904   if (!mref_reader.is_state_satisfied()) {
8905     return -CEPHFS_ENOTCONN;
8906   }
8907
8908   tout(cct) << __func__ << std::endl;
8909   tout(cct) << dirfd << std::endl;
8910   tout(cct) << relpath << std::endl;
8911   tout(cct) << "atime: " << times[0].tv_sec << "." << times[0].tv_nsec
8912             << std::endl;
8913   tout(cct) << "mtime: " << times[1].tv_sec << "." << times[1].tv_nsec
8914             << std::endl;
8915   tout(cct) << flags << std::endl;
8916
8917   filepath path(relpath);
8918   InodeRef in;
8919   InodeRef dirinode;
8920
8921   std::scoped_lock lock(client_lock);
8922   int r = get_fd_inode(dirfd, &dirinode);
8923   if (r < 0) {
8924     return r;
8925   }
8926
8927 #if defined(__linux__) && defined(O_PATH)
8928   if (flags & O_PATH) {
8929     return -CEPHFS_EBADF;
8930   }
8931 #endif
8932
8933   r = path_walk(path, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW), 0, dirinode);
8934   if (r < 0) {
8935     return r;
8936   }
8937   struct ceph_statx attr;
8938   utime_t(times[0]).to_timespec(&attr.stx_atime);
8939   utime_t(times[1]).to_timespec(&attr.stx_mtime);
8940
8941   return _setattrx(in, &attr, CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME, perms);
8942 }
8943
8944 int Client::flock(int fd, int operation, uint64_t owner)
8945 {
8946   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8947   if (!mref_reader.is_state_satisfied())
8948     return -CEPHFS_ENOTCONN;
8949
8950   tout(cct) << __func__ << std::endl;
8951   tout(cct) << fd << std::endl;
8952   tout(cct) << operation << std::endl;
8953   tout(cct) << owner << std::endl;
8954
8955   std::scoped_lock lock(client_lock);
8956   Fh *f = get_filehandle(fd);
8957   if (!f)
8958     return -CEPHFS_EBADF;
8959
8960   return _flock(f, operation, owner);
8961 }
8962
8963 int Client::opendir(const char *relpath, dir_result_t **dirpp, const UserPerm& perms)
8964 {
8965   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8966   if (!mref_reader.is_state_satisfied())
8967     return -CEPHFS_ENOTCONN;
8968
8969   tout(cct) << __func__ << std::endl;
8970   tout(cct) << relpath << std::endl;
8971
8972   filepath path(relpath);
8973   InodeRef in;
8974
8975   std::scoped_lock lock(client_lock);
8976   int r = path_walk(path, &in, perms, true);
8977   if (r < 0)
8978     return r;
8979   if (cct->_conf->client_permissions) {
8980     int r = may_open(in.get(), O_RDONLY, perms);
8981     if (r < 0)
8982       return r;
8983   }
8984   r = _opendir(in.get(), dirpp, perms);
8985   /* if ENOTDIR, dirpp will be an uninitialized point and it's very dangerous to access its value */
8986   if (r != -CEPHFS_ENOTDIR)
8987       tout(cct) << (uintptr_t)*dirpp << std::endl;
8988   return r;
8989 }
8990
8991 int Client::fdopendir(int dirfd, dir_result_t **dirpp, const UserPerm &perms) {
8992   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8993   if (!mref_reader.is_state_satisfied()) {
8994     return -CEPHFS_ENOTCONN;
8995   }
8996
8997   tout(cct) << __func__ << std::endl;
8998   tout(cct) << dirfd << std::endl;
8999
9000   InodeRef dirinode;
9001   std::scoped_lock locker(client_lock);
9002   int r = get_fd_inode(dirfd, &dirinode);
9003   if (r < 0) {
9004     return r;
9005   }
9006
9007   if (cct->_conf->client_permissions) {
9008     r = may_open(dirinode.get(), O_RDONLY, perms);
9009     if (r < 0) {
9010       return r;
9011     }
9012   }
9013   r = _opendir(dirinode.get(), dirpp, perms);
9014   /* if ENOTDIR, dirpp will be an uninitialized point and it's very dangerous to access its value */
9015   if (r != -CEPHFS_ENOTDIR) {
9016       tout(cct) << (uintptr_t)*dirpp << std::endl;
9017   }
9018   return r;
9019 }
9020
9021 int Client::_opendir(Inode *in, dir_result_t **dirpp, const UserPerm& perms)
9022 {
9023   if (!in->is_dir())
9024     return -CEPHFS_ENOTDIR;
9025   *dirpp = new dir_result_t(in, perms);
9026   opened_dirs.insert(*dirpp);
9027   ldout(cct, 8) << __func__ << "(" << in->ino << ") = " << 0 << " (" << *dirpp << ")" << dendl;
9028   return 0;
9029 }
9030
9031
9032 int Client::closedir(dir_result_t *dir)
9033 {
9034   tout(cct) << __func__ << std::endl;
9035   tout(cct) << (uintptr_t)dir << std::endl;
9036
9037   ldout(cct, 3) << __func__ << "(" << dir << ") = 0" << dendl;
9038   std::scoped_lock lock(client_lock);
9039   _closedir(dir);
9040   return 0;
9041 }
9042
9043 void Client::_closedir(dir_result_t *dirp)
9044 {
9045   ldout(cct, 10) << __func__ << "(" << dirp << ")" << dendl;
9046
9047   if (dirp->inode) {
9048     ldout(cct, 10) << __func__ << " detaching inode " << dirp->inode << dendl;
9049     dirp->inode.reset();
9050   }
9051   _readdir_drop_dirp_buffer(dirp);
9052   opened_dirs.erase(dirp);
9053   delete dirp;
9054 }
9055
9056 void Client::rewinddir(dir_result_t *dirp)
9057 {
9058   ldout(cct, 3) << __func__ << "(" << dirp << ")" << dendl;
9059
9060   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
9061   if (!mref_reader.is_state_satisfied())
9062     return;
9063
9064   std::scoped_lock lock(client_lock);
9065   dir_result_t *d = static_cast<dir_result_t*>(dirp);
9066   _readdir_drop_dirp_buffer(d);
9067   d->reset();
9068 }
9069
9070 loff_t Client::telldir(dir_result_t *dirp)
9071 {
9072   dir_result_t *d = static_cast<dir_result_t*>(dirp);
9073   ldout(cct, 3) << __func__ << "(" << dirp << ") = " << d->offset << dendl;
9074   return d->offset;
9075 }
9076
9077 void Client::seekdir(dir_result_t *dirp, loff_t offset)
9078 {
9079   ldout(cct, 3) << __func__ << "(" << dirp << ", " << offset << ")" << dendl;
9080
9081   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
9082   if (!mref_reader.is_state_satisfied())
9083     return;
9084
9085   std::scoped_lock lock(client_lock);
9086
9087   if (offset == dirp->offset)
9088     return;
9089
9090   if (offset > dirp->offset)
9091     dirp->release_count = 0;   // bump if we do a forward seek
9092   else
9093     dirp->ordered_count = 0;   // disable filling readdir cache
9094
9095   if (dirp->hash_order()) {
9096     if (dirp->offset > offset) {
9097       _readdir_drop_dirp_buffer(dirp);
9098       dirp->reset();
9099     }
9100   } else {
9101     if (offset == 0 ||
9102         dirp->buffer_frag != frag_t(dir_result_t::fpos_high(offset)) ||
9103         dirp->offset_low() > dir_result_t::fpos_low(offset))  {
9104       _readdir_drop_dirp_buffer(dirp);
9105       dirp->reset();
9106     }
9107   }
9108
9109   dirp->offset = offset;
9110 }
9111
9112
9113 //struct dirent {
9114 //  ino_t          d_ino;       /* inode number */
9115 //  off_t          d_off;       /* offset to the next dirent */
9116 //  unsigned short d_reclen;    /* length of this record */
9117 //  unsigned char  d_type;      /* type of file */
9118 //  char           d_name[256]; /* filename */
9119 //};
9120 void Client::fill_dirent(struct dirent *de, const char *name, int type, uint64_t ino, loff_t next_off)
9121 {
9122   strncpy(de->d_name, name, 255);
9123   de->d_name[255] = '\0';
9124 #if !defined(__CYGWIN__) && !(defined(_WIN32))
9125   de->d_ino = ino;
9126 #if !defined(__APPLE__) && !defined(__FreeBSD__)
9127   de->d_off = next_off;
9128 #endif
9129   de->d_reclen = 1;
9130   de->d_type = IFTODT(type);
9131   ldout(cct, 10) << __func__ << " '" << de->d_name << "' -> " << inodeno_t(de->d_ino)
9132            << " type " << (int)de->d_type << " w/ next_off " << hex << next_off << dec << dendl;
9133 #endif
9134 }
9135
9136 void Client::_readdir_next_frag(dir_result_t *dirp)
9137 {
9138   frag_t fg = dirp->buffer_frag;
9139
9140   if (fg.is_rightmost()) {
9141     ldout(cct, 10) << __func__ << " advance from " << fg << " to END" << dendl;
9142     dirp->set_end();
9143     return;
9144   }
9145
9146   // advance
9147   fg = fg.next();
9148   ldout(cct, 10) << __func__ << " advance from " << dirp->buffer_frag << " to " << fg << dendl;
9149
9150   if (dirp->hash_order()) {
9151     // keep last_name
9152     int64_t new_offset = dir_result_t::make_fpos(fg.value(), 2, true);
9153     if (dirp->offset < new_offset) // don't decrease offset
9154       dirp->offset = new_offset;
9155   } else {
9156     dirp->last_name.clear();
9157     dirp->offset = dir_result_t::make_fpos(fg, 2, false);
9158     _readdir_rechoose_frag(dirp);
9159   }
9160 }
9161
9162 void Client::_readdir_rechoose_frag(dir_result_t *dirp)
9163 {
9164   ceph_assert(dirp->inode);
9165
9166   if (dirp->hash_order())
9167     return;
9168
9169   frag_t cur = frag_t(dirp->offset_high());
9170   frag_t fg = dirp->inode->dirfragtree[cur.value()];
9171   if (fg != cur) {
9172     ldout(cct, 10) << __func__ << " frag " << cur << " maps to " << fg << dendl;
9173     dirp->offset = dir_result_t::make_fpos(fg, 2, false);
9174     dirp->last_name.clear();
9175     dirp->next_offset = 2;
9176   }
9177 }
9178
9179 void Client::_readdir_drop_dirp_buffer(dir_result_t *dirp)
9180 {
9181   ldout(cct, 10) << __func__ << " " << dirp << dendl;
9182   dirp->buffer.clear();
9183 }
9184
9185 int Client::_readdir_get_frag(int op, dir_result_t* dirp,
9186   fill_readdir_args_cb_t fill_req_cb)
9187 {
9188   ceph_assert(dirp);
9189   ceph_assert(dirp->inode);
9190
9191   // get the current frag.
9192   frag_t fg;
9193   if (dirp->hash_order())
9194     fg = dirp->inode->dirfragtree[dirp->offset_high()];
9195   else
9196     fg = frag_t(dirp->offset_high());
9197
9198   ldout(cct, 10) << __func__ << " " << dirp << " on " << dirp->inode->ino << " fg " << fg
9199                  << " offset " << hex << dirp->offset << dec << dendl;
9200
9201   InodeRef& diri = dirp->inode;
9202
9203   MetaRequest *req = new MetaRequest(op);
9204   fill_req_cb(dirp, req, diri, fg);
9205
9206   bufferlist dirbl;
9207   int res = make_request(req, dirp->perms, NULL, NULL, -1, &dirbl);
9208
9209   if (res == -CEPHFS_EAGAIN) {
9210     ldout(cct, 10) << __func__ << " got EAGAIN, retrying" << dendl;
9211     _readdir_rechoose_frag(dirp);
9212     return _readdir_get_frag(op, dirp, fill_req_cb);
9213   }
9214
9215   if (res == 0) {
9216     ldout(cct, 10) << __func__ << " " << dirp << " got frag " << dirp->buffer_frag
9217                    << " size " << dirp->buffer.size() << dendl;
9218   } else {
9219     ldout(cct, 10) << __func__ << " got error " << res << ", setting end flag" << dendl;
9220     dirp->set_end();
9221   }
9222
9223   return res;
9224 }
9225
9226 struct dentry_off_lt {
9227   bool operator()(const Dentry* dn, int64_t off) const {
9228     return dir_result_t::fpos_cmp(dn->offset, off) < 0;
9229   }
9230 };
9231
9232 int Client::_readdir_cache_cb(dir_result_t *dirp, add_dirent_cb_t cb, void *p,
9233                               int caps, bool getref)
9234 {
9235   ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
9236   ldout(cct, 10) << __func__ << " " << dirp << " on " << dirp->inode->ino
9237            << " last_name " << dirp->last_name
9238            << " offset " << hex << dirp->offset << dec
9239            << dendl;
9240   Dir *dir = dirp->inode->dir;
9241
9242   if (!dir) {
9243     ldout(cct, 10) << " dir is empty" << dendl;
9244     dirp->set_end();
9245     return 0;
9246   }
9247
9248   vector<Dentry*>::iterator pd = std::lower_bound(dir->readdir_cache.begin(),
9249                                                   dir->readdir_cache.end(),
9250                                                   dirp->offset, dentry_off_lt());
9251
9252   string dn_name;
9253   while (true) {
9254     int mask = caps;
9255     if (!dirp->inode->is_complete_and_ordered())
9256       return -CEPHFS_EAGAIN;
9257     if (pd == dir->readdir_cache.end())
9258       break;
9259     Dentry *dn = *pd;
9260     if (dn->inode == NULL) {
9261       ldout(cct, 15) << " skipping null '" << dn->name << "'" << dendl;
9262       ++pd;
9263       continue;
9264     }
9265     if (dn->cap_shared_gen != dir->parent_inode->shared_gen) {
9266       ldout(cct, 15) << " skipping mismatch shared gen '" << dn->name << "'" << dendl;
9267       ++pd;
9268       continue;
9269     }
9270
9271     int idx = pd - dir->readdir_cache.begin();
9272     if (dn->inode->is_dir()) {
9273       mask |= CEPH_STAT_RSTAT;
9274     }
9275     int r = _getattr(dn->inode, mask, dirp->perms);
9276     if (r < 0)
9277       return r;
9278
9279     // the content of readdir_cache may change after _getattr(), so pd may be invalid iterator
9280     pd = dir->readdir_cache.begin() + idx;
9281     if (pd >= dir->readdir_cache.end() || *pd != dn)
9282       return -CEPHFS_EAGAIN;
9283
9284     struct ceph_statx stx;
9285     struct dirent de;
9286     fill_statx(dn->inode, caps, &stx);
9287
9288     uint64_t next_off = dn->offset + 1;
9289     fill_dirent(&de, dn->name.c_str(), stx.stx_mode, stx.stx_ino, next_off);
9290     ++pd;
9291     if (pd == dir->readdir_cache.end())
9292       next_off = dir_result_t::END;
9293
9294     Inode *in = NULL;
9295     if (getref) {
9296       in = dn->inode.get();
9297       _ll_get(in);
9298     }
9299
9300     dn_name = dn->name; // fill in name while we have lock
9301
9302     client_lock.unlock();
9303     r = cb(p, &de, &stx, next_off, in);  // _next_ offset
9304     client_lock.lock();
9305     ldout(cct, 15) << " de " << de.d_name << " off " << hex << dn->offset << dec
9306                    << " = " << r << dendl;
9307     if (r < 0) {
9308       return r;
9309     }
9310
9311     dirp->offset = next_off;
9312     if (dirp->at_end())
9313       dirp->next_offset = 2;
9314     else
9315       dirp->next_offset = dirp->offset_low();
9316     dirp->last_name = dn_name; // we successfully returned this one; update!
9317     dirp->release_count = 0; // last_name no longer match cache index
9318     if (r > 0)
9319       return r;
9320   }
9321
9322   ldout(cct, 10) << __func__ << " " << dirp << " on " << dirp->inode->ino << " at end" << dendl;
9323   dirp->set_end();
9324   return 0;
9325 }
9326
9327 int Client::readdir_r_cb(dir_result_t* d,
9328   add_dirent_cb_t cb,
9329   void* p,
9330   unsigned want,
9331   unsigned flags,
9332   bool getref)
9333 {
9334   auto fill_readdir_cb = [](dir_result_t* dirp,
9335                             MetaRequest* req,
9336                             InodeRef& diri,
9337                             frag_t fg) {
9338     filepath path;
9339     diri->make_nosnap_relative_path(path);
9340     req->set_filepath(path);
9341     req->set_inode(diri.get());
9342     req->head.args.readdir.frag = fg;
9343     req->head.args.readdir.flags = CEPH_READDIR_REPLY_BITFLAGS;
9344     if (dirp->last_name.length()) {
9345       req->path2.set_path(dirp->last_name);
9346     } else if (dirp->hash_order()) {
9347       req->head.args.readdir.offset_hash = dirp->offset_high();
9348     }
9349     req->dirp = dirp;
9350   };
9351   int op = CEPH_MDS_OP_READDIR;
9352   if (d->inode && d->inode->snapid == CEPH_SNAPDIR)
9353     op = CEPH_MDS_OP_LSSNAP;
9354   return _readdir_r_cb(op,
9355     d,
9356     cb,
9357     fill_readdir_cb,
9358     p,
9359     want,
9360     flags,
9361     getref,
9362     false);
9363 }
9364
9365 //
9366 // NB: this is used for both readdir and readdir_snapdiff results processing
9367 // hence it should be request type agnostic
9368 //
9369 int Client::_readdir_r_cb(int op,
9370   dir_result_t *d,
9371   add_dirent_cb_t cb,
9372   fill_readdir_args_cb_t fill_cb,
9373   void *p,
9374   unsigned want,
9375   unsigned flags,
9376   bool getref,
9377   bool bypass_cache)
9378 {
9379   int caps = statx_to_mask(flags, want);
9380
9381   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
9382   if (!mref_reader.is_state_satisfied())
9383     return -CEPHFS_ENOTCONN;
9384
9385   std::unique_lock cl(client_lock);
9386
9387   dir_result_t *dirp = static_cast<dir_result_t*>(d);
9388
9389   ldout(cct, 10) << __func__ << " " << *dirp->inode << " offset " << hex << dirp->offset
9390                  << dec << " at_end=" << dirp->at_end()
9391                  << " hash_order=" << dirp->hash_order() << dendl;
9392
9393   struct dirent de;
9394   struct ceph_statx stx;
9395   memset(&de, 0, sizeof(de));
9396   memset(&stx, 0, sizeof(stx));
9397
9398   InodeRef& diri = dirp->inode;
9399
9400   if (dirp->at_end())
9401     return 0;
9402
9403   if (dirp->offset == 0) {
9404     ldout(cct, 15) << " including ." << dendl;
9405     ceph_assert(diri->dentries.size() < 2); // can't have multiple hard-links to a dir
9406     uint64_t next_off = 1;
9407
9408     int r;
9409     r = _getattr(diri, caps | CEPH_STAT_RSTAT, dirp->perms);
9410     if (r < 0)
9411       return r;
9412
9413     fill_statx(diri, caps, &stx);
9414     fill_dirent(&de, ".", S_IFDIR, stx.stx_ino, next_off);
9415
9416     Inode *inode = NULL;
9417     if (getref) {
9418       inode = diri.get();
9419       _ll_get(inode);
9420     }
9421
9422     cl.unlock();
9423     r = cb(p, &de, &stx, next_off, inode);
9424     cl.lock();
9425     if (r < 0)
9426       return r;
9427
9428     dirp->offset = next_off;
9429     if (r > 0)
9430       return r;
9431   }
9432   if (dirp->offset == 1) {
9433     ldout(cct, 15) << " including .." << dendl;
9434     uint64_t next_off = 2;
9435     InodeRef in;
9436     if (diri->dentries.empty())
9437       in = diri;
9438     else
9439       in = diri->get_first_parent()->dir->parent_inode;
9440
9441     int r;
9442     r = _getattr(in, caps | CEPH_STAT_RSTAT, dirp->perms);
9443     if (r < 0)
9444       return r;
9445
9446     fill_statx(in, caps, &stx);
9447     fill_dirent(&de, "..", S_IFDIR, stx.stx_ino, next_off);
9448
9449     Inode *inode = NULL;
9450     if (getref) {
9451       inode = in.get();
9452       _ll_get(inode);
9453     }
9454
9455     cl.unlock();
9456     r = cb(p, &de, &stx, next_off, inode);
9457     cl.lock();
9458     if (r < 0)
9459       return r;
9460
9461     dirp->offset = next_off;
9462     if (r > 0)
9463       return r;
9464   }
9465
9466   // can we read from our cache?
9467   ldout(cct, 10) << __func__
9468            << " offset " << hex << dirp->offset << dec
9469            << " snapid " << dirp->inode->snapid << " (complete && ordered) "
9470            << dirp->inode->is_complete_and_ordered()
9471            << " issued " << ccap_string(dirp->inode->caps_issued())
9472            << dendl;
9473   if (!bypass_cache &&
9474       dirp->inode->snapid != CEPH_SNAPDIR &&
9475       dirp->inode->is_complete_and_ordered() &&
9476       dirp->inode->caps_issued_mask(CEPH_CAP_FILE_SHARED, true)) {
9477     int err = _readdir_cache_cb(dirp, cb, p, caps, getref);
9478     if (err != -CEPHFS_EAGAIN)
9479       return err;
9480   }
9481
9482   while (1) {
9483     if (dirp->at_end())
9484       return 0;
9485
9486     bool check_caps = true;
9487     if (!dirp->is_cached()) {
9488       int r = _readdir_get_frag(op, dirp, fill_cb);
9489       if (r)
9490         return r;
9491       // _readdir_get_frag () may updates dirp->offset if the replied dirfrag is
9492       // different than the requested one. (our dirfragtree was outdated)
9493       check_caps = false;
9494     }
9495     frag_t fg = dirp->buffer_frag;
9496
9497     ldout(cct, 10) << __func__
9498                    << " frag " << fg << " buffer size " << dirp->buffer.size()
9499                    << " offset " << hex << dirp->offset << dendl;
9500
9501     for (auto it = std::lower_bound(dirp->buffer.begin(), dirp->buffer.end(),
9502                                     dirp->offset, dir_result_t::dentry_off_lt());
9503          it != dirp->buffer.end();
9504          ++it) {
9505       dir_result_t::dentry &entry = *it;
9506
9507       uint64_t next_off = entry.offset + 1;
9508
9509       int r;
9510       if (check_caps) {
9511         int mask = caps;
9512         if(entry.inode->is_dir()){
9513           mask |= CEPH_STAT_RSTAT;
9514         }
9515         r = _getattr(entry.inode, mask, dirp->perms);
9516         if (r < 0)
9517           return r;
9518       }
9519
9520       fill_statx(entry.inode, caps, &stx);
9521       fill_dirent(&de, entry.name.c_str(), stx.stx_mode, stx.stx_ino, next_off);
9522
9523       Inode *inode = NULL;
9524       if (getref) {
9525         inode = entry.inode.get();
9526         _ll_get(inode);
9527       }
9528
9529       cl.unlock();
9530       r = cb(p, &de, &stx, next_off, inode);  // _next_ offset
9531       cl.lock();
9532
9533       ldout(cct, 15) << __func__
9534                      << " de " << de.d_name << " off " << hex << next_off - 1 << dec
9535                      << " snap " << entry.inode->snapid
9536                      << " = " << r << dendl;
9537       if (r < 0)
9538         return r;
9539
9540       dirp->offset = next_off;
9541       if (r > 0)
9542         return r;
9543     }
9544
9545     if (dirp->next_offset > 2) {
9546       ldout(cct, 10) << " fetching next chunk of this frag" << dendl;
9547       _readdir_drop_dirp_buffer(dirp);
9548       continue;  // more!
9549     }
9550
9551     if (!fg.is_rightmost()) {
9552       // next frag!
9553       _readdir_next_frag(dirp);
9554       continue;
9555     }
9556
9557     if (!bypass_cache &&
9558         diri->shared_gen == dirp->start_shared_gen &&
9559         diri->dir_release_count == dirp->release_count) {
9560       if (diri->dir_ordered_count == dirp->ordered_count) {
9561         ldout(cct, 10) << " marking (I_COMPLETE|I_DIR_ORDERED) on " << *diri << dendl;
9562         if (diri->dir) {
9563           ceph_assert(diri->dir->readdir_cache.size() >= dirp->cache_index);
9564           diri->dir->readdir_cache.resize(dirp->cache_index);
9565         }
9566         diri->flags |= I_COMPLETE | I_DIR_ORDERED;
9567       } else {
9568         ldout(cct, 10) << " marking I_COMPLETE on " << *diri << dendl;
9569         diri->flags |= I_COMPLETE;
9570       }
9571     }
9572
9573     dirp->set_end();
9574     return 0;
9575   }
9576   ceph_abort();
9577   return 0;
9578 }
9579
9580
9581 int Client::readdir_r(dir_result_t *d, struct dirent *de)
9582 {
9583   return readdirplus_r(d, de, 0, 0, 0, NULL);
9584 }
9585
9586 /*
9587  * readdirplus_r
9588  *
9589  * returns
9590  *  1 if we got a dirent
9591  *  0 for end of directory
9592  * <0 on error
9593  */
9594
9595 struct single_readdir {
9596   struct dirent *de;
9597   struct ceph_statx *stx;
9598   Inode *inode;
9599   bool full;
9600 };
9601
9602 static int _readdir_single_dirent_cb(void *p, struct dirent *de,
9603                                      struct ceph_statx *stx, off_t off,
9604                                      Inode *in)
9605 {
9606   single_readdir *c = static_cast<single_readdir *>(p);
9607
9608   if (c->full)
9609     return -1;  // already filled this dirent
9610
9611   *c->de = *de;
9612   if (c->stx)
9613     *c->stx = *stx;
9614   c->inode = in;
9615   c->full = true;
9616   return 1;
9617 }
9618
9619 struct dirent *Client::readdir(dir_result_t *d)
9620 {
9621   int ret;
9622   auto& de = d->de;
9623   single_readdir sr;
9624   sr.de = &de;
9625   sr.stx = NULL;
9626   sr.inode = NULL;
9627   sr.full = false;
9628
9629   // our callback fills the dirent and sets sr.full=true on first
9630   // call, and returns -1 the second time around.
9631   ret = readdir_r_cb(d, _readdir_single_dirent_cb, (void *)&sr);
9632   if (ret < -1) {
9633     errno = -ret;  // this sucks.
9634     return (dirent *) NULL;
9635   }
9636   if (sr.full) {
9637     return &de;
9638   }
9639   return (dirent *) NULL;
9640 }
9641
9642 int Client::readdirplus_r(dir_result_t *d, struct dirent *de,
9643                           struct ceph_statx *stx, unsigned want,
9644                           unsigned flags, Inode **out)
9645 {
9646   single_readdir sr;
9647   sr.de = de;
9648   sr.stx = stx;
9649   sr.inode = NULL;
9650   sr.full = false;
9651
9652   // our callback fills the dirent and sets sr.full=true on first
9653   // call, and returns -1 the second time around.
9654   int r = readdir_r_cb(d, _readdir_single_dirent_cb, (void *)&sr, want, flags, out);
9655   if (r < -1)
9656     return r;
9657   if (out)
9658     *out = sr.inode;
9659   if (sr.full)
9660     return 1;
9661   return 0;
9662 }
9663
9664 int Client::readdir_snapdiff(dir_result_t* d1, snapid_t snap2,
9665                              struct dirent* out_de,
9666                              snapid_t* out_snap)
9667 {
9668   if (!d1 || !d1->inode || d1->inode->snapid == snap2) {
9669     lderr(cct) << __func__ << " invalid parameters: "
9670                << " d1:" << d1
9671                << " d1->inode:" << (d1 ? d1->inode : nullptr)
9672                << " snap2 id :" << snap2
9673                << dendl;
9674     errno = EINVAL;
9675     return -errno;
9676   }
9677
9678   auto& de = d1->de;
9679   ceph_statx stx;
9680   single_readdir sr;
9681   sr.de = &de;
9682   sr.stx = &stx;
9683   sr.inode = NULL;
9684   sr.full = false;
9685
9686   auto fill_snapdiff_cb = [&](dir_result_t* dirp,
9687     MetaRequest* req,
9688     InodeRef& diri,
9689     frag_t fg) {
9690       filepath path;
9691       diri->make_nosnap_relative_path(path);
9692       req->set_filepath(path);
9693       req->set_inode(diri.get());
9694       req->head.args.snapdiff.snap_other = snap2;
9695       req->head.args.snapdiff.frag = fg;
9696       req->head.args.snapdiff.flags = CEPH_READDIR_REPLY_BITFLAGS;
9697       if (dirp->last_name.length()) {
9698         req->path2.set_path(dirp->last_name);
9699       } else if (dirp->hash_order()) {
9700         req->head.args.snapdiff.offset_hash = dirp->offset_high();
9701       }
9702       req->dirp = dirp;
9703   };
9704
9705   // our callback fills the dirent and sets sr.full=true on first
9706   // call, and returns -1 the second time around.
9707   int ret = _readdir_r_cb(CEPH_MDS_OP_READDIR_SNAPDIFF,
9708     d1,
9709     _readdir_single_dirent_cb,
9710     fill_snapdiff_cb,
9711     (void*)&sr,
9712     0,
9713     AT_STATX_DONT_SYNC,
9714     false,
9715     true);
9716   if (ret < -1) {
9717     lderr(cct) << __func__ << " error: "
9718                << cpp_strerror(ret)
9719                << dendl;
9720     errno = -ret;  // this sucks.
9721     return ret;
9722   }
9723
9724   ldout(cct, 15) << __func__ << " " << ret
9725     << " " << sr.de->d_name
9726     << " " << stx.stx_dev
9727     << dendl;
9728   if (sr.full) {
9729     if (out_de) {
9730       *out_de = de;
9731     }
9732     if (out_snap) {
9733       *out_snap = stx.stx_dev;
9734     }
9735     return 1;
9736   }
9737   return 0;
9738 }
9739
9740 /* getdents */
9741 struct getdents_result {
9742   char *buf;
9743   int buflen;
9744   int pos;
9745   bool fullent;
9746 };
9747
9748 static int _readdir_getdent_cb(void *p, struct dirent *de,
9749                                struct ceph_statx *stx, off_t off, Inode *in)
9750 {
9751   struct getdents_result *c = static_cast<getdents_result *>(p);
9752
9753   int dlen;
9754   if (c->fullent)
9755     dlen = sizeof(*de);
9756   else
9757     dlen = strlen(de->d_name) + 1;
9758
9759   if (c->pos + dlen > c->buflen)
9760     return -1;  // doesn't fit
9761
9762   if (c->fullent) {
9763     memcpy(c->buf + c->pos, de, sizeof(*de));
9764   } else {
9765     memcpy(c->buf + c->pos, de->d_name, dlen);
9766   }
9767   c->pos += dlen;
9768   return 0;
9769 }
9770
9771 int Client::_getdents(dir_result_t *dir, char *buf, int buflen, bool fullent)
9772 {
9773   getdents_result gr;
9774   gr.buf = buf;
9775   gr.buflen = buflen;
9776   gr.fullent = fullent;
9777   gr.pos = 0;
9778
9779   int r = readdir_r_cb(dir, _readdir_getdent_cb, (void *)&gr);
9780
9781   if (r < 0) { // some error
9782     if (r == -1) { // buffer ran out of space
9783       if (gr.pos) { // but we got some entries already!
9784         return gr.pos;
9785       } // or we need a larger buffer
9786       return -CEPHFS_ERANGE;
9787     } else { // actual error, return it
9788       return r;
9789     }
9790   }
9791   return gr.pos;
9792 }
9793
9794
9795 /* getdir */
9796 struct getdir_result {
9797   list<string> *contents;
9798   int num;
9799 };
9800
9801 static int _getdir_cb(void *p, struct dirent *de, struct ceph_statx *stx, off_t off, Inode *in)
9802 {
9803   getdir_result *r = static_cast<getdir_result *>(p);
9804
9805   r->contents->push_back(de->d_name);
9806   r->num++;
9807   return 0;
9808 }
9809
9810 int Client::getdir(const char *relpath, list<string>& contents,
9811                    const UserPerm& perms)
9812 {
9813   ldout(cct, 3) << "getdir(" << relpath << ")" << dendl;
9814   tout(cct) << "getdir" << std::endl;
9815   tout(cct) << relpath << std::endl;
9816
9817   dir_result_t *d;
9818   int r = opendir(relpath, &d, perms);
9819   if (r < 0)
9820     return r;
9821
9822   getdir_result gr;
9823   gr.contents = &contents;
9824   gr.num = 0;
9825   r = readdir_r_cb(d, _getdir_cb, (void *)&gr);
9826
9827   closedir(d);
9828
9829   if (r < 0)
9830     return r;
9831   return gr.num;
9832 }
9833
9834
9835 /****** file i/o **********/
9836
9837 // common parts for open and openat. call with client_lock locked.
9838 int Client::create_and_open(int dirfd, const char *relpath, int flags,
9839                             const UserPerm& perms, mode_t mode, int stripe_unit,
9840                             int stripe_count, int object_size, const char *data_pool,
9841                             std::string alternate_name) {
9842   ceph_assert(ceph_mutex_is_locked(client_lock));
9843   int cflags = ceph_flags_sys2wire(flags);
9844   tout(cct) << cflags << std::endl;
9845
9846   Fh *fh = NULL;
9847
9848 #if defined(__linux__) && defined(O_PATH)
9849   /* When the O_PATH is being specified, others flags than O_DIRECTORY
9850    * and O_NOFOLLOW are ignored. Please refer do_entry_open() function
9851    * in kernel (fs/open.c). */
9852   if (flags & O_PATH)
9853     flags &= O_DIRECTORY | O_NOFOLLOW | O_PATH;
9854 #endif
9855
9856   filepath path(relpath);
9857   InodeRef in;
9858   bool created = false;
9859   /* O_CREATE with O_EXCL enforces O_NOFOLLOW. */
9860   bool followsym = !((flags & O_NOFOLLOW) || ((flags & O_CREAT) && (flags & O_EXCL)));
9861   int mask = ceph_caps_for_mode(ceph_flags_to_mode(cflags));
9862
9863   InodeRef dirinode = nullptr;
9864   int r = get_fd_inode(dirfd, &dirinode);
9865   if (r < 0) {
9866     return r;
9867   }
9868
9869   r = path_walk(path, &in, perms, followsym, mask, dirinode);
9870   if (r == 0 && (flags & O_CREAT) && (flags & O_EXCL))
9871     return -CEPHFS_EEXIST;
9872
9873 #if defined(__linux__) && defined(O_PATH)
9874   if (r == 0 && in->is_symlink() && (flags & O_NOFOLLOW) && !(flags & O_PATH))
9875 #else
9876     if (r == 0 && in->is_symlink() && (flags & O_NOFOLLOW))
9877 #endif
9878     return -CEPHFS_ELOOP;
9879
9880   if (r == -CEPHFS_ENOENT && (flags & O_CREAT)) {
9881     filepath dirpath = path;
9882     string dname = dirpath.last_dentry();
9883     dirpath.pop_dentry();
9884     InodeRef dir;
9885     r = path_walk(dirpath, &dir, perms, true,
9886                   cct->_conf->client_permissions ? CEPH_CAP_AUTH_SHARED : 0, dirinode);
9887     if (r < 0) {
9888       goto out;
9889     }
9890     if (cct->_conf->client_permissions) {
9891       r = may_create(dir.get(), perms);
9892       if (r < 0)
9893         goto out;
9894     }
9895     r = _create(dir.get(), dname.c_str(), flags, mode, &in, &fh, stripe_unit,
9896                 stripe_count, object_size, data_pool, &created, perms,
9897                 std::move(alternate_name));
9898   }
9899   if (r < 0)
9900     goto out;
9901
9902   if (!created) {
9903     // posix says we can only check permissions of existing files
9904     if (cct->_conf->client_permissions) {
9905       r = may_open(in.get(), flags, perms);
9906       if (r < 0)
9907         goto out;
9908     }
9909   }
9910
9911   if (!fh)
9912     r = _open(in.get(), flags, mode, &fh, perms);
9913   if (r >= 0) {
9914     // allocate a integer file descriptor
9915     ceph_assert(fh);
9916     r = get_fd();
9917     ceph_assert(fd_map.count(r) == 0);
9918     fd_map[r] = fh;
9919   }
9920
9921  out:
9922   return r;
9923 }
9924
9925 int Client::open(const char *relpath, int flags, const UserPerm& perms,
9926                  mode_t mode, int stripe_unit, int stripe_count,
9927                  int object_size, const char *data_pool, std::string alternate_name)
9928 {
9929   return openat(CEPHFS_AT_FDCWD, relpath, flags, perms, mode, stripe_unit,
9930                 stripe_count, object_size, data_pool, alternate_name);
9931 }
9932
9933 int Client::openat(int dirfd, const char *relpath, int flags, const UserPerm& perms,
9934                    mode_t mode, int stripe_unit, int stripe_count, int object_size,
9935                    const char *data_pool, std::string alternate_name) {
9936   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
9937   if (!mref_reader.is_state_satisfied()) {
9938     return -CEPHFS_ENOTCONN;
9939   }
9940
9941   ldout(cct, 3) << "openat enter(" << relpath << ")" << dendl;
9942   tout(cct) << dirfd << std::endl;
9943   tout(cct) << relpath << std::endl;
9944   tout(cct) << flags << std::endl;
9945   tout(cct) << mode << std::endl;
9946
9947   std::scoped_lock locker(client_lock);
9948   int r =  create_and_open(dirfd, relpath, flags, perms, mode, stripe_unit, stripe_count,
9949                            object_size, data_pool, alternate_name);
9950
9951   tout(cct) << r << std::endl;
9952   ldout(cct, 3) << "openat exit(" << relpath << ")" << dendl;
9953   return r;
9954 }
9955
9956 int Client::lookup_hash(inodeno_t ino, inodeno_t dirino, const char *name,
9957                         const UserPerm& perms)
9958 {
9959   ldout(cct, 3) << __func__ << " enter(" << ino << ", #" << dirino << "/" << name << ")" << dendl;
9960
9961   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
9962   if (!mref_reader.is_state_satisfied())
9963     return -CEPHFS_ENOTCONN;
9964
9965   std::scoped_lock lock(client_lock);
9966   MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPHASH);
9967   filepath path(ino);
9968   req->set_filepath(path);
9969
9970   uint32_t h = ceph_str_hash(CEPH_STR_HASH_RJENKINS, name, strlen(name));
9971   char f[30];
9972   sprintf(f, "%u", h);
9973   filepath path2(dirino);
9974   path2.push_dentry(string(f));
9975   req->set_filepath2(path2);
9976
9977   int r = make_request(req, perms, NULL, NULL,
9978                        rand() % mdsmap->get_num_in_mds());
9979   ldout(cct, 3) << __func__ << " exit(" << ino << ", #" << dirino << "/" << name << ") = " << r << dendl;
9980   return r;
9981 }
9982
9983
9984 /**
9985  * Load inode into local cache.
9986  *
9987  * If inode pointer is non-NULL, and take a reference on
9988  * the resulting Inode object in one operation, so that caller
9989  * can safely assume inode will still be there after return.
9990  */
9991 int Client::_lookup_vino(vinodeno_t vino, const UserPerm& perms, Inode **inode)
9992 {
9993   ldout(cct, 8) << __func__ << " enter(" << vino << ")" << dendl;
9994
9995   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
9996   if (!mref_reader.is_state_satisfied())
9997     return -CEPHFS_ENOTCONN;
9998
9999   if (is_reserved_vino(vino))
10000     return -CEPHFS_ESTALE;
10001
10002   MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPINO);
10003   filepath path(vino.ino);
10004   req->set_filepath(path);
10005
10006   /*
10007    * The MDS expects either a "real" snapid here or 0. The special value
10008    * carveouts for the snapid are all at the end of the range so we can
10009    * just look for any snapid below this value.
10010    */
10011   if (vino.snapid < CEPH_NOSNAP)
10012     req->head.args.lookupino.snapid = vino.snapid;
10013
10014   int r = make_request(req, perms, NULL, NULL, rand() % mdsmap->get_num_in_mds());
10015   if (r == 0 && inode != NULL) {
10016     unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino);
10017     ceph_assert(p != inode_map.end());
10018     *inode = p->second;
10019     _ll_get(*inode);
10020   }
10021   ldout(cct, 8) << __func__ << " exit(" << vino << ") = " << r << dendl;
10022   return r;
10023 }
10024
10025 int Client::lookup_ino(inodeno_t ino, const UserPerm& perms, Inode **inode)
10026 {
10027   vinodeno_t vino(ino, CEPH_NOSNAP);
10028   std::scoped_lock lock(client_lock);
10029   return _lookup_vino(vino, perms, inode);
10030 }
10031
10032 /**
10033  * Find the parent inode of `ino` and insert it into
10034  * our cache.  Conditionally also set `parent` to a referenced
10035  * Inode* if caller provides non-NULL value.
10036  */
10037 int Client::_lookup_parent(Inode *ino, const UserPerm& perms, Inode **parent)
10038 {
10039   ldout(cct, 8) << __func__ << " enter(" << ino->ino << ")" << dendl;
10040
10041   MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPPARENT);
10042   filepath path(ino->ino);
10043   req->set_filepath(path);
10044
10045   InodeRef target;
10046   int r = make_request(req, perms, &target, NULL, rand() % mdsmap->get_num_in_mds());
10047   // Give caller a reference to the parent ino if they provided a pointer.
10048   if (parent != NULL) {
10049     if (r == 0) {
10050       *parent = target.get();
10051       _ll_get(*parent);
10052       ldout(cct, 8) << __func__ << " found parent " << (*parent)->ino << dendl;
10053     } else {
10054       *parent = NULL;
10055     }
10056   }
10057   ldout(cct, 8) << __func__ << " exit(" << ino->ino << ") = " << r << dendl;
10058   return r;
10059 }
10060
10061 /**
10062  * Populate the parent dentry for `ino`, provided it is
10063  * a child of `parent`.
10064  */
10065 int Client::_lookup_name(Inode *ino, Inode *parent, const UserPerm& perms)
10066 {
10067   ceph_assert(parent->is_dir());
10068   ldout(cct, 3) << __func__ << " enter(" << ino->ino << ")" << dendl;
10069
10070   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
10071   if (!mref_reader.is_state_satisfied())
10072     return -CEPHFS_ENOTCONN;
10073
10074   MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPNAME);
10075   req->set_filepath2(filepath(parent->ino));
10076   req->set_filepath(filepath(ino->ino));
10077   req->set_inode(ino);
10078
10079   int r = make_request(req, perms, NULL, NULL, rand() % mdsmap->get_num_in_mds());
10080   ldout(cct, 3) << __func__ << " exit(" << ino->ino << ") = " << r << dendl;
10081   return r;
10082 }
10083
10084 int Client::lookup_name(Inode *ino, Inode *parent, const UserPerm& perms)
10085 {
10086   std::scoped_lock lock(client_lock);
10087   return _lookup_name(ino, parent, perms);
10088 }
10089
10090 Fh *Client::_create_fh(Inode *in, int flags, int cmode, const UserPerm& perms)
10091 {
10092   ceph_assert(in);
10093   Fh *f = new Fh(in, flags, cmode, fd_gen, perms);
10094
10095   ldout(cct, 10) << __func__ << " " << in->ino << " mode " << cmode << dendl;
10096
10097   if (in->snapid != CEPH_NOSNAP) {
10098     in->snap_cap_refs++;
10099     ldout(cct, 5) << "open success, fh is " << f << " combined IMMUTABLE SNAP caps "
10100             << ccap_string(in->caps_issued()) << dendl;
10101   }
10102
10103   const auto& conf = cct->_conf;
10104   f->readahead.set_trigger_requests(1);
10105   f->readahead.set_min_readahead_size(conf->client_readahead_min);
10106   uint64_t max_readahead = Readahead::NO_LIMIT;
10107   if (conf->client_readahead_max_bytes) {
10108     max_readahead = std::min(max_readahead, (uint64_t)conf->client_readahead_max_bytes);
10109   }
10110   if (conf->client_readahead_max_periods) {
10111     max_readahead = std::min(max_readahead, in->layout.get_period()*(uint64_t)conf->client_readahead_max_periods);
10112   }
10113   f->readahead.set_max_readahead_size(max_readahead);
10114   vector<uint64_t> alignments;
10115   alignments.push_back(in->layout.get_period());
10116   alignments.push_back(in->layout.stripe_unit);
10117   f->readahead.set_alignments(alignments);
10118
10119   return f;
10120 }
10121
10122 int Client::_release_fh(Fh *f)
10123 {
10124   //ldout(cct, 3) << "op: client->close(open_files[ " << fh << " ]);" << dendl;
10125   //ldout(cct, 3) << "op: open_files.erase( " << fh << " );" << dendl;
10126   Inode *in = f->inode.get();
10127   ldout(cct, 8) << __func__ << " " << f << " mode " << f->mode << " on " << *in << dendl;
10128
10129   in->unset_deleg(f);
10130
10131   if (in->snapid == CEPH_NOSNAP) {
10132     if (in->put_open_ref(f->mode)) {
10133       _flush(in, new C_Client_FlushComplete(this, in));
10134       check_caps(in, 0);
10135     }
10136   } else {
10137     ceph_assert(in->snap_cap_refs > 0);
10138     in->snap_cap_refs--;
10139   }
10140
10141   _release_filelocks(f);
10142
10143   // Finally, read any async err (i.e. from flushes)
10144   int err = f->take_async_err();
10145   if (err != 0) {
10146     ldout(cct, 1) << __func__ << " " << f << " on inode " << *in << " caught async_err = "
10147                   << cpp_strerror(err) << dendl;
10148   } else {
10149     ldout(cct, 10) << __func__ << " " << f << " on inode " << *in << " no async_err state" << dendl;
10150   }
10151
10152   _put_fh(f);
10153
10154   return err;
10155 }
10156
10157 void Client::_put_fh(Fh *f)
10158 {
10159   int left = f->put();
10160   if (!left) {
10161     delete f;
10162   }
10163 }
10164
10165 int Client::_open(Inode *in, int flags, mode_t mode, Fh **fhp,
10166                   const UserPerm& perms)
10167 {
10168   if (in->snapid != CEPH_NOSNAP &&
10169       (flags & (O_WRONLY | O_RDWR | O_CREAT | O_TRUNC | O_APPEND))) {
10170     return -CEPHFS_EROFS;
10171   }
10172
10173   // use normalized flags to generate cmode
10174   int cflags = ceph_flags_sys2wire(flags);
10175   if (cct->_conf.get_val<bool>("client_force_lazyio"))
10176     cflags |= CEPH_O_LAZY;
10177
10178   int cmode = ceph_flags_to_mode(cflags);
10179   int want = ceph_caps_for_mode(cmode);
10180   int result = 0;
10181
10182   in->get_open_ref(cmode);  // make note of pending open, since it effects _wanted_ caps.
10183
10184   if ((flags & O_TRUNC) == 0 && in->caps_issued_mask(want)) {
10185     // update wanted?
10186     check_caps(in, CHECK_CAPS_NODELAY);
10187   } else {
10188
10189     MetaRequest *req = new MetaRequest(CEPH_MDS_OP_OPEN);
10190     filepath path;
10191     in->make_nosnap_relative_path(path);
10192     req->set_filepath(path);
10193     req->head.args.open.flags = cflags & ~CEPH_O_CREAT;
10194     req->head.args.open.mode = mode;
10195     req->head.args.open.pool = -1;
10196     if (cct->_conf->client_debug_getattr_caps)
10197       req->head.args.open.mask = DEBUG_GETATTR_CAPS;
10198     else
10199       req->head.args.open.mask = 0;
10200     req->head.args.open.old_size = in->size;   // for O_TRUNC
10201     req->set_inode(in);
10202     result = make_request(req, perms);
10203
10204     /*
10205      * NFS expects that delegations will be broken on a conflicting open,
10206      * not just when there is actual conflicting access to the file. SMB leases
10207      * and oplocks also have similar semantics.
10208      *
10209      * Ensure that clients that have delegations enabled will wait on minimal
10210      * caps during open, just to ensure that other clients holding delegations
10211      * return theirs first.
10212      */
10213     if (deleg_timeout && result == 0) {
10214       int need = 0, have;
10215
10216       if (cmode & CEPH_FILE_MODE_WR)
10217         need |= CEPH_CAP_FILE_WR;
10218       if (cmode & CEPH_FILE_MODE_RD)
10219         need |= CEPH_CAP_FILE_RD;
10220
10221       Fh fh(in, flags, cmode, fd_gen, perms);
10222       result = get_caps(&fh, need, want, &have, -1);
10223       if (result < 0) {
10224         ldout(cct, 8) << "Unable to get caps after open of inode " << *in <<
10225                           " . Denying open: " <<
10226                           cpp_strerror(result) << dendl;
10227       } else {
10228         put_cap_ref(in, need);
10229       }
10230     }
10231   }
10232
10233   // success?
10234   if (result >= 0) {
10235     if (fhp)
10236       *fhp = _create_fh(in, flags, cmode, perms);
10237   } else {
10238     in->put_open_ref(cmode);
10239   }
10240
10241   trim_cache();
10242
10243   return result;
10244 }
10245
10246 int Client::_renew_caps(Inode *in)
10247 {
10248   int wanted = in->caps_file_wanted();
10249   if (in->is_any_caps() &&
10250       ((wanted & CEPH_CAP_ANY_WR) == 0 || in->auth_cap)) {
10251     check_caps(in, CHECK_CAPS_NODELAY);
10252     return 0;
10253   }
10254
10255   int flags = 0;
10256   if ((wanted & CEPH_CAP_FILE_RD) && (wanted & CEPH_CAP_FILE_WR))
10257     flags = O_RDWR;
10258   else if (wanted & CEPH_CAP_FILE_RD)
10259     flags = O_RDONLY;
10260   else if (wanted & CEPH_CAP_FILE_WR)
10261     flags = O_WRONLY;
10262
10263   MetaRequest *req = new MetaRequest(CEPH_MDS_OP_OPEN);
10264   filepath path;
10265   in->make_nosnap_relative_path(path);
10266   req->set_filepath(path);
10267   req->head.args.open.flags = flags;
10268   req->head.args.open.pool = -1;
10269   if (cct->_conf->client_debug_getattr_caps)
10270     req->head.args.open.mask = DEBUG_GETATTR_CAPS;
10271   else
10272     req->head.args.open.mask = 0;
10273   req->set_inode(in);
10274
10275   // duplicate in case Cap goes away; not sure if that race is a concern?
10276   const UserPerm *pperm = in->get_best_perms();
10277   UserPerm perms;
10278   if (pperm != NULL)
10279     perms = *pperm;
10280   int ret = make_request(req, perms);
10281   return ret;
10282 }
10283
10284 int Client::_close(int fd)
10285 {
10286   ldout(cct, 3) << "close enter(" << fd << ")" << dendl;
10287   tout(cct) << "close" << std::endl;
10288   tout(cct) << fd << std::endl;
10289
10290   Fh *fh = get_filehandle(fd);
10291   if (!fh)
10292     return -CEPHFS_EBADF;
10293   int err = _release_fh(fh);
10294   fd_map.erase(fd);
10295   put_fd(fd);
10296   ldout(cct, 3) << "close exit(" << fd << ")" << dendl;
10297   return err;
10298 }
10299
10300 int Client::close(int fd) {
10301   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
10302   if (!mref_reader.is_state_satisfied())
10303     return -CEPHFS_ENOTCONN;
10304
10305   std::scoped_lock lock(client_lock);
10306   return _close(fd);
10307 }
10308
10309 // ------------
10310 // read, write
10311
10312 loff_t Client::lseek(int fd, loff_t offset, int whence)
10313 {
10314   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
10315   if (!mref_reader.is_state_satisfied())
10316     return -CEPHFS_ENOTCONN;
10317
10318   tout(cct) << "lseek" << std::endl;
10319   tout(cct) << fd << std::endl;
10320   tout(cct) << offset << std::endl;
10321   tout(cct) << whence << std::endl;
10322
10323   std::scoped_lock lock(client_lock);
10324   Fh *f = get_filehandle(fd);
10325   if (!f)
10326     return -CEPHFS_EBADF;
10327 #if defined(__linux__) && defined(O_PATH)
10328   if (f->flags & O_PATH)
10329     return -CEPHFS_EBADF;
10330 #endif
10331   return _lseek(f, offset, whence);
10332 }
10333
10334 loff_t Client::_lseek(Fh *f, loff_t offset, int whence)
10335 {
10336   Inode *in = f->inode.get();
10337   bool whence_check = false;
10338   loff_t pos = -1;
10339
10340   switch (whence) {
10341   case SEEK_END:
10342     whence_check = true;
10343   break;
10344
10345 #ifdef SEEK_DATA
10346   case SEEK_DATA:
10347     whence_check = true;
10348   break;
10349 #endif
10350
10351 #ifdef SEEK_HOLE
10352   case SEEK_HOLE:
10353     whence_check = true;
10354   break;
10355 #endif
10356   }
10357
10358   if (whence_check) {
10359     int r = _getattr(in, CEPH_STAT_CAP_SIZE, f->actor_perms);
10360     if (r < 0)
10361       return r;
10362   }
10363
10364   switch (whence) {
10365   case SEEK_SET:
10366     pos = offset;
10367     break;
10368
10369   case SEEK_CUR:
10370     pos = f->pos + offset;
10371     break;
10372
10373   case SEEK_END:
10374     pos = in->size + offset;
10375     break;
10376
10377 #ifdef SEEK_DATA
10378   case SEEK_DATA:
10379     if (offset < 0 || static_cast<uint64_t>(offset) >= in->size)
10380       return -CEPHFS_ENXIO;
10381     pos = offset;
10382     break;
10383 #endif
10384
10385 #ifdef SEEK_HOLE
10386   case SEEK_HOLE:
10387     if (offset < 0 || static_cast<uint64_t>(offset) >= in->size)
10388       return -CEPHFS_ENXIO;
10389     pos = in->size;
10390     break;
10391 #endif
10392
10393   default:
10394     ldout(cct, 1) << __func__ << ": invalid whence value " << whence << dendl;
10395     return -CEPHFS_EINVAL;
10396   }
10397
10398   if (pos < 0) {
10399     return -CEPHFS_EINVAL;
10400   } else {
10401     f->pos = pos;
10402   }
10403
10404   ldout(cct, 8) << "_lseek(" << f << ", " << offset << ", " << whence << ") = " << f->pos << dendl;
10405   return f->pos;
10406 }
10407
10408
10409 void Client::lock_fh_pos(Fh *f)
10410 {
10411   ldout(cct, 10) << __func__ << " " << f << dendl;
10412
10413   if (f->pos_locked || !f->pos_waiters.empty()) {
10414     ceph::condition_variable cond;
10415     f->pos_waiters.push_back(&cond);
10416     ldout(cct, 10) << __func__ << " BLOCKING on " << f << dendl;
10417     std::unique_lock l{client_lock, std::adopt_lock};
10418     cond.wait(l, [f, me=&cond] {
10419       return !f->pos_locked && f->pos_waiters.front() == me;
10420     });
10421     l.release();
10422     ldout(cct, 10) << __func__ << " UNBLOCKING on " << f << dendl;
10423     ceph_assert(f->pos_waiters.front() == &cond);
10424     f->pos_waiters.pop_front();
10425   }
10426
10427   f->pos_locked = true;
10428 }
10429
10430 void Client::unlock_fh_pos(Fh *f)
10431 {
10432   ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
10433
10434   ldout(cct, 10) << __func__ << " " << f << dendl;
10435   f->pos_locked = false;
10436   if (!f->pos_waiters.empty()) {
10437     // only wake up the oldest waiter
10438     auto cond = f->pos_waiters.front();
10439     cond->notify_one();
10440   }
10441 }
10442
10443 int Client::uninline_data(Inode *in, Context *onfinish)
10444 {
10445   if (!in->inline_data.length()) {
10446     onfinish->complete(0);
10447     return 0;
10448   }
10449
10450   char oid_buf[32];
10451   snprintf(oid_buf, sizeof(oid_buf), "%llx.00000000", (long long unsigned)in->ino);
10452   object_t oid = oid_buf;
10453
10454   ObjectOperation create_ops;
10455   create_ops.create(false);
10456
10457   objecter->mutate(oid,
10458                    OSDMap::file_to_object_locator(in->layout),
10459                    create_ops,
10460                    in->snaprealm->get_snap_context(),
10461                    ceph::real_clock::now(),
10462                    0,
10463                    NULL);
10464
10465   bufferlist inline_version_bl;
10466   encode(in->inline_version, inline_version_bl);
10467
10468   ObjectOperation uninline_ops;
10469   uninline_ops.cmpxattr("inline_version",
10470                         CEPH_OSD_CMPXATTR_OP_GT,
10471                         CEPH_OSD_CMPXATTR_MODE_U64,
10472                         inline_version_bl);
10473   bufferlist inline_data = in->inline_data;
10474   uninline_ops.write(0, inline_data, in->truncate_size, in->truncate_seq);
10475   uninline_ops.setxattr("inline_version", stringify(in->inline_version));
10476
10477   objecter->mutate(oid,
10478                    OSDMap::file_to_object_locator(in->layout),
10479                    uninline_ops,
10480                    in->snaprealm->get_snap_context(),
10481                    ceph::real_clock::now(),
10482                    0,
10483                    onfinish);
10484
10485   return 0;
10486 }
10487
10488 //
10489
10490 // blocking osd interface
10491
10492 int Client::read(int fd, char *buf, loff_t size, loff_t offset)
10493 {
10494   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
10495   if (!mref_reader.is_state_satisfied())
10496     return -CEPHFS_ENOTCONN;
10497
10498   tout(cct) << "read" << std::endl;
10499   tout(cct) << fd << std::endl;
10500   tout(cct) << size << std::endl;
10501   tout(cct) << offset << std::endl;
10502
10503   std::unique_lock lock(client_lock);
10504   Fh *f = get_filehandle(fd);
10505   if (!f)
10506     return -CEPHFS_EBADF;
10507 #if defined(__linux__) && defined(O_PATH)
10508   if (f->flags & O_PATH)
10509     return -CEPHFS_EBADF;
10510 #endif
10511   bufferlist bl;
10512   /* We can't return bytes written larger than INT_MAX, clamp size to that */
10513   size = std::min(size, (loff_t)INT_MAX);
10514   int r = _read(f, offset, size, &bl);
10515   ldout(cct, 3) << "read(" << fd << ", " << (void*)buf << ", " << size << ", " << offset << ") = " << r << dendl;
10516   if (r >= 0) {
10517     lock.unlock();
10518     bl.begin().copy(bl.length(), buf);
10519     r = bl.length();
10520   }
10521   return r;
10522 }
10523
10524 int Client::preadv(int fd, const struct iovec *iov, int iovcnt, loff_t offset)
10525 {
10526   if (iovcnt < 0)
10527     return -CEPHFS_EINVAL;
10528   return _preadv_pwritev(fd, iov, iovcnt, offset, false);
10529 }
10530
10531 int64_t Client::_read(Fh *f, int64_t offset, uint64_t size, bufferlist *bl)
10532 {
10533   ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
10534
10535   int want, have = 0;
10536   bool movepos = false;
10537   int64_t rc = 0;
10538   const auto& conf = cct->_conf;
10539   Inode *in = f->inode.get();
10540   utime_t lat;
10541   utime_t start = ceph_clock_now();
10542
10543   if ((f->mode & CEPH_FILE_MODE_RD) == 0)
10544     return -CEPHFS_EBADF;
10545   //bool lazy = f->mode == CEPH_FILE_MODE_LAZY;
10546
10547   if (offset < 0) {
10548     lock_fh_pos(f);
10549     offset = f->pos;
10550     movepos = true;
10551   }
10552   loff_t start_pos = offset;
10553
10554   if (in->inline_version == 0) {
10555     auto r = _getattr(in, CEPH_STAT_CAP_INLINE_DATA, f->actor_perms, true);
10556     if (r < 0) {
10557       rc = r;
10558       goto done;
10559     }
10560     ceph_assert(in->inline_version > 0);
10561   }
10562
10563 retry:
10564   if (f->mode & CEPH_FILE_MODE_LAZY)
10565     want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO;
10566   else
10567     want = CEPH_CAP_FILE_CACHE;
10568   {
10569     auto r = get_caps(f, CEPH_CAP_FILE_RD, want, &have, -1);
10570     if (r < 0) {
10571       rc = r;
10572       goto done;
10573     }
10574   }
10575   if (f->flags & O_DIRECT)
10576     have &= ~(CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO);
10577
10578   if (in->inline_version < CEPH_INLINE_NONE) {
10579     uint32_t len = in->inline_data.length();
10580     uint64_t endoff = offset + size;
10581     if (endoff > in->size)
10582       endoff = in->size;
10583
10584     if (offset < len) {
10585       if (endoff <= len) {
10586         bl->substr_of(in->inline_data, offset, endoff - offset);
10587       } else {
10588         bl->substr_of(in->inline_data, offset, len - offset);
10589         bl->append_zero(endoff - len);
10590       }
10591       rc = endoff - offset;
10592     } else if ((uint64_t)offset < endoff) {
10593       bl->append_zero(endoff - offset);
10594       rc = endoff - offset;
10595     } else {
10596       rc = 0;
10597     }
10598     goto success;
10599   }
10600
10601   if (!conf->client_debug_force_sync_read &&
10602       conf->client_oc &&
10603       (have & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO))) {
10604
10605     if (f->flags & O_RSYNC) {
10606       _flush_range(in, offset, size);
10607     }
10608     rc = _read_async(f, offset, size, bl);
10609     if (rc < 0)
10610       goto done;
10611   } else {
10612     if (f->flags & O_DIRECT)
10613       _flush_range(in, offset, size);
10614
10615     bool checkeof = false;
10616     rc = _read_sync(f, offset, size, bl, &checkeof);
10617     if (rc < 0)
10618       goto done;
10619     if (checkeof) {
10620       offset += rc;
10621       size -= rc;
10622
10623       put_cap_ref(in, CEPH_CAP_FILE_RD);
10624       have = 0;
10625       // reverify size
10626       {
10627         auto r = _getattr(in, CEPH_STAT_CAP_SIZE, f->actor_perms);
10628         if (r < 0) {
10629           rc = r;
10630           goto done;
10631         }
10632       }
10633
10634       // eof?  short read.
10635       if ((uint64_t)offset < in->size)
10636         goto retry;
10637     }
10638   }
10639
10640 success:
10641   ceph_assert(rc >= 0);
10642   update_read_io_size(bl->length());
10643   if (movepos) {
10644     // adjust fd pos
10645     f->pos = start_pos + rc;
10646   }
10647
10648   lat = ceph_clock_now();
10649   lat -= start;
10650
10651   ++nr_read_request;
10652   update_io_stat_read(lat);
10653
10654 done:
10655   // done!
10656   if (have) {
10657     put_cap_ref(in, CEPH_CAP_FILE_RD);
10658   }
10659   if (movepos) {
10660     unlock_fh_pos(f);
10661   }
10662   return rc;
10663 }
10664
10665 Client::C_Readahead::C_Readahead(Client *c, Fh *f) :
10666     client(c), f(f) {
10667   f->get();
10668   f->readahead.inc_pending();
10669 }
10670
10671 Client::C_Readahead::~C_Readahead() {
10672   f->readahead.dec_pending();
10673   client->_put_fh(f);
10674 }
10675
10676 void Client::C_Readahead::finish(int r) {
10677   lgeneric_subdout(client->cct, client, 20) << "client." << client->get_nodeid() << " " << "C_Readahead on " << f->inode << dendl;
10678   client->put_cap_ref(f->inode.get(), CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE);
10679   if (r > 0) {
10680     client->update_read_io_size(r);
10681   }
10682 }
10683
10684 int Client::_read_async(Fh *f, uint64_t off, uint64_t len, bufferlist *bl)
10685 {
10686   ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
10687
10688   const auto& conf = cct->_conf;
10689   Inode *in = f->inode.get();
10690
10691   ldout(cct, 10) << __func__ << " " << *in << " " << off << "~" << len << dendl;
10692
10693   // trim read based on file size?
10694   if (off >= in->size)
10695     return 0;
10696   if (len == 0)
10697     return 0;
10698   if (off + len > in->size) {
10699     len = in->size - off;
10700   }
10701
10702   ldout(cct, 10) << " min_bytes=" << f->readahead.get_min_readahead_size()
10703                  << " max_bytes=" << f->readahead.get_max_readahead_size()
10704                  << " max_periods=" << conf->client_readahead_max_periods << dendl;
10705
10706   // read (and possibly block)
10707   int r = 0;
10708   C_SaferCond onfinish("Client::_read_async flock");
10709   r = objectcacher->file_read(&in->oset, &in->layout, in->snapid,
10710                               off, len, bl, 0, &onfinish);
10711   if (r == 0) {
10712     get_cap_ref(in, CEPH_CAP_FILE_CACHE);
10713     client_lock.unlock();
10714     r = onfinish.wait();
10715     client_lock.lock();
10716     put_cap_ref(in, CEPH_CAP_FILE_CACHE);
10717     update_read_io_size(bl->length());
10718   }
10719
10720   if(f->readahead.get_min_readahead_size() > 0) {
10721     pair<uint64_t, uint64_t> readahead_extent = f->readahead.update(off, len, in->size);
10722     if (readahead_extent.second > 0) {
10723       ldout(cct, 20) << "readahead " << readahead_extent.first << "~" << readahead_extent.second
10724                      << " (caller wants " << off << "~" << len << ")" << dendl;
10725       Context *onfinish2 = new C_Readahead(this, f);
10726       int r2 = objectcacher->file_read(&in->oset, &in->layout, in->snapid,
10727                                        readahead_extent.first, readahead_extent.second,
10728                                        NULL, 0, onfinish2);
10729       if (r2 == 0) {
10730         ldout(cct, 20) << "readahead initiated, c " << onfinish2 << dendl;
10731         get_cap_ref(in, CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE);
10732       } else {
10733         ldout(cct, 20) << "readahead was no-op, already cached" << dendl;
10734         delete onfinish2;
10735       }
10736     }
10737   }
10738
10739   return r;
10740 }
10741
10742 int Client::_read_sync(Fh *f, uint64_t off, uint64_t len, bufferlist *bl,
10743                        bool *checkeof)
10744 {
10745   ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
10746
10747   Inode *in = f->inode.get();
10748   uint64_t pos = off;
10749   int left = len;
10750   int read = 0;
10751
10752   ldout(cct, 10) << __func__ << " " << *in << " " << off << "~" << len << dendl;
10753
10754   // 0 success, 1 continue and < 0 error happen.
10755   auto wait_and_copy = [&](C_SaferCond &onfinish, bufferlist &tbl, int wanted) {
10756     int r = onfinish.wait();
10757
10758     // if we get ENOENT from OSD, assume 0 bytes returned
10759     if (r == -CEPHFS_ENOENT)
10760       r = 0;
10761     if (r < 0)
10762       return r;
10763
10764     if (tbl.length()) {
10765       r = tbl.length();
10766
10767       read += r;
10768       pos += r;
10769       left -= r;
10770       bl->claim_append(tbl);
10771     }
10772     // short read?
10773     if (r >= 0 && r < wanted) {
10774       if (pos < in->size) {
10775         // zero up to known EOF
10776         int64_t some = in->size - pos;
10777         if (some > left)
10778           some = left;
10779         auto z = buffer::ptr_node::create(some);
10780         z->zero();
10781         bl->push_back(std::move(z));
10782         read += some;
10783         pos += some;
10784         left -= some;
10785         if (left == 0)
10786           return 0;
10787       }
10788
10789       *checkeof = true;
10790       return 0;
10791     }
10792     return 1;
10793   };
10794
10795   while (left > 0) {
10796     C_SaferCond onfinish("Client::_read_sync flock");
10797     bufferlist tbl;
10798
10799     int wanted = left;
10800     filer->read_trunc(in->ino, &in->layout, in->snapid,
10801                       pos, left, &tbl, 0,
10802                       in->truncate_size, in->truncate_seq,
10803                       &onfinish);
10804     client_lock.unlock();
10805     int r = wait_and_copy(onfinish, tbl, wanted);
10806     client_lock.lock();
10807     if (!r)
10808       return read;
10809     if (r < 0)
10810       return r;
10811   }
10812   return read;
10813 }
10814
10815 int Client::write(int fd, const char *buf, loff_t size, loff_t offset)
10816 {
10817   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
10818   if (!mref_reader.is_state_satisfied())
10819     return -CEPHFS_ENOTCONN;
10820
10821   tout(cct) << "write" << std::endl;
10822   tout(cct) << fd << std::endl;
10823   tout(cct) << size << std::endl;
10824   tout(cct) << offset << std::endl;
10825
10826   std::scoped_lock lock(client_lock);
10827   Fh *fh = get_filehandle(fd);
10828   if (!fh)
10829     return -CEPHFS_EBADF;
10830 #if defined(__linux__) && defined(O_PATH)
10831   if (fh->flags & O_PATH)
10832     return -CEPHFS_EBADF;
10833 #endif
10834   /* We can't return bytes written larger than INT_MAX, clamp size to that */
10835   size = std::min(size, (loff_t)INT_MAX);
10836   int r = _write(fh, offset, size, buf, NULL, false);
10837   ldout(cct, 3) << "write(" << fd << ", \"...\", " << size << ", " << offset << ") = " << r << dendl;
10838   return r;
10839 }
10840
10841 int Client::pwritev(int fd, const struct iovec *iov, int iovcnt, int64_t offset)
10842 {
10843   if (iovcnt < 0)
10844     return -CEPHFS_EINVAL;
10845   return _preadv_pwritev(fd, iov, iovcnt, offset, true);
10846 }
10847
10848 int64_t Client::_preadv_pwritev_locked(Fh *fh, const struct iovec *iov,
10849                                        unsigned iovcnt, int64_t offset,
10850                                        bool write, bool clamp_to_int)
10851 {
10852     ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
10853
10854 #if defined(__linux__) && defined(O_PATH)
10855     if (fh->flags & O_PATH)
10856         return -CEPHFS_EBADF;
10857 #endif
10858     loff_t totallen = 0;
10859     for (unsigned i = 0; i < iovcnt; i++) {
10860         totallen += iov[i].iov_len;
10861     }
10862
10863     /*
10864      * Some of the API functions take 64-bit size values, but only return
10865      * 32-bit signed integers. Clamp the I/O sizes in those functions so that
10866      * we don't do I/Os larger than the values we can return.
10867      */
10868     if (clamp_to_int) {
10869       totallen = std::min(totallen, (loff_t)INT_MAX);
10870     }
10871     if (write) {
10872         int64_t w = _write(fh, offset, totallen, NULL, iov, iovcnt);
10873         ldout(cct, 3) << "pwritev(" << fh << ", \"...\", " << totallen << ", " << offset << ") = " << w << dendl;
10874         return w;
10875     } else {
10876         bufferlist bl;
10877         int64_t r = _read(fh, offset, totallen, &bl);
10878         ldout(cct, 3) << "preadv(" << fh << ", " <<  offset << ") = " << r << dendl;
10879         if (r <= 0)
10880           return r;
10881
10882         client_lock.unlock();
10883         auto iter = bl.cbegin();
10884         for (unsigned j = 0, resid = r; j < iovcnt && resid > 0; j++) {
10885                /*
10886                 * This piece of code aims to handle the case that bufferlist
10887                 * does not have enough data to fill in the iov
10888                 */
10889                const auto round_size = std::min<unsigned>(resid, iov[j].iov_len);
10890                iter.copy(round_size, reinterpret_cast<char*>(iov[j].iov_base));
10891                resid -= round_size;
10892                /* iter is self-updating */
10893         }
10894         client_lock.lock();
10895         return r;
10896     }
10897 }
10898
10899 int Client::_preadv_pwritev(int fd, const struct iovec *iov, unsigned iovcnt, int64_t offset, bool write)
10900 {
10901     RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
10902     if (!mref_reader.is_state_satisfied())
10903       return -CEPHFS_ENOTCONN;
10904
10905     tout(cct) << fd << std::endl;
10906     tout(cct) << offset << std::endl;
10907
10908     std::scoped_lock cl(client_lock);
10909     Fh *fh = get_filehandle(fd);
10910     if (!fh)
10911       return -CEPHFS_EBADF;
10912     return _preadv_pwritev_locked(fh, iov, iovcnt, offset, write, true);
10913 }
10914
10915 int64_t Client::_write(Fh *f, int64_t offset, uint64_t size, const char *buf,
10916                         const struct iovec *iov, int iovcnt)
10917 {
10918   ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
10919
10920   uint64_t fpos = 0;
10921   Inode *in = f->inode.get();
10922
10923   if ( (uint64_t)(offset+size) > mdsmap->get_max_filesize() && //exceeds config
10924        (uint64_t)(offset+size) > in->size ) { //exceeds filesize
10925       return -CEPHFS_EFBIG;
10926         }
10927   //ldout(cct, 7) << "write fh " << fh << " size " << size << " offset " << offset << dendl;
10928
10929   if (objecter->osdmap_pool_full(in->layout.pool_id)) {
10930     return -CEPHFS_ENOSPC;
10931   }
10932
10933   ceph_assert(in->snapid == CEPH_NOSNAP);
10934
10935   // was Fh opened as writeable?
10936   if ((f->mode & CEPH_FILE_MODE_WR) == 0)
10937     return -CEPHFS_EBADF;
10938
10939   // use/adjust fd pos?
10940   if (offset < 0) {
10941     lock_fh_pos(f);
10942     /*
10943      * FIXME: this is racy in that we may block _after_ this point waiting for caps, and size may
10944      * change out from under us.
10945      */
10946     if (f->flags & O_APPEND) {
10947       auto r = _lseek(f, 0, SEEK_END);
10948       if (r < 0) {
10949         unlock_fh_pos(f);
10950         return r;
10951       }
10952     }
10953     offset = f->pos;
10954     fpos = offset+size;
10955     unlock_fh_pos(f);
10956   }
10957
10958   // check quota
10959   uint64_t endoff = offset + size;
10960   if (endoff > in->size && is_quota_bytes_exceeded(in, endoff - in->size,
10961                                                    f->actor_perms)) {
10962     return -CEPHFS_EDQUOT;
10963   }
10964
10965   //bool lazy = f->mode == CEPH_FILE_MODE_LAZY;
10966
10967   ldout(cct, 10) << "cur file size is " << in->size << dendl;
10968
10969   // time it.
10970   utime_t start = ceph_clock_now();
10971
10972   if (in->inline_version == 0) {
10973     int r = _getattr(in, CEPH_STAT_CAP_INLINE_DATA, f->actor_perms, true);
10974     if (r < 0)
10975       return r;
10976     ceph_assert(in->inline_version > 0);
10977   }
10978
10979   // copy into fresh buffer (since our write may be resub, async)
10980   bufferlist bl;
10981   if (buf) {
10982     if (size > 0)
10983       bl.append(buf, size);
10984   } else if (iov){
10985     for (int i = 0; i < iovcnt; i++) {
10986       if (iov[i].iov_len > 0) {
10987         bl.append((const char *)iov[i].iov_base, iov[i].iov_len);
10988       }
10989     }
10990   }
10991
10992   utime_t lat;
10993   uint64_t totalwritten;
10994   int want, have;
10995   if (f->mode & CEPH_FILE_MODE_LAZY)
10996     want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO;
10997   else
10998     want = CEPH_CAP_FILE_BUFFER;
10999   int r = get_caps(f, CEPH_CAP_FILE_WR|CEPH_CAP_AUTH_SHARED, want, &have, endoff);
11000   if (r < 0)
11001     return r;
11002
11003   put_cap_ref(in, CEPH_CAP_AUTH_SHARED);
11004   if (size > 0) {
11005     r = clear_suid_sgid(in, f->actor_perms);
11006     if (r < 0) {
11007       put_cap_ref(in, CEPH_CAP_FILE_WR);
11008       return r;
11009     }
11010   }
11011
11012   if (f->flags & O_DIRECT)
11013     have &= ~(CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO);
11014
11015   ldout(cct, 10) << " snaprealm " << *in->snaprealm << dendl;
11016
11017   std::unique_ptr<C_SaferCond> onuninline = nullptr;
11018
11019   if (in->inline_version < CEPH_INLINE_NONE) {
11020     if (endoff > cct->_conf->client_max_inline_size ||
11021         endoff > CEPH_INLINE_MAX_SIZE ||
11022         !(have & CEPH_CAP_FILE_BUFFER)) {
11023       onuninline.reset(new C_SaferCond("Client::_write_uninline_data flock"));
11024       uninline_data(in, onuninline.get());
11025     } else {
11026       get_cap_ref(in, CEPH_CAP_FILE_BUFFER);
11027
11028       uint32_t len = in->inline_data.length();
11029
11030       if (endoff < len)
11031         in->inline_data.begin(endoff).copy(len - endoff, bl); // XXX
11032
11033       if (offset < len)
11034         in->inline_data.splice(offset, len - offset);
11035       else if (offset > len)
11036         in->inline_data.append_zero(offset - len);
11037
11038       in->inline_data.append(bl);
11039       in->inline_version++;
11040
11041       put_cap_ref(in, CEPH_CAP_FILE_BUFFER);
11042
11043       goto success;
11044     }
11045   }
11046
11047   if (cct->_conf->client_oc &&
11048       (have & (CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO))) {
11049     // do buffered write
11050     if (!in->oset.dirty_or_tx)
11051       get_cap_ref(in, CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_BUFFER);
11052
11053     get_cap_ref(in, CEPH_CAP_FILE_BUFFER);
11054
11055     // async, caching, non-blocking.
11056     r = objectcacher->file_write(&in->oset, &in->layout,
11057                                  in->snaprealm->get_snap_context(),
11058                                  offset, size, bl, ceph::real_clock::now(),
11059                                  0);
11060     put_cap_ref(in, CEPH_CAP_FILE_BUFFER);
11061
11062     if (r < 0)
11063       goto done;
11064
11065     // flush cached write if O_SYNC is set on file fh
11066     // O_DSYNC == O_SYNC on linux < 2.6.33
11067     // O_SYNC = __O_SYNC | O_DSYNC on linux >= 2.6.33
11068     if ((f->flags & O_SYNC) || (f->flags & O_DSYNC)) {
11069       _flush_range(in, offset, size);
11070     }
11071   } else {
11072     if (f->flags & O_DIRECT)
11073       _flush_range(in, offset, size);
11074
11075     // simple, non-atomic sync write
11076     C_SaferCond onfinish("Client::_write flock");
11077     get_cap_ref(in, CEPH_CAP_FILE_BUFFER);
11078
11079     filer->write_trunc(in->ino, &in->layout, in->snaprealm->get_snap_context(),
11080                        offset, size, bl, ceph::real_clock::now(), 0,
11081                        in->truncate_size, in->truncate_seq,
11082                        &onfinish);
11083     client_lock.unlock();
11084     r = onfinish.wait();
11085     client_lock.lock();
11086     put_cap_ref(in, CEPH_CAP_FILE_BUFFER);
11087     if (r < 0)
11088       goto done;
11089   }
11090
11091   // if we get here, write was successful, update client metadata
11092 success:
11093   update_write_io_size(size);
11094   // time
11095   lat = ceph_clock_now();
11096   lat -= start;
11097
11098   ++nr_write_request;
11099   update_io_stat_write(lat);
11100
11101   if (fpos) {
11102     lock_fh_pos(f);
11103     f->pos = fpos;
11104     unlock_fh_pos(f);
11105   }
11106   totalwritten = size;
11107   r = (int64_t)totalwritten;
11108
11109   // extend file?
11110   if (totalwritten + offset > in->size) {
11111     in->size = totalwritten + offset;
11112     in->mark_caps_dirty(CEPH_CAP_FILE_WR);
11113
11114     if (is_quota_bytes_approaching(in, f->actor_perms)) {
11115       check_caps(in, CHECK_CAPS_NODELAY);
11116     } else if (is_max_size_approaching(in)) {
11117       check_caps(in, 0);
11118     }
11119
11120     ldout(cct, 7) << "wrote to " << totalwritten+offset << ", extending file size" << dendl;
11121   } else {
11122     ldout(cct, 7) << "wrote to " << totalwritten+offset << ", leaving file size at " << in->size << dendl;
11123   }
11124
11125   // mtime
11126   in->mtime = in->ctime = ceph_clock_now();
11127   in->change_attr++;
11128   in->mark_caps_dirty(CEPH_CAP_FILE_WR);
11129
11130 done:
11131
11132   if (nullptr != onuninline) {
11133     client_lock.unlock();
11134     int uninline_ret = onuninline->wait();
11135     client_lock.lock();
11136
11137     if (uninline_ret >= 0 || uninline_ret == -CEPHFS_ECANCELED) {
11138       in->inline_data.clear();
11139       in->inline_version = CEPH_INLINE_NONE;
11140       in->mark_caps_dirty(CEPH_CAP_FILE_WR);
11141       check_caps(in, 0);
11142     } else
11143       r = uninline_ret;
11144   }
11145
11146   put_cap_ref(in, CEPH_CAP_FILE_WR);
11147   return r;
11148 }
11149
11150 int Client::_flush(Fh *f)
11151 {
11152   Inode *in = f->inode.get();
11153   int err = f->take_async_err();
11154   if (err != 0) {
11155     ldout(cct, 1) << __func__ << ": " << f << " on inode " << *in << " caught async_err = "
11156                   << cpp_strerror(err) << dendl;
11157   } else {
11158     ldout(cct, 10) << __func__ << ": " << f << " on inode " << *in << " no async_err state" << dendl;
11159   }
11160
11161   return err;
11162 }
11163
11164 int Client::truncate(const char *relpath, loff_t length, const UserPerm& perms)
11165 {
11166   struct ceph_statx stx;
11167   stx.stx_size = length;
11168   return setattrx(relpath, &stx, CEPH_SETATTR_SIZE, perms);
11169 }
11170
11171 int Client::ftruncate(int fd, loff_t length, const UserPerm& perms)
11172 {
11173   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11174   if (!mref_reader.is_state_satisfied())
11175     return -CEPHFS_ENOTCONN;
11176
11177   tout(cct) << __func__ << std::endl;
11178   tout(cct) << fd << std::endl;
11179   tout(cct) << length << std::endl;
11180
11181   std::scoped_lock lock(client_lock);
11182   Fh *f = get_filehandle(fd);
11183   if (!f)
11184     return -CEPHFS_EBADF;
11185 #if defined(__linux__) && defined(O_PATH)
11186   if (f->flags & O_PATH)
11187     return -CEPHFS_EBADF;
11188 #endif
11189   if ((f->mode & CEPH_FILE_MODE_WR) == 0)
11190     return -CEPHFS_EBADF;
11191   struct stat attr;
11192   attr.st_size = length;
11193   return _setattr(f->inode, &attr, CEPH_SETATTR_SIZE, perms);
11194 }
11195
11196 int Client::fsync(int fd, bool syncdataonly)
11197 {
11198   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11199   if (!mref_reader.is_state_satisfied())
11200     return -CEPHFS_ENOTCONN;
11201
11202   tout(cct) << "fsync" << std::endl;
11203   tout(cct) << fd << std::endl;
11204   tout(cct) << syncdataonly << std::endl;
11205
11206   std::scoped_lock lock(client_lock);
11207   Fh *f = get_filehandle(fd);
11208   if (!f)
11209     return -CEPHFS_EBADF;
11210 #if defined(__linux__) && defined(O_PATH)
11211   if (f->flags & O_PATH)
11212     return -CEPHFS_EBADF;
11213 #endif
11214   int r = _fsync(f, syncdataonly);
11215   if (r == 0) {
11216     // The IOs in this fsync were okay, but maybe something happened
11217     // in the background that we shoudl be reporting?
11218     r = f->take_async_err();
11219     ldout(cct, 5) << "fsync(" << fd << ", " << syncdataonly
11220                   << ") = 0, async_err = " << r << dendl;
11221   } else {
11222     // Assume that an error we encountered during fsync, even reported
11223     // synchronously, would also have applied the error to the Fh, and we
11224     // should clear it here to avoid returning the same error again on next
11225     // call.
11226     ldout(cct, 5) << "fsync(" << fd << ", " << syncdataonly << ") = "
11227                   << r << dendl;
11228     f->take_async_err();
11229   }
11230   return r;
11231 }
11232
11233 int Client::_fsync(Inode *in, bool syncdataonly)
11234 {
11235   ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
11236
11237   int r = 0;
11238   std::unique_ptr<C_SaferCond> object_cacher_completion = nullptr;
11239   ceph_tid_t flush_tid = 0;
11240   InodeRef tmp_ref;
11241   utime_t lat;
11242   utime_t start = ceph_clock_now();
11243
11244   ldout(cct, 8) << "_fsync on " << *in << " " << (syncdataonly ? "(dataonly)":"(data+metadata)") << dendl;
11245
11246   if (cct->_conf->client_oc) {
11247     object_cacher_completion.reset(new C_SaferCond("Client::_fsync::lock"));
11248     tmp_ref = in; // take a reference; C_SaferCond doesn't and _flush won't either
11249     _flush(in, object_cacher_completion.get());
11250     ldout(cct, 15) << "using return-valued form of _fsync" << dendl;
11251   }
11252
11253   if (!syncdataonly && in->dirty_caps) {
11254     check_caps(in, CHECK_CAPS_NODELAY|CHECK_CAPS_SYNCHRONOUS);
11255     if (in->flushing_caps)
11256       flush_tid = last_flush_tid;
11257   } else ldout(cct, 10) << "no metadata needs to commit" << dendl;
11258
11259   if (!syncdataonly && !in->unsafe_ops.empty()) {
11260     flush_mdlog_sync(in);
11261
11262     MetaRequest *req = in->unsafe_ops.back();
11263     ldout(cct, 15) << "waiting on unsafe requests, last tid " << req->get_tid() <<  dendl;
11264
11265     req->get();
11266     wait_on_list(req->waitfor_safe);
11267     put_request(req);
11268   }
11269
11270   if (nullptr != object_cacher_completion) { // wait on a real reply instead of guessing
11271     client_lock.unlock();
11272     ldout(cct, 15) << "waiting on data to flush" << dendl;
11273     r = object_cacher_completion->wait();
11274     client_lock.lock();
11275     ldout(cct, 15) << "got " << r << " from flush writeback" << dendl;
11276   } else {
11277     // FIXME: this can starve
11278     while (in->cap_refs[CEPH_CAP_FILE_BUFFER] > 0) {
11279       ldout(cct, 10) << "ino " << in->ino << " has " << in->cap_refs[CEPH_CAP_FILE_BUFFER]
11280                      << " uncommitted, waiting" << dendl;
11281       wait_on_list(in->waitfor_commit);
11282     }
11283   }
11284
11285   if (!r) {
11286     if (flush_tid > 0)
11287       wait_sync_caps(in, flush_tid);
11288
11289     ldout(cct, 10) << "ino " << in->ino << " has no uncommitted writes" << dendl;
11290   } else {
11291     ldout(cct, 8) << "ino " << in->ino << " failed to commit to disk! "
11292                   << cpp_strerror(-r) << dendl;
11293   }
11294
11295   lat = ceph_clock_now();
11296   lat -= start;
11297   logger->tinc(l_c_fsync, lat);
11298
11299   return r;
11300 }
11301
11302 int Client::_fsync(Fh *f, bool syncdataonly)
11303 {
11304   ldout(cct, 8) << "_fsync(" << f << ", " << (syncdataonly ? "dataonly)":"data+metadata)") << dendl;
11305   return _fsync(f->inode.get(), syncdataonly);
11306 }
11307
11308 int Client::fstat(int fd, struct stat *stbuf, const UserPerm& perms, int mask)
11309 {
11310   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11311   if (!mref_reader.is_state_satisfied())
11312     return -CEPHFS_ENOTCONN;
11313
11314   tout(cct) << "fstat mask " << hex << mask << dec << std::endl;
11315   tout(cct) << fd << std::endl;
11316
11317   std::scoped_lock lock(client_lock);
11318   Fh *f = get_filehandle(fd);
11319   if (!f)
11320     return -CEPHFS_EBADF;
11321   int r = _getattr(f->inode, mask, perms);
11322   if (r < 0)
11323     return r;
11324   fill_stat(f->inode, stbuf, NULL);
11325   ldout(cct, 5) << "fstat(" << fd << ", " << stbuf << ") = " << r << dendl;
11326   return r;
11327 }
11328
11329 int Client::fstatx(int fd, struct ceph_statx *stx, const UserPerm& perms,
11330                    unsigned int want, unsigned int flags)
11331 {
11332   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11333   if (!mref_reader.is_state_satisfied())
11334     return -CEPHFS_ENOTCONN;
11335
11336   tout(cct) << "fstatx flags " << hex << flags << " want " << want << dec << std::endl;
11337   tout(cct) << fd << std::endl;
11338
11339   std::scoped_lock lock(client_lock);
11340   Fh *f = get_filehandle(fd);
11341   if (!f)
11342     return -CEPHFS_EBADF;
11343
11344   unsigned mask = statx_to_mask(flags, want);
11345
11346   int r = 0;
11347   if (mask) {
11348     r = _getattr(f->inode, mask, perms);
11349     if (r < 0) {
11350       ldout(cct, 3) << "fstatx exit on error!" << dendl;
11351       return r;
11352     }
11353   }
11354
11355   fill_statx(f->inode, mask, stx);
11356   ldout(cct, 3) << "fstatx(" << fd << ", " << stx << ") = " << r << dendl;
11357   return r;
11358 }
11359
11360 int Client::statxat(int dirfd, const char *relpath,
11361                     struct ceph_statx *stx, const UserPerm& perms,
11362                     unsigned int want, unsigned int flags) {
11363   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11364   if (!mref_reader.is_state_satisfied()) {
11365     return -CEPHFS_ENOTCONN;
11366   }
11367
11368   tout(cct) << __func__ << " flags " << hex << flags << " want " << want << dec << std::endl;
11369   tout(cct) << dirfd << std::endl;
11370   tout(cct) << relpath << std::endl;
11371
11372   unsigned mask = statx_to_mask(flags, want);
11373
11374   InodeRef dirinode;
11375   std::scoped_lock lock(client_lock);
11376   int r = get_fd_inode(dirfd, &dirinode);
11377   if (r < 0) {
11378     return r;
11379   }
11380
11381   InodeRef in;
11382   filepath path(relpath);
11383   r = path_walk(path, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW), mask, dirinode);
11384   if (r < 0) {
11385     return r;
11386   }
11387   r = _getattr(in, mask, perms);
11388   if (r < 0) {
11389     ldout(cct, 3) << __func__ << " exit on error!" << dendl;
11390     return r;
11391   }
11392
11393   fill_statx(in, mask, stx);
11394   ldout(cct, 3) << __func__ << " dirfd" << dirfd << ", r= " << r << dendl;
11395   return r;
11396 }
11397
11398 // not written yet, but i want to link!
11399
11400 int Client::chdir(const char *relpath, std::string &new_cwd,
11401                   const UserPerm& perms)
11402 {
11403   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11404   if (!mref_reader.is_state_satisfied())
11405     return -CEPHFS_ENOTCONN;
11406
11407   tout(cct) << "chdir" << std::endl;
11408   tout(cct) << relpath << std::endl;
11409
11410   filepath path(relpath);
11411   InodeRef in;
11412
11413   std::scoped_lock lock(client_lock);
11414   int r = path_walk(path, &in, perms);
11415   if (r < 0)
11416     return r;
11417
11418   if (!(in.get()->is_dir()))
11419     return -CEPHFS_ENOTDIR;
11420
11421   if (cwd != in)
11422     cwd.swap(in);
11423   ldout(cct, 3) << "chdir(" << relpath << ")  cwd now " << cwd->ino << dendl;
11424
11425   _getcwd(new_cwd, perms);
11426   return 0;
11427 }
11428
11429 void Client::_getcwd(string& dir, const UserPerm& perms)
11430 {
11431   filepath path;
11432   ldout(cct, 10) << __func__ << " " << *cwd << dendl;
11433
11434   Inode *in = cwd.get();
11435   while (in != root.get()) {
11436     ceph_assert(in->dentries.size() < 2); // dirs can't be hard-linked
11437
11438     // A cwd or ancester is unlinked
11439     if (in->dentries.empty()) {
11440       return;
11441     }
11442
11443     Dentry *dn = in->get_first_parent();
11444
11445
11446     if (!dn) {
11447       // look it up
11448       ldout(cct, 10) << __func__ << " looking up parent for " << *in << dendl;
11449       MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPNAME);
11450       filepath path(in->ino);
11451       req->set_filepath(path);
11452       req->set_inode(in);
11453       int res = make_request(req, perms);
11454       if (res < 0)
11455         break;
11456
11457       // start over
11458       path = filepath();
11459       in = cwd.get();
11460       continue;
11461     }
11462     path.push_front_dentry(dn->name);
11463     in = dn->dir->parent_inode;
11464   }
11465   dir = "/";
11466   dir += path.get_path();
11467 }
11468
11469 void Client::getcwd(string& dir, const UserPerm& perms)
11470 {
11471   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11472   if (!mref_reader.is_state_satisfied())
11473     return;
11474
11475   std::scoped_lock l(client_lock);
11476
11477   _getcwd(dir, perms);
11478 }
11479
11480 int Client::statfs(const char *path, struct statvfs *stbuf,
11481                    const UserPerm& perms)
11482 {
11483   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11484   if (!mref_reader.is_state_satisfied())
11485     return -CEPHFS_ENOTCONN;
11486
11487   tout(cct) << __func__ << std::endl;
11488   unsigned long int total_files_on_fs;
11489
11490   ceph_statfs stats;
11491   C_SaferCond cond;
11492
11493   std::unique_lock lock(client_lock);
11494   const vector<int64_t> &data_pools = mdsmap->get_data_pools();
11495   if (data_pools.size() == 1) {
11496     objecter->get_fs_stats(stats, data_pools[0], &cond);
11497   } else {
11498     objecter->get_fs_stats(stats, std::optional<int64_t>(), &cond);
11499   }
11500
11501   lock.unlock();
11502   int rval = cond.wait();
11503   lock.lock();
11504
11505   ceph_assert(root);
11506   total_files_on_fs = root->rstat.rfiles + root->rstat.rsubdirs;
11507
11508   if (rval < 0) {
11509     ldout(cct, 1) << "underlying call to statfs returned error: "
11510                   << cpp_strerror(rval)
11511                   << dendl;
11512     return rval;
11513   }
11514
11515   memset(stbuf, 0, sizeof(*stbuf));
11516
11517   /*
11518    * we're going to set a block size of 4MB so we can represent larger
11519    * FSes without overflowing. Additionally convert the space
11520    * measurements from KB to bytes while making them in terms of
11521    * blocks.  We use 4MB only because it is big enough, and because it
11522    * actually *is* the (ceph) default block size.
11523    */
11524   const int CEPH_BLOCK_SHIFT = 22;
11525   stbuf->f_frsize = 1 << CEPH_BLOCK_SHIFT;
11526   stbuf->f_bsize = 1 << CEPH_BLOCK_SHIFT;
11527   stbuf->f_files = total_files_on_fs;
11528   stbuf->f_ffree = -1;
11529   stbuf->f_favail = -1;
11530   stbuf->f_fsid = -1;       // ??
11531   stbuf->f_flag = 0;        // ??
11532   stbuf->f_namemax = NAME_MAX;
11533
11534   // Usually quota_root will == root_ancestor, but if the mount root has no
11535   // quota but we can see a parent of it that does have a quota, we'll
11536   // respect that one instead.
11537   ceph_assert(root != nullptr);
11538   InodeRef quota_root = root->quota.is_enabled(QUOTA_MAX_BYTES) ? root : get_quota_root(root.get(), perms, QUOTA_MAX_BYTES);
11539
11540   // get_quota_root should always give us something if client quotas are
11541   // enabled
11542   ceph_assert(cct->_conf.get_val<bool>("client_quota") == false || quota_root != nullptr);
11543
11544   /* If bytes quota is set on a directory and conf option "client quota df"
11545    * is also set, available space = quota limit - used space. Else,
11546    * available space = total space - used space. */
11547   if (quota_root && cct->_conf->client_quota_df && quota_root->quota.max_bytes) {
11548
11549     // Skip the getattr if any sessions are stale, as we don't want to
11550     // block `df` if this client has e.g. been evicted, or if the MDS cluster
11551     // is unhealthy.
11552     if (!_any_stale_sessions()) {
11553       int r = _getattr(quota_root, 0, perms, true);
11554       if (r != 0) {
11555         // Ignore return value: error getting latest inode metadata is not a good
11556         // reason to break "df".
11557         lderr(cct) << "Error in getattr on quota root 0x"
11558                    << std::hex << quota_root->ino << std::dec
11559                    << " statfs result may be outdated" << dendl;
11560       }
11561     }
11562
11563     // Special case: if there is a size quota set on the Inode acting
11564     // as the root for this client mount, then report the quota status
11565     // as the filesystem statistics.
11566     const fsblkcnt_t total = quota_root->quota.max_bytes >> CEPH_BLOCK_SHIFT;
11567     const fsblkcnt_t used = quota_root->rstat.rbytes >> CEPH_BLOCK_SHIFT;
11568     // It is possible for a quota to be exceeded: arithmetic here must
11569     // handle case where used > total.
11570     const fsblkcnt_t free = total > used ? total - used : 0;
11571
11572     stbuf->f_blocks = total;
11573     stbuf->f_bfree = free;
11574     stbuf->f_bavail = free;
11575   } else {
11576     // General case: report the cluster statistics returned from RADOS. Because
11577     // multiple pools may be used without one filesystem namespace via
11578     // layouts, this is the most correct thing we can do.
11579     stbuf->f_blocks = stats.kb >> (CEPH_BLOCK_SHIFT - 10);
11580     stbuf->f_bfree = stats.kb_avail >> (CEPH_BLOCK_SHIFT - 10);
11581     stbuf->f_bavail = stats.kb_avail >> (CEPH_BLOCK_SHIFT - 10);
11582   }
11583
11584   return rval;
11585 }
11586
11587 int Client::_do_filelock(Inode *in, Fh *fh, int lock_type, int op, int sleep,
11588                          struct flock *fl, uint64_t owner, bool removing)
11589 {
11590   ldout(cct, 10) << __func__ << " ino " << in->ino
11591                  << (lock_type == CEPH_LOCK_FCNTL ? " fcntl" : " flock")
11592                  << " type " << fl->l_type << " owner " << owner
11593                  << " " << fl->l_start << "~" << fl->l_len << dendl;
11594
11595   if (in->flags & I_ERROR_FILELOCK)
11596     return -CEPHFS_EIO;
11597
11598   int lock_cmd;
11599   if (F_RDLCK == fl->l_type)
11600     lock_cmd = CEPH_LOCK_SHARED;
11601   else if (F_WRLCK == fl->l_type)
11602     lock_cmd = CEPH_LOCK_EXCL;
11603   else if (F_UNLCK == fl->l_type)
11604     lock_cmd = CEPH_LOCK_UNLOCK;
11605   else
11606     return -CEPHFS_EIO;
11607
11608   if (op != CEPH_MDS_OP_SETFILELOCK || lock_cmd == CEPH_LOCK_UNLOCK)
11609     sleep = 0;
11610
11611   /*
11612    * Set the most significant bit, so that MDS knows the 'owner'
11613    * is sufficient to identify the owner of lock. (old code uses
11614    * both 'owner' and 'pid')
11615    */
11616   owner |= (1ULL << 63);
11617
11618   MetaRequest *req = new MetaRequest(op);
11619   filepath path;
11620   in->make_nosnap_relative_path(path);
11621   req->set_filepath(path);
11622   req->set_inode(in);
11623
11624   req->head.args.filelock_change.rule = lock_type;
11625   req->head.args.filelock_change.type = lock_cmd;
11626   req->head.args.filelock_change.owner = owner;
11627   req->head.args.filelock_change.pid = fl->l_pid;
11628   req->head.args.filelock_change.start = fl->l_start;
11629   req->head.args.filelock_change.length = fl->l_len;
11630   req->head.args.filelock_change.wait = sleep;
11631
11632   int ret;
11633   bufferlist bl;
11634
11635   if (sleep && switch_interrupt_cb) {
11636     // enable interrupt
11637     switch_interrupt_cb(callback_handle, req->get());
11638     ret = make_request(req, fh->actor_perms, NULL, NULL, -1, &bl);
11639     // disable interrupt
11640     switch_interrupt_cb(callback_handle, NULL);
11641     if (ret == 0 && req->aborted()) {
11642       // effect of this lock request has been revoked by the 'lock intr' request
11643       ret = req->get_abort_code();
11644     }
11645     put_request(req);
11646   } else {
11647     ret = make_request(req, fh->actor_perms, NULL, NULL, -1, &bl);
11648   }
11649
11650   if (ret == 0) {
11651     if (op == CEPH_MDS_OP_GETFILELOCK) {
11652       ceph_filelock filelock;
11653       auto p = bl.cbegin();
11654       decode(filelock, p);
11655
11656       if (CEPH_LOCK_SHARED == filelock.type)
11657         fl->l_type = F_RDLCK;
11658       else if (CEPH_LOCK_EXCL == filelock.type)
11659         fl->l_type = F_WRLCK;
11660       else
11661         fl->l_type = F_UNLCK;
11662
11663       fl->l_whence = SEEK_SET;
11664       fl->l_start = filelock.start;
11665       fl->l_len = filelock.length;
11666       fl->l_pid = filelock.pid;
11667     } else if (op == CEPH_MDS_OP_SETFILELOCK) {
11668       ceph_lock_state_t *lock_state;
11669       if (lock_type == CEPH_LOCK_FCNTL) {
11670         if (!in->fcntl_locks)
11671           in->fcntl_locks.reset(new ceph_lock_state_t(cct, CEPH_LOCK_FCNTL));
11672         lock_state = in->fcntl_locks.get();
11673       } else if (lock_type == CEPH_LOCK_FLOCK) {
11674         if (!in->flock_locks)
11675           in->flock_locks.reset(new ceph_lock_state_t(cct, CEPH_LOCK_FLOCK));
11676         lock_state = in->flock_locks.get();
11677       } else {
11678         ceph_abort();
11679         return -CEPHFS_EINVAL;
11680       }
11681       _update_lock_state(fl, owner, lock_state);
11682
11683       if (!removing) {
11684         if (lock_type == CEPH_LOCK_FCNTL) {
11685           if (!fh->fcntl_locks)
11686             fh->fcntl_locks.reset(new ceph_lock_state_t(cct, CEPH_LOCK_FCNTL));
11687           lock_state = fh->fcntl_locks.get();
11688         } else {
11689           if (!fh->flock_locks)
11690             fh->flock_locks.reset(new ceph_lock_state_t(cct, CEPH_LOCK_FLOCK));
11691           lock_state = fh->flock_locks.get();
11692         }
11693         _update_lock_state(fl, owner, lock_state);
11694       }
11695     } else
11696       ceph_abort();
11697   }
11698   return ret;
11699 }
11700
11701 int Client::_interrupt_filelock(MetaRequest *req)
11702 {
11703   // Set abort code, but do not kick. The abort code prevents the request
11704   // from being re-sent.
11705   req->abort(-CEPHFS_EINTR);
11706   if (req->mds < 0)
11707     return 0; // haven't sent the request
11708
11709   Inode *in = req->inode();
11710
11711   int lock_type;
11712   if (req->head.args.filelock_change.rule == CEPH_LOCK_FLOCK)
11713     lock_type = CEPH_LOCK_FLOCK_INTR;
11714   else if (req->head.args.filelock_change.rule == CEPH_LOCK_FCNTL)
11715     lock_type = CEPH_LOCK_FCNTL_INTR;
11716   else {
11717     ceph_abort();
11718     return -CEPHFS_EINVAL;
11719   }
11720
11721   MetaRequest *intr_req = new MetaRequest(CEPH_MDS_OP_SETFILELOCK);
11722   filepath path;
11723   in->make_nosnap_relative_path(path);
11724   intr_req->set_filepath(path);
11725   intr_req->set_inode(in);
11726   intr_req->head.args.filelock_change = req->head.args.filelock_change;
11727   intr_req->head.args.filelock_change.rule = lock_type;
11728   intr_req->head.args.filelock_change.type = CEPH_LOCK_UNLOCK;
11729
11730   UserPerm perms(req->get_uid(), req->get_gid());
11731   return make_request(intr_req, perms, NULL, NULL, -1);
11732 }
11733
11734 void Client::_encode_filelocks(Inode *in, bufferlist& bl)
11735 {
11736   if (!in->fcntl_locks && !in->flock_locks)
11737     return;
11738
11739   unsigned nr_fcntl_locks = in->fcntl_locks ? in->fcntl_locks->held_locks.size() : 0;
11740   encode(nr_fcntl_locks, bl);
11741   if (nr_fcntl_locks) {
11742     auto &lock_state = in->fcntl_locks;
11743     for(auto p = lock_state->held_locks.begin();
11744         p != lock_state->held_locks.end();
11745         ++p)
11746       encode(p->second, bl);
11747   }
11748
11749   unsigned nr_flock_locks = in->flock_locks ? in->flock_locks->held_locks.size() : 0;
11750   encode(nr_flock_locks, bl);
11751   if (nr_flock_locks) {
11752     auto &lock_state = in->flock_locks;
11753     for(auto p = lock_state->held_locks.begin();
11754         p != lock_state->held_locks.end();
11755         ++p)
11756       encode(p->second, bl);
11757   }
11758
11759   ldout(cct, 10) << __func__ << " ino " << in->ino << ", " << nr_fcntl_locks
11760                  << " fcntl locks, " << nr_flock_locks << " flock locks" <<  dendl;
11761 }
11762
11763 void Client::_release_filelocks(Fh *fh)
11764 {
11765   if (!fh->fcntl_locks && !fh->flock_locks)
11766     return;
11767
11768   Inode *in = fh->inode.get();
11769   ldout(cct, 10) << __func__ << " " << fh << " ino " << in->ino << dendl;
11770
11771   list<ceph_filelock> activated_locks;
11772
11773   list<pair<int, ceph_filelock> > to_release;
11774
11775   if (fh->fcntl_locks) {
11776     auto &lock_state = fh->fcntl_locks;
11777     for(auto p = lock_state->held_locks.begin(); p != lock_state->held_locks.end(); ) {
11778       auto q = p++;
11779       if (in->flags & I_ERROR_FILELOCK) {
11780         lock_state->remove_lock(q->second, activated_locks);
11781       } else {
11782         to_release.push_back(pair<int, ceph_filelock>(CEPH_LOCK_FCNTL, q->second));
11783       }
11784     }
11785     lock_state.reset();
11786   }
11787   if (fh->flock_locks) {
11788     auto &lock_state = fh->flock_locks;
11789     for(auto p = lock_state->held_locks.begin(); p != lock_state->held_locks.end(); ) {
11790       auto q = p++;
11791       if (in->flags & I_ERROR_FILELOCK) {
11792         lock_state->remove_lock(q->second, activated_locks);
11793       } else {
11794         to_release.push_back(pair<int, ceph_filelock>(CEPH_LOCK_FLOCK, q->second));
11795       }
11796     }
11797     lock_state.reset();
11798   }
11799
11800   if ((in->flags & I_ERROR_FILELOCK) && !in->has_any_filelocks())
11801     in->flags &= ~I_ERROR_FILELOCK;
11802
11803   if (to_release.empty())
11804     return;
11805
11806   struct flock fl;
11807   memset(&fl, 0, sizeof(fl));
11808   fl.l_whence = SEEK_SET;
11809   fl.l_type = F_UNLCK;
11810
11811   for (list<pair<int, ceph_filelock> >::iterator p = to_release.begin();
11812        p != to_release.end();
11813        ++p) {
11814     fl.l_start = p->second.start;
11815     fl.l_len = p->second.length;
11816     fl.l_pid = p->second.pid;
11817     _do_filelock(in, fh, p->first, CEPH_MDS_OP_SETFILELOCK, 0, &fl,
11818                  p->second.owner, true);
11819   }
11820 }
11821
11822 void Client::_update_lock_state(struct flock *fl, uint64_t owner,
11823                                 ceph_lock_state_t *lock_state)
11824 {
11825   int lock_cmd;
11826   if (F_RDLCK == fl->l_type)
11827     lock_cmd = CEPH_LOCK_SHARED;
11828   else if (F_WRLCK == fl->l_type)
11829     lock_cmd = CEPH_LOCK_EXCL;
11830   else
11831     lock_cmd = CEPH_LOCK_UNLOCK;;
11832
11833   ceph_filelock filelock;
11834   filelock.start = fl->l_start;
11835   filelock.length = fl->l_len;
11836   filelock.client = 0;
11837   // see comment in _do_filelock()
11838   filelock.owner = owner | (1ULL << 63);
11839   filelock.pid = fl->l_pid;
11840   filelock.type = lock_cmd;
11841
11842   if (filelock.type == CEPH_LOCK_UNLOCK) {
11843     list<ceph_filelock> activated_locks;
11844     lock_state->remove_lock(filelock, activated_locks);
11845   } else {
11846     bool r = lock_state->add_lock(filelock, false, false, NULL);
11847     ceph_assert(r);
11848   }
11849 }
11850
11851 int Client::_getlk(Fh *fh, struct flock *fl, uint64_t owner)
11852 {
11853   Inode *in = fh->inode.get();
11854   ldout(cct, 10) << "_getlk " << fh << " ino " << in->ino << dendl;
11855   int ret = _do_filelock(in, fh, CEPH_LOCK_FCNTL, CEPH_MDS_OP_GETFILELOCK, 0, fl, owner);
11856   return ret;
11857 }
11858
11859 int Client::_setlk(Fh *fh, struct flock *fl, uint64_t owner, int sleep)
11860 {
11861   Inode *in = fh->inode.get();
11862   ldout(cct, 10) << "_setlk " << fh << " ino " << in->ino << dendl;
11863   int ret =  _do_filelock(in, fh, CEPH_LOCK_FCNTL, CEPH_MDS_OP_SETFILELOCK, sleep, fl, owner);
11864   ldout(cct, 10) << "_setlk " << fh << " ino " << in->ino << " result=" << ret << dendl;
11865   return ret;
11866 }
11867
11868 int Client::_flock(Fh *fh, int cmd, uint64_t owner)
11869 {
11870   Inode *in = fh->inode.get();
11871   ldout(cct, 10) << "_flock " << fh << " ino " << in->ino << dendl;
11872
11873   int sleep = !(cmd & LOCK_NB);
11874   cmd &= ~LOCK_NB;
11875
11876   int type;
11877   switch (cmd) {
11878     case LOCK_SH:
11879       type = F_RDLCK;
11880       break;
11881     case LOCK_EX:
11882       type = F_WRLCK;
11883       break;
11884     case LOCK_UN:
11885       type = F_UNLCK;
11886       break;
11887     default:
11888       return -CEPHFS_EINVAL;
11889   }
11890
11891   struct flock fl;
11892   memset(&fl, 0, sizeof(fl));
11893   fl.l_type = type;
11894   fl.l_whence = SEEK_SET;
11895
11896   int ret =  _do_filelock(in, fh, CEPH_LOCK_FLOCK, CEPH_MDS_OP_SETFILELOCK, sleep, &fl, owner);
11897   ldout(cct, 10) << "_flock " << fh << " ino " << in->ino << " result=" << ret << dendl;
11898   return ret;
11899 }
11900
11901 int Client::get_snap_info(const char *path, const UserPerm &perms, SnapInfo *snap_info) {
11902   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11903   if (!mref_reader.is_state_satisfied()) {
11904     return -CEPHFS_ENOTCONN;
11905   }
11906
11907   std::scoped_lock lock(client_lock);
11908   InodeRef in;
11909   int r = Client::path_walk(path, &in, perms, true);
11910   if (r < 0) {
11911     return r;
11912   }
11913
11914   if (in->snapid == CEPH_NOSNAP) {
11915     return -CEPHFS_EINVAL;
11916   }
11917
11918   snap_info->id = in->snapid;
11919   snap_info->metadata = in->snap_metadata;
11920   return 0;
11921 }
11922
11923 int Client::ll_statfs(Inode *in, struct statvfs *stbuf, const UserPerm& perms)
11924 {
11925   /* Since the only thing this does is wrap a call to statfs, and
11926      statfs takes a lock, it doesn't seem we have a need to split it
11927      out. */
11928   return statfs(0, stbuf, perms);
11929 }
11930
11931 void Client::_ll_register_callbacks(struct ceph_client_callback_args *args)
11932 {
11933   if (!args)
11934     return;
11935
11936   ldout(cct, 10) << __func__ << " cb " << args->handle
11937                  << " invalidate_ino_cb " << args->ino_cb
11938                  << " invalidate_dentry_cb " << args->dentry_cb
11939                  << " switch_interrupt_cb " << args->switch_intr_cb
11940                  << " remount_cb " << args->remount_cb
11941                  << dendl;
11942   callback_handle = args->handle;
11943   if (args->ino_cb) {
11944     ino_invalidate_cb = args->ino_cb;
11945     async_ino_invalidator.start();
11946   }
11947   if (args->dentry_cb) {
11948     dentry_invalidate_cb = args->dentry_cb;
11949     async_dentry_invalidator.start();
11950   }
11951   if (args->switch_intr_cb) {
11952     switch_interrupt_cb = args->switch_intr_cb;
11953     interrupt_finisher.start();
11954   }
11955   if (args->remount_cb) {
11956     remount_cb = args->remount_cb;
11957     remount_finisher.start();
11958   }
11959   if (args->ino_release_cb) {
11960     ino_release_cb = args->ino_release_cb;
11961     async_ino_releasor.start();
11962   }
11963   if (args->umask_cb)
11964     umask_cb = args->umask_cb;
11965 }
11966
11967 // This is deprecated, use ll_register_callbacks2() instead.
11968 void Client::ll_register_callbacks(struct ceph_client_callback_args *args)
11969 {
11970   ceph_assert(!is_mounting() && !is_mounted() && !is_unmounting());
11971
11972   _ll_register_callbacks(args);
11973 }
11974
11975 int Client::ll_register_callbacks2(struct ceph_client_callback_args *args)
11976 {
11977   if (is_mounting() || is_mounted() || is_unmounting())
11978     return -CEPHFS_EBUSY;
11979
11980   _ll_register_callbacks(args);
11981   return 0;
11982 }
11983
11984 std::pair<int, bool> Client::test_dentry_handling(bool can_invalidate)
11985 {
11986   std::pair <int, bool> r(0, false);
11987
11988   RWRef_t iref_reader(initialize_state, CLIENT_INITIALIZED);
11989   if (!iref_reader.is_state_satisfied())
11990     return std::make_pair(-CEPHFS_ENOTCONN, false);
11991
11992   can_invalidate_dentries = can_invalidate;
11993
11994   /*
11995    * Force to use the old and slow method to invalidate the dcache
11996    * if the euid is non-root, or the remount may fail with return
11997    * code 1 or 32.
11998    */
11999   uid_t euid = geteuid();
12000   ldout(cct, 10) << "euid: " << euid << dendl;
12001   if (euid != 0) {
12002     can_invalidate_dentries = true;
12003   }
12004
12005   if (can_invalidate_dentries) {
12006     ceph_assert(dentry_invalidate_cb);
12007     ldout(cct, 1) << "using dentry_invalidate_cb" << dendl;
12008   } else {
12009     ceph_assert(remount_cb);
12010     ldout(cct, 1) << "using remount_cb" << dendl;
12011     r = _do_remount(false);
12012   }
12013
12014   return r;
12015 }
12016
12017 int Client::_sync_fs()
12018 {
12019   ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
12020
12021   ldout(cct, 10) << __func__ << dendl;
12022
12023   // flush file data
12024   std::unique_ptr<C_SaferCond> cond = nullptr;
12025   if (cct->_conf->client_oc) {
12026     cond.reset(new C_SaferCond("Client::_sync_fs:lock"));
12027     objectcacher->flush_all(cond.get());
12028   }
12029
12030   // flush caps
12031   flush_caps_sync();
12032   ceph_tid_t flush_tid = last_flush_tid;
12033
12034   // flush the mdlog before waiting for unsafe requests.
12035   flush_mdlog_sync();
12036
12037   // wait for unsafe mds requests
12038   wait_unsafe_requests();
12039
12040   wait_sync_caps(flush_tid);
12041
12042   if (nullptr != cond) {
12043     client_lock.unlock();
12044     ldout(cct, 15) << __func__ << " waiting on data to flush" << dendl;
12045     cond->wait();
12046     ldout(cct, 15) << __func__ << " flush finished" << dendl;
12047     client_lock.lock();
12048   }
12049
12050   return 0;
12051 }
12052
12053 int Client::sync_fs()
12054 {
12055   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12056   if (!mref_reader.is_state_satisfied())
12057     return -CEPHFS_ENOTCONN;
12058
12059   std::scoped_lock l(client_lock);
12060
12061   return _sync_fs();
12062 }
12063
12064 int64_t Client::drop_caches()
12065 {
12066   std::scoped_lock l(client_lock);
12067   return objectcacher->release_all();
12068 }
12069
12070 int Client::_lazyio(Fh *fh, int enable)
12071 {
12072   Inode *in = fh->inode.get();
12073   ldout(cct, 20) << __func__ << " " << *in << " " << !!enable << dendl;
12074
12075   if (!!(fh->mode & CEPH_FILE_MODE_LAZY) == !!enable)
12076     return 0;
12077
12078   int orig_mode = fh->mode;
12079   if (enable) {
12080     fh->mode |= CEPH_FILE_MODE_LAZY;
12081     in->get_open_ref(fh->mode);
12082     in->put_open_ref(orig_mode);
12083     check_caps(in, CHECK_CAPS_NODELAY);
12084   } else {
12085     fh->mode &= ~CEPH_FILE_MODE_LAZY;
12086     in->get_open_ref(fh->mode);
12087     in->put_open_ref(orig_mode);
12088     check_caps(in, 0);
12089   }
12090
12091   return 0;
12092 }
12093
12094 int Client::lazyio(int fd, int enable)
12095 {
12096   std::scoped_lock l(client_lock);
12097   Fh *f = get_filehandle(fd);
12098   if (!f)
12099     return -CEPHFS_EBADF;
12100
12101   return _lazyio(f, enable);
12102 }
12103
12104 int Client::ll_lazyio(Fh *fh, int enable)
12105 {
12106   ldout(cct, 3) << __func__ << " " << fh << " " << fh->inode->ino << " " << !!enable << dendl;
12107   tout(cct) << __func__ << std::endl;
12108
12109   std::scoped_lock lock(client_lock);
12110   return _lazyio(fh, enable);
12111 }
12112
12113 int Client::lazyio_propagate(int fd, loff_t offset, size_t count)
12114 {
12115   std::scoped_lock l(client_lock);
12116   ldout(cct, 3) << "op: client->lazyio_propagate(" << fd
12117           << ", " << offset << ", " << count << ")" << dendl;
12118
12119   Fh *f = get_filehandle(fd);
12120   if (!f)
12121     return -CEPHFS_EBADF;
12122
12123   // for now
12124   _fsync(f, true);
12125
12126   return 0;
12127 }
12128
12129 int Client::lazyio_synchronize(int fd, loff_t offset, size_t count)
12130 {
12131   std::scoped_lock l(client_lock);
12132   ldout(cct, 3) << "op: client->lazyio_synchronize(" << fd
12133           << ", " << offset << ", " << count << ")" << dendl;
12134
12135   Fh *f = get_filehandle(fd);
12136   if (!f)
12137     return -CEPHFS_EBADF;
12138   Inode *in = f->inode.get();
12139
12140   _fsync(f, true);
12141   if (_release(in)) {
12142     int r =_getattr(in, CEPH_STAT_CAP_SIZE, f->actor_perms);
12143     if (r < 0)
12144       return r;
12145   }
12146   return 0;
12147 }
12148
12149
12150 // =============================
12151 // snaps
12152
12153 int Client::mksnap(const char *relpath, const char *name, const UserPerm& perm,
12154                    mode_t mode, const std::map<std::string, std::string> &metadata)
12155 {
12156   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12157   if (!mref_reader.is_state_satisfied())
12158     return -CEPHFS_ENOTCONN;
12159
12160   std::scoped_lock l(client_lock);
12161
12162   filepath path(relpath);
12163   InodeRef in;
12164   int r = path_walk(path, &in, perm);
12165   if (r < 0)
12166     return r;
12167   if (cct->_conf->client_permissions) {
12168     r = may_create(in.get(), perm);
12169     if (r < 0)
12170       return r;
12171   }
12172   Inode *snapdir = open_snapdir(in.get());
12173   return _mkdir(snapdir, name, mode, perm, nullptr, metadata);
12174 }
12175
12176 int Client::rmsnap(const char *relpath, const char *name, const UserPerm& perms, bool check_perms)
12177 {
12178   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12179   if (!mref_reader.is_state_satisfied())
12180     return -CEPHFS_ENOTCONN;
12181
12182   std::scoped_lock l(client_lock);
12183
12184   filepath path(relpath);
12185   InodeRef in;
12186   int r = path_walk(path, &in, perms);
12187   if (r < 0)
12188     return r;
12189   Inode *snapdir = open_snapdir(in.get());
12190   if (cct->_conf->client_permissions) {
12191     r = may_delete(snapdir, check_perms ? name : NULL, perms);
12192     if (r < 0)
12193       return r;
12194   }
12195   return _rmdir(snapdir, name, perms);
12196 }
12197
12198 // =============================
12199 // expose caps
12200
12201 int Client::get_caps_issued(int fd)
12202 {
12203   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12204   if (!mref_reader.is_state_satisfied())
12205     return -CEPHFS_ENOTCONN;
12206
12207   std::scoped_lock lock(client_lock);
12208
12209   Fh *f = get_filehandle(fd);
12210   if (!f)
12211     return -CEPHFS_EBADF;
12212
12213   return f->inode->caps_issued();
12214 }
12215
12216 int Client::get_caps_issued(const char *path, const UserPerm& perms)
12217 {
12218   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12219   if (!mref_reader.is_state_satisfied())
12220     return -CEPHFS_ENOTCONN;
12221
12222   std::scoped_lock lock(client_lock);
12223
12224   filepath p(path);
12225   InodeRef in;
12226   int r = path_walk(p, &in, perms, true);
12227   if (r < 0)
12228     return r;
12229   return in->caps_issued();
12230 }
12231
12232 // =========================================
12233 // low level
12234
12235 void Client::refresh_snapdir_attrs(Inode *in, Inode *diri) {
12236   ldout(cct, 10) << __func__ << ": snapdir inode=" << *in
12237                  << ", inode=" << *diri << dendl;
12238   in->ino = diri->ino;
12239   in->snapid = CEPH_SNAPDIR;
12240   in->mode = diri->mode;
12241   in->uid = diri->uid;
12242   in->gid = diri->gid;
12243   in->nlink = 1;
12244   in->mtime = diri->snaprealm->last_modified;
12245   in->ctime = in->mtime;
12246   in->change_attr = diri->snaprealm->change_attr;
12247   in->btime = diri->btime;
12248   in->atime = diri->atime;
12249   in->size = diri->size;
12250
12251   in->dirfragtree.clear();
12252   in->snapdir_parent = diri;
12253   // copy posix acls to snapshotted inode
12254   in->xattrs.clear();
12255   for (auto &[xattr_key, xattr_value] : diri->xattrs) {
12256     if (xattr_key.rfind("system.", 0) == 0) {
12257       in->xattrs[xattr_key] = xattr_value;
12258     }
12259   }
12260 }
12261
12262 Inode *Client::open_snapdir(Inode *diri)
12263 {
12264   Inode *in;
12265   vinodeno_t vino(diri->ino, CEPH_SNAPDIR);
12266   if (!inode_map.count(vino)) {
12267     in = new Inode(this, vino, &diri->layout);
12268     refresh_snapdir_attrs(in, diri);
12269     diri->flags |= I_SNAPDIR_OPEN;
12270     inode_map[vino] = in;
12271     if (use_faked_inos())
12272       _assign_faked_ino(in);
12273     ldout(cct, 10) << "open_snapdir created snapshot inode " << *in << dendl;
12274   } else {
12275     in = inode_map[vino];
12276     ldout(cct, 10) << "open_snapdir had snapshot inode " << *in << dendl;
12277   }
12278   return in;
12279 }
12280
12281 int Client::ll_lookup(Inode *parent, const char *name, struct stat *attr,
12282                       Inode **out, const UserPerm& perms)
12283 {
12284   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12285   if (!mref_reader.is_state_satisfied())
12286     return -CEPHFS_ENOTCONN;
12287
12288   vinodeno_t vparent = _get_vino(parent);
12289   ldout(cct, 3) << __func__ << " " << vparent << " " << name << dendl;
12290   tout(cct) << __func__ << std::endl;
12291   tout(cct) << name << std::endl;
12292
12293   std::scoped_lock lock(client_lock);
12294
12295   int r = 0;
12296   if (!fuse_default_permissions) {
12297     if (strcmp(name, ".") && strcmp(name, "..")) {
12298       r = may_lookup(parent, perms);
12299       if (r < 0)
12300         return r;
12301     }
12302   }
12303
12304   string dname(name);
12305   InodeRef in;
12306
12307   r = _lookup(parent, dname, CEPH_STAT_CAP_INODE_ALL, &in, perms);
12308   if (r < 0) {
12309     attr->st_ino = 0;
12310     goto out;
12311   }
12312
12313   ceph_assert(in);
12314   fill_stat(in, attr);
12315   _ll_get(in.get());
12316
12317  out:
12318   ldout(cct, 3) << __func__ << " " << vparent << " " << name
12319           << " -> " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
12320   tout(cct) << attr->st_ino << std::endl;
12321   *out = in.get();
12322   return r;
12323 }
12324
12325 int Client::ll_lookup_vino(
12326     vinodeno_t vino,
12327     const UserPerm& perms,
12328     Inode **inode)
12329 {
12330   ceph_assert(inode != NULL);
12331   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12332   if (!mref_reader.is_state_satisfied())
12333     return -CEPHFS_ENOTCONN;
12334
12335   if (is_reserved_vino(vino))
12336     return -CEPHFS_ESTALE;
12337
12338   std::scoped_lock lock(client_lock);
12339   ldout(cct, 3) << __func__ << " " << vino << dendl;
12340
12341   // Check the cache first
12342   unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino);
12343   if (p != inode_map.end()) {
12344     *inode = p->second;
12345     _ll_get(*inode);
12346     return 0;
12347   }
12348
12349   uint64_t snapid = vino.snapid;
12350
12351   // for snapdir, find the non-snapped dir inode
12352   if (snapid == CEPH_SNAPDIR)
12353     vino.snapid = CEPH_NOSNAP;
12354
12355   int r = _lookup_vino(vino, perms, inode);
12356   if (r)
12357     return r;
12358   ceph_assert(*inode != NULL);
12359
12360   if (snapid == CEPH_SNAPDIR) {
12361     Inode *tmp = *inode;
12362
12363     // open the snapdir and put the inode ref
12364     *inode = open_snapdir(tmp);
12365     _ll_forget(tmp, 1);
12366     _ll_get(*inode);
12367   }
12368   return 0;
12369 }
12370
12371 int Client::ll_lookup_inode(
12372     struct inodeno_t ino,
12373     const UserPerm& perms,
12374     Inode **inode)
12375 {
12376   vinodeno_t vino(ino, CEPH_NOSNAP);
12377   return ll_lookup_vino(vino, perms, inode);
12378 }
12379
12380 int Client::ll_lookupx(Inode *parent, const char *name, Inode **out,
12381                        struct ceph_statx *stx, unsigned want, unsigned flags,
12382                        const UserPerm& perms)
12383 {
12384   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12385   if (!mref_reader.is_state_satisfied())
12386     return -CEPHFS_ENOTCONN;
12387
12388   vinodeno_t vparent = _get_vino(parent);
12389   ldout(cct, 3) << __func__ << " " << vparent << " " << name << dendl;
12390   tout(cct) << "ll_lookupx" << std::endl;
12391   tout(cct) << name << std::endl;
12392
12393   std::scoped_lock lock(client_lock);
12394
12395   int r = 0;
12396   if (!fuse_default_permissions) {
12397     r = may_lookup(parent, perms);
12398     if (r < 0)
12399       return r;
12400   }
12401
12402   string dname(name);
12403   InodeRef in;
12404
12405   unsigned mask = statx_to_mask(flags, want);
12406   r = _lookup(parent, dname, mask, &in, perms);
12407   if (r < 0) {
12408     stx->stx_ino = 0;
12409     stx->stx_mask = 0;
12410   } else {
12411     ceph_assert(in);
12412     fill_statx(in, mask, stx);
12413     _ll_get(in.get());
12414   }
12415
12416   ldout(cct, 3) << __func__ << " " << vparent << " " << name
12417           << " -> " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
12418   tout(cct) << stx->stx_ino << std::endl;
12419   *out = in.get();
12420   return r;
12421 }
12422
12423 int Client::ll_walk(const char* name, Inode **out, struct ceph_statx *stx,
12424                     unsigned int want, unsigned int flags, const UserPerm& perms)
12425 {
12426   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12427   if (!mref_reader.is_state_satisfied())
12428     return -CEPHFS_ENOTCONN;
12429
12430   filepath fp(name, 0);
12431   InodeRef in;
12432   int rc;
12433   unsigned mask = statx_to_mask(flags, want);
12434
12435   ldout(cct, 3) << __func__ << " " << name << dendl;
12436   tout(cct) << __func__ << std::endl;
12437   tout(cct) << name << std::endl;
12438
12439   std::scoped_lock lock(client_lock);
12440   rc = path_walk(fp, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW), mask);
12441   if (rc < 0) {
12442     /* zero out mask, just in case... */
12443     stx->stx_mask = 0;
12444     stx->stx_ino = 0;
12445     *out = NULL;
12446     return rc;
12447   } else {
12448     ceph_assert(in);
12449     fill_statx(in, mask, stx);
12450     _ll_get(in.get());
12451     *out = in.get();
12452     return 0;
12453   }
12454 }
12455
12456 void Client::_ll_get(Inode *in)
12457 {
12458   if (in->ll_ref == 0) {
12459     in->iget();
12460     if (in->is_dir() && !in->dentries.empty()) {
12461       ceph_assert(in->dentries.size() == 1); // dirs can't be hard-linked
12462       in->get_first_parent()->get(); // pin dentry
12463     }
12464     if (in->snapid != CEPH_NOSNAP)
12465       ll_snap_ref[in->snapid]++;
12466   }
12467   in->ll_get();
12468   ldout(cct, 20) << __func__ << " " << in << " " << in->ino << " -> " << in->ll_ref << dendl;
12469 }
12470
12471 int Client::_ll_put(Inode *in, uint64_t num)
12472 {
12473   in->ll_put(num);
12474   ldout(cct, 20) << __func__ << " " << in << " " << in->ino << " " << num << " -> " << in->ll_ref << dendl;
12475   if (in->ll_ref == 0) {
12476     if (in->is_dir() && !in->dentries.empty()) {
12477       ceph_assert(in->dentries.size() == 1); // dirs can't be hard-linked
12478       in->get_first_parent()->put(); // unpin dentry
12479     }
12480     if (in->snapid != CEPH_NOSNAP) {
12481       auto p = ll_snap_ref.find(in->snapid);
12482       ceph_assert(p != ll_snap_ref.end());
12483       ceph_assert(p->second > 0);
12484       if (--p->second == 0)
12485         ll_snap_ref.erase(p);
12486     }
12487     put_inode(in);
12488     return 0;
12489   } else {
12490     return in->ll_ref;
12491   }
12492 }
12493
12494 void Client::_ll_drop_pins()
12495 {
12496   ldout(cct, 10) << __func__ << dendl;
12497   std::set<InodeRef> to_be_put; //this set will be deconstructed item by item when exit
12498   ceph::unordered_map<vinodeno_t, Inode*>::iterator next;
12499   for (ceph::unordered_map<vinodeno_t, Inode*>::iterator it = inode_map.begin();
12500        it != inode_map.end();
12501        it = next) {
12502     Inode *in = it->second;
12503     next = it;
12504     ++next;
12505     if (in->ll_ref){
12506       to_be_put.insert(in);
12507       _ll_put(in, in->ll_ref);
12508     }
12509   }
12510 }
12511
12512 bool Client::_ll_forget(Inode *in, uint64_t count)
12513 {
12514   inodeno_t ino = in->ino;
12515
12516   ldout(cct, 8) << __func__ << " " << ino << " " << count << dendl;
12517   tout(cct) << __func__ << std::endl;
12518   tout(cct) << ino.val << std::endl;
12519   tout(cct) << count << std::endl;
12520
12521   // Ignore forget if we're no longer mounted
12522   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12523   if (!mref_reader.is_state_satisfied())
12524     return true;
12525
12526   if (ino == 1) return true;  // ignore forget on root.
12527
12528   bool last = false;
12529   if (in->ll_ref < count) {
12530     ldout(cct, 1) << "WARNING: ll_forget on " << ino << " " << count
12531                   << ", which only has ll_ref=" << in->ll_ref << dendl;
12532     _ll_put(in, in->ll_ref);
12533     last = true;
12534   } else {
12535     if (_ll_put(in, count) == 0)
12536       last = true;
12537   }
12538
12539   return last;
12540 }
12541
12542 bool Client::ll_forget(Inode *in, uint64_t count)
12543 {
12544   std::scoped_lock lock(client_lock);
12545   return _ll_forget(in, count);
12546 }
12547
12548 bool Client::ll_put(Inode *in)
12549 {
12550   /* ll_forget already takes the lock */
12551   return ll_forget(in, 1);
12552 }
12553
12554 int Client::ll_get_snap_ref(snapid_t snap)
12555 {
12556   std::scoped_lock lock(client_lock);
12557   auto p = ll_snap_ref.find(snap);
12558   if (p != ll_snap_ref.end())
12559     return p->second;
12560   return 0;
12561 }
12562
12563 snapid_t Client::ll_get_snapid(Inode *in)
12564 {
12565   std::scoped_lock lock(client_lock);
12566   return in->snapid;
12567 }
12568
12569 Inode *Client::ll_get_inode(ino_t ino)
12570 {
12571   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12572   if (!mref_reader.is_state_satisfied())
12573     return NULL;
12574
12575   std::scoped_lock lock(client_lock);
12576
12577   vinodeno_t vino = _map_faked_ino(ino);
12578   unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino);
12579   if (p == inode_map.end())
12580     return NULL;
12581   Inode *in = p->second;
12582   _ll_get(in);
12583   return in;
12584 }
12585
12586 Inode *Client::ll_get_inode(vinodeno_t vino)
12587 {
12588   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12589   if (!mref_reader.is_state_satisfied())
12590     return NULL;
12591
12592   if (is_reserved_vino(vino))
12593     return NULL;
12594
12595   std::scoped_lock lock(client_lock);
12596
12597   unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino);
12598   if (p == inode_map.end())
12599     return NULL;
12600   Inode *in = p->second;
12601   _ll_get(in);
12602   return in;
12603 }
12604
12605 int Client::_ll_getattr(Inode *in, int caps, const UserPerm& perms)
12606 {
12607   vinodeno_t vino = _get_vino(in);
12608
12609   ldout(cct, 8) << __func__ << " " << vino << dendl;
12610   tout(cct) << __func__ << std::endl;
12611   tout(cct) << vino.ino.val << std::endl;
12612
12613   if (vino.snapid < CEPH_NOSNAP)
12614     return 0;
12615   else
12616     return _getattr(in, caps, perms);
12617 }
12618
12619 int Client::ll_getattr(Inode *in, struct stat *attr, const UserPerm& perms)
12620 {
12621   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12622   if (!mref_reader.is_state_satisfied())
12623     return -CEPHFS_ENOTCONN;
12624
12625   std::scoped_lock lock(client_lock);
12626
12627   int res = _ll_getattr(in, CEPH_STAT_CAP_INODE_ALL, perms);
12628
12629   if (res == 0)
12630     fill_stat(in, attr);
12631   ldout(cct, 3) << __func__ << " " << _get_vino(in) << " = " << res << dendl;
12632   return res;
12633 }
12634
12635 int Client::ll_getattrx(Inode *in, struct ceph_statx *stx, unsigned int want,
12636                         unsigned int flags, const UserPerm& perms)
12637 {
12638   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12639   if (!mref_reader.is_state_satisfied())
12640     return -CEPHFS_ENOTCONN;
12641
12642   std::scoped_lock lock(client_lock);
12643
12644   int res = 0;
12645   unsigned mask = statx_to_mask(flags, want);
12646
12647   if (mask && !in->caps_issued_mask(mask, true))
12648     res = _ll_getattr(in, mask, perms);
12649
12650   if (res == 0)
12651     fill_statx(in, mask, stx);
12652   ldout(cct, 3) << __func__ << " " << _get_vino(in) << " = " << res << dendl;
12653   return res;
12654 }
12655
12656 int Client::_ll_setattrx(Inode *in, struct ceph_statx *stx, int mask,
12657                          const UserPerm& perms, InodeRef *inp)
12658 {
12659   vinodeno_t vino = _get_vino(in);
12660
12661   ldout(cct, 8) << __func__ << " " << vino << " mask " << hex << mask << dec
12662                 << dendl;
12663   tout(cct) << __func__ << std::endl;
12664   tout(cct) << vino.ino.val << std::endl;
12665   tout(cct) << stx->stx_mode << std::endl;
12666   tout(cct) << stx->stx_uid << std::endl;
12667   tout(cct) << stx->stx_gid << std::endl;
12668   tout(cct) << stx->stx_size << std::endl;
12669   tout(cct) << stx->stx_mtime << std::endl;
12670   tout(cct) << stx->stx_atime << std::endl;
12671   tout(cct) << stx->stx_btime << std::endl;
12672   tout(cct) << mask << std::endl;
12673
12674   if (!fuse_default_permissions) {
12675     int res = may_setattr(in, stx, mask, perms);
12676     if (res < 0)
12677       return res;
12678   }
12679
12680   mask &= ~(CEPH_SETATTR_MTIME_NOW | CEPH_SETATTR_ATIME_NOW);
12681
12682   return __setattrx(in, stx, mask, perms, inp);
12683 }
12684
12685 int Client::ll_setattrx(Inode *in, struct ceph_statx *stx, int mask,
12686                         const UserPerm& perms)
12687 {
12688   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12689   if (!mref_reader.is_state_satisfied())
12690     return -CEPHFS_ENOTCONN;
12691
12692   std::scoped_lock lock(client_lock);
12693
12694   InodeRef target(in);
12695   int res = _ll_setattrx(in, stx, mask, perms, &target);
12696   if (res == 0) {
12697     ceph_assert(in == target.get());
12698     fill_statx(in, in->caps_issued(), stx);
12699   }
12700
12701   ldout(cct, 3) << __func__ << " " << _get_vino(in) << " = " << res << dendl;
12702   return res;
12703 }
12704
12705 int Client::ll_setattr(Inode *in, struct stat *attr, int mask,
12706                        const UserPerm& perms)
12707 {
12708   struct ceph_statx stx;
12709   stat_to_statx(attr, &stx);
12710
12711   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12712   if (!mref_reader.is_state_satisfied())
12713     return -CEPHFS_ENOTCONN;
12714
12715   std::scoped_lock lock(client_lock);
12716
12717   InodeRef target(in);
12718   int res = _ll_setattrx(in, &stx, mask, perms, &target);
12719   if (res == 0) {
12720     ceph_assert(in == target.get());
12721     fill_stat(in, attr);
12722   }
12723
12724   ldout(cct, 3) << __func__ << " " << _get_vino(in) << " = " << res << dendl;
12725   return res;
12726 }
12727
12728
12729 // ----------
12730 // xattrs
12731
12732 int Client::getxattr(const char *path, const char *name, void *value, size_t size,
12733                      const UserPerm& perms)
12734 {
12735   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12736   if (!mref_reader.is_state_satisfied())
12737     return -CEPHFS_ENOTCONN;
12738
12739   std::scoped_lock lock(client_lock);
12740
12741   InodeRef in;
12742   int r = Client::path_walk(path, &in, perms, true, CEPH_STAT_CAP_XATTR);
12743   if (r < 0)
12744     return r;
12745   return _getxattr(in, name, value, size, perms);
12746 }
12747
12748 int Client::lgetxattr(const char *path, const char *name, void *value, size_t size,
12749                       const UserPerm& perms)
12750 {
12751   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12752   if (!mref_reader.is_state_satisfied())
12753     return -CEPHFS_ENOTCONN;
12754
12755   std::scoped_lock lock(client_lock);
12756
12757   InodeRef in;
12758   int r = Client::path_walk(path, &in, perms, false, CEPH_STAT_CAP_XATTR);
12759   if (r < 0)
12760     return r;
12761   return _getxattr(in, name, value, size, perms);
12762 }
12763
12764 int Client::fgetxattr(int fd, const char *name, void *value, size_t size,
12765                       const UserPerm& perms)
12766 {
12767   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12768   if (!mref_reader.is_state_satisfied())
12769     return -CEPHFS_ENOTCONN;
12770
12771   std::scoped_lock lock(client_lock);
12772
12773   Fh *f = get_filehandle(fd);
12774   if (!f)
12775     return -CEPHFS_EBADF;
12776   return _getxattr(f->inode, name, value, size, perms);
12777 }
12778
12779 int Client::listxattr(const char *path, char *list, size_t size,
12780                       const UserPerm& perms)
12781 {
12782   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12783   if (!mref_reader.is_state_satisfied())
12784     return -CEPHFS_ENOTCONN;
12785
12786   std::scoped_lock lock(client_lock);
12787
12788   InodeRef in;
12789   int r = Client::path_walk(path, &in, perms, true, CEPH_STAT_CAP_XATTR);
12790   if (r < 0)
12791     return r;
12792   return Client::_listxattr(in.get(), list, size, perms);
12793 }
12794
12795 int Client::llistxattr(const char *path, char *list, size_t size,
12796                        const UserPerm& perms)
12797 {
12798   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12799   if (!mref_reader.is_state_satisfied())
12800     return -CEPHFS_ENOTCONN;
12801
12802   std::scoped_lock lock(client_lock);
12803
12804   InodeRef in;
12805   int r = Client::path_walk(path, &in, perms, false, CEPH_STAT_CAP_XATTR);
12806   if (r < 0)
12807     return r;
12808   return Client::_listxattr(in.get(), list, size, perms);
12809 }
12810
12811 int Client::flistxattr(int fd, char *list, size_t size, const UserPerm& perms)
12812 {
12813   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12814   if (!mref_reader.is_state_satisfied())
12815     return -CEPHFS_ENOTCONN;
12816
12817   std::scoped_lock lock(client_lock);
12818
12819   Fh *f = get_filehandle(fd);
12820   if (!f)
12821     return -CEPHFS_EBADF;
12822   return Client::_listxattr(f->inode.get(), list, size, perms);
12823 }
12824
12825 int Client::removexattr(const char *path, const char *name,
12826                         const UserPerm& perms)
12827 {
12828   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12829   if (!mref_reader.is_state_satisfied())
12830     return -CEPHFS_ENOTCONN;
12831
12832   std::scoped_lock lock(client_lock);
12833
12834   InodeRef in;
12835   int r = Client::path_walk(path, &in, perms, true);
12836   if (r < 0)
12837     return r;
12838   return _removexattr(in, name, perms);
12839 }
12840
12841 int Client::lremovexattr(const char *path, const char *name,
12842                          const UserPerm& perms)
12843 {
12844   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12845   if (!mref_reader.is_state_satisfied())
12846     return -CEPHFS_ENOTCONN;
12847
12848   std::scoped_lock lock(client_lock);
12849
12850   InodeRef in;
12851   int r = Client::path_walk(path, &in, perms, false);
12852   if (r < 0)
12853     return r;
12854   return _removexattr(in, name, perms);
12855 }
12856
12857 int Client::fremovexattr(int fd, const char *name, const UserPerm& perms)
12858 {
12859   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12860   if (!mref_reader.is_state_satisfied())
12861     return -CEPHFS_ENOTCONN;
12862
12863   std::scoped_lock lock(client_lock);
12864
12865   Fh *f = get_filehandle(fd);
12866   if (!f)
12867     return -CEPHFS_EBADF;
12868   return _removexattr(f->inode, name, perms);
12869 }
12870
12871 int Client::setxattr(const char *path, const char *name, const void *value,
12872                      size_t size, int flags, const UserPerm& perms)
12873 {
12874   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12875   if (!mref_reader.is_state_satisfied())
12876     return -CEPHFS_ENOTCONN;
12877
12878   _setxattr_maybe_wait_for_osdmap(name, value, size);
12879
12880   std::scoped_lock lock(client_lock);
12881
12882   InodeRef in;
12883   int r = Client::path_walk(path, &in, perms, true);
12884   if (r < 0)
12885     return r;
12886   return _setxattr(in, name, value, size, flags, perms);
12887 }
12888
12889 int Client::lsetxattr(const char *path, const char *name, const void *value,
12890                       size_t size, int flags, const UserPerm& perms)
12891 {
12892   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12893   if (!mref_reader.is_state_satisfied())
12894     return -CEPHFS_ENOTCONN;
12895
12896   _setxattr_maybe_wait_for_osdmap(name, value, size);
12897
12898   std::scoped_lock lock(client_lock);
12899
12900   InodeRef in;
12901   int r = Client::path_walk(path, &in, perms, false);
12902   if (r < 0)
12903     return r;
12904   return _setxattr(in, name, value, size, flags, perms);
12905 }
12906
12907 int Client::fsetxattr(int fd, const char *name, const void *value, size_t size,
12908                       int flags, const UserPerm& perms)
12909 {
12910   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12911   if (!mref_reader.is_state_satisfied())
12912     return -CEPHFS_ENOTCONN;
12913
12914   _setxattr_maybe_wait_for_osdmap(name, value, size);
12915
12916   std::scoped_lock lock(client_lock);
12917
12918   Fh *f = get_filehandle(fd);
12919   if (!f)
12920     return -CEPHFS_EBADF;
12921   return _setxattr(f->inode, name, value, size, flags, perms);
12922 }
12923
12924 int Client::_getxattr(Inode *in, const char *name, void *value, size_t size,
12925                       const UserPerm& perms)
12926 {
12927   int r;
12928   const VXattr *vxattr = nullptr;
12929
12930   vxattr = _match_vxattr(in, name);
12931   if (vxattr) {
12932     r = -CEPHFS_ENODATA;
12933
12934     // Do a force getattr to get the latest quota before returning
12935     // a value to userspace.
12936     int flags = 0;
12937     if (vxattr->flags & VXATTR_RSTAT) {
12938       flags |= CEPH_STAT_RSTAT;
12939     }
12940     if (vxattr->flags & VXATTR_DIRSTAT) {
12941       flags |= CEPH_CAP_FILE_SHARED;
12942     }
12943     r = _getattr(in, flags | CEPH_STAT_CAP_XATTR, perms, true);
12944     if (r != 0) {
12945       // Error from getattr!
12946       return r;
12947     }
12948
12949     // call pointer-to-member function
12950     char buf[256];
12951     if (!(vxattr->exists_cb && !(this->*(vxattr->exists_cb))(in))) {
12952       r = (this->*(vxattr->getxattr_cb))(in, buf, sizeof(buf));
12953     } else {
12954       r = -CEPHFS_ENODATA;
12955     }
12956
12957     if (size != 0) {
12958       if (r > (int)size) {
12959         r = -CEPHFS_ERANGE;
12960       } else if (r > 0) {
12961         memcpy(value, buf, r);
12962       }
12963     }
12964     goto out;
12965   }
12966
12967   if (!strncmp(name, "ceph.", 5)) {
12968     r = _getvxattr(in, perms, name, size, value, MDS_RANK_NONE);
12969     goto out;
12970   }
12971
12972   if (acl_type == NO_ACL && !strncmp(name, "system.", 7)) {
12973     r = -CEPHFS_EOPNOTSUPP;
12974     goto out;
12975   }
12976
12977   r = _getattr(in, CEPH_STAT_CAP_XATTR, perms, in->xattr_version == 0);
12978   if (r == 0) {
12979     string n(name);
12980     r = -CEPHFS_ENODATA;
12981     if (in->xattrs.count(n)) {
12982       r = in->xattrs[n].length();
12983       if (r > 0 && size != 0) {
12984         if (size >= (unsigned)r)
12985           memcpy(value, in->xattrs[n].c_str(), r);
12986         else
12987           r = -CEPHFS_ERANGE;
12988       }
12989     }
12990   }
12991  out:
12992   ldout(cct, 8) << "_getxattr(" << in->ino << ", \"" << name << "\", " << size << ") = " << r << dendl;
12993   return r;
12994 }
12995
12996 int Client::_getxattr(InodeRef &in, const char *name, void *value, size_t size,
12997                       const UserPerm& perms)
12998 {
12999   if (cct->_conf->client_permissions) {
13000     int r = xattr_permission(in.get(), name, MAY_READ, perms);
13001     if (r < 0)
13002       return r;
13003   }
13004   return _getxattr(in.get(), name, value, size, perms);
13005 }
13006
13007 int Client::ll_getxattr(Inode *in, const char *name, void *value,
13008                         size_t size, const UserPerm& perms)
13009 {
13010   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
13011   if (!mref_reader.is_state_satisfied())
13012     return -CEPHFS_ENOTCONN;
13013
13014   vinodeno_t vino = _get_vino(in);
13015
13016   ldout(cct, 3) << __func__ << " " << vino << " " << name << " size " << size << dendl;
13017   tout(cct) << __func__ << std::endl;
13018   tout(cct) << vino.ino.val << std::endl;
13019   tout(cct) << name << std::endl;
13020
13021   std::scoped_lock lock(client_lock);
13022   if (!fuse_default_permissions) {
13023     int r = xattr_permission(in, name, MAY_READ, perms);
13024     if (r < 0)
13025       return r;
13026   }
13027
13028   return _getxattr(in, name, value, size, perms);
13029 }
13030
13031 int Client::_listxattr(Inode *in, char *name, size_t size,
13032                        const UserPerm& perms)
13033 {
13034   bool len_only = (size == 0);
13035   int r = _getattr(in, CEPH_STAT_CAP_XATTR, perms, in->xattr_version == 0);
13036   if (r != 0) {
13037     goto out;
13038   }
13039
13040   r = 0;
13041   for ([[maybe_unused]] const auto &[xattr_name, xattr_value_bl] : in->xattrs) {
13042     if (xattr_name.rfind("ceph.", 0) == 0) {
13043       continue;
13044     }
13045
13046     size_t this_len = xattr_name.length() + 1;
13047     r += this_len;
13048     if (len_only)
13049       continue;
13050
13051     if (this_len > size) {
13052       r = -CEPHFS_ERANGE;
13053       goto out;
13054     }
13055
13056     memcpy(name, xattr_name.c_str(), this_len);
13057     name += this_len;
13058     size -= this_len;
13059   }
13060 out:
13061   ldout(cct, 8) << __func__ << "(" << in->ino << ", " << size << ") = " << r << dendl;
13062   return r;
13063 }
13064
13065 int Client::ll_listxattr(Inode *in, char *names, size_t size,
13066                          const UserPerm& perms)
13067 {
13068   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
13069   if (!mref_reader.is_state_satisfied())
13070     return -CEPHFS_ENOTCONN;
13071
13072   vinodeno_t vino = _get_vino(in);
13073
13074   ldout(cct, 3) << __func__ << " " << vino << " size " << size << dendl;
13075   tout(cct) << __func__ << std::endl;
13076   tout(cct) << vino.ino.val << std::endl;
13077   tout(cct) << size << std::endl;
13078
13079   std::scoped_lock lock(client_lock);
13080   return _listxattr(in, names, size, perms);
13081 }
13082
13083 int Client::_do_setxattr(Inode *in, const char *name, const void *value,
13084                          size_t size, int flags, const UserPerm& perms)
13085 {
13086
13087   int xattr_flags = 0;
13088   if (!value)
13089     xattr_flags |= CEPH_XATTR_REMOVE;
13090   if (flags & XATTR_CREATE)
13091     xattr_flags |= CEPH_XATTR_CREATE;
13092   if (flags & XATTR_REPLACE)
13093     xattr_flags |= CEPH_XATTR_REPLACE;
13094
13095   MetaRequest *req = new MetaRequest(CEPH_MDS_OP_SETXATTR);
13096   filepath path;
13097   in->make_nosnap_relative_path(path);
13098   req->set_filepath(path);
13099   req->set_string2(name);
13100   req->set_inode(in);
13101   req->head.args.setxattr.flags = xattr_flags;
13102
13103   bufferlist bl;
13104   ceph_assert(value || size == 0);
13105   bl.append((const char*)value, size);
13106   req->set_data(bl);
13107
13108   int res = make_request(req, perms);
13109
13110   trim_cache();
13111   ldout(cct, 3) << __func__ << "(" << in->ino << ", \"" << name << "\") = " <<
13112     res << dendl;
13113   return res;
13114 }
13115
13116 int Client::_setxattr(Inode *in, const char *name, const void *value,
13117                       size_t size, int flags, const UserPerm& perms)
13118 {
13119   if (in->snapid != CEPH_NOSNAP) {
13120     return -CEPHFS_EROFS;
13121   }
13122
13123   if (size == 0) {
13124     value = "";
13125   } else if (value == NULL) {
13126       return -CEPHFS_EINVAL;
13127   }
13128
13129   bool posix_acl_xattr = false;
13130   if (acl_type == POSIX_ACL)
13131     posix_acl_xattr = !strncmp(name, "system.", 7);
13132
13133   if (strncmp(name, "user.", 5) &&
13134       strncmp(name, "security.", 9) &&
13135       strncmp(name, "trusted.", 8) &&
13136       strncmp(name, "ceph.", 5) &&
13137       !posix_acl_xattr)
13138     return -CEPHFS_EOPNOTSUPP;
13139
13140   bool check_realm = false;
13141
13142   if (posix_acl_xattr) {
13143     if (!strcmp(name, ACL_EA_ACCESS)) {
13144       mode_t new_mode = in->mode;
13145       if (value) {
13146         int ret = posix_acl_equiv_mode(value, size, &new_mode);
13147         if (ret < 0)
13148           return ret;
13149         if (ret == 0) {
13150           value = NULL;
13151           size = 0;
13152         }
13153         if (new_mode != in->mode) {
13154           struct ceph_statx stx;
13155           stx.stx_mode = new_mode;
13156           ret = _do_setattr(in, &stx, CEPH_SETATTR_MODE, perms, nullptr);
13157           if (ret < 0)
13158             return ret;
13159         }
13160       }
13161     } else if (!strcmp(name, ACL_EA_DEFAULT)) {
13162       if (value) {
13163         if (!S_ISDIR(in->mode))
13164           return -CEPHFS_EACCES;
13165         int ret = posix_acl_check(value, size);
13166         if (ret < 0)
13167           return -CEPHFS_EINVAL;
13168         if (ret == 0) {
13169           value = NULL;
13170           size = 0;
13171         }
13172       }
13173     } else {
13174       return -CEPHFS_EOPNOTSUPP;
13175     }
13176   } else {
13177     const VXattr *vxattr = _match_vxattr(in, name);
13178     if (vxattr) {
13179       if (vxattr->readonly)
13180         return -CEPHFS_EOPNOTSUPP;
13181       if (vxattr->setxattr_cb)
13182         return (this->*(vxattr->setxattr_cb))(in, value, size, perms);
13183       if (vxattr->name.compare(0, 10, "ceph.quota") == 0 && value)
13184         check_realm = true;
13185     }
13186   }
13187
13188   int ret = _do_setxattr(in, name, value, size, flags, perms);
13189   if (ret >= 0 && check_realm) {
13190     // check if snaprealm was created for quota inode
13191     if (in->quota.is_enabled() &&
13192         !(in->snaprealm && in->snaprealm->ino == in->ino))
13193       ret = -CEPHFS_EOPNOTSUPP;
13194   }
13195
13196   return ret;
13197 }
13198
13199 int Client::_setxattr(InodeRef &in, const char *name, const void *value,
13200                       size_t size, int flags, const UserPerm& perms)
13201 {
13202   if (cct->_conf->client_permissions) {
13203     int r = xattr_permission(in.get(), name, MAY_WRITE, perms);
13204     if (r < 0)
13205       return r;
13206   }
13207   return _setxattr(in.get(), name, value, size, flags, perms);
13208 }
13209
13210 int Client::_setxattr_check_data_pool(string& name, string& value, const OSDMap *osdmap)
13211 {
13212   string tmp;
13213   if (name == "layout") {
13214     string::iterator begin = value.begin();
13215     string::iterator end = value.end();
13216     keys_and_values<string::iterator> p;    // create instance of parser
13217     std::map<string, string> m;             // map to receive results
13218     if (!qi::parse(begin, end, p, m)) {     // returns true if successful
13219       return -CEPHFS_EINVAL;
13220     }
13221     if (begin != end)
13222       return -CEPHFS_EINVAL;
13223     for (map<string,string>::iterator q = m.begin(); q != m.end(); ++q) {
13224       if (q->first == "pool") {
13225         tmp = q->second;
13226         break;
13227       }
13228     }
13229   } else if (name == "layout.pool") {
13230     tmp = value;
13231   }
13232
13233   if (tmp.length()) {
13234     int64_t pool;
13235     try {
13236       pool = boost::lexical_cast<unsigned>(tmp);
13237       if (!osdmap->have_pg_pool(pool))
13238         return -CEPHFS_ENOENT;
13239     } catch (boost::bad_lexical_cast const&) {
13240       pool = osdmap->lookup_pg_pool_name(tmp);
13241       if (pool < 0) {
13242         return -CEPHFS_ENOENT;
13243       }
13244     }
13245   }
13246
13247   return 0;
13248 }
13249
13250 void Client::_setxattr_maybe_wait_for_osdmap(const char *name, const void *value, size_t size)
13251 {
13252   // For setting pool of layout, MetaRequest need osdmap epoch.
13253   // There is a race which create a new data pool but client and mds both don't have.
13254   // Make client got the latest osdmap which make mds quickly judge whether get newer osdmap.
13255   ldout(cct, 15) << __func__ << ": name = " << name << dendl;
13256   if (strcmp(name, "ceph.file.layout.pool") == 0 || strcmp(name, "ceph.dir.layout.pool") == 0 ||
13257       strcmp(name, "ceph.file.layout") == 0 || strcmp(name, "ceph.dir.layout") == 0) {
13258     string rest(strstr(name, "layout"));
13259     string v((const char*)value, size);
13260     int r = objecter->with_osdmap([&](const OSDMap& o) {
13261       return _setxattr_check_data_pool(rest, v, &o);
13262     });
13263
13264     if (r == -CEPHFS_ENOENT) {
13265       bs::error_code ec;
13266       ldout(cct, 20) << __func__ << ": waiting for latest osdmap" << dendl;
13267       objecter->wait_for_latest_osdmap(ca::use_blocked[ec]);
13268       ldout(cct, 20) << __func__ << ": got latest osdmap: " << ec << dendl;
13269     }
13270   }
13271 }
13272
13273 int Client::ll_setxattr(Inode *in, const char *name, const void *value,
13274                         size_t size, int flags, const UserPerm& perms)
13275 {
13276   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
13277   if (!mref_reader.is_state_satisfied())
13278     return -CEPHFS_ENOTCONN;
13279
13280   _setxattr_maybe_wait_for_osdmap(name, value, size);
13281
13282   vinodeno_t vino = _get_vino(in);
13283
13284   ldout(cct, 3) << __func__ << " " << vino << " " << name << " size " << size << dendl;
13285   tout(cct) << __func__ << std::endl;
13286   tout(cct) << vino.ino.val << std::endl;
13287   tout(cct) << name << std::endl;
13288
13289   std::scoped_lock lock(client_lock);
13290   if (!fuse_default_permissions) {
13291     int r = xattr_permission(in, name, MAY_WRITE, perms);
13292     if (r < 0)
13293       return r;
13294   }
13295   return _setxattr(in, name, value, size, flags, perms);
13296 }
13297
13298 int Client::_removexattr(Inode *in, const char *name, const UserPerm& perms)
13299 {
13300   if (in->snapid != CEPH_NOSNAP) {
13301     return -CEPHFS_EROFS;
13302   }
13303
13304   // same xattrs supported by kernel client
13305   if (strncmp(name, "user.", 5) &&
13306       strncmp(name, "system.", 7) &&
13307       strncmp(name, "security.", 9) &&
13308       strncmp(name, "trusted.", 8) &&
13309       strncmp(name, "ceph.", 5))
13310     return -CEPHFS_EOPNOTSUPP;
13311
13312   const VXattr *vxattr = _match_vxattr(in, name);
13313   if (vxattr && vxattr->readonly)
13314     return -CEPHFS_EOPNOTSUPP;
13315
13316   MetaRequest *req = new MetaRequest(CEPH_MDS_OP_RMXATTR);
13317   filepath path;
13318   in->make_nosnap_relative_path(path);
13319   req->set_filepath(path);
13320   req->set_filepath2(name);
13321   req->set_inode(in);
13322
13323   int res = make_request(req, perms);
13324
13325   trim_cache();
13326   ldout(cct, 8) << "_removexattr(" << in->ino << ", \"" << name << "\") = " << res << dendl;
13327   return res;
13328 }
13329
13330 int Client::_removexattr(InodeRef &in, const char *name, const UserPerm& perms)
13331 {
13332   if (cct->_conf->client_permissions) {
13333     int r = xattr_permission(in.get(), name, MAY_WRITE, perms);
13334     if (r < 0)
13335       return r;
13336   }
13337   return _removexattr(in.get(), name, perms);
13338 }
13339
13340 int Client::ll_removexattr(Inode *in, const char *name, const UserPerm& perms)
13341 {
13342   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
13343   if (!mref_reader.is_state_satisfied())
13344     return -CEPHFS_ENOTCONN;
13345
13346   vinodeno_t vino = _get_vino(in);
13347
13348   ldout(cct, 3) << "ll_removexattr " << vino << " " << name << dendl;
13349   tout(cct) << "ll_removexattr" << std::endl;
13350   tout(cct) << vino.ino.val << std::endl;
13351   tout(cct) << name << std::endl;
13352
13353   std::scoped_lock lock(client_lock);
13354   if (!fuse_default_permissions) {
13355     int r = xattr_permission(in, name, MAY_WRITE, perms);
13356     if (r < 0)
13357       return r;
13358   }
13359
13360   return _removexattr(in, name, perms);
13361 }
13362
13363 bool Client::_vxattrcb_fscrypt_auth_exists(Inode *in)
13364 {
13365   bool exists = !in->fscrypt_auth.empty();
13366
13367   ldout(cct, 10) << "fscrypt_auth exists " << exists << dendl;
13368   return exists;
13369 }
13370
13371 size_t Client::_vxattrcb_fscrypt_auth(Inode *in, char *val, size_t size)
13372 {
13373   size_t count = in->fscrypt_auth.size();
13374
13375   if (count <= size)
13376     memcpy(val, in->fscrypt_auth.data(), count);
13377   return count;
13378 }
13379
13380 int Client::_vxattrcb_fscrypt_auth_set(Inode *in, const void *val, size_t size,
13381                                        const UserPerm& perms)
13382 {
13383   struct ceph_statx stx = { 0 };
13384   std::vector<uint8_t>  aux;
13385
13386   aux.resize(size);
13387   memcpy(aux.data(), val, size);
13388
13389   return _do_setattr(in, &stx, CEPH_SETATTR_FSCRYPT_AUTH, perms, nullptr, &aux);
13390 }
13391
13392 bool Client::_vxattrcb_fscrypt_file_exists(Inode *in)
13393 {
13394   return !in->fscrypt_file.empty();
13395 }
13396
13397 size_t Client::_vxattrcb_fscrypt_file(Inode *in, char *val, size_t size)
13398 {
13399   size_t count = in->fscrypt_file.size();
13400
13401   if (count <= size)
13402     memcpy(val, in->fscrypt_file.data(), count);
13403   return count;
13404 }
13405
13406 int Client::_vxattrcb_fscrypt_file_set(Inode *in, const void *val, size_t size,
13407                                        const UserPerm& perms)
13408 {
13409   struct ceph_statx stx = { 0 };
13410   std::vector<uint8_t>  aux;
13411
13412   aux.resize(size);
13413   memcpy(aux.data(), val, size);
13414
13415   return _do_setattr(in, &stx, CEPH_SETATTR_FSCRYPT_FILE, perms, nullptr, &aux);
13416 }
13417
13418 bool Client::_vxattrcb_quota_exists(Inode *in)
13419 {
13420   return in->quota.is_enabled() &&
13421    (in->snapid != CEPH_NOSNAP ||
13422     (in->snaprealm && in->snaprealm->ino == in->ino));
13423 }
13424 size_t Client::_vxattrcb_quota(Inode *in, char *val, size_t size)
13425 {
13426   return snprintf(val, size,
13427                   "max_bytes=%lld max_files=%lld",
13428                   (long long int)in->quota.max_bytes,
13429                   (long long int)in->quota.max_files);
13430 }
13431 size_t Client::_vxattrcb_quota_max_bytes(Inode *in, char *val, size_t size)
13432 {
13433   return snprintf(val, size, "%lld", (long long int)in->quota.max_bytes);
13434 }
13435 size_t Client::_vxattrcb_quota_max_files(Inode *in, char *val, size_t size)
13436 {
13437   return snprintf(val, size, "%lld", (long long int)in->quota.max_files);
13438 }
13439
13440 bool Client::_vxattrcb_layout_exists(Inode *in)
13441 {
13442   return in->layout != file_layout_t();
13443 }
13444 size_t Client::_vxattrcb_layout(Inode *in, char *val, size_t size)
13445 {
13446   int r = snprintf(val, size,
13447       "stripe_unit=%llu stripe_count=%llu object_size=%llu pool=",
13448       (unsigned long long)in->layout.stripe_unit,
13449       (unsigned long long)in->layout.stripe_count,
13450       (unsigned long long)in->layout.object_size);
13451   objecter->with_osdmap([&](const OSDMap& o) {
13452       if (o.have_pg_pool(in->layout.pool_id))
13453         r += snprintf(val + r, size - r, "%s",
13454                       o.get_pool_name(in->layout.pool_id).c_str());
13455       else
13456         r += snprintf(val + r, size - r, "%" PRIu64,
13457                       (uint64_t)in->layout.pool_id);
13458     });
13459   if (in->layout.pool_ns.length())
13460     r += snprintf(val + r, size - r, " pool_namespace=%s",
13461                   in->layout.pool_ns.c_str());
13462   return r;
13463 }
13464 size_t Client::_vxattrcb_layout_stripe_unit(Inode *in, char *val, size_t size)
13465 {
13466   return snprintf(val, size, "%llu", (unsigned long long)in->layout.stripe_unit);
13467 }
13468 size_t Client::_vxattrcb_layout_stripe_count(Inode *in, char *val, size_t size)
13469 {
13470   return snprintf(val, size, "%llu", (unsigned long long)in->layout.stripe_count);
13471 }
13472 size_t Client::_vxattrcb_layout_object_size(Inode *in, char *val, size_t size)
13473 {
13474   return snprintf(val, size, "%llu", (unsigned long long)in->layout.object_size);
13475 }
13476 size_t Client::_vxattrcb_layout_pool(Inode *in, char *val, size_t size)
13477 {
13478   size_t r;
13479   objecter->with_osdmap([&](const OSDMap& o) {
13480       if (o.have_pg_pool(in->layout.pool_id))
13481         r = snprintf(val, size, "%s", o.get_pool_name(
13482                        in->layout.pool_id).c_str());
13483       else
13484         r = snprintf(val, size, "%" PRIu64, (uint64_t)in->layout.pool_id);
13485     });
13486   return r;
13487 }
13488 size_t Client::_vxattrcb_layout_pool_namespace(Inode *in, char *val, size_t size)
13489 {
13490   return snprintf(val, size, "%s", in->layout.pool_ns.c_str());
13491 }
13492 size_t Client::_vxattrcb_dir_entries(Inode *in, char *val, size_t size)
13493 {
13494   return snprintf(val, size, "%llu", (unsigned long long)(in->dirstat.nfiles + in->dirstat.nsubdirs));
13495 }
13496 size_t Client::_vxattrcb_dir_files(Inode *in, char *val, size_t size)
13497 {
13498   return snprintf(val, size, "%llu", (unsigned long long)in->dirstat.nfiles);
13499 }
13500 size_t Client::_vxattrcb_dir_subdirs(Inode *in, char *val, size_t size)
13501 {
13502   return snprintf(val, size, "%llu", (unsigned long long)in->dirstat.nsubdirs);
13503 }
13504 size_t Client::_vxattrcb_dir_rentries(Inode *in, char *val, size_t size)
13505 {
13506   return snprintf(val, size, "%llu", (unsigned long long)(in->rstat.rfiles + in->rstat.rsubdirs));
13507 }
13508 size_t Client::_vxattrcb_dir_rfiles(Inode *in, char *val, size_t size)
13509 {
13510   return snprintf(val, size, "%llu", (unsigned long long)in->rstat.rfiles);
13511 }
13512 size_t Client::_vxattrcb_dir_rsubdirs(Inode *in, char *val, size_t size)
13513 {
13514   return snprintf(val, size, "%llu", (unsigned long long)in->rstat.rsubdirs);
13515 }
13516 size_t Client::_vxattrcb_dir_rsnaps(Inode *in, char *val, size_t size)
13517 {
13518   return snprintf(val, size, "%llu", (unsigned long long)in->rstat.rsnaps);
13519 }
13520 size_t Client::_vxattrcb_dir_rbytes(Inode *in, char *val, size_t size)
13521 {
13522   return snprintf(val, size, "%llu", (unsigned long long)in->rstat.rbytes);
13523 }
13524 size_t Client::_vxattrcb_dir_rctime(Inode *in, char *val, size_t size)
13525 {
13526   return snprintf(val, size, "%ld.%09ld", (long)in->rstat.rctime.sec(),
13527       (long)in->rstat.rctime.nsec());
13528 }
13529 bool Client::_vxattrcb_dir_pin_exists(Inode *in)
13530 {
13531   return in->dir_pin != -CEPHFS_ENODATA;
13532 }
13533 size_t Client::_vxattrcb_dir_pin(Inode *in, char *val, size_t size)
13534 {
13535   return snprintf(val, size, "%ld", (long)in->dir_pin);
13536 }
13537
13538 bool Client::_vxattrcb_snap_btime_exists(Inode *in)
13539 {
13540   return !in->snap_btime.is_zero();
13541 }
13542
13543 size_t Client::_vxattrcb_snap_btime(Inode *in, char *val, size_t size)
13544 {
13545   return snprintf(val, size, "%llu.%09lu",
13546       (long long unsigned)in->snap_btime.sec(),
13547       (long unsigned)in->snap_btime.nsec());
13548 }
13549
13550 size_t Client::_vxattrcb_caps(Inode *in, char *val, size_t size)
13551 {
13552   int issued;
13553
13554   in->caps_issued(&issued);
13555   return snprintf(val, size, "%s/0x%x", ccap_string(issued).c_str(), issued);
13556 }
13557
13558 bool Client::_vxattrcb_mirror_info_exists(Inode *in)
13559 {
13560   // checking one of the xattrs would suffice
13561   return in->xattrs.count("ceph.mirror.info.cluster_id") != 0;
13562 }
13563
13564 size_t Client::_vxattrcb_mirror_info(Inode *in, char *val, size_t size)
13565 {
13566   return snprintf(val, size, "cluster_id=%.*s fs_id=%.*s",
13567                   in->xattrs["ceph.mirror.info.cluster_id"].length(),
13568                   in->xattrs["ceph.mirror.info.cluster_id"].c_str(),
13569                   in->xattrs["ceph.mirror.info.fs_id"].length(),
13570                   in->xattrs["ceph.mirror.info.fs_id"].c_str());
13571 }
13572
13573 size_t Client::_vxattrcb_cluster_fsid(Inode *in, char *val, size_t size)
13574 {
13575   return snprintf(val, size, "%s", monclient->get_fsid().to_string().c_str());
13576 }
13577
13578 size_t Client::_vxattrcb_client_id(Inode *in, char *val, size_t size)
13579 {
13580   auto name = messenger->get_myname();
13581   return snprintf(val, size, "%s%" PRId64, name.type_str(), name.num());
13582 }
13583
13584 #define CEPH_XATTR_NAME(_type, _name) "ceph." #_type "." #_name
13585 #define CEPH_XATTR_NAME2(_type, _name, _name2) "ceph." #_type "." #_name "." #_name2
13586
13587 #define XATTR_NAME_CEPH(_type, _name, _flags)                 \
13588 {                                                              \
13589   name: CEPH_XATTR_NAME(_type, _name),                         \
13590   getxattr_cb: &Client::_vxattrcb_ ## _type ## _ ## _name,     \
13591   readonly: true,                                              \
13592   exists_cb: NULL,                                             \
13593   flags: _flags,                                               \
13594 }
13595 #define XATTR_LAYOUT_FIELD(_type, _name, _field)                \
13596 {                                                               \
13597   name: CEPH_XATTR_NAME2(_type, _name, _field),                 \
13598   getxattr_cb: &Client::_vxattrcb_ ## _name ## _ ## _field,     \
13599   readonly: false,                                              \
13600   exists_cb: &Client::_vxattrcb_layout_exists,                  \
13601   flags: 0,                                                     \
13602 }
13603 #define XATTR_QUOTA_FIELD(_type, _name)                         \
13604 {                                                               \
13605   name: CEPH_XATTR_NAME(_type, _name),                          \
13606   getxattr_cb: &Client::_vxattrcb_ ## _type ## _ ## _name,      \
13607   readonly: false,                                              \
13608   exists_cb: &Client::_vxattrcb_quota_exists,                   \
13609   flags: 0,                                                     \
13610 }
13611
13612 const Client::VXattr Client::_dir_vxattrs[] = {
13613   {
13614     name: "ceph.dir.layout",
13615     getxattr_cb: &Client::_vxattrcb_layout,
13616     readonly: false,
13617     exists_cb: &Client::_vxattrcb_layout_exists,
13618     flags: 0,
13619   },
13620   // FIXME
13621   // Delete the following dir layout field definitions for release "S"
13622   XATTR_LAYOUT_FIELD(dir, layout, stripe_unit),
13623   XATTR_LAYOUT_FIELD(dir, layout, stripe_count),
13624   XATTR_LAYOUT_FIELD(dir, layout, object_size),
13625   XATTR_LAYOUT_FIELD(dir, layout, pool),
13626   XATTR_LAYOUT_FIELD(dir, layout, pool_namespace),
13627   XATTR_NAME_CEPH(dir, entries, VXATTR_DIRSTAT),
13628   XATTR_NAME_CEPH(dir, files, VXATTR_DIRSTAT),
13629   XATTR_NAME_CEPH(dir, subdirs, VXATTR_DIRSTAT),
13630   XATTR_NAME_CEPH(dir, rentries, VXATTR_RSTAT),
13631   XATTR_NAME_CEPH(dir, rfiles, VXATTR_RSTAT),
13632   XATTR_NAME_CEPH(dir, rsubdirs, VXATTR_RSTAT),
13633   XATTR_NAME_CEPH(dir, rsnaps, VXATTR_RSTAT),
13634   XATTR_NAME_CEPH(dir, rbytes, VXATTR_RSTAT),
13635   XATTR_NAME_CEPH(dir, rctime, VXATTR_RSTAT),
13636   {
13637     name: "ceph.quota",
13638     getxattr_cb: &Client::_vxattrcb_quota,
13639     readonly: false,
13640     exists_cb: &Client::_vxattrcb_quota_exists,
13641     flags: 0,
13642   },
13643   XATTR_QUOTA_FIELD(quota, max_bytes),
13644   XATTR_QUOTA_FIELD(quota, max_files),
13645   // FIXME
13646   // Delete the following dir pin field definitions for release "S"
13647   {
13648     name: "ceph.dir.pin",
13649     getxattr_cb: &Client::_vxattrcb_dir_pin,
13650     readonly: false,
13651     exists_cb: &Client::_vxattrcb_dir_pin_exists,
13652     flags: 0,
13653   },
13654   {
13655     name: "ceph.snap.btime",
13656     getxattr_cb: &Client::_vxattrcb_snap_btime,
13657     readonly: true,
13658     exists_cb: &Client::_vxattrcb_snap_btime_exists,
13659     flags: 0,
13660   },
13661   {
13662     name: "ceph.mirror.info",
13663     getxattr_cb: &Client::_vxattrcb_mirror_info,
13664     readonly: false,
13665     exists_cb: &Client::_vxattrcb_mirror_info_exists,
13666     flags: 0,
13667   },
13668   {
13669     name: "ceph.caps",
13670     getxattr_cb: &Client::_vxattrcb_caps,
13671     readonly: true,
13672     exists_cb: NULL,
13673     flags: 0,
13674   },
13675   { name: "" }     /* Required table terminator */
13676 };
13677
13678 const Client::VXattr Client::_file_vxattrs[] = {
13679   {
13680     name: "ceph.file.layout",
13681     getxattr_cb: &Client::_vxattrcb_layout,
13682     readonly: false,
13683     exists_cb: &Client::_vxattrcb_layout_exists,
13684     flags: 0,
13685   },
13686   XATTR_LAYOUT_FIELD(file, layout, stripe_unit),
13687   XATTR_LAYOUT_FIELD(file, layout, stripe_count),
13688   XATTR_LAYOUT_FIELD(file, layout, object_size),
13689   XATTR_LAYOUT_FIELD(file, layout, pool),
13690   XATTR_LAYOUT_FIELD(file, layout, pool_namespace),
13691   {
13692     name: "ceph.snap.btime",
13693     getxattr_cb: &Client::_vxattrcb_snap_btime,
13694     readonly: true,
13695     exists_cb: &Client::_vxattrcb_snap_btime_exists,
13696     flags: 0,
13697   },
13698   {
13699     name: "ceph.caps",
13700     getxattr_cb: &Client::_vxattrcb_caps,
13701     readonly: true,
13702     exists_cb: NULL,
13703     flags: 0,
13704   },
13705   { name: "" }     /* Required table terminator */
13706 };
13707
13708 const Client::VXattr Client::_common_vxattrs[] = {
13709   {
13710     name: "ceph.cluster_fsid",
13711     getxattr_cb: &Client::_vxattrcb_cluster_fsid,
13712     readonly: true,
13713     exists_cb: nullptr,
13714     flags: 0,
13715   },
13716   {
13717     name: "ceph.client_id",
13718     getxattr_cb: &Client::_vxattrcb_client_id,
13719     readonly: true,
13720     exists_cb: nullptr,
13721     flags: 0,
13722   },
13723   {
13724     name: "ceph.fscrypt.auth",
13725     getxattr_cb: &Client::_vxattrcb_fscrypt_auth,
13726     setxattr_cb: &Client::_vxattrcb_fscrypt_auth_set,
13727     readonly: false,
13728     exists_cb: &Client::_vxattrcb_fscrypt_auth_exists,
13729     flags: 0,
13730   },
13731   {
13732     name: "ceph.fscrypt.file",
13733     getxattr_cb: &Client::_vxattrcb_fscrypt_file,
13734     setxattr_cb: &Client::_vxattrcb_fscrypt_file_set,
13735     readonly: false,
13736     exists_cb: &Client::_vxattrcb_fscrypt_file_exists,
13737     flags: 0,
13738   },
13739   { name: "" }     /* Required table terminator */
13740 };
13741
13742 const Client::VXattr *Client::_get_vxattrs(Inode *in)
13743 {
13744   if (in->is_dir())
13745     return _dir_vxattrs;
13746   else if (in->is_file())
13747     return _file_vxattrs;
13748   return NULL;
13749 }
13750
13751 const Client::VXattr *Client::_match_vxattr(Inode *in, const char *name)
13752 {
13753   if (strncmp(name, "ceph.", 5) == 0) {
13754     const VXattr *vxattr = _get_vxattrs(in);
13755     if (vxattr) {
13756       while (!vxattr->name.empty()) {
13757         if (vxattr->name == name)
13758           return vxattr;
13759         vxattr++;
13760       }
13761     }
13762
13763     // for common vxattrs
13764     vxattr = _common_vxattrs;
13765     while (!vxattr->name.empty()) {
13766       if (vxattr->name == name)
13767         return vxattr;
13768       vxattr++;
13769     }
13770   }
13771
13772   return NULL;
13773 }
13774
13775 int Client::ll_readlink(Inode *in, char *buf, size_t buflen, const UserPerm& perms)
13776 {
13777   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
13778   if (!mref_reader.is_state_satisfied())
13779     return -CEPHFS_ENOTCONN;
13780
13781   vinodeno_t vino = _get_vino(in);
13782
13783   ldout(cct, 3) << "ll_readlink " << vino << dendl;
13784   tout(cct) << "ll_readlink" << std::endl;
13785   tout(cct) << vino.ino.val << std::endl;
13786
13787   std::scoped_lock lock(client_lock);
13788   for (auto dn : in->dentries) {
13789     touch_dn(dn);
13790   }
13791
13792   int r = _readlink(in, buf, buflen); // FIXME: no permission checking!
13793   ldout(cct, 3) << "ll_readlink " << vino << " = " << r << dendl;
13794   return r;
13795 }
13796
13797 int Client::_mknod(Inode *dir, const char *name, mode_t mode, dev_t rdev,
13798                    const UserPerm& perms, InodeRef *inp)
13799 {
13800   ldout(cct, 8) << "_mknod(" << dir->ino << " " << name << ", 0" << oct
13801                 << mode << dec << ", " << rdev << ", uid " << perms.uid()
13802                 << ", gid " << perms.gid() << ")" << dendl;
13803
13804   if (strlen(name) > NAME_MAX)
13805     return -CEPHFS_ENAMETOOLONG;
13806
13807   if (dir->snapid != CEPH_NOSNAP) {
13808     return -CEPHFS_EROFS;
13809   }
13810   if (is_quota_files_exceeded(dir, perms)) {
13811     return -CEPHFS_EDQUOT;
13812   }
13813
13814   MetaRequest *req = new MetaRequest(CEPH_MDS_OP_MKNOD);
13815
13816   req->set_inode_owner_uid_gid(perms.uid(), perms.gid());
13817
13818   filepath path;
13819   dir->make_nosnap_relative_path(path);
13820   path.push_dentry(name);
13821   req->set_filepath(path);
13822   req->set_inode(dir);
13823   req->head.args.mknod.rdev = rdev;
13824   req->dentry_drop = CEPH_CAP_FILE_SHARED;
13825   req->dentry_unless = CEPH_CAP_FILE_EXCL;
13826
13827   bufferlist xattrs_bl;
13828   int res = _posix_acl_create(dir, &mode, xattrs_bl, perms);
13829   if (res < 0) {
13830     put_request(req);
13831     return res;
13832   }
13833   req->head.args.mknod.mode = mode;
13834   if (xattrs_bl.length() > 0)
13835     req->set_data(xattrs_bl);
13836
13837   Dentry *de = get_or_create(dir, name);
13838   req->set_dentry(de);
13839
13840   res = make_request(req, perms, inp);
13841
13842   trim_cache();
13843
13844   ldout(cct, 8) << "mknod(" << path << ", 0" << oct << mode << dec << ") = " << res << dendl;
13845   return res;
13846 }
13847
13848 int Client::ll_mknod(Inode *parent, const char *name, mode_t mode,
13849                      dev_t rdev, struct stat *attr, Inode **out,
13850                      const UserPerm& perms)
13851 {
13852   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
13853   if (!mref_reader.is_state_satisfied())
13854     return -CEPHFS_ENOTCONN;
13855
13856   vinodeno_t vparent = _get_vino(parent);
13857
13858   ldout(cct, 3) << "ll_mknod " << vparent << " " << name << dendl;
13859   tout(cct) << "ll_mknod" << std::endl;
13860   tout(cct) << vparent.ino.val << std::endl;
13861   tout(cct) << name << std::endl;
13862   tout(cct) << mode << std::endl;
13863   tout(cct) << rdev << std::endl;
13864
13865   std::scoped_lock lock(client_lock);
13866   if (!fuse_default_permissions) {
13867     int r = may_create(parent, perms);
13868     if (r < 0)
13869       return r;
13870   }
13871
13872   InodeRef in;
13873   int r = _mknod(parent, name, mode, rdev, perms, &in);
13874   if (r == 0) {
13875     fill_stat(in, attr);
13876     _ll_get(in.get());
13877   }
13878   tout(cct) << attr->st_ino << std::endl;
13879   ldout(cct, 3) << "ll_mknod " << vparent << " " << name
13880           << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
13881   *out = in.get();
13882   return r;
13883 }
13884
13885 int Client::ll_mknodx(Inode *parent, const char *name, mode_t mode,
13886                       dev_t rdev, Inode **out,
13887                       struct ceph_statx *stx, unsigned want, unsigned flags,
13888                       const UserPerm& perms)
13889 {
13890   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
13891   if (!mref_reader.is_state_satisfied())
13892     return -CEPHFS_ENOTCONN;
13893
13894   unsigned caps = statx_to_mask(flags, want);
13895
13896   vinodeno_t vparent = _get_vino(parent);
13897
13898   ldout(cct, 3) << "ll_mknodx " << vparent << " " << name << dendl;
13899   tout(cct) << "ll_mknodx" << std::endl;
13900   tout(cct) << vparent.ino.val << std::endl;
13901   tout(cct) << name << std::endl;
13902   tout(cct) << mode << std::endl;
13903   tout(cct) << rdev << std::endl;
13904
13905   std::scoped_lock lock(client_lock);
13906
13907   if (!fuse_default_permissions) {
13908     int r = may_create(parent, perms);
13909     if (r < 0)
13910       return r;
13911   }
13912
13913   InodeRef in;
13914   int r = _mknod(parent, name, mode, rdev, perms, &in);
13915   if (r == 0) {
13916     fill_statx(in, caps, stx);
13917     _ll_get(in.get());
13918   }
13919   tout(cct) << stx->stx_ino << std::endl;
13920   ldout(cct, 3) << "ll_mknodx " << vparent << " " << name
13921           << " = " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
13922   *out = in.get();
13923   return r;
13924 }
13925
13926 int Client::_create(Inode *dir, const char *name, int flags, mode_t mode,
13927                     InodeRef *inp, Fh **fhp, int stripe_unit, int stripe_count,
13928                     int object_size, const char *data_pool, bool *created,
13929                     const UserPerm& perms, std::string alternate_name)
13930 {
13931   ldout(cct, 8) << "_create(" << dir->ino << " " << name << ", 0" << oct <<
13932     mode << dec << ")" << dendl;
13933
13934   if (strlen(name) > NAME_MAX)
13935     return -CEPHFS_ENAMETOOLONG;
13936   if (dir->snapid != CEPH_NOSNAP) {
13937     return -CEPHFS_EROFS;
13938   }
13939   if (is_quota_files_exceeded(dir, perms)) {
13940     return -CEPHFS_EDQUOT;
13941   }
13942
13943   // use normalized flags to generate cmode
13944   int cflags = ceph_flags_sys2wire(flags);
13945   if (cct->_conf.get_val<bool>("client_force_lazyio"))
13946     cflags |= CEPH_O_LAZY;
13947
13948   int cmode = ceph_flags_to_mode(cflags);
13949
13950   int64_t pool_id = -1;
13951   if (data_pool && *data_pool) {
13952     pool_id = objecter->with_osdmap(
13953       std::mem_fn(&OSDMap::lookup_pg_pool_name), data_pool);
13954     if (pool_id < 0)
13955       return -CEPHFS_EINVAL;
13956     if (pool_id > 0xffffffffll)
13957       return -CEPHFS_ERANGE;  // bummer!
13958   }
13959
13960   MetaRequest *req = new MetaRequest(CEPH_MDS_OP_CREATE);
13961
13962   req->set_inode_owner_uid_gid(perms.uid(), perms.gid());
13963
13964   filepath path;
13965   dir->make_nosnap_relative_path(path);
13966   path.push_dentry(name);
13967   req->set_filepath(path);
13968   req->set_alternate_name(std::move(alternate_name));
13969   req->set_inode(dir);
13970   req->head.args.open.flags = cflags | CEPH_O_CREAT;
13971
13972   req->head.args.open.stripe_unit = stripe_unit;
13973   req->head.args.open.stripe_count = stripe_count;
13974   req->head.args.open.object_size = object_size;
13975   if (cct->_conf->client_debug_getattr_caps)
13976     req->head.args.open.mask = DEBUG_GETATTR_CAPS;
13977   else
13978     req->head.args.open.mask = 0;
13979   req->head.args.open.pool = pool_id;
13980   req->dentry_drop = CEPH_CAP_FILE_SHARED;
13981   req->dentry_unless = CEPH_CAP_FILE_EXCL;
13982
13983   mode |= S_IFREG;
13984   bufferlist xattrs_bl;
13985   int res = _posix_acl_create(dir, &mode, xattrs_bl, perms);
13986   if (res < 0) {
13987     put_request(req);
13988     return res;
13989   }
13990   req->head.args.open.mode = mode;
13991   if (xattrs_bl.length() > 0)
13992     req->set_data(xattrs_bl);
13993
13994   Dentry *de = get_or_create(dir, name);
13995   req->set_dentry(de);
13996
13997   res = make_request(req, perms, inp, created);
13998   if (res < 0) {
13999     goto reply_error;
14000   }
14001
14002   /* If the caller passed a value in fhp, do the open */
14003   if(fhp) {
14004     (*inp)->get_open_ref(cmode);
14005     *fhp = _create_fh(inp->get(), flags, cmode, perms);
14006   }
14007
14008  reply_error:
14009   trim_cache();
14010
14011   ldout(cct, 8) << "create(" << path << ", 0" << oct << mode << dec
14012                 << " layout " << stripe_unit
14013                 << ' ' << stripe_count
14014                 << ' ' << object_size
14015                 <<") = " << res << dendl;
14016   return res;
14017 }
14018
14019 int Client::_mkdir(Inode *dir, const char *name, mode_t mode, const UserPerm& perm,
14020                    InodeRef *inp, const std::map<std::string, std::string> &metadata,
14021                    std::string alternate_name)
14022 {
14023   ldout(cct, 8) << "_mkdir(" << dir->ino << " " << name << ", 0" << oct
14024                 << mode << dec << ", uid " << perm.uid()
14025                 << ", gid " << perm.gid() << ")" << dendl;
14026
14027   if (strlen(name) > NAME_MAX)
14028     return -CEPHFS_ENAMETOOLONG;
14029
14030   if (dir->snapid != CEPH_NOSNAP && dir->snapid != CEPH_SNAPDIR) {
14031     return -CEPHFS_EROFS;
14032   }
14033   if (is_quota_files_exceeded(dir, perm)) {
14034     return -CEPHFS_EDQUOT;
14035   }
14036
14037   bool is_snap_op = dir->snapid == CEPH_SNAPDIR;
14038   MetaRequest *req = new MetaRequest(is_snap_op ?
14039                                      CEPH_MDS_OP_MKSNAP : CEPH_MDS_OP_MKDIR);
14040
14041   if (!is_snap_op)
14042     req->set_inode_owner_uid_gid(perm.uid(), perm.gid());
14043
14044   filepath path;
14045   dir->make_nosnap_relative_path(path);
14046   path.push_dentry(name);
14047   req->set_filepath(path);
14048   req->set_inode(dir);
14049   req->dentry_drop = CEPH_CAP_FILE_SHARED;
14050   req->dentry_unless = CEPH_CAP_FILE_EXCL;
14051   req->set_alternate_name(std::move(alternate_name));
14052
14053   mode |= S_IFDIR;
14054   bufferlist bl;
14055   int res = _posix_acl_create(dir, &mode, bl, perm);
14056   if (res < 0) {
14057     put_request(req);
14058     return res;
14059   }
14060   req->head.args.mkdir.mode = mode;
14061   if (is_snap_op) {
14062     SnapPayload payload;
14063     // clear the bufferlist that may have been populated by the call
14064     // to _posix_acl_create(). MDS mksnap does not make use of it.
14065     // So, reuse it to pass metadata payload.
14066     bl.clear();
14067     payload.metadata = metadata;
14068     encode(payload, bl);
14069   }
14070   if (bl.length() > 0) {
14071     req->set_data(bl);
14072   }
14073
14074   Dentry *de = get_or_create(dir, name);
14075   req->set_dentry(de);
14076
14077   ldout(cct, 10) << "_mkdir: making request" << dendl;
14078   res = make_request(req, perm, inp);
14079   ldout(cct, 10) << "_mkdir result is " << res << dendl;
14080
14081   trim_cache();
14082
14083   ldout(cct, 8) << "_mkdir(" << path << ", 0" << oct << mode << dec << ") = " << res << dendl;
14084   return res;
14085 }
14086
14087 int Client::ll_mkdir(Inode *parent, const char *name, mode_t mode,
14088                      struct stat *attr, Inode **out, const UserPerm& perm)
14089 {
14090   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14091   if (!mref_reader.is_state_satisfied())
14092     return -CEPHFS_ENOTCONN;
14093
14094   vinodeno_t vparent = _get_vino(parent);
14095
14096   ldout(cct, 3) << "ll_mkdir " << vparent << " " << name << dendl;
14097   tout(cct) << "ll_mkdir" << std::endl;
14098   tout(cct) << vparent.ino.val << std::endl;
14099   tout(cct) << name << std::endl;
14100   tout(cct) << mode << std::endl;
14101
14102   std::scoped_lock lock(client_lock);
14103
14104   if (!fuse_default_permissions) {
14105     int r = may_create(parent, perm);
14106     if (r < 0)
14107       return r;
14108   }
14109
14110   InodeRef in;
14111   int r = _mkdir(parent, name, mode, perm, &in);
14112   if (r == 0) {
14113     fill_stat(in, attr);
14114     _ll_get(in.get());
14115   }
14116   tout(cct) << attr->st_ino << std::endl;
14117   ldout(cct, 3) << "ll_mkdir " << vparent << " " << name
14118           << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
14119   *out = in.get();
14120   return r;
14121 }
14122
14123 int Client::ll_mkdirx(Inode *parent, const char *name, mode_t mode, Inode **out,
14124                       struct ceph_statx *stx, unsigned want, unsigned flags,
14125                       const UserPerm& perms)
14126 {
14127   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14128   if (!mref_reader.is_state_satisfied())
14129     return -CEPHFS_ENOTCONN;
14130
14131   vinodeno_t vparent = _get_vino(parent);
14132
14133   ldout(cct, 3) << "ll_mkdirx " << vparent << " " << name << dendl;
14134   tout(cct) << "ll_mkdirx" << std::endl;
14135   tout(cct) << vparent.ino.val << std::endl;
14136   tout(cct) << name << std::endl;
14137   tout(cct) << mode << std::endl;
14138
14139   std::scoped_lock lock(client_lock);
14140
14141   if (!fuse_default_permissions) {
14142     int r = may_create(parent, perms);
14143     if (r < 0)
14144       return r;
14145   }
14146
14147   InodeRef in;
14148   int r = _mkdir(parent, name, mode, perms, &in);
14149   if (r == 0) {
14150     fill_statx(in, statx_to_mask(flags, want), stx);
14151     _ll_get(in.get());
14152   } else {
14153     stx->stx_ino = 0;
14154     stx->stx_mask = 0;
14155   }
14156   tout(cct) << stx->stx_ino << std::endl;
14157   ldout(cct, 3) << "ll_mkdirx " << vparent << " " << name
14158           << " = " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
14159   *out = in.get();
14160   return r;
14161 }
14162
14163 int Client::_symlink(Inode *dir, const char *name, const char *target,
14164                      const UserPerm& perms, std::string alternate_name, InodeRef *inp)
14165 {
14166   ldout(cct, 8) << "_symlink(" << dir->ino << " " << name << ", " << target
14167                 << ", uid " << perms.uid() << ", gid " << perms.gid() << ")"
14168                 << dendl;
14169
14170   if (strlen(name) > NAME_MAX)
14171     return -CEPHFS_ENAMETOOLONG;
14172
14173   if (dir->snapid != CEPH_NOSNAP) {
14174     return -CEPHFS_EROFS;
14175   }
14176   if (is_quota_files_exceeded(dir, perms)) {
14177     return -CEPHFS_EDQUOT;
14178   }
14179
14180   MetaRequest *req = new MetaRequest(CEPH_MDS_OP_SYMLINK);
14181
14182   req->set_inode_owner_uid_gid(perms.uid(), perms.gid());
14183
14184   filepath path;
14185   dir->make_nosnap_relative_path(path);
14186   path.push_dentry(name);
14187   req->set_filepath(path);
14188   req->set_alternate_name(std::move(alternate_name));
14189   req->set_inode(dir);
14190   req->set_string2(target);
14191   req->dentry_drop = CEPH_CAP_FILE_SHARED;
14192   req->dentry_unless = CEPH_CAP_FILE_EXCL;
14193
14194   Dentry *de = get_or_create(dir, name);
14195   req->set_dentry(de);
14196
14197   int res = make_request(req, perms, inp);
14198
14199   trim_cache();
14200   ldout(cct, 8) << "_symlink(\"" << path << "\", \"" << target << "\") = " <<
14201     res << dendl;
14202   return res;
14203 }
14204
14205 int Client::ll_symlink(Inode *parent, const char *name, const char *value,
14206                        struct stat *attr, Inode **out, const UserPerm& perms)
14207 {
14208   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14209   if (!mref_reader.is_state_satisfied())
14210     return -CEPHFS_ENOTCONN;
14211
14212   vinodeno_t vparent = _get_vino(parent);
14213
14214   ldout(cct, 3) << "ll_symlink " << vparent << " " << name << " -> " << value
14215                 << dendl;
14216   tout(cct) << "ll_symlink" << std::endl;
14217   tout(cct) << vparent.ino.val << std::endl;
14218   tout(cct) << name << std::endl;
14219   tout(cct) << value << std::endl;
14220
14221   std::scoped_lock lock(client_lock);
14222
14223   if (!fuse_default_permissions) {
14224     int r = may_create(parent, perms);
14225     if (r < 0)
14226       return r;
14227   }
14228
14229   InodeRef in;
14230   int r = _symlink(parent, name, value, perms, "", &in);
14231   if (r == 0) {
14232     fill_stat(in, attr);
14233     _ll_get(in.get());
14234   }
14235   tout(cct) << attr->st_ino << std::endl;
14236   ldout(cct, 3) << "ll_symlink " << vparent << " " << name
14237           << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
14238   *out = in.get();
14239   return r;
14240 }
14241
14242 int Client::ll_symlinkx(Inode *parent, const char *name, const char *value,
14243                         Inode **out, struct ceph_statx *stx, unsigned want,
14244                         unsigned flags, const UserPerm& perms)
14245 {
14246   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14247   if (!mref_reader.is_state_satisfied())
14248     return -CEPHFS_ENOTCONN;
14249
14250   vinodeno_t vparent = _get_vino(parent);
14251
14252   ldout(cct, 3) << "ll_symlinkx " << vparent << " " << name << " -> " << value
14253                 << dendl;
14254   tout(cct) << "ll_symlinkx" << std::endl;
14255   tout(cct) << vparent.ino.val << std::endl;
14256   tout(cct) << name << std::endl;
14257   tout(cct) << value << std::endl;
14258
14259   std::scoped_lock lock(client_lock);
14260
14261   if (!fuse_default_permissions) {
14262     int r = may_create(parent, perms);
14263     if (r < 0)
14264       return r;
14265   }
14266
14267   InodeRef in;
14268   int r = _symlink(parent, name, value, perms, "", &in);
14269   if (r == 0) {
14270     fill_statx(in, statx_to_mask(flags, want), stx);
14271     _ll_get(in.get());
14272   }
14273   tout(cct) << stx->stx_ino << std::endl;
14274   ldout(cct, 3) << "ll_symlinkx " << vparent << " " << name
14275           << " = " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
14276   *out = in.get();
14277   return r;
14278 }
14279
14280 int Client::_unlink(Inode *dir, const char *name, const UserPerm& perm)
14281 {
14282   ldout(cct, 8) << "_unlink(" << dir->ino << " " << name
14283                 << " uid " << perm.uid() << " gid " << perm.gid()
14284                 << ")" << dendl;
14285
14286   if (dir->snapid != CEPH_NOSNAP) {
14287     return -CEPHFS_EROFS;
14288   }
14289
14290   MetaRequest *req = new MetaRequest(CEPH_MDS_OP_UNLINK);
14291
14292   filepath path;
14293   dir->make_nosnap_relative_path(path);
14294   path.push_dentry(name);
14295   req->set_filepath(path);
14296
14297   InodeRef otherin;
14298   Inode *in;
14299   Dentry *de = get_or_create(dir, name);
14300   req->set_dentry(de);
14301   req->dentry_drop = CEPH_CAP_FILE_SHARED;
14302   req->dentry_unless = CEPH_CAP_FILE_EXCL;
14303
14304   int res = _lookup(dir, name, 0, &otherin, perm);
14305   if (res < 0) {
14306     put_request(req);
14307     return res;
14308   }
14309
14310   in = otherin.get();
14311   req->set_other_inode(in);
14312   in->break_all_delegs();
14313   req->other_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
14314
14315   req->set_inode(dir);
14316
14317   res = make_request(req, perm);
14318
14319   trim_cache();
14320   ldout(cct, 8) << "unlink(" << path << ") = " << res << dendl;
14321   return res;
14322 }
14323
14324 int Client::ll_unlink(Inode *in, const char *name, const UserPerm& perm)
14325 {
14326   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14327   if (!mref_reader.is_state_satisfied())
14328     return -CEPHFS_ENOTCONN;
14329
14330   vinodeno_t vino = _get_vino(in);
14331
14332   ldout(cct, 3) << "ll_unlink " << vino << " " << name << dendl;
14333   tout(cct) << "ll_unlink" << std::endl;
14334   tout(cct) << vino.ino.val << std::endl;
14335   tout(cct) << name << std::endl;
14336
14337   std::scoped_lock lock(client_lock);
14338
14339   if (!fuse_default_permissions) {
14340     int r = may_delete(in, name, perm);
14341     if (r < 0)
14342       return r;
14343   }
14344   return _unlink(in, name, perm);
14345 }
14346
14347 int Client::_rmdir(Inode *dir, const char *name, const UserPerm& perms)
14348 {
14349   ldout(cct, 8) << "_rmdir(" << dir->ino << " " << name << " uid "
14350                 << perms.uid() << " gid " << perms.gid() << ")" << dendl;
14351
14352   if (dir->snapid != CEPH_NOSNAP && dir->snapid != CEPH_SNAPDIR) {
14353     return -CEPHFS_EROFS;
14354   }
14355
14356   int op = dir->snapid == CEPH_SNAPDIR ? CEPH_MDS_OP_RMSNAP : CEPH_MDS_OP_RMDIR;
14357   MetaRequest *req = new MetaRequest(op);
14358   filepath path;
14359   dir->make_nosnap_relative_path(path);
14360   path.push_dentry(name);
14361   req->set_filepath(path);
14362   req->set_inode(dir);
14363
14364   req->dentry_drop = CEPH_CAP_FILE_SHARED;
14365   req->dentry_unless = CEPH_CAP_FILE_EXCL;
14366   req->other_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
14367
14368   InodeRef in;
14369
14370   Dentry *de = get_or_create(dir, name);
14371   if (op == CEPH_MDS_OP_RMDIR)
14372     req->set_dentry(de);
14373   else
14374     de->get();
14375
14376   int res = _lookup(dir, name, 0, &in, perms);
14377   if (res < 0) {
14378     put_request(req);
14379     return res;
14380   }
14381
14382   if (op == CEPH_MDS_OP_RMSNAP) {
14383     unlink(de, true, true);
14384     de->put();
14385   }
14386   req->set_other_inode(in.get());
14387
14388   res = make_request(req, perms);
14389
14390   trim_cache();
14391   ldout(cct, 8) << "rmdir(" << path << ") = " << res << dendl;
14392   return res;
14393 }
14394
14395 int Client::ll_rmdir(Inode *in, const char *name, const UserPerm& perms)
14396 {
14397   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14398   if (!mref_reader.is_state_satisfied())
14399     return -CEPHFS_ENOTCONN;
14400
14401   vinodeno_t vino = _get_vino(in);
14402
14403   ldout(cct, 3) << "ll_rmdir " << vino << " " << name << dendl;
14404   tout(cct) << "ll_rmdir" << std::endl;
14405   tout(cct) << vino.ino.val << std::endl;
14406   tout(cct) << name << std::endl;
14407
14408   std::scoped_lock lock(client_lock);
14409
14410   if (!fuse_default_permissions) {
14411     int r = may_delete(in, name, perms);
14412     if (r < 0)
14413       return r;
14414   }
14415
14416   return _rmdir(in, name, perms);
14417 }
14418
14419 int Client::_rename(Inode *fromdir, const char *fromname, Inode *todir, const char *toname, const UserPerm& perm, std::string alternate_name)
14420 {
14421   ldout(cct, 8) << "_rename(" << fromdir->ino << " " << fromname << " to "
14422                 << todir->ino << " " << toname
14423                 << " uid " << perm.uid() << " gid " << perm.gid() << ")"
14424                 << dendl;
14425
14426   if (fromdir->snapid != todir->snapid)
14427     return -CEPHFS_EXDEV;
14428
14429   int op = CEPH_MDS_OP_RENAME;
14430   if (fromdir->snapid != CEPH_NOSNAP) {
14431     if (fromdir == todir && fromdir->snapid == CEPH_SNAPDIR)
14432       op = CEPH_MDS_OP_RENAMESNAP;
14433     else
14434       return -CEPHFS_EROFS;
14435   }
14436
14437   // don't allow cross-quota renames
14438   if (cct->_conf.get_val<bool>("client_quota") && fromdir != todir) {
14439     Inode *fromdir_root =
14440       fromdir->quota.is_enabled() ? fromdir : get_quota_root(fromdir, perm);
14441     Inode *todir_root =
14442       todir->quota.is_enabled() ? todir : get_quota_root(todir, perm);
14443     if (fromdir_root != todir_root) {
14444       return -CEPHFS_EXDEV;
14445     }
14446   }
14447
14448   InodeRef target;
14449   MetaRequest *req = new MetaRequest(op);
14450
14451   filepath from;
14452   fromdir->make_nosnap_relative_path(from);
14453   from.push_dentry(fromname);
14454   filepath to;
14455   todir->make_nosnap_relative_path(to);
14456   to.push_dentry(toname);
14457   req->set_filepath(to);
14458   req->set_filepath2(from);
14459   req->set_alternate_name(std::move(alternate_name));
14460
14461   Dentry *oldde = get_or_create(fromdir, fromname);
14462   Dentry *de = get_or_create(todir, toname);
14463
14464   int res;
14465   if (op == CEPH_MDS_OP_RENAME) {
14466     req->set_old_dentry(oldde);
14467     req->old_dentry_drop = CEPH_CAP_FILE_SHARED;
14468     req->old_dentry_unless = CEPH_CAP_FILE_EXCL;
14469
14470     de->is_renaming = true;
14471     req->set_dentry(de);
14472     req->dentry_drop = CEPH_CAP_FILE_SHARED;
14473     req->dentry_unless = CEPH_CAP_FILE_EXCL;
14474
14475     InodeRef oldin, otherin;
14476     res = _lookup(fromdir, fromname, 0, &oldin, perm, nullptr, true);
14477     if (res < 0)
14478       goto fail;
14479
14480     Inode *oldinode = oldin.get();
14481     oldinode->break_all_delegs();
14482     req->set_old_inode(oldinode);
14483     req->old_inode_drop = CEPH_CAP_LINK_SHARED;
14484
14485     res = _lookup(todir, toname, 0, &otherin, perm, nullptr, true);
14486     switch (res) {
14487     case 0:
14488       {
14489         Inode *in = otherin.get();
14490         req->set_other_inode(in);
14491         in->break_all_delegs();
14492       }
14493       req->other_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
14494       break;
14495     case -CEPHFS_ENOENT:
14496       break;
14497     default:
14498       goto fail;
14499     }
14500
14501     req->set_inode(todir);
14502   } else {
14503     // renamesnap reply contains no tracedn, so we need to invalidate
14504     // dentry manually
14505     unlink(oldde, true, true);
14506     unlink(de, true, true);
14507
14508     req->set_inode(todir);
14509   }
14510
14511   res = make_request(req, perm, &target);
14512   ldout(cct, 10) << "rename result is " << res << dendl;
14513
14514   // if rename fails it will miss waking up the waiters
14515   if (op == CEPH_MDS_OP_RENAME && de->is_renaming) {
14516     de->is_renaming = false;
14517     signal_cond_list(waiting_for_rename);
14518   }
14519
14520   // renamed item from our cache
14521
14522   trim_cache();
14523   ldout(cct, 8) << "_rename(" << from << ", " << to << ") = " << res << dendl;
14524   return res;
14525
14526  fail:
14527   put_request(req);
14528   return res;
14529 }
14530
14531 int Client::ll_rename(Inode *parent, const char *name, Inode *newparent,
14532                       const char *newname, const UserPerm& perm)
14533 {
14534   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14535   if (!mref_reader.is_state_satisfied())
14536     return -CEPHFS_ENOTCONN;
14537
14538   vinodeno_t vparent = _get_vino(parent);
14539   vinodeno_t vnewparent = _get_vino(newparent);
14540
14541   ldout(cct, 3) << "ll_rename " << vparent << " " << name << " to "
14542           << vnewparent << " " << newname << dendl;
14543   tout(cct) << "ll_rename" << std::endl;
14544   tout(cct) << vparent.ino.val << std::endl;
14545   tout(cct) << name << std::endl;
14546   tout(cct) << vnewparent.ino.val << std::endl;
14547   tout(cct) << newname << std::endl;
14548
14549   std::scoped_lock lock(client_lock);
14550
14551   if (!fuse_default_permissions) {
14552     int r = may_delete(parent, name, perm);
14553     if (r < 0)
14554       return r;
14555     r = may_delete(newparent, newname, perm);
14556     if (r < 0 && r != -CEPHFS_ENOENT)
14557       return r;
14558   }
14559
14560   return _rename(parent, name, newparent, newname, perm, "");
14561 }
14562
14563 int Client::_link(Inode *in, Inode *dir, const char *newname, const UserPerm& perm, std::string alternate_name, InodeRef *inp)
14564 {
14565   ldout(cct, 8) << "_link(" << in->ino << " to " << dir->ino << " " << newname
14566                 << " uid " << perm.uid() << " gid " << perm.gid() << ")" << dendl;
14567
14568   if (strlen(newname) > NAME_MAX)
14569     return -CEPHFS_ENAMETOOLONG;
14570
14571   if (in->snapid != CEPH_NOSNAP || dir->snapid != CEPH_NOSNAP) {
14572     return -CEPHFS_EROFS;
14573   }
14574   if (is_quota_files_exceeded(dir, perm)) {
14575     return -CEPHFS_EDQUOT;
14576   }
14577
14578   in->break_all_delegs();
14579   MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LINK);
14580
14581   filepath path(newname, dir->ino);
14582   req->set_filepath(path);
14583   req->set_alternate_name(std::move(alternate_name));
14584   filepath existing(in->ino);
14585   req->set_filepath2(existing);
14586
14587   req->set_inode(dir);
14588   req->inode_drop = CEPH_CAP_FILE_SHARED;
14589   req->inode_unless = CEPH_CAP_FILE_EXCL;
14590
14591   Dentry *de = get_or_create(dir, newname);
14592   req->set_dentry(de);
14593
14594   int res = make_request(req, perm, inp);
14595   ldout(cct, 10) << "link result is " << res << dendl;
14596
14597   trim_cache();
14598   ldout(cct, 8) << "link(" << existing << ", " << path << ") = " << res << dendl;
14599   return res;
14600 }
14601
14602 int Client::ll_link(Inode *in, Inode *newparent, const char *newname,
14603                     const UserPerm& perm)
14604 {
14605   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14606   if (!mref_reader.is_state_satisfied())
14607     return -CEPHFS_ENOTCONN;
14608
14609   vinodeno_t vino = _get_vino(in);
14610   vinodeno_t vnewparent = _get_vino(newparent);
14611
14612   ldout(cct, 3) << "ll_link " << vino << " to " << vnewparent << " " <<
14613     newname << dendl;
14614   tout(cct) << "ll_link" << std::endl;
14615   tout(cct) << vino.ino.val << std::endl;
14616   tout(cct) << vnewparent << std::endl;
14617   tout(cct) << newname << std::endl;
14618
14619   InodeRef target;
14620
14621   std::scoped_lock lock(client_lock);
14622
14623   if (!fuse_default_permissions) {
14624     if (S_ISDIR(in->mode))
14625       return -CEPHFS_EPERM;
14626
14627     int r = may_hardlink(in, perm);
14628     if (r < 0)
14629       return r;
14630
14631     r = may_create(newparent, perm);
14632     if (r < 0)
14633       return r;
14634   }
14635
14636   return _link(in, newparent, newname, perm, "", &target);
14637 }
14638
14639 int Client::ll_num_osds(void)
14640 {
14641   std::scoped_lock lock(client_lock);
14642   return objecter->with_osdmap(std::mem_fn(&OSDMap::get_num_osds));
14643 }
14644
14645 int Client::ll_osdaddr(int osd, uint32_t *addr)
14646 {
14647   std::scoped_lock lock(client_lock);
14648
14649   entity_addr_t g;
14650   bool exists = objecter->with_osdmap([&](const OSDMap& o) {
14651       if (!o.exists(osd))
14652         return false;
14653       g = o.get_addrs(osd).front();
14654       return true;
14655     });
14656   if (!exists)
14657     return -1;
14658   uint32_t nb_addr = (g.in4_addr()).sin_addr.s_addr;
14659   *addr = ntohl(nb_addr);
14660   return 0;
14661 }
14662
14663 uint32_t Client::ll_stripe_unit(Inode *in)
14664 {
14665   std::scoped_lock lock(client_lock);
14666   return in->layout.stripe_unit;
14667 }
14668
14669 uint64_t Client::ll_snap_seq(Inode *in)
14670 {
14671   std::scoped_lock lock(client_lock);
14672   return in->snaprealm->seq;
14673 }
14674
14675 int Client::ll_file_layout(Inode *in, file_layout_t *layout)
14676 {
14677   std::scoped_lock lock(client_lock);
14678   *layout = in->layout;
14679   return 0;
14680 }
14681
14682 int Client::ll_file_layout(Fh *fh, file_layout_t *layout)
14683 {
14684   return ll_file_layout(fh->inode.get(), layout);
14685 }
14686
14687 /* Currently we cannot take advantage of redundancy in reads, since we
14688    would have to go through all possible placement groups (a
14689    potentially quite large number determined by a hash), and use CRUSH
14690    to calculate the appropriate set of OSDs for each placement group,
14691    then index into that.  An array with one entry per OSD is much more
14692    tractable and works for demonstration purposes. */
14693
14694 int Client::ll_get_stripe_osd(Inode *in, uint64_t blockno,
14695                               file_layout_t* layout)
14696 {
14697   std::scoped_lock lock(client_lock);
14698
14699   inodeno_t ino = in->ino;
14700   uint32_t object_size = layout->object_size;
14701   uint32_t su = layout->stripe_unit;
14702   uint32_t stripe_count = layout->stripe_count;
14703   uint64_t stripes_per_object = object_size / su;
14704   uint64_t stripeno = 0, stripepos = 0;
14705
14706   if(stripe_count) {
14707       stripeno = blockno / stripe_count;    // which horizontal stripe        (Y)
14708       stripepos = blockno % stripe_count;   // which object in the object set (X)
14709   }
14710   uint64_t objectsetno = stripeno / stripes_per_object;       // which object set
14711   uint64_t objectno = objectsetno * stripe_count + stripepos;  // object id
14712
14713   object_t oid = file_object_t(ino, objectno);
14714   return objecter->with_osdmap([&](const OSDMap& o) {
14715       ceph_object_layout olayout =
14716         o.file_to_object_layout(oid, *layout);
14717       pg_t pg = (pg_t)olayout.ol_pgid;
14718       vector<int> osds;
14719       int primary;
14720       o.pg_to_acting_osds(pg, &osds, &primary);
14721       return primary;
14722     });
14723 }
14724
14725 /* Return the offset of the block, internal to the object */
14726
14727 uint64_t Client::ll_get_internal_offset(Inode *in, uint64_t blockno)
14728 {
14729   std::scoped_lock lock(client_lock);
14730   file_layout_t *layout=&(in->layout);
14731   uint32_t object_size = layout->object_size;
14732   uint32_t su = layout->stripe_unit;
14733   uint64_t stripes_per_object = object_size / su;
14734
14735   return (blockno % stripes_per_object) * su;
14736 }
14737
14738 int Client::ll_opendir(Inode *in, int flags, dir_result_t** dirpp,
14739                        const UserPerm& perms)
14740 {
14741   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14742   if (!mref_reader.is_state_satisfied())
14743     return -CEPHFS_ENOTCONN;
14744
14745   vinodeno_t vino = _get_vino(in);
14746
14747   ldout(cct, 3) << "ll_opendir " << vino << dendl;
14748   tout(cct) << "ll_opendir" << std::endl;
14749   tout(cct) << vino.ino.val << std::endl;
14750
14751   std::scoped_lock lock(client_lock);
14752
14753   if (!fuse_default_permissions) {
14754     int r = may_open(in, flags, perms);
14755     if (r < 0)
14756       return r;
14757   }
14758
14759   int r = _opendir(in, dirpp, perms);
14760   tout(cct) << (uintptr_t)*dirpp << std::endl;
14761
14762   ldout(cct, 3) << "ll_opendir " << vino << " = " << r << " (" << *dirpp << ")"
14763                 << dendl;
14764   return r;
14765 }
14766
14767 int Client::ll_releasedir(dir_result_t *dirp)
14768 {
14769   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14770   if (!mref_reader.is_state_satisfied())
14771     return -CEPHFS_ENOTCONN;
14772
14773   ldout(cct, 3) << "ll_releasedir " << dirp << dendl;
14774   tout(cct) << "ll_releasedir" << std::endl;
14775   tout(cct) << (uintptr_t)dirp << std::endl;
14776
14777   std::scoped_lock lock(client_lock);
14778
14779   _closedir(dirp);
14780   return 0;
14781 }
14782
14783 int Client::ll_fsyncdir(dir_result_t *dirp)
14784 {
14785   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14786   if (!mref_reader.is_state_satisfied())
14787     return -CEPHFS_ENOTCONN;
14788
14789   ldout(cct, 3) << "ll_fsyncdir " << dirp << dendl;
14790   tout(cct) << "ll_fsyncdir" << std::endl;
14791   tout(cct) << (uintptr_t)dirp << std::endl;
14792
14793   std::scoped_lock lock(client_lock);
14794   return _fsync(dirp->inode.get(), false);
14795 }
14796
14797 int Client::ll_open(Inode *in, int flags, Fh **fhp, const UserPerm& perms)
14798 {
14799   ceph_assert(!(flags & O_CREAT));
14800
14801   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14802   if (!mref_reader.is_state_satisfied())
14803     return -CEPHFS_ENOTCONN;
14804
14805   vinodeno_t vino = _get_vino(in);
14806
14807   ldout(cct, 3) << "ll_open " << vino << " " << ceph_flags_sys2wire(flags) << dendl;
14808   tout(cct) << "ll_open" << std::endl;
14809   tout(cct) << vino.ino.val << std::endl;
14810   tout(cct) << ceph_flags_sys2wire(flags) << std::endl;
14811
14812   std::scoped_lock lock(client_lock);
14813
14814   int r;
14815   if (!fuse_default_permissions) {
14816     r = may_open(in, flags, perms);
14817     if (r < 0)
14818       goto out;
14819   }
14820
14821   r = _open(in, flags, 0, fhp /* may be NULL */, perms);
14822
14823  out:
14824   Fh *fhptr = fhp ? *fhp : NULL;
14825   if (fhptr) {
14826     ll_unclosed_fh_set.insert(fhptr);
14827   }
14828   tout(cct) << (uintptr_t)fhptr << std::endl;
14829   ldout(cct, 3) << "ll_open " << vino << " " << ceph_flags_sys2wire(flags) <<
14830       " = " << r << " (" << fhptr << ")" << dendl;
14831   return r;
14832 }
14833
14834 int Client::_ll_create(Inode *parent, const char *name, mode_t mode,
14835                       int flags, InodeRef *in, int caps, Fh **fhp,
14836                       const UserPerm& perms)
14837 {
14838   *fhp = NULL;
14839
14840   vinodeno_t vparent = _get_vino(parent);
14841
14842   ldout(cct, 8) << "_ll_create " << vparent << " " << name << " 0" << oct <<
14843     mode << dec << " " << ceph_flags_sys2wire(flags) << ", uid " << perms.uid()
14844                 << ", gid " << perms.gid() << dendl;
14845   tout(cct) << "ll_create" << std::endl;
14846   tout(cct) << vparent.ino.val << std::endl;
14847   tout(cct) << name << std::endl;
14848   tout(cct) << mode << std::endl;
14849   tout(cct) << ceph_flags_sys2wire(flags) << std::endl;
14850
14851   bool created = false;
14852   int r = _lookup(parent, name, caps, in, perms);
14853
14854   if (r == 0 && (flags & O_CREAT) && (flags & O_EXCL))
14855     return -CEPHFS_EEXIST;
14856
14857   if (r == -CEPHFS_ENOENT && (flags & O_CREAT)) {
14858     if (!fuse_default_permissions) {
14859       r = may_create(parent, perms);
14860       if (r < 0)
14861         goto out;
14862     }
14863     r = _create(parent, name, flags, mode, in, fhp, 0, 0, 0, NULL, &created,
14864                 perms, "");
14865     if (r < 0)
14866       goto out;
14867   }
14868
14869   if (r < 0)
14870     goto out;
14871
14872   ceph_assert(*in);
14873
14874   ldout(cct, 20) << "_ll_create created = " << created << dendl;
14875   if (!created) {
14876     if (!fuse_default_permissions) {
14877       r = may_open(in->get(), flags, perms);
14878       if (r < 0) {
14879         if (*fhp) {
14880           int release_r = _release_fh(*fhp);
14881           ceph_assert(release_r == 0);  // during create, no async data ops should have happened
14882         }
14883         goto out;
14884       }
14885     }
14886     if (*fhp == NULL) {
14887       r = _open(in->get(), flags, mode, fhp, perms);
14888       if (r < 0)
14889         goto out;
14890     }
14891   }
14892
14893 out:
14894   if (*fhp) {
14895     ll_unclosed_fh_set.insert(*fhp);
14896   }
14897
14898   #ifdef _WIN32
14899   uint64_t ino = 0;
14900   #else
14901   ino_t ino = 0;
14902   #endif
14903   if (r >= 0) {
14904     Inode *inode = in->get();
14905     if (use_faked_inos())
14906       ino = inode->faked_ino;
14907     else
14908       ino = inode->ino;
14909   }
14910
14911   tout(cct) << (uintptr_t)*fhp << std::endl;
14912   tout(cct) << ino << std::endl;
14913   ldout(cct, 8) << "_ll_create " << vparent << " " << name << " 0" << oct <<
14914     mode << dec << " " << ceph_flags_sys2wire(flags) << " = " << r << " (" <<
14915     *fhp << " " << hex << ino << dec << ")" << dendl;
14916
14917   return r;
14918 }
14919
14920 int Client::ll_create(Inode *parent, const char *name, mode_t mode,
14921                       int flags, struct stat *attr, Inode **outp, Fh **fhp,
14922                       const UserPerm& perms)
14923 {
14924   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14925   if (!mref_reader.is_state_satisfied())
14926     return -CEPHFS_ENOTCONN;
14927
14928   std::scoped_lock lock(client_lock);
14929   InodeRef in;
14930
14931   int r = _ll_create(parent, name, mode, flags, &in, CEPH_STAT_CAP_INODE_ALL,
14932                       fhp, perms);
14933   if (r >= 0) {
14934     ceph_assert(in);
14935
14936     // passing an Inode in outp requires an additional ref
14937     if (outp) {
14938       _ll_get(in.get());
14939       *outp = in.get();
14940     }
14941     fill_stat(in, attr);
14942   } else {
14943     attr->st_ino = 0;
14944   }
14945
14946   return r;
14947 }
14948
14949 int Client::ll_createx(Inode *parent, const char *name, mode_t mode,
14950                         int oflags, Inode **outp, Fh **fhp,
14951                         struct ceph_statx *stx, unsigned want, unsigned lflags,
14952                         const UserPerm& perms)
14953 {
14954   unsigned caps = statx_to_mask(lflags, want);
14955   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14956   if (!mref_reader.is_state_satisfied())
14957     return -CEPHFS_ENOTCONN;
14958
14959   std::scoped_lock lock(client_lock);
14960   InodeRef in;
14961
14962   int r = _ll_create(parent, name, mode, oflags, &in, caps, fhp, perms);
14963   if (r >= 0) {
14964     ceph_assert(in);
14965
14966     // passing an Inode in outp requires an additional ref
14967     if (outp) {
14968       _ll_get(in.get());
14969       *outp = in.get();
14970     }
14971     fill_statx(in, caps, stx);
14972   } else {
14973     stx->stx_ino = 0;
14974     stx->stx_mask = 0;
14975   }
14976
14977   return r;
14978 }
14979
14980 loff_t Client::ll_lseek(Fh *fh, loff_t offset, int whence)
14981 {
14982   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14983   if (!mref_reader.is_state_satisfied())
14984     return -CEPHFS_ENOTCONN;
14985
14986   tout(cct) << "ll_lseek" << std::endl;
14987   tout(cct) << offset << std::endl;
14988   tout(cct) << whence << std::endl;
14989
14990   std::scoped_lock lock(client_lock);
14991   return _lseek(fh, offset, whence);
14992 }
14993
14994 int Client::ll_read(Fh *fh, loff_t off, loff_t len, bufferlist *bl)
14995 {
14996   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14997   if (!mref_reader.is_state_satisfied())
14998     return -CEPHFS_ENOTCONN;
14999
15000   ldout(cct, 3) << "ll_read " << fh << " " << fh->inode->ino << " " << " " << off << "~" << len << dendl;
15001   tout(cct) << "ll_read" << std::endl;
15002   tout(cct) << (uintptr_t)fh << std::endl;
15003   tout(cct) << off << std::endl;
15004   tout(cct) << len << std::endl;
15005
15006   /* We can't return bytes written larger than INT_MAX, clamp len to that */
15007   len = std::min(len, (loff_t)INT_MAX);
15008   std::scoped_lock lock(client_lock);
15009
15010   int r = _read(fh, off, len, bl);
15011   ldout(cct, 3) << "ll_read " << fh << " " << off << "~" << len << " = " << r
15012                 << dendl;
15013   return r;
15014 }
15015
15016 int Client::ll_read_block(Inode *in, uint64_t blockid,
15017                           char *buf,
15018                           uint64_t offset,
15019                           uint64_t length,
15020                           file_layout_t* layout)
15021 {
15022   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
15023   if (!mref_reader.is_state_satisfied())
15024     return -CEPHFS_ENOTCONN;
15025
15026   vinodeno_t vino = _get_vino(in);
15027   object_t oid = file_object_t(vino.ino, blockid);
15028   C_SaferCond onfinish;
15029   bufferlist bl;
15030
15031   objecter->read(oid,
15032                  object_locator_t(layout->pool_id),
15033                  offset,
15034                  length,
15035                  vino.snapid,
15036                  &bl,
15037                  CEPH_OSD_FLAG_READ,
15038                  &onfinish);
15039
15040   int r = onfinish.wait();
15041   if (r >= 0) {
15042       bl.begin().copy(bl.length(), buf);
15043       r = bl.length();
15044   }
15045
15046   return r;
15047 }
15048
15049 /* It appears that the OSD doesn't return success unless the entire
15050    buffer was written, return the write length on success. */
15051
15052 int Client::ll_write_block(Inode *in, uint64_t blockid,
15053                            char* buf, uint64_t offset,
15054                            uint64_t length, file_layout_t* layout,
15055                            uint64_t snapseq, uint32_t sync)
15056 {
15057   vinodeno_t vino = ll_get_vino(in);
15058   int r = 0;
15059   std::unique_ptr<C_SaferCond> onsafe = nullptr;
15060
15061   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
15062   if (!mref_reader.is_state_satisfied())
15063     return -CEPHFS_ENOTCONN;
15064
15065   if (length == 0) {
15066     return -CEPHFS_EINVAL;
15067   }
15068   if (true || sync) {
15069     /* if write is stable, the epilogue is waiting on
15070      * flock */
15071     onsafe.reset(new C_SaferCond("Client::ll_write_block flock"));
15072   }
15073   object_t oid = file_object_t(vino.ino, blockid);
15074   SnapContext fakesnap;
15075   ceph::bufferlist bl;
15076   if (length > 0) {
15077     bl.push_back(buffer::copy(buf, length));
15078   }
15079
15080   ldout(cct, 1) << "ll_block_write for " << vino.ino << "." << blockid
15081                 << dendl;
15082
15083   fakesnap.seq = snapseq;
15084
15085   /* lock just in time */
15086   objecter->write(oid,
15087                   object_locator_t(layout->pool_id),
15088                   offset,
15089                   length,
15090                   fakesnap,
15091                   bl,
15092                   ceph::real_clock::now(),
15093                   0,
15094                   onsafe.get());
15095
15096   if (nullptr != onsafe) {
15097     r = onsafe->wait();
15098   }
15099
15100   if (r < 0) {
15101     return r;
15102   } else {
15103     return length;
15104   }
15105 }
15106
15107 int Client::ll_commit_blocks(Inode *in,
15108                              uint64_t offset,
15109                              uint64_t length)
15110 {
15111     /*
15112     BarrierContext *bctx;
15113     vinodeno_t vino = _get_vino(in);
15114     uint64_t ino = vino.ino;
15115
15116     ldout(cct, 1) << "ll_commit_blocks for " << vino.ino << " from "
15117                   << offset << " to " << length << dendl;
15118
15119     if (length == 0) {
15120       return -CEPHFS_EINVAL;
15121     }
15122
15123     std::scoped_lock lock(client_lock);
15124     map<uint64_t, BarrierContext*>::iterator p = barriers.find(ino);
15125     if (p != barriers.end()) {
15126       barrier_interval civ(offset, offset + length);
15127       p->second->commit_barrier(civ);
15128     }
15129     */
15130     return 0;
15131 }
15132
15133 int Client::ll_write(Fh *fh, loff_t off, loff_t len, const char *data)
15134 {
15135   ldout(cct, 3) << "ll_write " << fh << " " << fh->inode->ino << " " << off <<
15136     "~" << len << dendl;
15137   tout(cct) << "ll_write" << std::endl;
15138   tout(cct) << (uintptr_t)fh << std::endl;
15139   tout(cct) << off << std::endl;
15140   tout(cct) << len << std::endl;
15141
15142   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
15143   if (!mref_reader.is_state_satisfied())
15144     return -CEPHFS_ENOTCONN;
15145
15146   /* We can't return bytes written larger than INT_MAX, clamp len to that */
15147   len = std::min(len, (loff_t)INT_MAX);
15148   std::scoped_lock lock(client_lock);
15149
15150   int r = _write(fh, off, len, data, NULL, 0);
15151   ldout(cct, 3) << "ll_write " << fh << " " << off << "~" << len << " = " << r
15152                 << dendl;
15153   return r;
15154 }
15155
15156 int64_t Client::ll_writev(struct Fh *fh, const struct iovec *iov, int iovcnt, int64_t off)
15157 {
15158   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
15159   if (!mref_reader.is_state_satisfied())
15160     return -CEPHFS_ENOTCONN;
15161
15162   std::scoped_lock cl(client_lock);
15163   return _preadv_pwritev_locked(fh, iov, iovcnt, off, true, false);
15164 }
15165
15166 int64_t Client::ll_readv(struct Fh *fh, const struct iovec *iov, int iovcnt, int64_t off)
15167 {
15168   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
15169   if (!mref_reader.is_state_satisfied())
15170     return -CEPHFS_ENOTCONN;
15171
15172   std::scoped_lock cl(client_lock);
15173   return _preadv_pwritev_locked(fh, iov, iovcnt, off, false, false);
15174 }
15175
15176 int Client::ll_flush(Fh *fh)
15177 {
15178   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
15179   if (!mref_reader.is_state_satisfied())
15180     return -CEPHFS_ENOTCONN;
15181
15182   ldout(cct, 3) << "ll_flush " << fh << " " << fh->inode->ino << " " << dendl;
15183   tout(cct) << "ll_flush" << std::endl;
15184   tout(cct) << (uintptr_t)fh << std::endl;
15185
15186   std::scoped_lock lock(client_lock);
15187   return _flush(fh);
15188 }
15189
15190 int Client::ll_fsync(Fh *fh, bool syncdataonly)
15191 {
15192   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
15193   if (!mref_reader.is_state_satisfied())
15194     return -CEPHFS_ENOTCONN;
15195
15196   ldout(cct, 3) << "ll_fsync " << fh << " " << fh->inode->ino << " " << dendl;
15197   tout(cct) << "ll_fsync" << std::endl;
15198   tout(cct) << (uintptr_t)fh << std::endl;
15199
15200   std::scoped_lock lock(client_lock);
15201   int r = _fsync(fh, syncdataonly);
15202   if (r) {
15203     // If we're returning an error, clear it from the FH
15204     fh->take_async_err();
15205   }
15206   return r;
15207 }
15208
15209 int Client::ll_sync_inode(Inode *in, bool syncdataonly)
15210 {
15211   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
15212   if (!mref_reader.is_state_satisfied())
15213     return -CEPHFS_ENOTCONN;
15214
15215   ldout(cct, 3) << "ll_sync_inode " << *in << " " << dendl;
15216   tout(cct) << "ll_sync_inode" << std::endl;
15217   tout(cct) << (uintptr_t)in << std::endl;
15218
15219   std::scoped_lock lock(client_lock);
15220   return _fsync(in, syncdataonly);
15221 }
15222
15223 int Client::clear_suid_sgid(Inode *in, const UserPerm& perms, bool defer)
15224 {
15225   ldout(cct, 20) << __func__ << " " << *in << "; " << perms << " defer "
15226                  << defer << dendl;
15227
15228   if (!in->is_file()) {
15229     return 0;
15230   }
15231
15232   if (likely(!(in->mode & (S_ISUID|S_ISGID)))) {
15233     return 0;
15234   }
15235
15236   if (perms.uid() == 0 || perms.uid() == in->uid) {
15237     return 0;
15238   }
15239
15240   int mask = 0;
15241
15242   // always drop the suid
15243   if (unlikely(in->mode & S_ISUID)) {
15244     mask = CEPH_SETATTR_KILL_SUID;
15245   }
15246
15247   // remove the sgid if S_IXUGO is set or the inode is
15248   // is not in the caller's group list.
15249   if ((in->mode & S_ISGID) &&
15250       ((in->mode & S_IXUGO) || !perms.gid_in_groups(in->gid))) {
15251     mask |= CEPH_SETATTR_KILL_SGID;
15252   }
15253
15254   ldout(cct, 20) << __func__ << " mask " << mask << dendl;
15255   if (defer) {
15256     return mask;
15257   }
15258
15259   struct ceph_statx stx = { 0 };
15260   return __setattrx(in, &stx, mask, perms);
15261 }
15262
15263 int Client::_fallocate(Fh *fh, int mode, int64_t offset, int64_t length)
15264 {
15265   ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
15266
15267   if (offset < 0 || length <= 0)
15268     return -CEPHFS_EINVAL;
15269
15270   if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
15271     return -CEPHFS_EOPNOTSUPP;
15272
15273   if ((mode & FALLOC_FL_PUNCH_HOLE) && !(mode & FALLOC_FL_KEEP_SIZE))
15274     return -CEPHFS_EOPNOTSUPP;
15275
15276   Inode *in = fh->inode.get();
15277
15278   if (objecter->osdmap_pool_full(in->layout.pool_id) &&
15279       !(mode & FALLOC_FL_PUNCH_HOLE)) {
15280     return -CEPHFS_ENOSPC;
15281   }
15282
15283   if (in->snapid != CEPH_NOSNAP)
15284     return -CEPHFS_EROFS;
15285
15286   if ((fh->mode & CEPH_FILE_MODE_WR) == 0)
15287     return -CEPHFS_EBADF;
15288
15289   uint64_t size = offset + length;
15290   if (!(mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE)) &&
15291       size > in->size &&
15292       is_quota_bytes_exceeded(in, size - in->size, fh->actor_perms)) {
15293     return -CEPHFS_EDQUOT;
15294   }
15295
15296   int have;
15297   int r = get_caps(fh, CEPH_CAP_FILE_WR, CEPH_CAP_FILE_BUFFER, &have, -1);
15298   if (r < 0)
15299     return r;
15300
15301   r = clear_suid_sgid(in, fh->actor_perms);
15302   if (r < 0) {
15303     put_cap_ref(in, CEPH_CAP_FILE_WR);
15304     return r;
15305   }
15306
15307   std::unique_ptr<C_SaferCond> onuninline = nullptr;
15308   if (mode & FALLOC_FL_PUNCH_HOLE) {
15309     if (in->inline_version < CEPH_INLINE_NONE &&
15310         (have & CEPH_CAP_FILE_BUFFER)) {
15311       bufferlist bl;
15312       auto inline_iter = in->inline_data.cbegin();
15313       int len = in->inline_data.length();
15314       if (offset < len) {
15315         if (offset > 0)
15316           inline_iter.copy(offset, bl);
15317         int size = length;
15318         if (offset + size > len)
15319           size = len - offset;
15320         if (size > 0)
15321           bl.append_zero(size);
15322         if (offset + size < len) {
15323           inline_iter += size;
15324           inline_iter.copy(len - offset - size, bl);
15325         }
15326         in->inline_data = bl;
15327         in->inline_version++;
15328       }
15329       in->mtime = in->ctime = ceph_clock_now();
15330       in->change_attr++;
15331       in->mark_caps_dirty(CEPH_CAP_FILE_WR);
15332     } else {
15333       if (in->inline_version < CEPH_INLINE_NONE) {
15334         onuninline.reset(new C_SaferCond("Client::_fallocate_uninline_data flock"));
15335         uninline_data(in, onuninline.get());
15336       }
15337
15338       C_SaferCond onfinish("Client::_punch_hole flock");
15339
15340       get_cap_ref(in, CEPH_CAP_FILE_BUFFER);
15341
15342       _invalidate_inode_cache(in, offset, length);
15343       filer->zero(in->ino, &in->layout,
15344                   in->snaprealm->get_snap_context(),
15345                   offset, length,
15346                   ceph::real_clock::now(),
15347                   0, true, &onfinish);
15348       in->mtime = in->ctime = ceph_clock_now();
15349       in->change_attr++;
15350       in->mark_caps_dirty(CEPH_CAP_FILE_WR);
15351
15352       client_lock.unlock();
15353       onfinish.wait();
15354       client_lock.lock();
15355       put_cap_ref(in, CEPH_CAP_FILE_BUFFER);
15356     }
15357   } else if (!(mode & FALLOC_FL_KEEP_SIZE)) {
15358     uint64_t size = offset + length;
15359     if (size > in->size) {
15360       in->size = size;
15361       in->mtime = in->ctime = ceph_clock_now();
15362       in->change_attr++;
15363       in->mark_caps_dirty(CEPH_CAP_FILE_WR);
15364
15365       if (is_quota_bytes_approaching(in, fh->actor_perms)) {
15366         check_caps(in, CHECK_CAPS_NODELAY);
15367       } else if (is_max_size_approaching(in)) {
15368         check_caps(in, 0);
15369       }
15370     }
15371   }
15372
15373   if (nullptr != onuninline) {
15374     client_lock.unlock();
15375     int ret = onuninline->wait();
15376     client_lock.lock();
15377
15378     if (ret >= 0 || ret == -CEPHFS_ECANCELED) {
15379       in->inline_data.clear();
15380       in->inline_version = CEPH_INLINE_NONE;
15381       in->mark_caps_dirty(CEPH_CAP_FILE_WR);
15382       check_caps(in, 0);
15383     } else
15384       r = ret;
15385   }
15386
15387   put_cap_ref(in, CEPH_CAP_FILE_WR);
15388   return r;
15389 }
15390
15391 int Client::ll_fallocate(Fh *fh, int mode, int64_t offset, int64_t length)
15392 {
15393   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
15394   if (!mref_reader.is_state_satisfied())
15395     return -CEPHFS_ENOTCONN;
15396
15397   ldout(cct, 3) << __func__ << " " << fh << " " << fh->inode->ino << " " << dendl;
15398   tout(cct) << __func__ << " " << mode << " " << offset << " " << length << std::endl;
15399   tout(cct) << (uintptr_t)fh << std::endl;
15400
15401   std::scoped_lock lock(client_lock);
15402   return _fallocate(fh, mode, offset, length);
15403 }
15404
15405 int Client::fallocate(int fd, int mode, loff_t offset, loff_t length)
15406 {
15407   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
15408   if (!mref_reader.is_state_satisfied())
15409     return -CEPHFS_ENOTCONN;
15410
15411   tout(cct) << __func__ << " " << fd << mode << " " << offset << " " << length << std::endl;
15412
15413   std::scoped_lock lock(client_lock);
15414   Fh *fh = get_filehandle(fd);
15415   if (!fh)
15416     return -CEPHFS_EBADF;
15417 #if defined(__linux__) && defined(O_PATH)
15418   if (fh->flags & O_PATH)
15419     return -CEPHFS_EBADF;
15420 #endif
15421   return _fallocate(fh, mode, offset, length);
15422 }
15423
15424 int Client::ll_release(Fh *fh)
15425 {
15426   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
15427   if (!mref_reader.is_state_satisfied())
15428     return -CEPHFS_ENOTCONN;
15429
15430   ldout(cct, 3) << __func__ << " (fh)" << fh << " " << fh->inode->ino << " " <<
15431     dendl;
15432   tout(cct) << __func__ << " (fh)" << std::endl;
15433   tout(cct) << (uintptr_t)fh << std::endl;
15434
15435   std::scoped_lock lock(client_lock);
15436
15437   if (ll_unclosed_fh_set.count(fh))
15438     ll_unclosed_fh_set.erase(fh);
15439   return _release_fh(fh);
15440 }
15441
15442 int Client::ll_getlk(Fh *fh, struct flock *fl, uint64_t owner)
15443 {
15444   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
15445   if (!mref_reader.is_state_satisfied())
15446     return -CEPHFS_ENOTCONN;
15447
15448   ldout(cct, 3) << "ll_getlk (fh)" << fh << " " << fh->inode->ino << dendl;
15449   tout(cct) << "ll_getk (fh)" << (uintptr_t)fh << std::endl;
15450
15451   std::scoped_lock lock(client_lock);
15452   return _getlk(fh, fl, owner);
15453 }
15454
15455 int Client::ll_setlk(Fh *fh, struct flock *fl, uint64_t owner, int sleep)
15456 {
15457   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
15458   if (!mref_reader.is_state_satisfied())
15459     return -CEPHFS_ENOTCONN;
15460
15461   ldout(cct, 3) << __func__ << "  (fh) " << fh << " " << fh->inode->ino << dendl;
15462   tout(cct) << __func__ << " (fh)" << (uintptr_t)fh << std::endl;
15463
15464   std::scoped_lock lock(client_lock);
15465   return _setlk(fh, fl, owner, sleep);
15466 }
15467
15468 int Client::ll_flock(Fh *fh, int cmd, uint64_t owner)
15469 {
15470   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
15471   if (!mref_reader.is_state_satisfied())
15472     return -CEPHFS_ENOTCONN;
15473
15474   ldout(cct, 3) << __func__ << "  (fh) " << fh << " " << fh->inode->ino << dendl;
15475   tout(cct) << __func__ << " (fh)" << (uintptr_t)fh << std::endl;
15476
15477   std::scoped_lock lock(client_lock);
15478   return _flock(fh, cmd, owner);
15479 }
15480
15481 int Client::set_deleg_timeout(uint32_t timeout)
15482 {
15483   std::scoped_lock lock(client_lock);
15484
15485   /*
15486    * The whole point is to prevent blocklisting so we must time out the
15487    * delegation before the session autoclose timeout kicks in.
15488    */
15489   if (timeout >= mdsmap->get_session_autoclose())
15490     return -CEPHFS_EINVAL;
15491
15492   deleg_timeout = timeout;
15493   return 0;
15494 }
15495
15496 int Client::ll_delegation(Fh *fh, unsigned cmd, ceph_deleg_cb_t cb, void *priv)
15497 {
15498   int ret = -CEPHFS_EINVAL;
15499
15500   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
15501   if (!mref_reader.is_state_satisfied())
15502     return -CEPHFS_ENOTCONN;
15503
15504   std::scoped_lock lock(client_lock);
15505
15506   Inode *inode = fh->inode.get();
15507
15508   switch(cmd) {
15509   case CEPH_DELEGATION_NONE:
15510     inode->unset_deleg(fh);
15511     ret = 0;
15512     break;
15513   default:
15514     try {
15515       ret = inode->set_deleg(fh, cmd, cb, priv);
15516     } catch (std::bad_alloc&) {
15517       ret = -CEPHFS_ENOMEM;
15518     }
15519     break;
15520   }
15521   return ret;
15522 }
15523
15524 class C_Client_RequestInterrupt : public Context  {
15525 private:
15526   Client *client;
15527   MetaRequest *req;
15528 public:
15529   C_Client_RequestInterrupt(Client *c, MetaRequest *r) : client(c), req(r) {
15530     req->get();
15531   }
15532   void finish(int r) override {
15533     std::scoped_lock l(client->client_lock);
15534     ceph_assert(req->head.op == CEPH_MDS_OP_SETFILELOCK);
15535     client->_interrupt_filelock(req);
15536     client->put_request(req);
15537   }
15538 };
15539
15540 void Client::ll_interrupt(void *d)
15541 {
15542   MetaRequest *req = static_cast<MetaRequest*>(d);
15543   ldout(cct, 3) << __func__ << " tid " << req->get_tid() << dendl;
15544   tout(cct) << __func__ << " tid " << req->get_tid() << std::endl;
15545   interrupt_finisher.queue(new C_Client_RequestInterrupt(this, req));
15546 }
15547
15548 // =========================================
15549 // layout
15550
15551 // expose file layouts
15552
15553 int Client::describe_layout(const char *relpath, file_layout_t *lp,
15554                             const UserPerm& perms)
15555 {
15556   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
15557   if (!mref_reader.is_state_satisfied())
15558     return -CEPHFS_ENOTCONN;
15559
15560   std::scoped_lock lock(client_lock);
15561
15562   filepath path(relpath);
15563   InodeRef in;
15564   int r = path_walk(path, &in, perms);
15565   if (r < 0)
15566     return r;
15567
15568   *lp = in->layout;
15569
15570   ldout(cct, 3) << __func__ << "(" << relpath << ") = 0" << dendl;
15571   return 0;
15572 }
15573
15574 int Client::fdescribe_layout(int fd, file_layout_t *lp)
15575 {
15576   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
15577   if (!mref_reader.is_state_satisfied())
15578     return -CEPHFS_ENOTCONN;
15579
15580   std::scoped_lock lock(client_lock);
15581
15582   Fh *f = get_filehandle(fd);
15583   if (!f)
15584     return -CEPHFS_EBADF;
15585   Inode *in = f->inode.get();
15586
15587   *lp = in->layout;
15588
15589   ldout(cct, 3) << __func__ << "(" << fd << ") = 0" << dendl;
15590   return 0;
15591 }
15592
15593 int64_t Client::get_default_pool_id()
15594 {
15595   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
15596   if (!mref_reader.is_state_satisfied())
15597     return -CEPHFS_ENOTCONN;
15598
15599   std::scoped_lock lock(client_lock);
15600
15601   /* first data pool is the default */
15602   return mdsmap->get_first_data_pool();
15603 }
15604
15605 // expose osdmap
15606
15607 int64_t Client::get_pool_id(const char *pool_name)
15608 {
15609   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
15610   if (!mref_reader.is_state_satisfied())
15611     return -CEPHFS_ENOTCONN;
15612
15613   std::scoped_lock lock(client_lock);
15614
15615   return objecter->with_osdmap(std::mem_fn(&OSDMap::lookup_pg_pool_name),
15616                                pool_name);
15617 }
15618
15619 string Client::get_pool_name(int64_t pool)
15620 {
15621   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
15622   if (!mref_reader.is_state_satisfied())
15623     return string();
15624
15625   std::scoped_lock lock(client_lock);
15626
15627   return objecter->with_osdmap([pool](const OSDMap& o) {
15628       return o.have_pg_pool(pool) ? o.get_pool_name(pool) : string();
15629     });
15630 }
15631
15632 int Client::get_pool_replication(int64_t pool)
15633 {
15634   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
15635   if (!mref_reader.is_state_satisfied())
15636     return -CEPHFS_ENOTCONN;
15637
15638   std::scoped_lock lock(client_lock);
15639
15640   return objecter->with_osdmap([pool](const OSDMap& o) {
15641       return o.have_pg_pool(pool) ? o.get_pg_pool(pool)->get_size() : -CEPHFS_ENOENT;
15642     });
15643 }
15644
15645 int Client::get_file_extent_osds(int fd, loff_t off, loff_t *len, vector<int>& osds)
15646 {
15647   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
15648   if (!mref_reader.is_state_satisfied())
15649     return -CEPHFS_ENOTCONN;
15650
15651   std::scoped_lock lock(client_lock);
15652
15653   Fh *f = get_filehandle(fd);
15654   if (!f)
15655     return -CEPHFS_EBADF;
15656   Inode *in = f->inode.get();
15657
15658   vector<ObjectExtent> extents;
15659   Striper::file_to_extents(cct, in->ino, &in->layout, off, 1, in->truncate_size, extents);
15660   ceph_assert(extents.size() == 1);
15661
15662   objecter->with_osdmap([&](const OSDMap& o) {
15663       pg_t pg = o.object_locator_to_pg(extents[0].oid, extents[0].oloc);
15664       o.pg_to_acting_osds(pg, osds);
15665     });
15666
15667   if (osds.empty())
15668     return -CEPHFS_EINVAL;
15669
15670   /*
15671    * Return the remainder of the extent (stripe unit)
15672    *
15673    * If length = 1 is passed to Striper::file_to_extents we get a single
15674    * extent back, but its length is one so we still need to compute the length
15675    * to the end of the stripe unit.
15676    *
15677    * If length = su then we may get 1 or 2 objects back in the extents vector
15678    * which would have to be examined. Even then, the offsets are local to the
15679    * object, so matching up to the file offset is extra work.
15680    *
15681    * It seems simpler to stick with length = 1 and manually compute the
15682    * remainder.
15683    */
15684   if (len) {
15685     uint64_t su = in->layout.stripe_unit;
15686     *len = su - (off % su);
15687   }
15688
15689   return 0;
15690 }
15691
15692 int Client::get_osd_crush_location(int id, vector<pair<string, string> >& path)
15693 {
15694   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
15695   if (!mref_reader.is_state_satisfied())
15696     return -CEPHFS_ENOTCONN;
15697
15698   std::scoped_lock lock(client_lock);
15699
15700   if (id < 0)
15701     return -CEPHFS_EINVAL;
15702   return objecter->with_osdmap([&](const OSDMap& o) {
15703       return o.crush->get_full_location_ordered(id, path);
15704     });
15705 }
15706
15707 int Client::get_file_stripe_address(int fd, loff_t offset,
15708                                     vector<entity_addr_t>& address)
15709 {
15710   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
15711   if (!mref_reader.is_state_satisfied())
15712     return -CEPHFS_ENOTCONN;
15713
15714   std::scoped_lock lock(client_lock);
15715
15716   Fh *f = get_filehandle(fd);
15717   if (!f)
15718     return -CEPHFS_EBADF;
15719   Inode *in = f->inode.get();
15720
15721   // which object?
15722   vector<ObjectExtent> extents;
15723   Striper::file_to_extents(cct, in->ino, &in->layout, offset, 1,
15724                            in->truncate_size, extents);
15725   ceph_assert(extents.size() == 1);
15726
15727   // now we have the object and its 'layout'
15728   return objecter->with_osdmap([&](const OSDMap& o) {
15729       pg_t pg = o.object_locator_to_pg(extents[0].oid, extents[0].oloc);
15730       vector<int> osds;
15731       o.pg_to_acting_osds(pg, osds);
15732       if (osds.empty())
15733         return -CEPHFS_EINVAL;
15734       for (unsigned i = 0; i < osds.size(); i++) {
15735         entity_addr_t addr = o.get_addrs(osds[i]).front();
15736         address.push_back(addr);
15737       }
15738       return 0;
15739     });
15740 }
15741
15742 int Client::get_osd_addr(int osd, entity_addr_t& addr)
15743 {
15744   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
15745   if (!mref_reader.is_state_satisfied())
15746     return -CEPHFS_ENOTCONN;
15747
15748   std::scoped_lock lock(client_lock);
15749
15750   return objecter->with_osdmap([&](const OSDMap& o) {
15751       if (!o.exists(osd))
15752         return -CEPHFS_ENOENT;
15753
15754       addr = o.get_addrs(osd).front();
15755       return 0;
15756     });
15757 }
15758
15759 int Client::enumerate_layout(int fd, vector<ObjectExtent>& result,
15760                              loff_t length, loff_t offset)
15761 {
15762   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
15763   if (!mref_reader.is_state_satisfied())
15764     return -CEPHFS_ENOTCONN;
15765
15766   std::scoped_lock lock(client_lock);
15767
15768   Fh *f = get_filehandle(fd);
15769   if (!f)
15770     return -CEPHFS_EBADF;
15771   Inode *in = f->inode.get();
15772
15773   // map to a list of extents
15774   Striper::file_to_extents(cct, in->ino, &in->layout, offset, length, in->truncate_size, result);
15775
15776   ldout(cct, 3) << __func__ << "(" << fd << ", " << length << ", " << offset << ") = 0" << dendl;
15777   return 0;
15778 }
15779
15780
15781 /* find an osd with the same ip.  -CEPHFS_ENXIO if none. */
15782 int Client::get_local_osd()
15783 {
15784   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
15785   if (!mref_reader.is_state_satisfied())
15786     return -CEPHFS_ENOTCONN;
15787
15788   std::scoped_lock lock(client_lock);
15789
15790   objecter->with_osdmap([this](const OSDMap& o) {
15791       if (o.get_epoch() != local_osd_epoch) {
15792         local_osd = o.find_osd_on_ip(messenger->get_myaddrs().front());
15793         local_osd_epoch = o.get_epoch();
15794       }
15795     });
15796   return local_osd;
15797 }
15798
15799
15800
15801
15802
15803
15804 // ===============================
15805
15806 void Client::ms_handle_connect(Connection *con)
15807 {
15808   ldout(cct, 10) << __func__ << " on " << con->get_peer_addr() << dendl;
15809 }
15810
15811 bool Client::ms_handle_reset(Connection *con)
15812 {
15813   ldout(cct, 0) << __func__ << " on " << con->get_peer_addr() << dendl;
15814   return false;
15815 }
15816
15817 void Client::ms_handle_remote_reset(Connection *con)
15818 {
15819   std::scoped_lock lock(client_lock);
15820   ldout(cct, 0) << __func__ << " on " << con->get_peer_addr() << dendl;
15821   switch (con->get_peer_type()) {
15822   case CEPH_ENTITY_TYPE_MDS:
15823     {
15824       // kludge to figure out which mds this is; fixme with a Connection* state
15825       mds_rank_t mds = MDS_RANK_NONE;
15826       MetaSessionRef s = NULL;
15827       for (auto &p : mds_sessions) {
15828         if (mdsmap->have_inst(p.first) && mdsmap->get_addrs(p.first) == con->get_peer_addrs()) {
15829           mds = p.first;
15830           s = p.second;
15831         }
15832       }
15833       if (mds >= 0) {
15834         ceph_assert(s != NULL);
15835         switch (s->state) {
15836         case MetaSession::STATE_CLOSING:
15837           ldout(cct, 1) << "reset from mds we were closing; we'll call that closed" << dendl;
15838           _closed_mds_session(s.get());
15839           break;
15840
15841         case MetaSession::STATE_OPENING:
15842           {
15843             ldout(cct, 1) << "reset from mds we were opening; retrying" << dendl;
15844             list<Context*> waiters;
15845             waiters.swap(s->waiting_for_open);
15846             _closed_mds_session(s.get());
15847             auto news = _get_or_open_mds_session(mds);
15848             news->waiting_for_open.swap(waiters);
15849           }
15850           break;
15851
15852         case MetaSession::STATE_OPEN:
15853           {
15854             objecter->maybe_request_map(); /* to check if we are blocklisted */
15855             if (cct->_conf.get_val<bool>("client_reconnect_stale")) {
15856               ldout(cct, 1) << "reset from mds we were open; close mds session for reconnect" << dendl;
15857               _closed_mds_session(s.get());
15858             } else {
15859               ldout(cct, 1) << "reset from mds we were open; mark session as stale" << dendl;
15860               s->state = MetaSession::STATE_STALE;
15861             }
15862           }
15863           break;
15864
15865         case MetaSession::STATE_NEW:
15866         case MetaSession::STATE_CLOSED:
15867         default:
15868           break;
15869         }
15870       }
15871     }
15872     break;
15873   }
15874 }
15875
15876 bool Client::ms_handle_refused(Connection *con)
15877 {
15878   ldout(cct, 1) << __func__ << " on " << con->get_peer_addr() << dendl;
15879   return false;
15880 }
15881
15882 Inode *Client::get_quota_root(Inode *in, const UserPerm& perms, quota_max_t type)
15883 {
15884   Inode *quota_in = root_ancestor;
15885   SnapRealm *realm = in->snaprealm;
15886
15887   if (!cct->_conf.get_val<bool>("client_quota"))
15888     return NULL;
15889
15890   while (realm) {
15891     ldout(cct, 10) << __func__ << " realm " << realm->ino << dendl;
15892     if (realm->ino != in->ino) {
15893       auto p = inode_map.find(vinodeno_t(realm->ino, CEPH_NOSNAP));
15894       if (p == inode_map.end())
15895         break;
15896
15897       if (p->second->quota.is_enabled(type)) {
15898         quota_in = p->second;
15899         break;
15900       }
15901     }
15902     realm = realm->pparent;
15903   }
15904   ldout(cct, 10) << __func__ << " " << in->vino() << " -> " << quota_in->vino() << dendl;
15905   return quota_in;
15906 }
15907
15908 /**
15909  * Traverse quota ancestors of the Inode, return true
15910  * if any of them passes the passed function
15911  */
15912 bool Client::check_quota_condition(Inode *in, const UserPerm& perms,
15913                                    std::function<bool (const Inode &in)> test)
15914 {
15915   if (!cct->_conf.get_val<bool>("client_quota"))
15916     return false;
15917
15918   while (true) {
15919     ceph_assert(in != NULL);
15920     if (test(*in)) {
15921       return true;
15922     }
15923
15924     if (in == root_ancestor) {
15925       // We're done traversing, drop out
15926       return false;
15927     } else {
15928       // Continue up the tree
15929       in = get_quota_root(in, perms);
15930     }
15931   }
15932
15933   return false;
15934 }
15935
15936 bool Client::is_quota_files_exceeded(Inode *in, const UserPerm& perms)
15937 {
15938   return check_quota_condition(in, perms,
15939       [](const Inode &in) {
15940         return in.quota.max_files && in.rstat.rsize() >= in.quota.max_files;
15941       });
15942 }
15943
15944 bool Client::is_quota_bytes_exceeded(Inode *in, int64_t new_bytes,
15945                                      const UserPerm& perms)
15946 {
15947   return check_quota_condition(in, perms,
15948       [&new_bytes](const Inode &in) {
15949         return in.quota.max_bytes && (in.rstat.rbytes + new_bytes)
15950                > in.quota.max_bytes;
15951       });
15952 }
15953
15954 bool Client::is_quota_bytes_approaching(Inode *in, const UserPerm& perms)
15955 {
15956   ceph_assert(in->size >= in->reported_size);
15957   const uint64_t size = in->size - in->reported_size;
15958   return check_quota_condition(in, perms,
15959       [&size](const Inode &in) {
15960         if (in.quota.max_bytes) {
15961           if (in.rstat.rbytes >= in.quota.max_bytes) {
15962             return true;
15963           }
15964
15965           const uint64_t space = in.quota.max_bytes - in.rstat.rbytes;
15966           return (space >> 4) < size;
15967         } else {
15968           return false;
15969         }
15970       });
15971 }
15972
15973 enum {
15974   POOL_CHECKED = 1,
15975   POOL_CHECKING = 2,
15976   POOL_READ = 4,
15977   POOL_WRITE = 8,
15978 };
15979
15980 int Client::check_pool_perm(Inode *in, int need)
15981 {
15982   ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
15983
15984   if (!cct->_conf->client_check_pool_perm)
15985     return 0;
15986
15987   /* Only need to do this for regular files */
15988   if (!in->is_file())
15989     return 0;
15990
15991   int64_t pool_id = in->layout.pool_id;
15992   std::string pool_ns = in->layout.pool_ns;
15993   std::pair<int64_t, std::string> perm_key(pool_id, pool_ns);
15994   int have = 0;
15995   while (true) {
15996     auto it = pool_perms.find(perm_key);
15997     if (it == pool_perms.end())
15998       break;
15999     if (it->second == POOL_CHECKING) {
16000       // avoid concurrent checkings
16001       wait_on_list(waiting_for_pool_perm);
16002     } else {
16003       have = it->second;
16004       ceph_assert(have & POOL_CHECKED);
16005       break;
16006     }
16007   }
16008
16009   if (!have) {
16010     if (in->snapid != CEPH_NOSNAP) {
16011       // pool permission check needs to write to the first object. But for snapshot,
16012       // head of the first object may have already been deleted. To avoid creating
16013       // orphan object, skip the check for now.
16014       return 0;
16015     }
16016
16017     pool_perms[perm_key] = POOL_CHECKING;
16018
16019     char oid_buf[32];
16020     snprintf(oid_buf, sizeof(oid_buf), "%llx.00000000", (unsigned long long)in->ino);
16021     object_t oid = oid_buf;
16022
16023     SnapContext nullsnapc;
16024
16025     C_SaferCond rd_cond;
16026     ObjectOperation rd_op;
16027     rd_op.stat(nullptr, nullptr, nullptr);
16028
16029     objecter->mutate(oid, OSDMap::file_to_object_locator(in->layout), rd_op,
16030                      nullsnapc, ceph::real_clock::now(), 0, &rd_cond);
16031
16032     C_SaferCond wr_cond;
16033     ObjectOperation wr_op;
16034     wr_op.create(true);
16035
16036     objecter->mutate(oid, OSDMap::file_to_object_locator(in->layout), wr_op,
16037                      nullsnapc, ceph::real_clock::now(), 0, &wr_cond);
16038
16039     client_lock.unlock();
16040     int rd_ret = rd_cond.wait();
16041     int wr_ret = wr_cond.wait();
16042     client_lock.lock();
16043
16044     bool errored = false;
16045
16046     if (rd_ret == 0 || rd_ret == -CEPHFS_ENOENT)
16047       have |= POOL_READ;
16048     else if (rd_ret != -CEPHFS_EPERM) {
16049       ldout(cct, 10) << __func__ << " on pool " << pool_id << " ns " << pool_ns
16050                      << " rd_err = " << rd_ret << " wr_err = " << wr_ret << dendl;
16051       errored = true;
16052     }
16053
16054     if (wr_ret == 0 || wr_ret == -CEPHFS_EEXIST)
16055       have |= POOL_WRITE;
16056     else if (wr_ret != -CEPHFS_EPERM) {
16057       ldout(cct, 10) << __func__ << " on pool " << pool_id << " ns " << pool_ns
16058                      << " rd_err = " << rd_ret << " wr_err = " << wr_ret << dendl;
16059       errored = true;
16060     }
16061
16062     if (errored) {
16063       // Indeterminate: erase CHECKING state so that subsequent calls re-check.
16064       // Raise EIO because actual error code might be misleading for
16065       // userspace filesystem user.
16066       pool_perms.erase(perm_key);
16067       signal_cond_list(waiting_for_pool_perm);
16068       return -CEPHFS_EIO;
16069     }
16070
16071     pool_perms[perm_key] = have | POOL_CHECKED;
16072     signal_cond_list(waiting_for_pool_perm);
16073   }
16074
16075   if ((need & CEPH_CAP_FILE_RD) && !(have & POOL_READ)) {
16076     ldout(cct, 10) << __func__ << " on pool " << pool_id << " ns " << pool_ns
16077                    << " need " << ccap_string(need) << ", but no read perm" << dendl;
16078     return -CEPHFS_EPERM;
16079   }
16080   if ((need & CEPH_CAP_FILE_WR) && !(have & POOL_WRITE)) {
16081     ldout(cct, 10) << __func__ << " on pool " << pool_id << " ns " << pool_ns
16082                    << " need " << ccap_string(need) << ", but no write perm" << dendl;
16083     return -CEPHFS_EPERM;
16084   }
16085
16086   return 0;
16087 }
16088
16089 int Client::_posix_acl_permission(Inode *in, const UserPerm& perms, unsigned want)
16090 {
16091   if (acl_type == POSIX_ACL) {
16092     if (in->xattrs.count(ACL_EA_ACCESS)) {
16093       const bufferptr& access_acl = in->xattrs[ACL_EA_ACCESS];
16094
16095       return posix_acl_permits(access_acl, in->uid, in->gid, perms, want);
16096     }
16097   }
16098   return -CEPHFS_EAGAIN;
16099 }
16100
16101 int Client::_posix_acl_chmod(Inode *in, mode_t mode, const UserPerm& perms)
16102 {
16103   if (acl_type == NO_ACL)
16104     return 0;
16105
16106   int r = _getattr(in, CEPH_STAT_CAP_XATTR, perms, in->xattr_version == 0);
16107   if (r < 0)
16108     goto out;
16109
16110   if (acl_type == POSIX_ACL) {
16111     if (in->xattrs.count(ACL_EA_ACCESS)) {
16112       const bufferptr& access_acl = in->xattrs[ACL_EA_ACCESS];
16113       bufferptr acl(access_acl.c_str(), access_acl.length());
16114       r = posix_acl_access_chmod(acl, mode);
16115       if (r < 0)
16116         goto out;
16117       r = _do_setxattr(in, ACL_EA_ACCESS, acl.c_str(), acl.length(), 0, perms);
16118     } else {
16119       r = 0;
16120     }
16121   }
16122 out:
16123   ldout(cct, 10) << __func__ << " ino " << in->ino << " result=" << r << dendl;
16124   return r;
16125 }
16126
16127 int Client::_posix_acl_create(Inode *dir, mode_t *mode, bufferlist& xattrs_bl,
16128                               const UserPerm& perms)
16129 {
16130   if (acl_type == NO_ACL)
16131     return 0;
16132
16133   if (S_ISLNK(*mode))
16134     return 0;
16135
16136   int r = _getattr(dir, CEPH_STAT_CAP_XATTR, perms, dir->xattr_version == 0);
16137   if (r < 0)
16138     goto out;
16139
16140   if (acl_type == POSIX_ACL) {
16141     if (dir->xattrs.count(ACL_EA_DEFAULT)) {
16142       map<string, bufferptr> xattrs;
16143
16144       const bufferptr& default_acl = dir->xattrs[ACL_EA_DEFAULT];
16145       bufferptr acl(default_acl.c_str(), default_acl.length());
16146       r = posix_acl_inherit_mode(acl, mode);
16147       if (r < 0)
16148         goto out;
16149
16150       if (r > 0) {
16151         r = posix_acl_equiv_mode(acl.c_str(), acl.length(), mode);
16152         if (r < 0)
16153           goto out;
16154         if (r > 0)
16155           xattrs[ACL_EA_ACCESS] = acl;
16156       }
16157
16158       if (S_ISDIR(*mode))
16159         xattrs[ACL_EA_DEFAULT] = dir->xattrs[ACL_EA_DEFAULT];
16160
16161       r = xattrs.size();
16162       if (r > 0)
16163         encode(xattrs, xattrs_bl);
16164     } else {
16165       if (umask_cb)
16166         *mode &= ~umask_cb(callback_handle);
16167       r = 0;
16168     }
16169   }
16170 out:
16171   ldout(cct, 10) << __func__ << " dir ino " << dir->ino << " result=" << r << dendl;
16172   return r;
16173 }
16174
16175 void Client::set_filer_flags(int flags)
16176 {
16177   std::scoped_lock l(client_lock);
16178   ceph_assert(flags == 0 ||
16179          flags == CEPH_OSD_FLAG_LOCALIZE_READS);
16180   objecter->add_global_op_flags(flags);
16181 }
16182
16183 void Client::clear_filer_flags(int flags)
16184 {
16185   std::scoped_lock l(client_lock);
16186   ceph_assert(flags == CEPH_OSD_FLAG_LOCALIZE_READS);
16187   objecter->clear_global_op_flag(flags);
16188 }
16189
16190 // called before mount
16191 void Client::set_uuid(const std::string& uuid)
16192 {
16193   RWRef_t iref_reader(initialize_state, CLIENT_INITIALIZED);
16194   ceph_assert(iref_reader.is_state_satisfied());
16195
16196   std::scoped_lock l(client_lock);
16197   ceph_assert(!uuid.empty());
16198
16199   metadata["uuid"] = uuid;
16200   _close_sessions();
16201 }
16202
16203 // called before mount. 0 means infinite
16204 void Client::set_session_timeout(unsigned timeout)
16205 {
16206   RWRef_t iref_reader(initialize_state, CLIENT_INITIALIZED);
16207   ceph_assert(iref_reader.is_state_satisfied());
16208
16209   std::scoped_lock l(client_lock);
16210
16211   metadata["timeout"] = stringify(timeout);
16212 }
16213
16214 // called before mount
16215 int Client::start_reclaim(const std::string& uuid, unsigned flags,
16216                           const std::string& fs_name)
16217 {
16218   RWRef_t iref_reader(initialize_state, CLIENT_INITIALIZED);
16219   if (!iref_reader.is_state_satisfied())
16220     return -CEPHFS_ENOTCONN;
16221
16222   if (uuid.empty())
16223     return -CEPHFS_EINVAL;
16224
16225   std::unique_lock l(client_lock);
16226   {
16227     auto it = metadata.find("uuid");
16228     if (it != metadata.end() && it->second == uuid)
16229       return -CEPHFS_EINVAL;
16230   }
16231
16232   int r = subscribe_mdsmap(fs_name);
16233   if (r < 0) {
16234     lderr(cct) << "mdsmap subscription failed: " << cpp_strerror(r) << dendl;
16235     return r;
16236   }
16237
16238   if (metadata.empty())
16239     populate_metadata("");
16240
16241   while (mdsmap->get_epoch() == 0)
16242     wait_on_list(waiting_for_mdsmap);
16243
16244   reclaim_errno = 0;
16245   for (unsigned mds = 0; mds < mdsmap->get_num_in_mds(); ) {
16246     if (!mdsmap->is_up(mds)) {
16247       ldout(cct, 10) << "mds." << mds << " not active, waiting for new mdsmap" << dendl;
16248       wait_on_list(waiting_for_mdsmap);
16249       continue;
16250     }
16251
16252     MetaSessionRef session;
16253     if (!have_open_session(mds)) {
16254       session = _get_or_open_mds_session(mds);
16255       if (session->state == MetaSession::STATE_REJECTED)
16256         return -CEPHFS_EPERM;
16257       if (session->state != MetaSession::STATE_OPENING) {
16258         // umounting?
16259         return -CEPHFS_EINVAL;
16260       }
16261       ldout(cct, 10) << "waiting for session to mds." << mds << " to open" << dendl;
16262       wait_on_context_list(session->waiting_for_open);
16263       continue;
16264     }
16265
16266     session = mds_sessions.at(mds);
16267     if (!session->mds_features.test(CEPHFS_FEATURE_RECLAIM_CLIENT))
16268       return -CEPHFS_EOPNOTSUPP;
16269
16270     if (session->reclaim_state == MetaSession::RECLAIM_NULL ||
16271         session->reclaim_state == MetaSession::RECLAIMING) {
16272       session->reclaim_state = MetaSession::RECLAIMING;
16273       auto m = make_message<MClientReclaim>(uuid, flags);
16274       session->con->send_message2(std::move(m));
16275       wait_on_list(waiting_for_reclaim);
16276     } else if (session->reclaim_state == MetaSession::RECLAIM_FAIL) {
16277       return reclaim_errno ? : -CEPHFS_ENOTRECOVERABLE;
16278     } else {
16279       mds++;
16280     }
16281   }
16282
16283   // didn't find target session in any mds
16284   if (reclaim_target_addrs.empty()) {
16285     if (flags & CEPH_RECLAIM_RESET)
16286       return -CEPHFS_ENOENT;
16287     return -CEPHFS_ENOTRECOVERABLE;
16288   }
16289
16290   if (flags & CEPH_RECLAIM_RESET)
16291     return 0;
16292
16293   // use blocklist to check if target session was killed
16294   // (config option mds_session_blocklist_on_evict needs to be true)
16295   ldout(cct, 10) << __func__ << ": waiting for OSD epoch " << reclaim_osd_epoch << dendl;
16296   bs::error_code ec;
16297   l.unlock();
16298   objecter->wait_for_map(reclaim_osd_epoch, ca::use_blocked[ec]);
16299   l.lock();
16300
16301   if (ec)
16302     return ceph::from_error_code(ec);
16303
16304   bool blocklisted = objecter->with_osdmap(
16305       [this](const OSDMap &osd_map) -> bool {
16306         return osd_map.is_blocklisted(reclaim_target_addrs);
16307       });
16308   if (blocklisted)
16309     return -CEPHFS_ENOTRECOVERABLE;
16310
16311   metadata["reclaiming_uuid"] = uuid;
16312   return 0;
16313 }
16314
16315 void Client::finish_reclaim()
16316 {
16317   auto it = metadata.find("reclaiming_uuid");
16318   if (it == metadata.end()) {
16319     for (auto &p : mds_sessions)
16320       p.second->reclaim_state = MetaSession::RECLAIM_NULL;
16321     return;
16322   }
16323
16324   for (auto &p : mds_sessions) {
16325     p.second->reclaim_state = MetaSession::RECLAIM_NULL;
16326     auto m = make_message<MClientReclaim>("", MClientReclaim::FLAG_FINISH);
16327     p.second->con->send_message2(std::move(m));
16328   }
16329
16330   metadata["uuid"] = it->second;
16331   metadata.erase(it);
16332 }
16333
16334 void Client::handle_client_reclaim_reply(const MConstRef<MClientReclaimReply>& reply)
16335 {
16336   mds_rank_t from = mds_rank_t(reply->get_source().num());
16337   ldout(cct, 10) << __func__ << " " << *reply << " from mds." << from << dendl;
16338
16339   std::scoped_lock cl(client_lock);
16340   auto session = _get_mds_session(from, reply->get_connection().get());
16341   if (!session) {
16342     ldout(cct, 10) << " discarding reclaim reply from sessionless mds." <<  from << dendl;
16343     return;
16344   }
16345
16346   if (reply->get_result() >= 0) {
16347     session->reclaim_state = MetaSession::RECLAIM_OK;
16348     if (reply->get_epoch() > reclaim_osd_epoch)
16349       reclaim_osd_epoch = reply->get_epoch();
16350     if (!reply->get_addrs().empty())
16351       reclaim_target_addrs = reply->get_addrs();
16352   } else {
16353     session->reclaim_state = MetaSession::RECLAIM_FAIL;
16354     reclaim_errno = reply->get_result();
16355   }
16356
16357   signal_cond_list(waiting_for_reclaim);
16358 }
16359
16360 /**
16361  * This is included in cap release messages, to cause
16362  * the MDS to wait until this OSD map epoch.  It is necessary
16363  * in corner cases where we cancel RADOS ops, so that
16364  * nobody else tries to do IO to the same objects in
16365  * the same epoch as the cancelled ops.
16366  */
16367 void Client::set_cap_epoch_barrier(epoch_t e)
16368 {
16369   ldout(cct, 5) << __func__ << " epoch = " << e << dendl;
16370   cap_epoch_barrier = e;
16371 }
16372
16373 const char** Client::get_tracked_conf_keys() const
16374 {
16375   static const char* keys[] = {
16376     "client_cache_size",
16377     "client_cache_mid",
16378     "client_acl_type",
16379     "client_deleg_timeout",
16380     "client_deleg_break_on_open",
16381     "client_oc_size",
16382     "client_oc_max_objects",
16383     "client_oc_max_dirty",
16384     "client_oc_target_dirty",
16385     "client_oc_max_dirty_age",
16386     "client_caps_release_delay",
16387     "client_mount_timeout",
16388     NULL
16389   };
16390   return keys;
16391 }
16392
16393 void Client::handle_conf_change(const ConfigProxy& conf,
16394                                 const std::set <std::string> &changed)
16395 {
16396   std::scoped_lock lock(client_lock);
16397
16398   if (changed.count("client_cache_mid")) {
16399     lru.lru_set_midpoint(cct->_conf->client_cache_mid);
16400   }
16401   if (changed.count("client_acl_type")) {
16402     acl_type = NO_ACL;
16403     if (cct->_conf->client_acl_type == "posix_acl")
16404       acl_type = POSIX_ACL;
16405   }
16406   if (changed.count("client_oc_size")) {
16407     objectcacher->set_max_size(cct->_conf->client_oc_size);
16408   }
16409   if (changed.count("client_oc_max_objects")) {
16410     objectcacher->set_max_objects(cct->_conf->client_oc_max_objects);
16411   }
16412   if (changed.count("client_oc_max_dirty")) {
16413     objectcacher->set_max_dirty(cct->_conf->client_oc_max_dirty);
16414   }
16415   if (changed.count("client_oc_target_dirty")) {
16416     objectcacher->set_target_dirty(cct->_conf->client_oc_target_dirty);
16417   }
16418   if (changed.count("client_oc_max_dirty_age")) {
16419     objectcacher->set_max_dirty_age(cct->_conf->client_oc_max_dirty_age);
16420   }
16421   if (changed.count("client_collect_and_send_global_metrics")) {
16422     _collect_and_send_global_metrics = cct->_conf.get_val<bool>(
16423       "client_collect_and_send_global_metrics");
16424   }
16425   if (changed.count("client_caps_release_delay")) {
16426     caps_release_delay = cct->_conf.get_val<std::chrono::seconds>(
16427       "client_caps_release_delay");
16428   }
16429   if (changed.count("client_mount_timeout")) {
16430     mount_timeout = cct->_conf.get_val<std::chrono::seconds>(
16431       "client_mount_timeout");
16432   }
16433 }
16434
16435 void intrusive_ptr_add_ref(Inode *in)
16436 {
16437   in->iget();
16438 }
16439
16440 void intrusive_ptr_release(Inode *in)
16441 {
16442   in->client->put_inode(in);
16443 }
16444
16445 mds_rank_t Client::_get_random_up_mds() const
16446 {
16447   ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
16448
16449   std::set<mds_rank_t> up;
16450   mdsmap->get_up_mds_set(up);
16451
16452   if (up.empty())
16453     return MDS_RANK_NONE;
16454   std::set<mds_rank_t>::const_iterator p = up.begin();
16455   for (int n = rand() % up.size(); n; n--)
16456     ++p;
16457   return *p;
16458 }
16459
16460
16461 StandaloneClient::StandaloneClient(Messenger *m, MonClient *mc,
16462                                    boost::asio::io_context& ictx)
16463   : Client(m, mc, new Objecter(m->cct, m, mc, ictx))
16464 {
16465   monclient->set_messenger(m);
16466   objecter->set_client_incarnation(0);
16467 }
16468
16469 StandaloneClient::~StandaloneClient()
16470 {
16471   delete objecter;
16472   objecter = nullptr;
16473 }
16474
16475 int StandaloneClient::init()
16476 {
16477   RWRef_t iref_writer(initialize_state, CLIENT_INITIALIZING, false);
16478   ceph_assert(iref_writer.is_first_writer());
16479
16480   _pre_init();
16481   objecter->init();
16482
16483   client_lock.lock();
16484
16485   messenger->add_dispatcher_tail(objecter);
16486   messenger->add_dispatcher_tail(this);
16487
16488   monclient->set_want_keys(CEPH_ENTITY_TYPE_MDS | CEPH_ENTITY_TYPE_OSD);
16489   int r = monclient->init();
16490   if (r < 0) {
16491     // need to do cleanup because we're in an intermediate init state
16492     {
16493       std::scoped_lock l(timer_lock);
16494       timer.shutdown();
16495     }
16496
16497     client_lock.unlock();
16498     objecter->shutdown();
16499     objectcacher->stop();
16500     monclient->shutdown();
16501     return r;
16502   }
16503   objecter->start();
16504
16505   client_lock.unlock();
16506   _finish_init();
16507   iref_writer.update_state(CLIENT_INITIALIZED);
16508
16509   return 0;
16510 }
16511
16512 void StandaloneClient::shutdown()
16513 {
16514   Client::shutdown();
16515   objecter->shutdown();
16516   monclient->shutdown();
16517 }