ceph/src/client/Client.cc

   1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
   2 // vim: ts=8 sw=2 smarttab
   3 /*
   4  * Ceph - scalable distributed file system
   5  *
   6  * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
   7  *
   8  * This is free software; you can redistribute it and/or
   9  * modify it under the terms of the GNU Lesser General Public
  10  * License version 2.1, as published by the Free Software
  11  * Foundation.  See file COPYING.
  12  *
  13  */
  14
  15
  16 // unix-ey fs stuff
  17 #include <unistd.h>
  18 #include <sys/types.h>
  19 #include <time.h>
  20 #include <utime.h>
  21 #include <string.h>
  22 #include <sys/stat.h>
  23 #include <sys/param.h>
  24 #include <fcntl.h>
  25 #include <sys/file.h>
  26 #ifndef _WIN32
  27 #include <sys/utsname.h>
  28 #endif
  29 #include <sys/uio.h>
  30
  31 #include <boost/lexical_cast.hpp>
  32 #include <boost/fusion/include/std_pair.hpp>
  33
  34 #include "common/async/waiter.h"
  35
  36 #if defined(__FreeBSD__) || defined(_WIN32)
  37 #define XATTR_CREATE    0x1
  38 #define XATTR_REPLACE   0x2
  39 #else
  40 #include <sys/xattr.h>
  41 #endif
  42
  43 #if defined(__linux__)
  44 #include <linux/falloc.h>
  45 #endif
  46
  47 #include <sys/statvfs.h>
  48
  49 #include "common/config.h"
  50 #include "common/version.h"
  51 #include "common/async/blocked_completion.h"
  52
  53 #include "mon/MonClient.h"
  54
  55 #include "messages/MClientCaps.h"
  56 #include "messages/MClientLease.h"
  57 #include "messages/MClientQuota.h"
  58 #include "messages/MClientReclaim.h"
  59 #include "messages/MClientReclaimReply.h"
  60 #include "messages/MClientReconnect.h"
  61 #include "messages/MClientReply.h"
  62 #include "messages/MClientRequest.h"
  63 #include "messages/MClientRequestForward.h"
  64 #include "messages/MClientSession.h"
  65 #include "messages/MClientSnap.h"
  66 #include "messages/MClientMetrics.h"
  67 #include "messages/MCommandReply.h"
  68 #include "messages/MFSMap.h"
  69 #include "messages/MFSMapUser.h"
  70 #include "messages/MMDSMap.h"
  71 #include "messages/MOSDMap.h"
  72
  73 #include "mds/flock.h"
  74 #include "mds/cephfs_features.h"
  75 #include "osd/OSDMap.h"
  76 #include "osdc/Filer.h"
  77
  78 #include "common/Cond.h"
  79 #include "common/perf_counters.h"
  80 #include "common/admin_socket.h"
  81 #include "common/errno.h"
  82 #include "include/str_list.h"
  83
  84 #define dout_subsys ceph_subsys_client
  85
  86 #include "include/lru.h"
  87 #include "include/compat.h"
  88 #include "include/stringify.h"
  89 #include "include/random.h"
  90
  91 #include "Client.h"
  92 #include "Inode.h"
  93 #include "Dentry.h"
  94 #include "Delegation.h"
  95 #include "Dir.h"
  96 #include "ClientSnapRealm.h"
  97 #include "Fh.h"
  98 #include "MetaSession.h"
  99 #include "MetaRequest.h"
 100 #include "ObjecterWriteback.h"
 101 #include "posix_acl.h"
 102
 103 #include "include/ceph_assert.h"
 104 #include "include/stat.h"
 105
 106 #include "include/cephfs/ceph_ll_client.h"
 107
 108 #if HAVE_GETGROUPLIST
 109 #include <grp.h>
 110 #include <pwd.h>
 111 #include <unistd.h>
 112 #endif
 113
 114 #undef dout_prefix
 115 #define dout_prefix *_dout << "client." << whoami << " "
 116
 117 #define  tout(cct)       if (!cct->_conf->client_trace.empty()) traceout
 118
 119 // FreeBSD fails to define this
 120 #ifndef O_DSYNC
 121 #define O_DSYNC 0x0
 122 #endif
 123 // Darwin fails to define this
 124 #ifndef O_RSYNC
 125 #define O_RSYNC 0x0
 126 #endif
 127
 128 #ifndef O_DIRECT
 129 #define O_DIRECT 0x0
 130 #endif
 131
 132 // Windows doesn't define those values. While the Posix compatibilty layer
 133 // doesn't support those values, the Windows native functions do provide
 134 // similar flags. Special care should be taken if we're going to use those
 135 // flags in ceph-dokan. The current values are no-ops, while propagating
 136 // them to the rest of the code might cause the Windows functions to reject
 137 // them as invalid.
 138 #ifndef O_NOFOLLOW
 139 #define O_NOFOLLOW 0x0
 140 #endif
 141
 142 #ifndef O_SYNC
 143 #define O_SYNC 0x0
 144 #endif
 145
 146 #define DEBUG_GETATTR_CAPS (CEPH_CAP_XATTR_SHARED)
 147
 148 using namespace TOPNSPC::common;
 149
 150 namespace bs = boost::system;
 151 namespace ca = ceph::async;
 152
 153 void client_flush_set_callback(void *p, ObjectCacher::ObjectSet *oset)
 154 {
 155   Client *client = static_cast<Client*>(p);
 156   client->flush_set_callback(oset);
 157 }
 158
 159
 160 // -------------
 161
 162 Client::CommandHook::CommandHook(Client *client) :
 163   m_client(client)
 164 {
 165 }
 166
 167 int Client::CommandHook::call(
 168   std::string_view command,
 169   const cmdmap_t& cmdmap,
 170   Formatter *f,
 171   std::ostream& errss,
 172   bufferlist& out)
 173 {
 174   f->open_object_section("result");
 175   {
 176     std::scoped_lock l{m_client->client_lock};
 177     if (command == "mds_requests")
 178       m_client->dump_mds_requests(f);
 179     else if (command == "mds_sessions") {
 180       bool cap_dump = false;
 181       cmd_getval(cmdmap, "cap_dump", cap_dump);
 182       m_client->dump_mds_sessions(f, cap_dump);
 183     } else if (command == "dump_cache")
 184       m_client->dump_cache(f);
 185     else if (command == "kick_stale_sessions")
 186       m_client->_kick_stale_sessions();
 187     else if (command == "status")
 188       m_client->dump_status(f);
 189     else
 190       ceph_abort_msg("bad command registered");
 191   }
 192   f->close_section();
 193   return 0;
 194 }
 195
 196
 197 // -------------
 198
 199 dir_result_t::dir_result_t(Inode *in, const UserPerm& perms)
 200   : inode(in), offset(0), next_offset(2),
 201     release_count(0), ordered_count(0), cache_index(0), start_shared_gen(0),
 202     perms(perms)
 203   { }
 204
 205 void Client::_reset_faked_inos()
 206 {
 207   ino_t start = 1024;
 208   free_faked_inos.clear();
 209   free_faked_inos.insert(start, (uint32_t)-1 - start + 1);
 210   last_used_faked_ino = 0;
 211   last_used_faked_root = 0;
 212   #ifdef _WIN32
 213   // On Windows, sizeof(ino_t) is just 2. Despite that, most "native"
 214   // Windows structures, including Dokan ones, are using 64B identifiers.
 215   _use_faked_inos = false;
 216   #else
 217   _use_faked_inos = sizeof(ino_t) < 8 || cct->_conf->client_use_faked_inos;
 218   #endif
 219 }
 220
 221 void Client::_assign_faked_ino(Inode *in)
 222 {
 223   if (0 == last_used_faked_ino)
 224     last_used_faked_ino = last_used_faked_ino + 2048; // start(1024)~2048 reserved for _assign_faked_root
 225   interval_set<ino_t>::const_iterator it = free_faked_inos.lower_bound(last_used_faked_ino + 1);
 226   if (it == free_faked_inos.end() && last_used_faked_ino > 0) {
 227     last_used_faked_ino = 2048;
 228     it = free_faked_inos.lower_bound(last_used_faked_ino + 1);
 229   }
 230   ceph_assert(it != free_faked_inos.end());
 231   if (last_used_faked_ino < it.get_start()) {
 232     ceph_assert(it.get_len() > 0);
 233     last_used_faked_ino = it.get_start();
 234   } else {
 235     ++last_used_faked_ino;
 236     ceph_assert(it.get_start() + it.get_len() > last_used_faked_ino);
 237   }
 238   in->faked_ino = last_used_faked_ino;
 239   free_faked_inos.erase(in->faked_ino);
 240   faked_ino_map[in->faked_ino] = in->vino();
 241 }
 242
 243 /*
 244  * In the faked mode, if you export multiple subdirectories,
 245  * you will see that the inode numbers of the exported subdirectories
 246  * are the same. so we distinguish the mount point by reserving
 247  * the "fake ids" between "1024~2048" and combining the last
 248  * 10bits(0x3ff) of the "root inodes".
 249 */
 250 void Client::_assign_faked_root(Inode *in)
 251 {
 252   interval_set<ino_t>::const_iterator it = free_faked_inos.lower_bound(last_used_faked_root + 1);
 253   if (it == free_faked_inos.end() && last_used_faked_root > 0) {
 254     last_used_faked_root = 0;
 255     it = free_faked_inos.lower_bound(last_used_faked_root + 1);
 256   }
 257   assert(it != free_faked_inos.end());
 258   vinodeno_t inode_info = in->vino();
 259   uint64_t inode_num = (uint64_t)inode_info.ino;
 260   ldout(cct, 10) << "inode_num " << inode_num << "inode_num & 0x3ff=" << (inode_num & 0x3ff)<< dendl;
 261   last_used_faked_root = it.get_start()  + (inode_num & 0x3ff); // 0x3ff mask and get_start will not exceed 2048
 262   assert(it.get_start() + it.get_len() > last_used_faked_root);
 263
 264   in->faked_ino = last_used_faked_root;
 265   free_faked_inos.erase(in->faked_ino);
 266   faked_ino_map[in->faked_ino] = in->vino();
 267 }
 268
 269 void Client::_release_faked_ino(Inode *in)
 270 {
 271   free_faked_inos.insert(in->faked_ino);
 272   faked_ino_map.erase(in->faked_ino);
 273 }
 274
 275 vinodeno_t Client::_map_faked_ino(ino_t ino)
 276 {
 277   vinodeno_t vino;
 278   if (ino == 1)
 279     vino = root->vino();
 280   else if (faked_ino_map.count(ino))
 281     vino = faked_ino_map[ino];
 282   else
 283     vino = vinodeno_t(0, CEPH_NOSNAP);
 284   ldout(cct, 10) << __func__ << " " << ino << " -> " << vino << dendl;
 285   return vino;
 286 }
 287
 288 vinodeno_t Client::map_faked_ino(ino_t ino)
 289 {
 290   std::scoped_lock lock(client_lock);
 291   return _map_faked_ino(ino);
 292 }
 293
 294 // cons/des
 295
 296 Client::Client(Messenger *m, MonClient *mc, Objecter *objecter_)
 297   : Dispatcher(m->cct->get()),
 298     timer(m->cct, timer_lock, false),
 299     messenger(m),
 300     monclient(mc),
 301     objecter(objecter_),
 302     whoami(mc->get_global_id()),
 303     mount_state(CLIENT_UNMOUNTED, "Client::mountstate_lock"),
 304     initialize_state(CLIENT_NEW, "Client::initstate_lock"),
 305     cct_deleter{m->cct, [](CephContext *p) {p->put();}},
 306     async_ino_invalidator(m->cct),
 307     async_dentry_invalidator(m->cct),
 308     interrupt_finisher(m->cct),
 309     remount_finisher(m->cct),
 310     async_ino_releasor(m->cct),
 311     objecter_finisher(m->cct),
 312     m_command_hook(this),
 313     fscid(0)
 314 {
 315   _reset_faked_inos();
 316
 317   user_id = cct->_conf->client_mount_uid;
 318   group_id = cct->_conf->client_mount_gid;
 319   fuse_default_permissions = cct->_conf.get_val<bool>(
 320     "fuse_default_permissions");
 321
 322   if (cct->_conf->client_acl_type == "posix_acl")
 323     acl_type = POSIX_ACL;
 324
 325   lru.lru_set_midpoint(cct->_conf->client_cache_mid);
 326
 327   // file handles
 328   free_fd_set.insert(10, 1<<30);
 329
 330   mdsmap.reset(new MDSMap);
 331
 332   // osd interfaces
 333   writeback_handler.reset(new ObjecterWriteback(objecter, &objecter_finisher,
 334                                             &client_lock));
 335   objectcacher.reset(new ObjectCacher(cct, "libcephfs", *writeback_handler, client_lock,
 336                                   client_flush_set_callback,    // all commit callback
 337                                   (void*)this,
 338                                   cct->_conf->client_oc_size,
 339                                   cct->_conf->client_oc_max_objects,
 340                                   cct->_conf->client_oc_max_dirty,
 341                                   cct->_conf->client_oc_target_dirty,
 342                                   cct->_conf->client_oc_max_dirty_age,
 343                                   true));
 344 }
 345
 346
 347 Client::~Client()
 348 {
 349   ceph_assert(ceph_mutex_is_not_locked(client_lock));
 350
 351   // If the task is crashed or aborted and doesn't
 352   // get any chance to run the umount and shutdow.
 353   {
 354     std::scoped_lock l{client_lock};
 355     tick_thread_stopped = true;
 356     upkeep_cond.notify_one();
 357   }
 358
 359   if (upkeeper.joinable())
 360     upkeeper.join();
 361
 362   // It is necessary to hold client_lock, because any inode destruction
 363   // may call into ObjectCacher, which asserts that it's lock (which is
 364   // client_lock) is held.
 365   std::scoped_lock l{client_lock};
 366   tear_down_cache();
 367 }
 368
 369 void Client::tear_down_cache()
 370 {
 371   // fd's
 372   for (auto &[fd, fh] : fd_map) {
 373     ldout(cct, 1) << __func__ << " forcing close of fh " << fd << " ino " << fh->inode->ino << dendl;
 374     _release_fh(fh);
 375   }
 376   fd_map.clear();
 377
 378   while (!opened_dirs.empty()) {
 379     dir_result_t *dirp = *opened_dirs.begin();
 380     ldout(cct, 1) << __func__ << " forcing close of dir " << dirp << " ino " << dirp->inode->ino << dendl;
 381     _closedir(dirp);
 382   }
 383
 384   // caps!
 385   // *** FIXME ***
 386
 387   // empty lru
 388   trim_cache();
 389   ceph_assert(lru.lru_get_size() == 0);
 390
 391   // close root ino
 392   ceph_assert(inode_map.size() <= 1 + root_parents.size());
 393   if (root && inode_map.size() == 1 + root_parents.size()) {
 394     delete root;
 395     root = 0;
 396     root_ancestor = 0;
 397     while (!root_parents.empty())
 398       root_parents.erase(root_parents.begin());
 399     inode_map.clear();
 400     _reset_faked_inos();
 401   }
 402
 403   ceph_assert(inode_map.empty());
 404 }
 405
 406 inodeno_t Client::get_root_ino()
 407 {
 408   std::scoped_lock l(client_lock);
 409   if (use_faked_inos())
 410     return root->faked_ino;
 411   else
 412     return root->ino;
 413 }
 414
 415 Inode *Client::get_root()
 416 {
 417   std::scoped_lock l(client_lock);
 418   root->ll_get();
 419   return root;
 420 }
 421
 422
 423 // debug crapola
 424
 425 void Client::dump_inode(Formatter *f, Inode *in, set<Inode*>& did, bool disconnected)
 426 {
 427   filepath path;
 428   in->make_long_path(path);
 429   ldout(cct, 1) << "dump_inode: "
 430                 << (disconnected ? "DISCONNECTED ":"")
 431                 << "inode " << in->ino
 432                 << " " << path
 433                 << " ref " << in->get_num_ref()
 434                 << " " << *in << dendl;
 435
 436   if (f) {
 437     f->open_object_section("inode");
 438     f->dump_stream("path") << path;
 439     if (disconnected)
 440       f->dump_int("disconnected", 1);
 441     in->dump(f);
 442     f->close_section();
 443   }
 444
 445   did.insert(in);
 446   if (in->dir) {
 447     ldout(cct, 1) << "  dir " << in->dir << " size " << in->dir->dentries.size() << dendl;
 448     for (ceph::unordered_map<string, Dentry*>::iterator it = in->dir->dentries.begin();
 449          it != in->dir->dentries.end();
 450          ++it) {
 451       ldout(cct, 1) << "   " << in->ino << " dn " << it->first << " " << it->second << " ref " << it->second->ref << dendl;
 452       if (f) {
 453         f->open_object_section("dentry");
 454         it->second->dump(f);
 455         f->close_section();
 456       }
 457       if (it->second->inode)
 458         dump_inode(f, it->second->inode.get(), did, false);
 459     }
 460   }
 461 }
 462
 463 void Client::dump_cache(Formatter *f)
 464 {
 465   set<Inode*> did;
 466
 467   ldout(cct, 1) << __func__ << dendl;
 468
 469   if (f)
 470     f->open_array_section("cache");
 471
 472   if (root)
 473     dump_inode(f, root, did, true);
 474
 475   // make a second pass to catch anything disconnected
 476   for (ceph::unordered_map<vinodeno_t, Inode*>::iterator it = inode_map.begin();
 477        it != inode_map.end();
 478        ++it) {
 479     if (did.count(it->second))
 480       continue;
 481     dump_inode(f, it->second, did, true);
 482   }
 483
 484   if (f)
 485     f->close_section();
 486 }
 487
 488 void Client::dump_status(Formatter *f)
 489 {
 490   ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
 491
 492   ldout(cct, 1) << __func__ << dendl;
 493
 494   const epoch_t osd_epoch
 495     = objecter->with_osdmap(std::mem_fn(&OSDMap::get_epoch));
 496
 497   if (f) {
 498     f->open_object_section("metadata");
 499     for (const auto& kv : metadata)
 500       f->dump_string(kv.first.c_str(), kv.second);
 501     f->close_section();
 502
 503     f->dump_int("dentry_count", lru.lru_get_size());
 504     f->dump_int("dentry_pinned_count", lru.lru_get_num_pinned());
 505     f->dump_int("id", get_nodeid().v);
 506     entity_inst_t inst(messenger->get_myname(), messenger->get_myaddr_legacy());
 507     f->dump_object("inst", inst);
 508     f->dump_object("addr", inst.addr);
 509     f->dump_stream("inst_str") << inst.name << " " << inst.addr.get_legacy_str();
 510     f->dump_string("addr_str", inst.addr.get_legacy_str());
 511     f->dump_int("inode_count", inode_map.size());
 512     f->dump_int("mds_epoch", mdsmap->get_epoch());
 513     f->dump_int("osd_epoch", osd_epoch);
 514     f->dump_int("osd_epoch_barrier", cap_epoch_barrier);
 515     f->dump_bool("blocklisted", blocklisted);
 516     f->dump_string("fs_name", mdsmap->get_fs_name());
 517   }
 518 }
 519
 520 void Client::_pre_init()
 521 {
 522   timer.init();
 523
 524   objecter_finisher.start();
 525   filer.reset(new Filer(objecter, &objecter_finisher));
 526   objecter->enable_blocklist_events();
 527
 528   objectcacher->start();
 529 }
 530
 531 int Client::init()
 532 {
 533   RWRef_t iref_writer(initialize_state, CLIENT_INITIALIZING, false);
 534   ceph_assert(iref_writer.is_first_writer());
 535
 536   _pre_init();
 537   {
 538     std::scoped_lock l{client_lock};
 539     messenger->add_dispatcher_tail(this);
 540   }
 541   _finish_init();
 542   iref_writer.update_state(CLIENT_INITIALIZED);
 543   return 0;
 544 }
 545
 546 void Client::_finish_init()
 547 {
 548   {
 549     std::scoped_lock l{client_lock};
 550     // logger
 551     PerfCountersBuilder plb(cct, "client", l_c_first, l_c_last);
 552     plb.add_time_avg(l_c_reply, "reply", "Latency of receiving a reply on metadata request");
 553     plb.add_time_avg(l_c_lat, "lat", "Latency of processing a metadata request");
 554     plb.add_time_avg(l_c_wrlat, "wrlat", "Latency of a file data write operation");
 555     plb.add_time_avg(l_c_read, "rdlat", "Latency of a file data read operation");
 556     plb.add_time_avg(l_c_fsync, "fsync", "Latency of a file sync operation");
 557     logger.reset(plb.create_perf_counters());
 558     cct->get_perfcounters_collection()->add(logger.get());
 559   }
 560
 561   cct->_conf.add_observer(this);
 562
 563   AdminSocket* admin_socket = cct->get_admin_socket();
 564   int ret = admin_socket->register_command("mds_requests",
 565                                            &m_command_hook,
 566                                            "show in-progress mds requests");
 567   if (ret < 0) {
 568     lderr(cct) << "error registering admin socket command: "
 569                << cpp_strerror(-ret) << dendl;
 570   }
 571   ret = admin_socket->register_command("mds_sessions "
 572                                        "name=cap_dump,type=CephBool,req=false",
 573                                        &m_command_hook,
 574                                        "show mds session state");
 575   if (ret < 0) {
 576     lderr(cct) << "error registering admin socket command: "
 577                << cpp_strerror(-ret) << dendl;
 578   }
 579   ret = admin_socket->register_command("dump_cache",
 580                                        &m_command_hook,
 581                                        "show in-memory metadata cache contents");
 582   if (ret < 0) {
 583     lderr(cct) << "error registering admin socket command: "
 584                << cpp_strerror(-ret) << dendl;
 585   }
 586   ret = admin_socket->register_command("kick_stale_sessions",
 587                                        &m_command_hook,
 588                                        "kick sessions that were remote reset");
 589   if (ret < 0) {
 590     lderr(cct) << "error registering admin socket command: "
 591                << cpp_strerror(-ret) << dendl;
 592   }
 593   ret = admin_socket->register_command("status",
 594                                        &m_command_hook,
 595                                        "show overall client status");
 596   if (ret < 0) {
 597     lderr(cct) << "error registering admin socket command: "
 598                << cpp_strerror(-ret) << dendl;
 599   }
 600 }
 601
 602 void Client::shutdown()
 603 {
 604   ldout(cct, 1) << __func__ << dendl;
 605
 606   // If we were not mounted, but were being used for sending
 607   // MDS commands, we may have sessions that need closing.
 608   {
 609     std::scoped_lock l{client_lock};
 610
 611     // To make sure the tick thread will be stoppped before
 612     // destructing the Client, just in case like the _mount()
 613     // failed but didn't not get a chance to stop the tick
 614     // thread
 615     tick_thread_stopped = true;
 616     upkeep_cond.notify_one();
 617
 618     _close_sessions();
 619   }
 620   cct->_conf.remove_observer(this);
 621
 622   cct->get_admin_socket()->unregister_commands(&m_command_hook);
 623
 624   if (ino_invalidate_cb) {
 625     ldout(cct, 10) << "shutdown stopping cache invalidator finisher" << dendl;
 626     async_ino_invalidator.wait_for_empty();
 627     async_ino_invalidator.stop();
 628   }
 629
 630   if (dentry_invalidate_cb) {
 631     ldout(cct, 10) << "shutdown stopping dentry invalidator finisher" << dendl;
 632     async_dentry_invalidator.wait_for_empty();
 633     async_dentry_invalidator.stop();
 634   }
 635
 636   if (switch_interrupt_cb) {
 637     ldout(cct, 10) << "shutdown stopping interrupt finisher" << dendl;
 638     interrupt_finisher.wait_for_empty();
 639     interrupt_finisher.stop();
 640   }
 641
 642   if (remount_cb) {
 643     ldout(cct, 10) << "shutdown stopping remount finisher" << dendl;
 644     remount_finisher.wait_for_empty();
 645     remount_finisher.stop();
 646   }
 647
 648   if (ino_release_cb) {
 649     ldout(cct, 10) << "shutdown stopping inode release finisher" << dendl;
 650     async_ino_releasor.wait_for_empty();
 651     async_ino_releasor.stop();
 652   }
 653
 654   objectcacher->stop();  // outside of client_lock! this does a join.
 655
 656   /*
 657    * We are shuting down the client.
 658    *
 659    * Just declare the state to CLIENT_NEW to block and fail any
 660    * new comming "reader" and then try to wait all the in-flight
 661    * "readers" to finish.
 662    */
 663   RWRef_t iref_writer(initialize_state, CLIENT_NEW, false);
 664   if (!iref_writer.is_first_writer())
 665     return;
 666   iref_writer.wait_readers_done();
 667
 668   {
 669     std::scoped_lock l(timer_lock);
 670     timer.shutdown();
 671   }
 672
 673   objecter_finisher.wait_for_empty();
 674   objecter_finisher.stop();
 675
 676   if (logger) {
 677     cct->get_perfcounters_collection()->remove(logger.get());
 678     logger.reset();
 679   }
 680 }
 681
 682
 683 // ===================
 684 // metadata cache stuff
 685
 686 void Client::trim_cache(bool trim_kernel_dcache)
 687 {
 688   uint64_t max = cct->_conf->client_cache_size;
 689   ldout(cct, 20) << "trim_cache size " << lru.lru_get_size() << " max " << max << dendl;
 690   unsigned last = 0;
 691   while (lru.lru_get_size() != last) {
 692     last = lru.lru_get_size();
 693
 694     if (!is_unmounting() && lru.lru_get_size() <= max)  break;
 695
 696     // trim!
 697     Dentry *dn = static_cast<Dentry*>(lru.lru_get_next_expire());
 698     if (!dn)
 699       break;  // done
 700
 701     trim_dentry(dn);
 702   }
 703
 704   if (trim_kernel_dcache && lru.lru_get_size() > max)
 705     _invalidate_kernel_dcache();
 706
 707   // hose root?
 708   if (lru.lru_get_size() == 0 && root && root->get_num_ref() == 0 && inode_map.size() == 1 + root_parents.size()) {
 709     ldout(cct, 15) << "trim_cache trimmed root " << root << dendl;
 710     delete root;
 711     root = 0;
 712     root_ancestor = 0;
 713     while (!root_parents.empty())
 714       root_parents.erase(root_parents.begin());
 715     inode_map.clear();
 716     _reset_faked_inos();
 717   }
 718 }
 719
 720 void Client::trim_cache_for_reconnect(MetaSession *s)
 721 {
 722   mds_rank_t mds = s->mds_num;
 723   ldout(cct, 20) << __func__ << " mds." << mds << dendl;
 724
 725   int trimmed = 0;
 726   list<Dentry*> skipped;
 727   while (lru.lru_get_size() > 0) {
 728     Dentry *dn = static_cast<Dentry*>(lru.lru_expire());
 729     if (!dn)
 730       break;
 731
 732     if ((dn->inode && dn->inode->caps.count(mds)) ||
 733         dn->dir->parent_inode->caps.count(mds)) {
 734       trim_dentry(dn);
 735       trimmed++;
 736     } else
 737       skipped.push_back(dn);
 738   }
 739
 740   for(list<Dentry*>::iterator p = skipped.begin(); p != skipped.end(); ++p)
 741     lru.lru_insert_mid(*p);
 742
 743   ldout(cct, 20) << __func__ << " mds." << mds
 744                  << " trimmed " << trimmed << " dentries" << dendl;
 745
 746   if (s->caps.size() > 0)
 747     _invalidate_kernel_dcache();
 748 }
 749
 750 void Client::trim_dentry(Dentry *dn)
 751 {
 752   ldout(cct, 15) << "trim_dentry unlinking dn " << dn->name
 753                  << " in dir "
 754                  << std::hex << dn->dir->parent_inode->ino << std::dec
 755                  << dendl;
 756   if (dn->inode) {
 757     Inode *diri = dn->dir->parent_inode;
 758     clear_dir_complete_and_ordered(diri, true);
 759   }
 760   unlink(dn, false, false);  // drop dir, drop dentry
 761 }
 762
 763
 764 void Client::update_inode_file_size(Inode *in, int issued, uint64_t size,
 765                                     uint64_t truncate_seq, uint64_t truncate_size)
 766 {
 767   uint64_t prior_size = in->size;
 768
 769   if (truncate_seq > in->truncate_seq ||
 770       (truncate_seq == in->truncate_seq && size > in->size)) {
 771     ldout(cct, 10) << "size " << in->size << " -> " << size << dendl;
 772     in->size = size;
 773     in->reported_size = size;
 774     if (truncate_seq != in->truncate_seq) {
 775       ldout(cct, 10) << "truncate_seq " << in->truncate_seq << " -> "
 776                << truncate_seq << dendl;
 777       in->truncate_seq = truncate_seq;
 778       in->oset.truncate_seq = truncate_seq;
 779
 780       // truncate cached file data
 781       if (prior_size > size) {
 782         _invalidate_inode_cache(in, truncate_size, prior_size - truncate_size);
 783       }
 784     }
 785
 786     // truncate inline data
 787     if (in->inline_version < CEPH_INLINE_NONE) {
 788       uint32_t len = in->inline_data.length();
 789       if (size < len)
 790         in->inline_data.splice(size, len - size);
 791     }
 792   }
 793   if (truncate_seq >= in->truncate_seq &&
 794       in->truncate_size != truncate_size) {
 795     if (in->is_file()) {
 796       ldout(cct, 10) << "truncate_size " << in->truncate_size << " -> "
 797                << truncate_size << dendl;
 798       in->truncate_size = truncate_size;
 799       in->oset.truncate_size = truncate_size;
 800     } else {
 801       ldout(cct, 0) << "Hmmm, truncate_seq && truncate_size changed on non-file inode!" << dendl;
 802     }
 803   }
 804 }
 805
 806 void Client::update_inode_file_time(Inode *in, int issued, uint64_t time_warp_seq,
 807                                     utime_t ctime, utime_t mtime, utime_t atime)
 808 {
 809   ldout(cct, 10) << __func__ << " " << *in << " " << ccap_string(issued)
 810                  << " ctime " << ctime << " mtime " << mtime << dendl;
 811
 812   if (time_warp_seq > in->time_warp_seq)
 813     ldout(cct, 10) << " mds time_warp_seq " << time_warp_seq
 814                    << " is higher than local time_warp_seq "
 815                    << in->time_warp_seq << dendl;
 816
 817   int warn = false;
 818   // be careful with size, mtime, atime
 819   if (issued & (CEPH_CAP_FILE_EXCL|
 820                 CEPH_CAP_FILE_WR|
 821                 CEPH_CAP_FILE_BUFFER|
 822                 CEPH_CAP_AUTH_EXCL|
 823                 CEPH_CAP_XATTR_EXCL)) {
 824     ldout(cct, 30) << "Yay have enough caps to look at our times" << dendl;
 825     if (ctime > in->ctime)
 826       in->ctime = ctime;
 827     if (time_warp_seq > in->time_warp_seq) {
 828       //the mds updated times, so take those!
 829       in->mtime = mtime;
 830       in->atime = atime;
 831       in->time_warp_seq = time_warp_seq;
 832     } else if (time_warp_seq == in->time_warp_seq) {
 833       //take max times
 834       if (mtime > in->mtime)
 835         in->mtime = mtime;
 836       if (atime > in->atime)
 837         in->atime = atime;
 838     } else if (issued & CEPH_CAP_FILE_EXCL) {
 839       //ignore mds values as we have a higher seq
 840     } else warn = true;
 841   } else {
 842     ldout(cct, 30) << "Don't have enough caps, just taking mds' time values" << dendl;
 843     if (time_warp_seq >= in->time_warp_seq) {
 844       in->ctime = ctime;
 845       in->mtime = mtime;
 846       in->atime = atime;
 847       in->time_warp_seq = time_warp_seq;
 848     } else warn = true;
 849   }
 850   if (warn) {
 851     ldout(cct, 0) << "WARNING: " << *in << " mds time_warp_seq "
 852             << time_warp_seq << " is lower than local time_warp_seq "
 853             << in->time_warp_seq
 854             << dendl;
 855   }
 856 }
 857
 858 void Client::_fragmap_remove_non_leaves(Inode *in)
 859 {
 860   for (map<frag_t,int>::iterator p = in->fragmap.begin(); p != in->fragmap.end(); )
 861     if (!in->dirfragtree.is_leaf(p->first))
 862       in->fragmap.erase(p++);
 863     else
 864       ++p;
 865 }
 866
 867 void Client::_fragmap_remove_stopped_mds(Inode *in, mds_rank_t mds)
 868 {
 869   for (auto p = in->fragmap.begin(); p != in->fragmap.end(); )
 870     if (p->second == mds)
 871       in->fragmap.erase(p++);
 872     else
 873       ++p;
 874 }
 875
 876 Inode * Client::add_update_inode(InodeStat *st, utime_t from,
 877                                  MetaSession *session,
 878                                  const UserPerm& request_perms)
 879 {
 880   Inode *in;
 881   bool was_new = false;
 882   if (inode_map.count(st->vino)) {
 883     in = inode_map[st->vino];
 884     ldout(cct, 12) << __func__ << " had " << *in << " caps " << ccap_string(st->cap.caps) << dendl;
 885   } else {
 886     in = new Inode(this, st->vino, &st->layout);
 887     inode_map[st->vino] = in;
 888
 889     if (use_faked_inos())
 890       _assign_faked_ino(in);
 891
 892     if (!root) {
 893       root = in;
 894       if (use_faked_inos())
 895         _assign_faked_root(root);
 896       root_ancestor = in;
 897       cwd = root;
 898     } else if (is_mounting()) {
 899       root_parents[root_ancestor] = in;
 900       root_ancestor = in;
 901     }
 902
 903     // immutable bits
 904     in->ino = st->vino.ino;
 905     in->snapid = st->vino.snapid;
 906     in->mode = st->mode & S_IFMT;
 907     was_new = true;
 908   }
 909
 910   in->rdev = st->rdev;
 911   if (in->is_symlink())
 912     in->symlink = st->symlink;
 913
 914   // only update inode if mds info is strictly newer, or it is the same and projected (odd).
 915   bool new_version = false;
 916   if (in->version == 0 ||
 917       ((st->cap.flags & CEPH_CAP_FLAG_AUTH) &&
 918        (in->version & ~1) < st->version))
 919     new_version = true;
 920
 921   int issued;
 922   in->caps_issued(&issued);
 923   issued |= in->caps_dirty();
 924   int new_issued = ~issued & (int)st->cap.caps;
 925
 926   if ((new_version || (new_issued & CEPH_CAP_AUTH_SHARED)) &&
 927       !(issued & CEPH_CAP_AUTH_EXCL)) {
 928     in->mode = st->mode;
 929     in->uid = st->uid;
 930     in->gid = st->gid;
 931     in->btime = st->btime;
 932     in->snap_btime = st->snap_btime;
 933     in->snap_metadata = st->snap_metadata;
 934   }
 935
 936   if ((new_version || (new_issued & CEPH_CAP_LINK_SHARED)) &&
 937       !(issued & CEPH_CAP_LINK_EXCL)) {
 938     in->nlink = st->nlink;
 939   }
 940
 941   if (new_version || (new_issued & CEPH_CAP_ANY_RD)) {
 942     update_inode_file_time(in, issued, st->time_warp_seq,
 943                            st->ctime, st->mtime, st->atime);
 944   }
 945
 946   if (new_version ||
 947       (new_issued & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR))) {
 948     in->layout = st->layout;
 949     update_inode_file_size(in, issued, st->size, st->truncate_seq, st->truncate_size);
 950   }
 951
 952   if (in->is_dir()) {
 953     if (new_version || (new_issued & CEPH_CAP_FILE_SHARED)) {
 954       in->dirstat = st->dirstat;
 955     }
 956     // dir_layout/rstat/quota are not tracked by capability, update them only if
 957     // the inode stat is from auth mds
 958     if (new_version || (st->cap.flags & CEPH_CAP_FLAG_AUTH)) {
 959       in->dir_layout = st->dir_layout;
 960       ldout(cct, 20) << " dir hash is " << (int)in->dir_layout.dl_dir_hash << dendl;
 961       in->rstat = st->rstat;
 962       in->quota = st->quota;
 963       in->dir_pin = st->dir_pin;
 964     }
 965     // move me if/when version reflects fragtree changes.
 966     if (in->dirfragtree != st->dirfragtree) {
 967       in->dirfragtree = st->dirfragtree;
 968       _fragmap_remove_non_leaves(in);
 969     }
 970   }
 971
 972   if ((in->xattr_version  == 0 || !(issued & CEPH_CAP_XATTR_EXCL)) &&
 973       st->xattrbl.length() &&
 974       st->xattr_version > in->xattr_version) {
 975     auto p = st->xattrbl.cbegin();
 976     decode(in->xattrs, p);
 977     in->xattr_version = st->xattr_version;
 978   }
 979
 980   if (st->inline_version > in->inline_version) {
 981     in->inline_data = st->inline_data;
 982     in->inline_version = st->inline_version;
 983   }
 984
 985   /* always take a newer change attr */
 986   if (st->change_attr > in->change_attr)
 987     in->change_attr = st->change_attr;
 988
 989   if (st->version > in->version)
 990     in->version = st->version;
 991
 992   if (was_new)
 993     ldout(cct, 12) << __func__ << " adding " << *in << " caps " << ccap_string(st->cap.caps) << dendl;
 994
 995   if (!st->cap.caps)
 996     return in;   // as with readdir returning indoes in different snaprealms (no caps!)
 997
 998   if (in->snapid == CEPH_NOSNAP) {
 999     add_update_cap(in, session, st->cap.cap_id, st->cap.caps, st->cap.wanted,
1000                    st->cap.seq, st->cap.mseq, inodeno_t(st->cap.realm),
1001                    st->cap.flags, request_perms);
1002     if (in->auth_cap && in->auth_cap->session == session) {
1003       in->max_size = st->max_size;
1004       in->rstat = st->rstat;
1005     }
1006
1007     // setting I_COMPLETE needs to happen after adding the cap
1008     if (in->is_dir() &&
1009         (st->cap.caps & CEPH_CAP_FILE_SHARED) &&
1010         (issued & CEPH_CAP_FILE_EXCL) == 0 &&
1011         in->dirstat.nfiles == 0 &&
1012         in->dirstat.nsubdirs == 0) {
1013       ldout(cct, 10) << " marking (I_COMPLETE|I_DIR_ORDERED) on empty dir " << *in << dendl;
1014       in->flags |= I_COMPLETE | I_DIR_ORDERED;
1015       if (in->dir) {
1016         ldout(cct, 10) << " dir is open on empty dir " << in->ino << " with "
1017                        << in->dir->dentries.size() << " entries, marking all dentries null" << dendl;
1018         in->dir->readdir_cache.clear();
1019         for (const auto& p : in->dir->dentries) {
1020           unlink(p.second, true, true);  // keep dir, keep dentry
1021         }
1022         if (in->dir->dentries.empty())
1023           close_dir(in->dir);
1024       }
1025     }
1026   } else {
1027     in->snap_caps |= st->cap.caps;
1028   }
1029
1030   in->fscrypt = st->fscrypt;
1031   return in;
1032 }
1033
1034
1035 /*
1036  * insert_dentry_inode - insert + link a single dentry + inode into the metadata cache.
1037  */
1038 Dentry *Client::insert_dentry_inode(Dir *dir, const string& dname, LeaseStat *dlease,
1039                                     Inode *in, utime_t from, MetaSession *session,
1040                                     Dentry *old_dentry)
1041 {
1042   Dentry *dn = NULL;
1043   if (dir->dentries.count(dname))
1044     dn = dir->dentries[dname];
1045
1046   ldout(cct, 12) << __func__ << " '" << dname << "' vino " << in->vino()
1047                  << " in dir " << dir->parent_inode->vino() << " dn " << dn
1048                  << dendl;
1049
1050   if (dn && dn->inode) {
1051     if (dn->inode->vino() == in->vino()) {
1052       touch_dn(dn);
1053       ldout(cct, 12) << " had dentry " << dname
1054                << " with correct vino " << dn->inode->vino()
1055                << dendl;
1056     } else {
1057       ldout(cct, 12) << " had dentry " << dname
1058                << " with WRONG vino " << dn->inode->vino()
1059                << dendl;
1060       unlink(dn, true, true);  // keep dir, keep dentry
1061     }
1062   }
1063
1064   if (!dn || !dn->inode) {
1065     InodeRef tmp_ref(in);
1066     if (old_dentry) {
1067       if (old_dentry->dir != dir) {
1068         Inode *old_diri = old_dentry->dir->parent_inode;
1069         clear_dir_complete_and_ordered(old_diri, false);
1070       }
1071       unlink(old_dentry, dir == old_dentry->dir, false);  // drop dentry, keep dir open if its the same dir
1072     }
1073     Inode *diri = dir->parent_inode;
1074     clear_dir_complete_and_ordered(diri, false);
1075     dn = link(dir, dname, in, dn);
1076   }
1077
1078   update_dentry_lease(dn, dlease, from, session);
1079   return dn;
1080 }
1081
1082 void Client::update_dentry_lease(Dentry *dn, LeaseStat *dlease, utime_t from, MetaSession *session)
1083 {
1084   utime_t dttl = from;
1085   dttl += (float)dlease->duration_ms / 1000.0;
1086
1087   ldout(cct, 15) << __func__ << " " << *dn << " " << *dlease << " from " << from << dendl;
1088
1089   ceph_assert(dn);
1090
1091   if (dlease->mask & CEPH_LEASE_VALID) {
1092     if (dttl > dn->lease_ttl) {
1093       ldout(cct, 10) << "got dentry lease on " << dn->name
1094                << " dur " << dlease->duration_ms << "ms ttl " << dttl << dendl;
1095       dn->lease_ttl = dttl;
1096       dn->lease_mds = session->mds_num;
1097       dn->lease_seq = dlease->seq;
1098       dn->lease_gen = session->cap_gen;
1099     }
1100   }
1101   dn->cap_shared_gen = dn->dir->parent_inode->shared_gen;
1102   if (dlease->mask & CEPH_LEASE_PRIMARY_LINK)
1103     dn->mark_primary();
1104   dn->alternate_name = std::move(dlease->alternate_name);
1105 }
1106
1107
1108 /*
1109  * update MDS location cache for a single inode
1110  */
1111 void Client::update_dir_dist(Inode *in, DirStat *dst)
1112 {
1113   // auth
1114   ldout(cct, 20) << "got dirfrag map for " << in->ino << " frag " << dst->frag << " to mds " << dst->auth << dendl;
1115   if (dst->auth >= 0) {
1116     in->fragmap[dst->frag] = dst->auth;
1117   } else {
1118     in->fragmap.erase(dst->frag);
1119   }
1120   if (!in->dirfragtree.is_leaf(dst->frag)) {
1121     in->dirfragtree.force_to_leaf(cct, dst->frag);
1122     _fragmap_remove_non_leaves(in);
1123   }
1124
1125   // replicated
1126   in->dir_replicated = !dst->dist.empty();
1127   if (!dst->dist.empty())
1128     in->frag_repmap[dst->frag].assign(dst->dist.begin(), dst->dist.end()) ;
1129   else
1130     in->frag_repmap.erase(dst->frag);
1131 }
1132
1133 void Client::clear_dir_complete_and_ordered(Inode *diri, bool complete)
1134 {
1135   if (complete)
1136     diri->dir_release_count++;
1137   else
1138     diri->dir_ordered_count++;
1139   if (diri->flags & I_COMPLETE) {
1140     if (complete) {
1141       ldout(cct, 10) << " clearing (I_COMPLETE|I_DIR_ORDERED) on " << *diri << dendl;
1142       diri->flags &= ~(I_COMPLETE | I_DIR_ORDERED);
1143     } else {
1144       if (diri->flags & I_DIR_ORDERED) {
1145         ldout(cct, 10) << " clearing I_DIR_ORDERED on " << *diri << dendl;
1146         diri->flags &= ~I_DIR_ORDERED;
1147       }
1148     }
1149     if (diri->dir)
1150       diri->dir->readdir_cache.clear();
1151   }
1152 }
1153
1154 /*
1155  * insert results from readdir or lssnap into the metadata cache.
1156  */
1157 void Client::insert_readdir_results(MetaRequest *request, MetaSession *session, Inode *diri) {
1158
1159   auto& reply = request->reply;
1160   ConnectionRef con = request->reply->get_connection();
1161   uint64_t features;
1162   if(session->mds_features.test(CEPHFS_FEATURE_REPLY_ENCODING)) {
1163     features = (uint64_t)-1;
1164   }
1165   else {
1166     features = con->get_features();
1167   }
1168
1169   dir_result_t *dirp = request->dirp;
1170   ceph_assert(dirp);
1171
1172   // the extra buffer list is only set for readdir and lssnap replies
1173   auto p = reply->get_extra_bl().cbegin();
1174   if (!p.end()) {
1175     // snapdir?
1176     if (request->head.op == CEPH_MDS_OP_LSSNAP) {
1177       ceph_assert(diri);
1178       diri = open_snapdir(diri);
1179     }
1180
1181     // only open dir if we're actually adding stuff to it!
1182     Dir *dir = diri->open_dir();
1183     ceph_assert(dir);
1184
1185     // dirstat
1186     DirStat dst(p, features);
1187     __u32 numdn;
1188     __u16 flags;
1189     decode(numdn, p);
1190     decode(flags, p);
1191
1192     bool end = ((unsigned)flags & CEPH_READDIR_FRAG_END);
1193     bool hash_order = ((unsigned)flags & CEPH_READDIR_HASH_ORDER);
1194
1195     frag_t fg = (unsigned)request->head.args.readdir.frag;
1196     unsigned readdir_offset = dirp->next_offset;
1197     string readdir_start = dirp->last_name;
1198     ceph_assert(!readdir_start.empty() || readdir_offset == 2);
1199
1200     unsigned last_hash = 0;
1201     if (hash_order) {
1202       if (!readdir_start.empty()) {
1203         last_hash = ceph_frag_value(diri->hash_dentry_name(readdir_start));
1204       } else if (flags & CEPH_READDIR_OFFSET_HASH) {
1205         /* mds understands offset_hash */
1206         last_hash = (unsigned)request->head.args.readdir.offset_hash;
1207       }
1208     }
1209
1210     if (fg != dst.frag) {
1211       ldout(cct, 10) << "insert_trace got new frag " << fg << " -> " << dst.frag << dendl;
1212       fg = dst.frag;
1213       if (!hash_order) {
1214         readdir_offset = 2;
1215         readdir_start.clear();
1216         dirp->offset = dir_result_t::make_fpos(fg, readdir_offset, false);
1217       }
1218     }
1219
1220     ldout(cct, 10) << __func__ << " " << numdn << " readdir items, end=" << end
1221                    << ", hash_order=" << hash_order
1222                    << ", readdir_start " << readdir_start
1223                    << ", last_hash " << last_hash
1224                    << ", next_offset " << readdir_offset << dendl;
1225
1226     if (diri->snapid != CEPH_SNAPDIR &&
1227         fg.is_leftmost() && readdir_offset == 2 &&
1228         !(hash_order && last_hash)) {
1229       dirp->release_count = diri->dir_release_count;
1230       dirp->ordered_count = diri->dir_ordered_count;
1231       dirp->start_shared_gen = diri->shared_gen;
1232       dirp->cache_index = 0;
1233     }
1234
1235     dirp->buffer_frag = fg;
1236
1237     _readdir_drop_dirp_buffer(dirp);
1238     dirp->buffer.reserve(numdn);
1239
1240     string dname;
1241     LeaseStat dlease;
1242     for (unsigned i=0; i<numdn; i++) {
1243       decode(dname, p);
1244       dlease.decode(p, features);
1245       InodeStat ist(p, features);
1246
1247       ldout(cct, 15) << "" << i << ": '" << dname << "'" << dendl;
1248
1249       Inode *in = add_update_inode(&ist, request->sent_stamp, session,
1250                                    request->perms);
1251       Dentry *dn;
1252       if (diri->dir->dentries.count(dname)) {
1253         Dentry *olddn = diri->dir->dentries[dname];
1254         if (olddn->inode != in) {
1255           // replace incorrect dentry
1256           unlink(olddn, true, true);  // keep dir, dentry
1257           dn = link(dir, dname, in, olddn);
1258           ceph_assert(dn == olddn);
1259         } else {
1260           // keep existing dn
1261           dn = olddn;
1262           touch_dn(dn);
1263         }
1264       } else {
1265         // new dn
1266         dn = link(dir, dname, in, NULL);
1267       }
1268       dn->alternate_name = std::move(dlease.alternate_name);
1269
1270       update_dentry_lease(dn, &dlease, request->sent_stamp, session);
1271       if (hash_order) {
1272         unsigned hash = ceph_frag_value(diri->hash_dentry_name(dname));
1273         if (hash != last_hash)
1274           readdir_offset = 2;
1275         last_hash = hash;
1276         dn->offset = dir_result_t::make_fpos(hash, readdir_offset++, true);
1277       } else {
1278         dn->offset = dir_result_t::make_fpos(fg, readdir_offset++, false);
1279       }
1280       // add to readdir cache
1281       if (dirp->release_count == diri->dir_release_count &&
1282           dirp->ordered_count == diri->dir_ordered_count &&
1283           dirp->start_shared_gen == diri->shared_gen) {
1284         if (dirp->cache_index == dir->readdir_cache.size()) {
1285           if (i == 0) {
1286             ceph_assert(!dirp->inode->is_complete_and_ordered());
1287             dir->readdir_cache.reserve(dirp->cache_index + numdn);
1288           }
1289           dir->readdir_cache.push_back(dn);
1290         } else if (dirp->cache_index < dir->readdir_cache.size()) {
1291           if (dirp->inode->is_complete_and_ordered())
1292             ceph_assert(dir->readdir_cache[dirp->cache_index] == dn);
1293           else
1294             dir->readdir_cache[dirp->cache_index] = dn;
1295         } else {
1296           ceph_abort_msg("unexpected readdir buffer idx");
1297         }
1298         dirp->cache_index++;
1299       }
1300       // add to cached result list
1301       dirp->buffer.push_back(dir_result_t::dentry(dn->offset, dname, dn->alternate_name, in));
1302       ldout(cct, 15) << __func__ << "  " << hex << dn->offset << dec << ": '" << dname << "' -> " << in->ino << dendl;
1303     }
1304
1305     if (numdn > 0)
1306       dirp->last_name = dname;
1307     if (end)
1308       dirp->next_offset = 2;
1309     else
1310       dirp->next_offset = readdir_offset;
1311
1312     if (dir->is_empty())
1313       close_dir(dir);
1314   }
1315 }
1316
1317 /** insert_trace
1318  *
1319  * insert a trace from a MDS reply into the cache.
1320  */
1321 Inode* Client::insert_trace(MetaRequest *request, MetaSession *session)
1322 {
1323   auto& reply = request->reply;
1324   int op = request->get_op();
1325
1326   ldout(cct, 10) << "insert_trace from " << request->sent_stamp << " mds." << session->mds_num
1327            << " is_target=" << (int)reply->head.is_target
1328            << " is_dentry=" << (int)reply->head.is_dentry
1329            << dendl;
1330
1331   auto p = reply->get_trace_bl().cbegin();
1332   if (request->got_unsafe) {
1333     ldout(cct, 10) << "insert_trace -- already got unsafe; ignoring" << dendl;
1334     ceph_assert(p.end());
1335     return NULL;
1336   }
1337
1338   if (p.end()) {
1339     ldout(cct, 10) << "insert_trace -- no trace" << dendl;
1340
1341     Dentry *d = request->dentry();
1342     if (d) {
1343       Inode *diri = d->dir->parent_inode;
1344       clear_dir_complete_and_ordered(diri, true);
1345     }
1346
1347     if (d && reply->get_result() == 0) {
1348       if (op == CEPH_MDS_OP_RENAME) {
1349         // rename
1350         Dentry *od = request->old_dentry();
1351         ldout(cct, 10) << " unlinking rename src dn " << od << " for traceless reply" << dendl;
1352         ceph_assert(od);
1353         unlink(od, true, true);  // keep dir, dentry
1354       } else if (op == CEPH_MDS_OP_RMDIR ||
1355                  op == CEPH_MDS_OP_UNLINK) {
1356         // unlink, rmdir
1357         ldout(cct, 10) << " unlinking unlink/rmdir dn " << d << " for traceless reply" << dendl;
1358         unlink(d, true, true);  // keep dir, dentry
1359       }
1360     }
1361     return NULL;
1362   }
1363
1364   ConnectionRef con = request->reply->get_connection();
1365   uint64_t features;
1366   if (session->mds_features.test(CEPHFS_FEATURE_REPLY_ENCODING)) {
1367     features = (uint64_t)-1;
1368   }
1369   else {
1370     features = con->get_features();
1371   }
1372   ldout(cct, 10) << " features 0x" << hex << features << dec << dendl;
1373
1374   // snap trace
1375   SnapRealm *realm = NULL;
1376   if (reply->snapbl.length())
1377     update_snap_trace(reply->snapbl, &realm);
1378
1379   ldout(cct, 10) << " hrm "
1380            << " is_target=" << (int)reply->head.is_target
1381            << " is_dentry=" << (int)reply->head.is_dentry
1382            << dendl;
1383
1384   InodeStat dirst;
1385   DirStat dst;
1386   string dname;
1387   LeaseStat dlease;
1388   InodeStat ist;
1389
1390   if (reply->head.is_dentry) {
1391     dirst.decode(p, features);
1392     dst.decode(p, features);
1393     decode(dname, p);
1394     dlease.decode(p, features);
1395   }
1396
1397   Inode *in = 0;
1398   if (reply->head.is_target) {
1399     ist.decode(p, features);
1400     if (cct->_conf->client_debug_getattr_caps) {
1401       unsigned wanted = 0;
1402       if (op == CEPH_MDS_OP_GETATTR || op == CEPH_MDS_OP_LOOKUP)
1403         wanted = request->head.args.getattr.mask;
1404       else if (op == CEPH_MDS_OP_OPEN || op == CEPH_MDS_OP_CREATE)
1405         wanted = request->head.args.open.mask;
1406
1407       if ((wanted & CEPH_CAP_XATTR_SHARED) &&
1408           !(ist.xattr_version > 0 && ist.xattrbl.length() > 0))
1409         ceph_abort_msg("MDS reply does not contain xattrs");
1410     }
1411
1412     in = add_update_inode(&ist, request->sent_stamp, session,
1413                           request->perms);
1414   }
1415
1416   Inode *diri = NULL;
1417   if (reply->head.is_dentry) {
1418     diri = add_update_inode(&dirst, request->sent_stamp, session,
1419                             request->perms);
1420     update_dir_dist(diri, &dst);  // dir stat info is attached to ..
1421
1422     if (in) {
1423       Dir *dir = diri->open_dir();
1424       insert_dentry_inode(dir, dname, &dlease, in, request->sent_stamp, session,
1425                           (op == CEPH_MDS_OP_RENAME) ? request->old_dentry() : NULL);
1426     } else {
1427       Dentry *dn = NULL;
1428       if (diri->dir && diri->dir->dentries.count(dname)) {
1429         dn = diri->dir->dentries[dname];
1430         if (dn->inode) {
1431           clear_dir_complete_and_ordered(diri, false);
1432           unlink(dn, true, true);  // keep dir, dentry
1433         }
1434       }
1435       if (dlease.duration_ms > 0) {
1436         if (!dn) {
1437           Dir *dir = diri->open_dir();
1438           dn = link(dir, dname, NULL, NULL);
1439         }
1440         update_dentry_lease(dn, &dlease, request->sent_stamp, session);
1441       }
1442     }
1443   } else if (op == CEPH_MDS_OP_LOOKUPSNAP ||
1444              op == CEPH_MDS_OP_MKSNAP) {
1445     ldout(cct, 10) << " faking snap lookup weirdness" << dendl;
1446     // fake it for snap lookup
1447     vinodeno_t vino = ist.vino;
1448     vino.snapid = CEPH_SNAPDIR;
1449     ceph_assert(inode_map.count(vino));
1450     diri = inode_map[vino];
1451
1452     string dname = request->path.last_dentry();
1453
1454     LeaseStat dlease;
1455     dlease.duration_ms = 0;
1456
1457     if (in) {
1458       Dir *dir = diri->open_dir();
1459       insert_dentry_inode(dir, dname, &dlease, in, request->sent_stamp, session);
1460     } else {
1461       if (diri->dir && diri->dir->dentries.count(dname)) {
1462         Dentry *dn = diri->dir->dentries[dname];
1463         if (dn->inode)
1464           unlink(dn, true, true);  // keep dir, dentry
1465       }
1466     }
1467   }
1468
1469   if (in) {
1470     if (op == CEPH_MDS_OP_READDIR ||
1471         op == CEPH_MDS_OP_LSSNAP) {
1472       insert_readdir_results(request, session, in);
1473     } else if (op == CEPH_MDS_OP_LOOKUPNAME) {
1474       // hack: return parent inode instead
1475       in = diri;
1476     }
1477
1478     if (request->dentry() == NULL && in != request->inode()) {
1479       // pin the target inode if its parent dentry is not pinned
1480       request->set_other_inode(in);
1481     }
1482   }
1483
1484   if (realm)
1485     put_snap_realm(realm);
1486
1487   request->target = in;
1488   return in;
1489 }
1490
1491 // -------
1492
1493 mds_rank_t Client::choose_target_mds(MetaRequest *req, Inode** phash_diri)
1494 {
1495   mds_rank_t mds = MDS_RANK_NONE;
1496   __u32 hash = 0;
1497   bool is_hash = false;
1498
1499   Inode *in = NULL;
1500   Dentry *de = NULL;
1501
1502   if (req->resend_mds >= 0) {
1503     mds = req->resend_mds;
1504     req->resend_mds = -1;
1505     ldout(cct, 10) << __func__ << " resend_mds specified as mds." << mds << dendl;
1506     goto out;
1507   }
1508
1509   if (cct->_conf->client_use_random_mds)
1510     goto random_mds;
1511
1512   in = req->inode();
1513   de = req->dentry();
1514   if (in) {
1515     ldout(cct, 20) << __func__ << " starting with req->inode " << *in << dendl;
1516     if (req->path.depth()) {
1517       hash = in->hash_dentry_name(req->path[0]);
1518       ldout(cct, 20) << __func__ << " inode dir hash is " << (int)in->dir_layout.dl_dir_hash
1519                << " on " << req->path[0]
1520                << " => " << hash << dendl;
1521       is_hash = true;
1522     }
1523   } else if (de) {
1524     if (de->inode) {
1525       in = de->inode.get();
1526       ldout(cct, 20) << __func__ << " starting with req->dentry inode " << *in << dendl;
1527     } else {
1528       in = de->dir->parent_inode;
1529       hash = in->hash_dentry_name(de->name);
1530       ldout(cct, 20) << __func__ << " dentry dir hash is " << (int)in->dir_layout.dl_dir_hash
1531                << " on " << de->name
1532                << " => " << hash << dendl;
1533       is_hash = true;
1534     }
1535   }
1536   if (in) {
1537     if (in->snapid != CEPH_NOSNAP) {
1538       ldout(cct, 10) << __func__ << " " << *in << " is snapped, using nonsnap parent" << dendl;
1539       while (in->snapid != CEPH_NOSNAP) {
1540         if (in->snapid == CEPH_SNAPDIR)
1541           in = in->snapdir_parent.get();
1542         else if (!in->dentries.empty())
1543           /* In most cases there will only be one dentry, so getting it
1544            * will be the correct action. If there are multiple hard links,
1545            * I think the MDS should be able to redirect as needed*/
1546           in = in->get_first_parent()->dir->parent_inode;
1547         else {
1548           ldout(cct, 10) << "got unlinked inode, can't look at parent" << dendl;
1549           break;
1550         }
1551       }
1552       is_hash = false;
1553     }
1554
1555     ldout(cct, 20) << __func__ << " " << *in << " is_hash=" << is_hash
1556              << " hash=" << hash << dendl;
1557
1558     if (is_hash && S_ISDIR(in->mode) && (!in->fragmap.empty() || !in->frag_repmap.empty())) {
1559       frag_t fg = in->dirfragtree[hash];
1560       if (!req->auth_is_best()) {
1561         auto repmapit = in->frag_repmap.find(fg);
1562         if (repmapit != in->frag_repmap.end()) {
1563           auto& repmap = repmapit->second;
1564           auto r = ceph::util::generate_random_number<uint64_t>(0, repmap.size()-1);
1565           mds = repmap.at(r);
1566         }
1567       } else if (in->fragmap.count(fg)) {
1568         mds = in->fragmap[fg];
1569         if (phash_diri)
1570           *phash_diri = in;
1571       } else if (in->auth_cap) {
1572         req->send_to_auth = true;
1573         mds = in->auth_cap->session->mds_num;
1574       }
1575       if (mds >= 0) {
1576         ldout(cct, 10) << __func__ << " from dirfragtree hash" << dendl;
1577         goto out;
1578       }
1579     }
1580
1581     if (in->auth_cap && req->auth_is_best()) {
1582       mds = in->auth_cap->session->mds_num;
1583     } else if (!in->caps.empty()) {
1584       mds = in->caps.begin()->second.session->mds_num;
1585     } else {
1586       goto random_mds;
1587     }
1588     ldout(cct, 10) << __func__ << " from caps on inode " << *in << dendl;
1589
1590     goto out;
1591   }
1592
1593 random_mds:
1594   if (mds < 0) {
1595     mds = _get_random_up_mds();
1596     ldout(cct, 10) << "did not get mds through better means, so chose random mds " << mds << dendl;
1597   }
1598
1599 out:
1600   ldout(cct, 20) << "mds is " << mds << dendl;
1601   return mds;
1602 }
1603
1604 void Client::connect_mds_targets(mds_rank_t mds)
1605 {
1606   ldout(cct, 10) << __func__ << " for mds." << mds << dendl;
1607   ceph_assert(mds_sessions.count(mds));
1608   const MDSMap::mds_info_t& info = mdsmap->get_mds_info(mds);
1609   for (const auto &rank : info.export_targets) {
1610     if (mds_sessions.count(rank) == 0 &&
1611         mdsmap->is_clientreplay_or_active_or_stopping(rank)) {
1612       ldout(cct, 10) << "check_mds_sessions opening mds." << mds
1613                      << " export target mds." << rank << dendl;
1614       _open_mds_session(rank);
1615     }
1616   }
1617 }
1618
1619 void Client::dump_mds_sessions(Formatter *f, bool cap_dump)
1620 {
1621   f->dump_int("id", get_nodeid().v);
1622   entity_inst_t inst(messenger->get_myname(), messenger->get_myaddr_legacy());
1623   f->dump_object("inst", inst);
1624   f->dump_stream("inst_str") << inst;
1625   f->dump_stream("addr_str") << inst.addr;
1626   f->open_array_section("sessions");
1627   for (const auto &p : mds_sessions) {
1628     f->open_object_section("session");
1629     p.second.dump(f, cap_dump);
1630     f->close_section();
1631   }
1632   f->close_section();
1633   f->dump_int("mdsmap_epoch", mdsmap->get_epoch());
1634 }
1635
1636 void Client::dump_mds_requests(Formatter *f)
1637 {
1638   for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
1639        p != mds_requests.end();
1640        ++p) {
1641     f->open_object_section("request");
1642     p->second->dump(f);
1643     f->close_section();
1644   }
1645 }
1646
1647 int Client::verify_reply_trace(int r, MetaSession *session,
1648                                MetaRequest *request, const MConstRef<MClientReply>& reply,
1649                                InodeRef *ptarget, bool *pcreated,
1650                                const UserPerm& perms)
1651 {
1652   // check whether this request actually did the create, and set created flag
1653   bufferlist extra_bl;
1654   inodeno_t created_ino;
1655   bool got_created_ino = false;
1656   ceph::unordered_map<vinodeno_t, Inode*>::iterator p;
1657
1658   extra_bl = reply->get_extra_bl();
1659   if (extra_bl.length() >= 8) {
1660     if (session->mds_features.test(CEPHFS_FEATURE_DELEG_INO)) {
1661      struct openc_response_t    ocres;
1662
1663      decode(ocres, extra_bl);
1664      created_ino = ocres.created_ino;
1665      /*
1666       * The userland cephfs client doesn't have a way to do an async create
1667       * (yet), so just discard delegated_inos for now. Eventually we should
1668       * store them and use them in create calls, even if they are synchronous,
1669       * if only for testing purposes.
1670       */
1671      ldout(cct, 10) << "delegated_inos: " << ocres.delegated_inos << dendl;
1672     } else {
1673      // u64 containing number of created ino
1674      decode(created_ino, extra_bl);
1675     }
1676     ldout(cct, 10) << "make_request created ino " << created_ino << dendl;
1677     got_created_ino = true;
1678   }
1679
1680   if (pcreated)
1681     *pcreated = got_created_ino;
1682
1683   if (request->target) {
1684     *ptarget = request->target;
1685     ldout(cct, 20) << "make_request target is " << *ptarget->get() << dendl;
1686   } else {
1687     if (got_created_ino && (p = inode_map.find(vinodeno_t(created_ino, CEPH_NOSNAP))) != inode_map.end()) {
1688       (*ptarget) = p->second;
1689       ldout(cct, 20) << "make_request created, target is " << *ptarget->get() << dendl;
1690     } else {
1691       // we got a traceless reply, and need to look up what we just
1692       // created.  for now, do this by name.  someday, do this by the
1693       // ino... which we know!  FIXME.
1694       InodeRef target;
1695       Dentry *d = request->dentry();
1696       if (d) {
1697         if (d->dir) {
1698           ldout(cct, 10) << "make_request got traceless reply, looking up #"
1699                          << d->dir->parent_inode->ino << "/" << d->name
1700                          << " got_ino " << got_created_ino
1701                          << " ino " << created_ino
1702                          << dendl;
1703           r = _do_lookup(d->dir->parent_inode, d->name, request->regetattr_mask,
1704                          &target, perms);
1705         } else {
1706           // if the dentry is not linked, just do our best. see #5021.
1707           ceph_abort_msg("how did this happen?  i want logs!");
1708         }
1709       } else {
1710         Inode *in = request->inode();
1711         ldout(cct, 10) << "make_request got traceless reply, forcing getattr on #"
1712                        << in->ino << dendl;
1713         r = _getattr(in, request->regetattr_mask, perms, true);
1714         target = in;
1715       }
1716       if (r >= 0) {
1717         // verify ino returned in reply and trace_dist are the same
1718         if (got_created_ino &&
1719             created_ino.val != target->ino.val) {
1720           ldout(cct, 5) << "create got ino " << created_ino << " but then failed on lookup; EINTR?" << dendl;
1721           r = -CEPHFS_EINTR;
1722         }
1723         if (ptarget)
1724           ptarget->swap(target);
1725       }
1726     }
1727   }
1728
1729   return r;
1730 }
1731
1732
1733 /**
1734  * make a request
1735  *
1736  * Blocking helper to make an MDS request.
1737  *
1738  * If the ptarget flag is set, behavior changes slightly: the caller
1739  * expects to get a pointer to the inode we are creating or operating
1740  * on.  As a result, we will follow up any traceless mutation reply
1741  * with a getattr or lookup to transparently handle a traceless reply
1742  * from the MDS (as when the MDS restarts and the client has to replay
1743  * a request).
1744  *
1745  * @param request the MetaRequest to execute
1746  * @param perms The user uid/gid to execute as (eventually, full group lists?)
1747  * @param ptarget [optional] address to store a pointer to the target inode we want to create or operate on
1748  * @param pcreated [optional; required if ptarget] where to store a bool of whether our create atomically created a file
1749  * @param use_mds [optional] prefer a specific mds (-1 for default)
1750  * @param pdirbl [optional; disallowed if ptarget] where to pass extra reply payload to the caller
1751  */
1752 int Client::make_request(MetaRequest *request,
1753                          const UserPerm& perms,
1754                          InodeRef *ptarget, bool *pcreated,
1755                          mds_rank_t use_mds,
1756                          bufferlist *pdirbl)
1757 {
1758   int r = 0;
1759
1760   // assign a unique tid
1761   ceph_tid_t tid = ++last_tid;
1762   request->set_tid(tid);
1763
1764   // and timestamp
1765   request->op_stamp = ceph_clock_now();
1766
1767   // make note
1768   mds_requests[tid] = request->get();
1769   if (oldest_tid == 0 && request->get_op() != CEPH_MDS_OP_SETFILELOCK)
1770     oldest_tid = tid;
1771
1772   request->set_caller_perms(perms);
1773
1774   if (cct->_conf->client_inject_fixed_oldest_tid) {
1775     ldout(cct, 20) << __func__ << " injecting fixed oldest_client_tid(1)" << dendl;
1776     request->set_oldest_client_tid(1);
1777   } else {
1778     request->set_oldest_client_tid(oldest_tid);
1779   }
1780
1781   // hack target mds?
1782   if (use_mds >= 0)
1783     request->resend_mds = use_mds;
1784
1785   MetaSession *session = NULL;
1786   while (1) {
1787     if (request->aborted())
1788       break;
1789
1790     if (blocklisted) {
1791       request->abort(-CEPHFS_EBLOCKLISTED);
1792       break;
1793     }
1794
1795     // set up wait cond
1796     ceph::condition_variable caller_cond;
1797     request->caller_cond = &caller_cond;
1798
1799     // choose mds
1800     Inode *hash_diri = NULL;
1801     mds_rank_t mds = choose_target_mds(request, &hash_diri);
1802     int mds_state = (mds == MDS_RANK_NONE) ? MDSMap::STATE_NULL : mdsmap->get_state(mds);
1803     if (mds_state != MDSMap::STATE_ACTIVE && mds_state != MDSMap::STATE_STOPPING) {
1804       if (mds_state == MDSMap::STATE_NULL && mds >= mdsmap->get_max_mds()) {
1805         if (hash_diri) {
1806           ldout(cct, 10) << " target mds." << mds << " has stopped, remove it from fragmap" << dendl;
1807           _fragmap_remove_stopped_mds(hash_diri, mds);
1808         } else {
1809           ldout(cct, 10) << " target mds." << mds << " has stopped, trying a random mds" << dendl;
1810           request->resend_mds = _get_random_up_mds();
1811         }
1812       } else {
1813         ldout(cct, 10) << " target mds." << mds << " not active, waiting for new mdsmap" << dendl;
1814         wait_on_list(waiting_for_mdsmap);
1815       }
1816       continue;
1817     }
1818
1819     // open a session?
1820     if (!have_open_session(mds)) {
1821       session = _get_or_open_mds_session(mds);
1822       if (session->state == MetaSession::STATE_REJECTED) {
1823         request->abort(-CEPHFS_EPERM);
1824         break;
1825       }
1826       // wait
1827       if (session->state == MetaSession::STATE_OPENING) {
1828         ldout(cct, 10) << "waiting for session to mds." << mds << " to open" << dendl;
1829         wait_on_context_list(session->waiting_for_open);
1830         continue;
1831       }
1832
1833       if (!have_open_session(mds))
1834         continue;
1835     } else {
1836       session = &mds_sessions.at(mds);
1837     }
1838
1839     // send request.
1840     send_request(request, session);
1841
1842     // wait for signal
1843     ldout(cct, 20) << "awaiting reply|forward|kick on " << &caller_cond << dendl;
1844     request->kick = false;
1845     std::unique_lock l{client_lock, std::adopt_lock};
1846     caller_cond.wait(l, [request] {
1847       return (request->reply ||           // reply
1848               request->resend_mds >= 0 || // forward
1849               request->kick);
1850     });
1851     l.release();
1852     request->caller_cond = nullptr;
1853
1854     // did we get a reply?
1855     if (request->reply)
1856       break;
1857   }
1858
1859   if (!request->reply) {
1860     ceph_assert(request->aborted());
1861     ceph_assert(!request->got_unsafe);
1862     r = request->get_abort_code();
1863     request->item.remove_myself();
1864     unregister_request(request);
1865     put_request(request);
1866     return r;
1867   }
1868
1869   // got it!
1870   auto reply = std::move(request->reply);
1871   r = reply->get_result();
1872   if (r >= 0)
1873     request->success = true;
1874
1875   // kick dispatcher (we've got it!)
1876   ceph_assert(request->dispatch_cond);
1877   request->dispatch_cond->notify_all();
1878   ldout(cct, 20) << "sendrecv kickback on tid " << tid << " " << request->dispatch_cond << dendl;
1879   request->dispatch_cond = 0;
1880
1881   if (r >= 0 && ptarget)
1882     r = verify_reply_trace(r, session, request, reply, ptarget, pcreated, perms);
1883
1884   if (pdirbl)
1885     *pdirbl = reply->get_extra_bl();
1886
1887   // -- log times --
1888   utime_t lat = ceph_clock_now();
1889   lat -= request->sent_stamp;
1890   ldout(cct, 20) << "lat " << lat << dendl;
1891   logger->tinc(l_c_lat, lat);
1892   logger->tinc(l_c_reply, lat);
1893
1894   put_request(request);
1895   return r;
1896 }
1897
1898 void Client::unregister_request(MetaRequest *req)
1899 {
1900   mds_requests.erase(req->tid);
1901   if (req->tid == oldest_tid) {
1902     map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.upper_bound(oldest_tid);
1903     while (true) {
1904       if (p == mds_requests.end()) {
1905         oldest_tid = 0;
1906         break;
1907       }
1908       if (p->second->get_op() != CEPH_MDS_OP_SETFILELOCK) {
1909         oldest_tid = p->first;
1910         break;
1911       }
1912       ++p;
1913     }
1914   }
1915   put_request(req);
1916 }
1917
1918 void Client::put_request(MetaRequest *request)
1919 {
1920   if (request->_put()) {
1921     int op = -1;
1922     if (request->success)
1923       op = request->get_op();
1924     InodeRef other_in;
1925     request->take_other_inode(&other_in);
1926     delete request;
1927
1928     if (other_in &&
1929         (op == CEPH_MDS_OP_RMDIR ||
1930          op == CEPH_MDS_OP_RENAME ||
1931          op == CEPH_MDS_OP_RMSNAP)) {
1932       _try_to_trim_inode(other_in.get(), false);
1933     }
1934   }
1935 }
1936
1937 int Client::encode_inode_release(Inode *in, MetaRequest *req,
1938                          mds_rank_t mds, int drop,
1939                          int unless, int force)
1940 {
1941   ldout(cct, 20) << __func__ << " enter(in:" << *in << ", req:" << req
1942            << " mds:" << mds << ", drop:" << ccap_string(drop) << ", unless:" << ccap_string(unless)
1943            << ", force:" << force << ")" << dendl;
1944   int released = 0;
1945   auto it = in->caps.find(mds);
1946   if (it != in->caps.end()) {
1947     Cap &cap = it->second;
1948     drop &= ~(in->dirty_caps | get_caps_used(in));
1949     if ((drop & cap.issued) &&
1950         !(unless & cap.issued)) {
1951       ldout(cct, 25) << "dropping caps " << ccap_string(drop) << dendl;
1952       cap.issued &= ~drop;
1953       cap.implemented &= ~drop;
1954       released = 1;
1955     } else {
1956       released = force;
1957     }
1958     if (released) {
1959       cap.wanted = in->caps_wanted();
1960       if (&cap == in->auth_cap &&
1961           !(cap.wanted & CEPH_CAP_ANY_FILE_WR)) {
1962         in->requested_max_size = 0;
1963         ldout(cct, 25) << "reset requested_max_size due to not wanting any file write cap" << dendl;
1964       }
1965       ceph_mds_request_release rel;
1966       rel.ino = in->ino;
1967       rel.cap_id = cap.cap_id;
1968       rel.seq = cap.seq;
1969       rel.issue_seq = cap.issue_seq;
1970       rel.mseq = cap.mseq;
1971       rel.caps = cap.implemented;
1972       rel.wanted = cap.wanted;
1973       rel.dname_len = 0;
1974       rel.dname_seq = 0;
1975       req->cap_releases.push_back(MClientRequest::Release(rel,""));
1976     }
1977   }
1978   ldout(cct, 25) << __func__ << " exit(in:" << *in << ") released:"
1979            << released << dendl;
1980   return released;
1981 }
1982
1983 void Client::encode_dentry_release(Dentry *dn, MetaRequest *req,
1984                            mds_rank_t mds, int drop, int unless)
1985 {
1986   ldout(cct, 20) << __func__ << " enter(dn:"
1987            << dn << ")" << dendl;
1988   int released = 0;
1989   if (dn->dir)
1990     released = encode_inode_release(dn->dir->parent_inode, req,
1991                                     mds, drop, unless, 1);
1992   if (released && dn->lease_mds == mds) {
1993     ldout(cct, 25) << "preemptively releasing dn to mds" << dendl;
1994     auto& rel = req->cap_releases.back();
1995     rel.item.dname_len = dn->name.length();
1996     rel.item.dname_seq = dn->lease_seq;
1997     rel.dname = dn->name;
1998     dn->lease_mds = -1;
1999   }
2000   ldout(cct, 25) << __func__ << " exit(dn:"
2001            << dn << ")" << dendl;
2002 }
2003
2004
2005 /*
2006  * This requires the MClientRequest *request member to be set.
2007  * It will error out horribly without one.
2008  * Additionally, if you set any *drop member, you'd better have
2009  * set the corresponding dentry!
2010  */
2011 void Client::encode_cap_releases(MetaRequest *req, mds_rank_t mds)
2012 {
2013   ldout(cct, 20) << __func__ << " enter (req: "
2014                  << req << ", mds: " << mds << ")" << dendl;
2015   if (req->inode_drop && req->inode())
2016     encode_inode_release(req->inode(), req,
2017                          mds, req->inode_drop,
2018                          req->inode_unless);
2019
2020   if (req->old_inode_drop && req->old_inode())
2021     encode_inode_release(req->old_inode(), req,
2022                          mds, req->old_inode_drop,
2023                          req->old_inode_unless);
2024   if (req->other_inode_drop && req->other_inode())
2025     encode_inode_release(req->other_inode(), req,
2026                          mds, req->other_inode_drop,
2027                          req->other_inode_unless);
2028
2029   if (req->dentry_drop && req->dentry())
2030     encode_dentry_release(req->dentry(), req,
2031                           mds, req->dentry_drop,
2032                           req->dentry_unless);
2033
2034   if (req->old_dentry_drop && req->old_dentry())
2035     encode_dentry_release(req->old_dentry(), req,
2036                           mds, req->old_dentry_drop,
2037                           req->old_dentry_unless);
2038   ldout(cct, 25) << __func__ << " exit (req: "
2039            << req << ", mds " << mds <<dendl;
2040 }
2041
2042 bool Client::have_open_session(mds_rank_t mds)
2043 {
2044   const auto &it = mds_sessions.find(mds);
2045   return it != mds_sessions.end() &&
2046     (it->second.state == MetaSession::STATE_OPEN ||
2047      it->second.state == MetaSession::STATE_STALE);
2048 }
2049
2050 MetaSession *Client::_get_mds_session(mds_rank_t mds, Connection *con)
2051 {
2052   const auto &it = mds_sessions.find(mds);
2053   if (it == mds_sessions.end() || it->second.con != con) {
2054     return NULL;
2055   } else {
2056     return &it->second;
2057   }
2058 }
2059
2060 MetaSession *Client::_get_or_open_mds_session(mds_rank_t mds)
2061 {
2062   auto it = mds_sessions.find(mds);
2063   return it == mds_sessions.end() ? _open_mds_session(mds) : &it->second;
2064 }
2065
2066 /**
2067  * Populate a map of strings with client-identifying metadata,
2068  * such as the hostname.  Call this once at initialization.
2069  */
2070 void Client::populate_metadata(const std::string &mount_root)
2071 {
2072   // Hostname
2073 #ifdef _WIN32
2074   // TODO: move this to compat.h
2075   char hostname[64];
2076   DWORD hostname_sz = 64;
2077   GetComputerNameA(hostname, &hostname_sz);
2078   metadata["hostname"] = hostname;
2079 #else
2080   struct utsname u;
2081   int r = uname(&u);
2082   if (r >= 0) {
2083     metadata["hostname"] = u.nodename;
2084     ldout(cct, 20) << __func__ << " read hostname '" << u.nodename << "'" << dendl;
2085   } else {
2086     ldout(cct, 1) << __func__ << " failed to read hostname (" << cpp_strerror(r) << ")" << dendl;
2087   }
2088 #endif
2089
2090   metadata["pid"] = stringify(getpid());
2091
2092   // Ceph entity id (the '0' in "client.0")
2093   metadata["entity_id"] = cct->_conf->name.get_id();
2094
2095   // Our mount position
2096   if (!mount_root.empty()) {
2097     metadata["root"] = mount_root;
2098   }
2099
2100   // Ceph version
2101   metadata["ceph_version"] = pretty_version_to_str();
2102   metadata["ceph_sha1"] = git_version_to_str();
2103
2104   // Apply any metadata from the user's configured overrides
2105   std::vector<std::string> tokens;
2106   get_str_vec(cct->_conf->client_metadata, ",", tokens);
2107   for (const auto &i : tokens) {
2108     auto eqpos = i.find("=");
2109     // Throw out anything that isn't of the form "<str>=<str>"
2110     if (eqpos == 0 || eqpos == std::string::npos || eqpos == i.size()) {
2111       lderr(cct) << "Invalid metadata keyval pair: '" << i << "'" << dendl;
2112       continue;
2113     }
2114     metadata[i.substr(0, eqpos)] = i.substr(eqpos + 1);
2115   }
2116 }
2117
2118 /**
2119  * Optionally add or override client metadata fields.
2120  */
2121 void Client::update_metadata(std::string const &k, std::string const &v)
2122 {
2123   RWRef_t iref_reader(initialize_state, CLIENT_INITIALIZED);
2124   ceph_assert(iref_reader.is_state_satisfied());
2125
2126   std::scoped_lock l(client_lock);
2127
2128   auto it = metadata.find(k);
2129   if (it != metadata.end()) {
2130     ldout(cct, 1) << __func__ << " warning, overriding metadata field '" << k
2131                   << "' from '" << it->second << "' to '" << v << "'" << dendl;
2132   }
2133
2134   metadata[k] = v;
2135 }
2136
2137 MetaSession *Client::_open_mds_session(mds_rank_t mds)
2138 {
2139   ldout(cct, 10) << __func__ << " mds." << mds << dendl;
2140   auto addrs = mdsmap->get_addrs(mds);
2141   auto em = mds_sessions.emplace(std::piecewise_construct,
2142       std::forward_as_tuple(mds),
2143       std::forward_as_tuple(mds, messenger->connect_to_mds(addrs), addrs));
2144   ceph_assert(em.second); /* not already present */
2145   MetaSession *session = &em.first->second;
2146
2147   auto m = make_message<MClientSession>(CEPH_SESSION_REQUEST_OPEN);
2148   m->metadata = metadata;
2149   m->supported_features = feature_bitset_t(CEPHFS_FEATURES_CLIENT_SUPPORTED);
2150   m->metric_spec = feature_bitset_t(CEPHFS_METRIC_FEATURES_ALL);
2151   session->con->send_message2(std::move(m));
2152   return session;
2153 }
2154
2155 void Client::_close_mds_session(MetaSession *s)
2156 {
2157   ldout(cct, 2) << __func__ << " mds." << s->mds_num << " seq " << s->seq << dendl;
2158   s->state = MetaSession::STATE_CLOSING;
2159   s->con->send_message2(make_message<MClientSession>(CEPH_SESSION_REQUEST_CLOSE, s->seq));
2160 }
2161
2162 void Client::_closed_mds_session(MetaSession *s, int err, bool rejected)
2163 {
2164   ldout(cct, 5) << __func__ << " mds." << s->mds_num << " seq " << s->seq << dendl;
2165   if (rejected && s->state != MetaSession::STATE_CLOSING)
2166     s->state = MetaSession::STATE_REJECTED;
2167   else
2168     s->state = MetaSession::STATE_CLOSED;
2169   s->con->mark_down();
2170   signal_context_list(s->waiting_for_open);
2171   mount_cond.notify_all();
2172   remove_session_caps(s, err);
2173   kick_requests_closed(s);
2174   mds_ranks_closing.erase(s->mds_num);
2175   if (s->state == MetaSession::STATE_CLOSED)
2176     mds_sessions.erase(s->mds_num);
2177 }
2178
2179 void Client::handle_client_session(const MConstRef<MClientSession>& m)
2180 {
2181   mds_rank_t from = mds_rank_t(m->get_source().num());
2182   ldout(cct, 10) << __func__ << " " << *m << " from mds." << from << dendl;
2183
2184   std::scoped_lock cl(client_lock);
2185   MetaSession *session = _get_mds_session(from, m->get_connection().get());
2186   if (!session) {
2187     ldout(cct, 10) << " discarding session message from sessionless mds " << m->get_source_inst() << dendl;
2188     return;
2189   }
2190
2191   switch (m->get_op()) {
2192   case CEPH_SESSION_OPEN:
2193     {
2194       feature_bitset_t missing_features(CEPHFS_FEATURES_CLIENT_REQUIRED);
2195       missing_features -= m->supported_features;
2196       if (!missing_features.empty()) {
2197         lderr(cct) << "mds." << from << " lacks required features '"
2198                    << missing_features << "', closing session " << dendl;
2199         _close_mds_session(session);
2200         _closed_mds_session(session, -CEPHFS_EPERM, true);
2201         break;
2202       }
2203       session->mds_features = std::move(m->supported_features);
2204
2205       renew_caps(session);
2206       session->state = MetaSession::STATE_OPEN;
2207       if (is_unmounting())
2208         mount_cond.notify_all();
2209       else
2210         connect_mds_targets(from);
2211       signal_context_list(session->waiting_for_open);
2212       break;
2213     }
2214
2215   case CEPH_SESSION_CLOSE:
2216     _closed_mds_session(session);
2217     break;
2218
2219   case CEPH_SESSION_RENEWCAPS:
2220     if (session->cap_renew_seq == m->get_seq()) {
2221       bool was_stale = ceph_clock_now() >= session->cap_ttl;
2222       session->cap_ttl =
2223         session->last_cap_renew_request + mdsmap->get_session_timeout();
2224       if (was_stale)
2225         wake_up_session_caps(session, false);
2226     }
2227     break;
2228
2229   case CEPH_SESSION_STALE:
2230     // invalidate session caps/leases
2231     session->cap_gen++;
2232     session->cap_ttl = ceph_clock_now();
2233     session->cap_ttl -= 1;
2234     renew_caps(session);
2235     break;
2236
2237   case CEPH_SESSION_RECALL_STATE:
2238     /*
2239      * Call the renew caps and flush cap releases just before
2240      * triming the caps in case the tick() won't get a chance
2241      * to run them, which could cause the client to be blocklisted
2242      * and MDS daemons trying to recall the caps again and
2243      * again.
2244      *
2245      * In most cases it will do nothing, and the new cap releases
2246      * added by trim_caps() followed will be deferred flushing
2247      * by tick().
2248      */
2249     renew_and_flush_cap_releases();
2250     trim_caps(session, m->get_max_caps());
2251     break;
2252
2253   case CEPH_SESSION_FLUSHMSG:
2254     /* flush cap release */
2255     if (auto& m = session->release; m) {
2256       session->con->send_message2(std::move(m));
2257     }
2258     session->con->send_message2(make_message<MClientSession>(CEPH_SESSION_FLUSHMSG_ACK, m->get_seq()));
2259     break;
2260
2261   case CEPH_SESSION_FORCE_RO:
2262     force_session_readonly(session);
2263     break;
2264
2265   case CEPH_SESSION_REJECT:
2266     {
2267       std::string_view error_str;
2268       auto it = m->metadata.find("error_string");
2269       if (it != m->metadata.end())
2270         error_str = it->second;
2271       else
2272         error_str = "unknown error";
2273       lderr(cct) << "mds." << from << " rejected us (" << error_str << ")" << dendl;
2274
2275       _closed_mds_session(session, -CEPHFS_EPERM, true);
2276     }
2277     break;
2278
2279   default:
2280     ceph_abort();
2281   }
2282 }
2283
2284 bool Client::_any_stale_sessions() const
2285 {
2286   ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
2287
2288   for (const auto &p : mds_sessions) {
2289     if (p.second.state == MetaSession::STATE_STALE) {
2290       return true;
2291     }
2292   }
2293
2294   return false;
2295 }
2296
2297 void Client::_kick_stale_sessions()
2298 {
2299   ldout(cct, 1) << __func__ << dendl;
2300
2301   for (auto it = mds_sessions.begin(); it != mds_sessions.end(); ) {
2302     MetaSession &s = it->second;
2303     if (s.state == MetaSession::STATE_REJECTED) {
2304       mds_sessions.erase(it++);
2305       continue;
2306     }
2307     ++it;
2308     if (s.state == MetaSession::STATE_STALE)
2309       _closed_mds_session(&s);
2310   }
2311 }
2312
2313 void Client::send_request(MetaRequest *request, MetaSession *session,
2314                           bool drop_cap_releases)
2315 {
2316   // make the request
2317   mds_rank_t mds = session->mds_num;
2318   ldout(cct, 10) << __func__ << " rebuilding request " << request->get_tid()
2319                  << " for mds." << mds << dendl;
2320   auto r = build_client_request(request);
2321   if (request->dentry()) {
2322     r->set_dentry_wanted();
2323   }
2324   if (request->got_unsafe) {
2325     r->set_replayed_op();
2326     if (request->target)
2327       r->head.ino = request->target->ino;
2328   } else {
2329     encode_cap_releases(request, mds);
2330     if (drop_cap_releases) // we haven't send cap reconnect yet, drop cap releases
2331       request->cap_releases.clear();
2332     else
2333       r->releases.swap(request->cap_releases);
2334   }
2335   r->set_mdsmap_epoch(mdsmap->get_epoch());
2336   if (r->head.op == CEPH_MDS_OP_SETXATTR) {
2337     objecter->with_osdmap([r](const OSDMap& o) {
2338         r->set_osdmap_epoch(o.get_epoch());
2339       });
2340   }
2341
2342   if (request->mds == -1) {
2343     request->sent_stamp = ceph_clock_now();
2344     ldout(cct, 20) << __func__ << " set sent_stamp to " << request->sent_stamp << dendl;
2345   }
2346   request->mds = mds;
2347
2348   Inode *in = request->inode();
2349   if (in) {
2350     auto it = in->caps.find(mds);
2351     if (it != in->caps.end()) {
2352       request->sent_on_mseq = it->second.mseq;
2353     }
2354   }
2355
2356   session->requests.push_back(&request->item);
2357
2358   ldout(cct, 10) << __func__ << " " << *r << " to mds." << mds << dendl;
2359   session->con->send_message2(std::move(r));
2360 }
2361
2362 ref_t<MClientRequest> Client::build_client_request(MetaRequest *request)
2363 {
2364   auto req = make_message<MClientRequest>(request->get_op());
2365   req->set_tid(request->tid);
2366   req->set_stamp(request->op_stamp);
2367   memcpy(&req->head, &request->head, sizeof(ceph_mds_request_head));
2368
2369   // if the filepath's haven't been set, set them!
2370   if (request->path.empty()) {
2371     Inode *in = request->inode();
2372     Dentry *de = request->dentry();
2373     if (in)
2374       in->make_nosnap_relative_path(request->path);
2375     else if (de) {
2376       if (de->inode)
2377         de->inode->make_nosnap_relative_path(request->path);
2378       else if (de->dir) {
2379         de->dir->parent_inode->make_nosnap_relative_path(request->path);
2380         request->path.push_dentry(de->name);
2381       }
2382       else ldout(cct, 1) << "Warning -- unable to construct a filepath!"
2383                    << " No path, inode, or appropriately-endowed dentry given!"
2384                    << dendl;
2385     } else ldout(cct, 1) << "Warning -- unable to construct a filepath!"
2386                    << " No path, inode, or dentry given!"
2387                    << dendl;
2388   }
2389   req->set_filepath(request->get_filepath());
2390   req->set_filepath2(request->get_filepath2());
2391   req->set_alternate_name(request->alternate_name);
2392   req->set_data(request->data);
2393   req->set_retry_attempt(request->retry_attempt++);
2394   req->head.num_fwd = request->num_fwd;
2395   const gid_t *_gids;
2396   int gid_count = request->perms.get_gids(&_gids);
2397   req->set_gid_list(gid_count, _gids);
2398   return req;
2399 }
2400
2401
2402
2403 void Client::handle_client_request_forward(const MConstRef<MClientRequestForward>& fwd)
2404 {
2405   mds_rank_t mds = mds_rank_t(fwd->get_source().num());
2406
2407   std::scoped_lock cl(client_lock);
2408   MetaSession *session = _get_mds_session(mds, fwd->get_connection().get());
2409   if (!session) {
2410     return;
2411   }
2412   ceph_tid_t tid = fwd->get_tid();
2413
2414   if (mds_requests.count(tid) == 0) {
2415     ldout(cct, 10) << __func__ << " no pending request on tid " << tid << dendl;
2416     return;
2417   }
2418
2419   MetaRequest *request = mds_requests[tid];
2420   ceph_assert(request);
2421
2422   // reset retry counter
2423   request->retry_attempt = 0;
2424
2425   // request not forwarded, or dest mds has no session.
2426   // resend.
2427   ldout(cct, 10) << __func__ << " tid " << tid
2428            << " fwd " << fwd->get_num_fwd()
2429            << " to mds." << fwd->get_dest_mds()
2430            << ", resending to " << fwd->get_dest_mds()
2431            << dendl;
2432
2433   request->mds = -1;
2434   request->item.remove_myself();
2435   request->num_fwd = fwd->get_num_fwd();
2436   request->resend_mds = fwd->get_dest_mds();
2437   request->caller_cond->notify_all();
2438 }
2439
2440 bool Client::is_dir_operation(MetaRequest *req)
2441 {
2442   int op = req->get_op();
2443   if (op == CEPH_MDS_OP_MKNOD || op == CEPH_MDS_OP_LINK ||
2444       op == CEPH_MDS_OP_UNLINK || op == CEPH_MDS_OP_RENAME ||
2445       op == CEPH_MDS_OP_MKDIR || op == CEPH_MDS_OP_RMDIR ||
2446       op == CEPH_MDS_OP_SYMLINK || op == CEPH_MDS_OP_CREATE)
2447     return true;
2448   return false;
2449 }
2450
2451 void Client::handle_client_reply(const MConstRef<MClientReply>& reply)
2452 {
2453   mds_rank_t mds_num = mds_rank_t(reply->get_source().num());
2454
2455   std::scoped_lock cl(client_lock);
2456   MetaSession *session = _get_mds_session(mds_num, reply->get_connection().get());
2457   if (!session) {
2458     return;
2459   }
2460
2461   ceph_tid_t tid = reply->get_tid();
2462   bool is_safe = reply->is_safe();
2463
2464   if (mds_requests.count(tid) == 0) {
2465     lderr(cct) << __func__ << " no pending request on tid " << tid
2466                << " safe is:" << is_safe << dendl;
2467     return;
2468   }
2469   MetaRequest *request = mds_requests.at(tid);
2470
2471   ldout(cct, 20) << __func__ << " got a reply. Safe:" << is_safe
2472                  << " tid " << tid << dendl;
2473
2474   if (request->got_unsafe && !is_safe) {
2475     //duplicate response
2476     ldout(cct, 0) << "got a duplicate reply on tid " << tid << " from mds "
2477             << mds_num << " safe:" << is_safe << dendl;
2478     return;
2479   }
2480
2481   if (-CEPHFS_ESTALE == reply->get_result()) { // see if we can get to proper MDS
2482     ldout(cct, 20) << "got ESTALE on tid " << request->tid
2483                    << " from mds." << request->mds << dendl;
2484     request->send_to_auth = true;
2485     request->resend_mds = choose_target_mds(request);
2486     Inode *in = request->inode();
2487     std::map<mds_rank_t, Cap>::const_iterator it;
2488     if (request->resend_mds >= 0 &&
2489         request->resend_mds == request->mds &&
2490         (in == NULL ||
2491          (it = in->caps.find(request->resend_mds)) != in->caps.end() ||
2492          request->sent_on_mseq == it->second.mseq)) {
2493       ldout(cct, 20) << "have to return ESTALE" << dendl;
2494     } else {
2495       request->caller_cond->notify_all();
2496       return;
2497     }
2498   }
2499
2500   ceph_assert(!request->reply);
2501   request->reply = reply;
2502   insert_trace(request, session);
2503
2504   // Handle unsafe reply
2505   if (!is_safe) {
2506     request->got_unsafe = true;
2507     session->unsafe_requests.push_back(&request->unsafe_item);
2508     if (is_dir_operation(request)) {
2509       Inode *dir = request->inode();
2510       ceph_assert(dir);
2511       dir->unsafe_ops.push_back(&request->unsafe_dir_item);
2512     }
2513     if (request->target) {
2514       InodeRef &in = request->target;
2515       in->unsafe_ops.push_back(&request->unsafe_target_item);
2516     }
2517   }
2518
2519   // Only signal the caller once (on the first reply):
2520   // Either its an unsafe reply, or its a safe reply and no unsafe reply was sent.
2521   if (!is_safe || !request->got_unsafe) {
2522     ceph::condition_variable cond;
2523     request->dispatch_cond = &cond;
2524
2525     // wake up waiter
2526     ldout(cct, 20) << __func__ << " signalling caller " << (void*)request->caller_cond << dendl;
2527     request->caller_cond->notify_all();
2528
2529     // wake for kick back
2530     std::unique_lock l{client_lock, std::adopt_lock};
2531     cond.wait(l, [tid, request, &cond, this] {
2532       if (request->dispatch_cond) {
2533         ldout(cct, 20) << "handle_client_reply awaiting kickback on tid "
2534                        << tid << " " << &cond << dendl;
2535       }
2536       return !request->dispatch_cond;
2537     });
2538     l.release();
2539   }
2540
2541   if (is_safe) {
2542     // the filesystem change is committed to disk
2543     // we're done, clean up
2544     if (request->got_unsafe) {
2545       request->unsafe_item.remove_myself();
2546       request->unsafe_dir_item.remove_myself();
2547       request->unsafe_target_item.remove_myself();
2548       signal_cond_list(request->waitfor_safe);
2549     }
2550     request->item.remove_myself();
2551     unregister_request(request);
2552   }
2553   if (is_unmounting())
2554     mount_cond.notify_all();
2555 }
2556
2557 void Client::_handle_full_flag(int64_t pool)
2558 {
2559   ldout(cct, 1) << __func__ << ": FULL: cancelling outstanding operations "
2560     << "on " << pool << dendl;
2561   // Cancel all outstanding ops in this pool with -CEPHFS_ENOSPC: it is necessary
2562   // to do this rather than blocking, because otherwise when we fill up we
2563   // potentially lock caps forever on files with dirty pages, and we need
2564   // to be able to release those caps to the MDS so that it can delete files
2565   // and free up space.
2566   epoch_t cancelled_epoch = objecter->op_cancel_writes(-CEPHFS_ENOSPC, pool);
2567
2568   // For all inodes with layouts in this pool and a pending flush write op
2569   // (i.e. one of the ones we will cancel), we've got to purge_set their data
2570   // from ObjectCacher so that it doesn't re-issue the write in response to
2571   // the ENOSPC error.
2572   // Fortunately since we're cancelling everything in a given pool, we don't
2573   // need to know which ops belong to which ObjectSet, we can just blow all
2574   // the un-flushed cached data away and mark any dirty inodes' async_err
2575   // field with -CEPHFS_ENOSPC as long as we're sure all the ops we cancelled were
2576   // affecting this pool, and all the objectsets we're purging were also
2577   // in this pool.
2578   for (unordered_map<vinodeno_t,Inode*>::iterator i = inode_map.begin();
2579        i != inode_map.end(); ++i)
2580   {
2581     Inode *inode = i->second;
2582     if (inode->oset.dirty_or_tx
2583         && (pool == -1 || inode->layout.pool_id == pool)) {
2584       ldout(cct, 4) << __func__ << ": FULL: inode 0x" << std::hex << i->first << std::dec
2585         << " has dirty objects, purging and setting ENOSPC" << dendl;
2586       objectcacher->purge_set(&inode->oset);
2587       inode->set_async_err(-CEPHFS_ENOSPC);
2588     }
2589   }
2590
2591   if (cancelled_epoch != (epoch_t)-1) {
2592     set_cap_epoch_barrier(cancelled_epoch);
2593   }
2594 }
2595
2596 void Client::handle_osd_map(const MConstRef<MOSDMap>& m)
2597 {
2598   std::set<entity_addr_t> new_blocklists;
2599
2600   std::scoped_lock cl(client_lock);
2601   objecter->consume_blocklist_events(&new_blocklists);
2602
2603   const auto myaddrs = messenger->get_myaddrs();
2604   bool new_blocklist = false;
2605   bool prenautilus = objecter->with_osdmap(
2606     [&](const OSDMap& o) {
2607       return o.require_osd_release < ceph_release_t::nautilus;
2608     });
2609   if (!blocklisted) {
2610     for (auto a : myaddrs.v) {
2611       // blocklist entries are always TYPE_ANY for nautilus+
2612       a.set_type(entity_addr_t::TYPE_ANY);
2613       if (new_blocklists.count(a)) {
2614         new_blocklist = true;
2615         break;
2616       }
2617       if (prenautilus) {
2618         // ...except pre-nautilus, they were TYPE_LEGACY
2619         a.set_type(entity_addr_t::TYPE_LEGACY);
2620         if (new_blocklists.count(a)) {
2621           new_blocklist = true;
2622           break;
2623         }
2624       }
2625     }
2626   }
2627   if (new_blocklist) {
2628     auto epoch = objecter->with_osdmap([](const OSDMap &o){
2629         return o.get_epoch();
2630         });
2631     lderr(cct) << "I was blocklisted at osd epoch " << epoch << dendl;
2632     blocklisted = true;
2633
2634     _abort_mds_sessions(-CEPHFS_EBLOCKLISTED);
2635
2636     // Since we know all our OSD ops will fail, cancel them all preemtively,
2637     // so that on an unhealthy cluster we can umount promptly even if e.g.
2638     // some PGs were inaccessible.
2639     objecter->op_cancel_writes(-CEPHFS_EBLOCKLISTED);
2640
2641   }
2642
2643   if (blocklisted) {
2644     // Handle case where we were blocklisted but no longer are
2645     blocklisted = objecter->with_osdmap([myaddrs](const OSDMap &o){
2646         return o.is_blocklisted(myaddrs);});
2647   }
2648
2649   // Always subscribe to next osdmap for blocklisted client
2650   // until this client is not blocklisted.
2651   if (blocklisted) {
2652     objecter->maybe_request_map();
2653   }
2654
2655   if (objecter->osdmap_full_flag()) {
2656     _handle_full_flag(-1);
2657   } else {
2658     // Accumulate local list of full pools so that I can drop
2659     // the objecter lock before re-entering objecter in
2660     // cancel_writes
2661     std::vector<int64_t> full_pools;
2662
2663     objecter->with_osdmap([&full_pools](const OSDMap &o) {
2664         for (const auto& kv : o.get_pools()) {
2665           if (kv.second.has_flag(pg_pool_t::FLAG_FULL)) {
2666             full_pools.push_back(kv.first);
2667           }
2668         }
2669       });
2670
2671     for (auto p : full_pools)
2672       _handle_full_flag(p);
2673
2674     // Subscribe to subsequent maps to watch for the full flag going
2675     // away.  For the global full flag objecter does this for us, but
2676     // it pays no attention to the per-pool full flag so in this branch
2677     // we do it ourselves.
2678     if (!full_pools.empty()) {
2679       objecter->maybe_request_map();
2680     }
2681   }
2682 }
2683
2684
2685 // ------------------------
2686 // incoming messages
2687
2688
2689 bool Client::ms_dispatch2(const MessageRef &m)
2690 {
2691   RWRef_t iref_reader(initialize_state, CLIENT_INITIALIZED);
2692   if (!iref_reader.is_state_satisfied()) {
2693     ldout(cct, 10) << "inactive, discarding " << *m << dendl;
2694     return true;
2695   }
2696
2697   switch (m->get_type()) {
2698     // mounting and mds sessions
2699   case CEPH_MSG_MDS_MAP:
2700     handle_mds_map(ref_cast<MMDSMap>(m));
2701     break;
2702   case CEPH_MSG_FS_MAP:
2703     handle_fs_map(ref_cast<MFSMap>(m));
2704     break;
2705   case CEPH_MSG_FS_MAP_USER:
2706     handle_fs_map_user(ref_cast<MFSMapUser>(m));
2707     break;
2708   case CEPH_MSG_CLIENT_SESSION:
2709     handle_client_session(ref_cast<MClientSession>(m));
2710     break;
2711
2712   case CEPH_MSG_OSD_MAP:
2713     handle_osd_map(ref_cast<MOSDMap>(m));
2714     break;
2715
2716     // requests
2717   case CEPH_MSG_CLIENT_REQUEST_FORWARD:
2718     handle_client_request_forward(ref_cast<MClientRequestForward>(m));
2719     break;
2720   case CEPH_MSG_CLIENT_REPLY:
2721     handle_client_reply(ref_cast<MClientReply>(m));
2722     break;
2723
2724   // reclaim reply
2725   case CEPH_MSG_CLIENT_RECLAIM_REPLY:
2726     handle_client_reclaim_reply(ref_cast<MClientReclaimReply>(m));
2727     break;
2728
2729   case CEPH_MSG_CLIENT_SNAP:
2730     handle_snap(ref_cast<MClientSnap>(m));
2731     break;
2732   case CEPH_MSG_CLIENT_CAPS:
2733     handle_caps(ref_cast<MClientCaps>(m));
2734     break;
2735   case CEPH_MSG_CLIENT_LEASE:
2736     handle_lease(ref_cast<MClientLease>(m));
2737     break;
2738   case MSG_COMMAND_REPLY:
2739     if (m->get_source().type() == CEPH_ENTITY_TYPE_MDS) {
2740       handle_command_reply(ref_cast<MCommandReply>(m));
2741     } else {
2742       return false;
2743     }
2744     break;
2745   case CEPH_MSG_CLIENT_QUOTA:
2746     handle_quota(ref_cast<MClientQuota>(m));
2747     break;
2748
2749   default:
2750     return false;
2751   }
2752
2753   // unmounting?
2754   std::scoped_lock cl(client_lock);
2755   if (is_unmounting()) {
2756     ldout(cct, 10) << "unmounting: trim pass, size was " << lru.lru_get_size()
2757              << "+" << inode_map.size() << dendl;
2758     uint64_t size = lru.lru_get_size() + inode_map.size();
2759     trim_cache();
2760     if (size > lru.lru_get_size() + inode_map.size()) {
2761       ldout(cct, 10) << "unmounting: trim pass, cache shrank, poking unmount()" << dendl;
2762       mount_cond.notify_all();
2763     } else {
2764       ldout(cct, 10) << "unmounting: trim pass, size still " << lru.lru_get_size()
2765                << "+" << inode_map.size() << dendl;
2766     }
2767   }
2768
2769   return true;
2770 }
2771
2772 void Client::handle_fs_map(const MConstRef<MFSMap>& m)
2773 {
2774   std::scoped_lock cl(client_lock);
2775   fsmap.reset(new FSMap(m->get_fsmap()));
2776
2777   signal_cond_list(waiting_for_fsmap);
2778
2779   monclient->sub_got("fsmap", fsmap->get_epoch());
2780 }
2781
2782 void Client::handle_fs_map_user(const MConstRef<MFSMapUser>& m)
2783 {
2784   std::scoped_lock cl(client_lock);
2785   fsmap_user.reset(new FSMapUser);
2786   *fsmap_user = m->get_fsmap();
2787
2788   monclient->sub_got("fsmap.user", fsmap_user->get_epoch());
2789   signal_cond_list(waiting_for_fsmap);
2790 }
2791
2792 // Cancel all the commands for missing or laggy GIDs
2793 void Client::cancel_commands(const MDSMap& newmap)
2794 {
2795   std::vector<ceph_tid_t> cancel_ops;
2796
2797   std::scoped_lock cmd_lock(command_lock);
2798   auto &commands = command_table.get_commands();
2799   for (const auto &[tid, op] : commands) {
2800     const mds_gid_t op_mds_gid = op.mds_gid;
2801     if (newmap.is_dne_gid(op_mds_gid) || newmap.is_laggy_gid(op_mds_gid)) {
2802       ldout(cct, 1) << __func__ << ": cancelling command op " << tid << dendl;
2803       cancel_ops.push_back(tid);
2804       if (op.outs) {
2805         std::ostringstream ss;
2806         ss << "MDS " << op_mds_gid << " went away";
2807         *(op.outs) = ss.str();
2808       }
2809       /*
2810        * No need to make the con->mark_down under
2811        * client_lock here, because the con will
2812        * has its own lock.
2813        */
2814       op.con->mark_down();
2815       if (op.on_finish)
2816         op.on_finish->complete(-CEPHFS_ETIMEDOUT);
2817     }
2818   }
2819
2820   for (const auto &tid : cancel_ops)
2821     command_table.erase(tid);
2822 }
2823
2824 void Client::handle_mds_map(const MConstRef<MMDSMap>& m)
2825 {
2826   std::unique_lock cl(client_lock);
2827   if (m->get_epoch() <= mdsmap->get_epoch()) {
2828     ldout(cct, 1) << __func__ << " epoch " << m->get_epoch()
2829                   << " is identical to or older than our "
2830                   << mdsmap->get_epoch() << dendl;
2831     return;
2832   }
2833
2834   cl.unlock();
2835   ldout(cct, 1) << __func__ << " epoch " << m->get_epoch() << dendl;
2836   std::unique_ptr<MDSMap> _mdsmap(new MDSMap);
2837   _mdsmap->decode(m->get_encoded());
2838   cancel_commands(*_mdsmap.get());
2839   cl.lock();
2840
2841   _mdsmap.swap(mdsmap);
2842
2843   // reset session
2844   for (auto p = mds_sessions.begin(); p != mds_sessions.end(); ) {
2845     mds_rank_t mds = p->first;
2846     MetaSession *session = &p->second;
2847     ++p;
2848
2849     int oldstate = _mdsmap->get_state(mds);
2850     int newstate = mdsmap->get_state(mds);
2851     if (!mdsmap->is_up(mds)) {
2852       session->con->mark_down();
2853     } else if (mdsmap->get_addrs(mds) != session->addrs) {
2854       auto old_inc = _mdsmap->get_incarnation(mds);
2855       auto new_inc = mdsmap->get_incarnation(mds);
2856       if (old_inc != new_inc) {
2857         ldout(cct, 1) << "mds incarnation changed from "
2858                       << old_inc << " to " << new_inc << dendl;
2859         oldstate = MDSMap::STATE_NULL;
2860       }
2861       session->con->mark_down();
2862       session->addrs = mdsmap->get_addrs(mds);
2863       // When new MDS starts to take over, notify kernel to trim unused entries
2864       // in its dcache/icache. Hopefully, the kernel will release some unused
2865       // inodes before the new MDS enters reconnect state.
2866       trim_cache_for_reconnect(session);
2867     } else if (oldstate == newstate)
2868       continue;  // no change
2869
2870     session->mds_state = newstate;
2871     if (newstate == MDSMap::STATE_RECONNECT) {
2872       session->con = messenger->connect_to_mds(session->addrs);
2873       send_reconnect(session);
2874     } else if (newstate > MDSMap::STATE_RECONNECT) {
2875       if (oldstate < MDSMap::STATE_RECONNECT) {
2876         ldout(cct, 1) << "we may miss the MDSMap::RECONNECT, close mds session ... " << dendl;
2877         _closed_mds_session(session);
2878         continue;
2879       }
2880       if (newstate >= MDSMap::STATE_ACTIVE) {
2881         if (oldstate < MDSMap::STATE_ACTIVE) {
2882           // kick new requests
2883           kick_requests(session);
2884           kick_flushing_caps(session);
2885           signal_context_list(session->waiting_for_open);
2886           wake_up_session_caps(session, true);
2887         }
2888         connect_mds_targets(mds);
2889       }
2890     } else if (newstate == MDSMap::STATE_NULL &&
2891                mds >= mdsmap->get_max_mds()) {
2892       _closed_mds_session(session);
2893     }
2894   }
2895
2896   // kick any waiting threads
2897   signal_cond_list(waiting_for_mdsmap);
2898
2899   monclient->sub_got("mdsmap", mdsmap->get_epoch());
2900 }
2901
2902 void Client::send_reconnect(MetaSession *session)
2903 {
2904   mds_rank_t mds = session->mds_num;
2905   ldout(cct, 10) << __func__ << " to mds." << mds << dendl;
2906
2907   // trim unused caps to reduce MDS's cache rejoin time
2908   trim_cache_for_reconnect(session);
2909
2910   session->readonly = false;
2911
2912   session->release.reset();
2913
2914   // reset my cap seq number
2915   session->seq = 0;
2916   //connect to the mds' offload targets
2917   connect_mds_targets(mds);
2918   //make sure unsafe requests get saved
2919   resend_unsafe_requests(session);
2920
2921   early_kick_flushing_caps(session);
2922
2923   auto m = make_message<MClientReconnect>();
2924   bool allow_multi = session->mds_features.test(CEPHFS_FEATURE_MULTI_RECONNECT);
2925
2926   // i have an open session.
2927   ceph::unordered_set<inodeno_t> did_snaprealm;
2928   for (ceph::unordered_map<vinodeno_t, Inode*>::iterator p = inode_map.begin();
2929        p != inode_map.end();
2930        ++p) {
2931     Inode *in = p->second;
2932     auto it = in->caps.find(mds);
2933     if (it != in->caps.end()) {
2934       if (allow_multi &&
2935           m->get_approx_size() >=
2936           static_cast<size_t>((std::numeric_limits<int>::max() >> 1))) {
2937         m->mark_more();
2938         session->con->send_message2(std::move(m));
2939
2940         m = make_message<MClientReconnect>();
2941       }
2942
2943       Cap &cap = it->second;
2944       ldout(cct, 10) << " caps on " << p->first
2945                << " " << ccap_string(cap.issued)
2946                << " wants " << ccap_string(in->caps_wanted())
2947                << dendl;
2948       filepath path;
2949       in->make_short_path(path);
2950       ldout(cct, 10) << "    path " << path << dendl;
2951
2952       bufferlist flockbl;
2953       _encode_filelocks(in, flockbl);
2954
2955       cap.seq = 0;  // reset seq.
2956       cap.issue_seq = 0;  // reset seq.
2957       cap.mseq = 0;  // reset seq.
2958       // cap gen should catch up with session cap_gen
2959       if (cap.gen < session->cap_gen) {
2960         cap.gen = session->cap_gen;
2961         cap.issued = cap.implemented = CEPH_CAP_PIN;
2962       } else {
2963         cap.issued = cap.implemented;
2964       }
2965       snapid_t snap_follows = 0;
2966       if (!in->cap_snaps.empty())
2967         snap_follows = in->cap_snaps.begin()->first;
2968
2969       m->add_cap(p->first.ino,
2970                  cap.cap_id,
2971                  path.get_ino(), path.get_path(),   // ino
2972                  in->caps_wanted(), // wanted
2973                  cap.issued,     // issued
2974                  in->snaprealm->ino,
2975                  snap_follows,
2976                  flockbl);
2977
2978       if (did_snaprealm.count(in->snaprealm->ino) == 0) {
2979         ldout(cct, 10) << " snaprealm " << *in->snaprealm << dendl;
2980         m->add_snaprealm(in->snaprealm->ino, in->snaprealm->seq, in->snaprealm->parent);
2981         did_snaprealm.insert(in->snaprealm->ino);
2982       }
2983     }
2984   }
2985
2986   if (!allow_multi)
2987     m->set_encoding_version(0); // use connection features to choose encoding
2988   session->con->send_message2(std::move(m));
2989
2990   mount_cond.notify_all();
2991
2992   if (session->reclaim_state == MetaSession::RECLAIMING)
2993     signal_cond_list(waiting_for_reclaim);
2994 }
2995
2996
2997 void Client::kick_requests(MetaSession *session)
2998 {
2999   ldout(cct, 10) << __func__ << " for mds." << session->mds_num << dendl;
3000   for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
3001        p != mds_requests.end();
3002        ++p) {
3003     MetaRequest *req = p->second;
3004     if (req->got_unsafe)
3005       continue;
3006     if (req->aborted()) {
3007       if (req->caller_cond) {
3008         req->kick = true;
3009         req->caller_cond->notify_all();
3010       }
3011       continue;
3012     }
3013     if (req->retry_attempt > 0)
3014       continue; // new requests only
3015     if (req->mds == session->mds_num) {
3016       send_request(p->second, session);
3017     }
3018   }
3019 }
3020
3021 void Client::resend_unsafe_requests(MetaSession *session)
3022 {
3023   for (xlist<MetaRequest*>::iterator iter = session->unsafe_requests.begin();
3024        !iter.end();
3025        ++iter)
3026     send_request(*iter, session);
3027
3028   // also re-send old requests when MDS enters reconnect stage. So that MDS can
3029   // process completed requests in clientreplay stage.
3030   for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
3031        p != mds_requests.end();
3032        ++p) {
3033     MetaRequest *req = p->second;
3034     if (req->got_unsafe)
3035       continue;
3036     if (req->aborted())
3037       continue;
3038     if (req->retry_attempt == 0)
3039       continue; // old requests only
3040     if (req->mds == session->mds_num)
3041       send_request(req, session, true);
3042   }
3043 }
3044
3045 void Client::wait_unsafe_requests()
3046 {
3047   list<MetaRequest*> last_unsafe_reqs;
3048   for (const auto &p : mds_sessions) {
3049     const MetaSession &s = p.second;
3050     if (!s.unsafe_requests.empty()) {
3051       MetaRequest *req = s.unsafe_requests.back();
3052       req->get();
3053       last_unsafe_reqs.push_back(req);
3054     }
3055   }
3056
3057   for (list<MetaRequest*>::iterator p = last_unsafe_reqs.begin();
3058        p != last_unsafe_reqs.end();
3059        ++p) {
3060     MetaRequest *req = *p;
3061     if (req->unsafe_item.is_on_list())
3062       wait_on_list(req->waitfor_safe);
3063     put_request(req);
3064   }
3065 }
3066
3067 void Client::kick_requests_closed(MetaSession *session)
3068 {
3069   ldout(cct, 10) << __func__ << " for mds." << session->mds_num << dendl;
3070   for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
3071        p != mds_requests.end(); ) {
3072     MetaRequest *req = p->second;
3073     ++p;
3074     if (req->mds == session->mds_num) {
3075       if (req->caller_cond) {
3076         req->kick = true;
3077         req->caller_cond->notify_all();
3078       }
3079       req->item.remove_myself();
3080       if (req->got_unsafe) {
3081         lderr(cct) << __func__ << " removing unsafe request " << req->get_tid() << dendl;
3082         req->unsafe_item.remove_myself();
3083         if (is_dir_operation(req)) {
3084           Inode *dir = req->inode();
3085           assert(dir);
3086           dir->set_async_err(-CEPHFS_EIO);
3087           lderr(cct) << "kick_requests_closed drop req of inode(dir) : "
3088                      <<  dir->ino  << " " << req->get_tid() << dendl;
3089           req->unsafe_dir_item.remove_myself();
3090         }
3091         if (req->target) {
3092           InodeRef &in = req->target;
3093           in->set_async_err(-CEPHFS_EIO);
3094           lderr(cct) << "kick_requests_closed drop req of inode : "
3095                      <<  in->ino  << " " << req->get_tid() << dendl;
3096           req->unsafe_target_item.remove_myself();
3097         }
3098         signal_cond_list(req->waitfor_safe);
3099         unregister_request(req);
3100       }
3101     }
3102   }
3103   ceph_assert(session->requests.empty());
3104   ceph_assert(session->unsafe_requests.empty());
3105 }
3106
3107
3108
3109
3110 /************
3111  * leases
3112  */
3113
3114 void Client::got_mds_push(MetaSession *s)
3115 {
3116   s->seq++;
3117   ldout(cct, 10) << " mds." << s->mds_num << " seq now " << s->seq << dendl;
3118   if (s->state == MetaSession::STATE_CLOSING) {
3119     s->con->send_message2(make_message<MClientSession>(CEPH_SESSION_REQUEST_CLOSE, s->seq));
3120   }
3121 }
3122
3123 void Client::handle_lease(const MConstRef<MClientLease>& m)
3124 {
3125   ldout(cct, 10) << __func__ << " " << *m << dendl;
3126
3127   ceph_assert(m->get_action() == CEPH_MDS_LEASE_REVOKE);
3128   mds_rank_t mds = mds_rank_t(m->get_source().num());
3129
3130   std::scoped_lock cl(client_lock);
3131   MetaSession *session = _get_mds_session(mds, m->get_connection().get());
3132   if (!session) {
3133     return;
3134   }
3135
3136   got_mds_push(session);
3137
3138   ceph_seq_t seq = m->get_seq();
3139
3140   Inode *in;
3141   vinodeno_t vino(m->get_ino(), CEPH_NOSNAP);
3142   if (inode_map.count(vino) == 0) {
3143     ldout(cct, 10) << " don't have vino " << vino << dendl;
3144     goto revoke;
3145   }
3146   in = inode_map[vino];
3147
3148   if (m->get_mask() & CEPH_LEASE_VALID) {
3149     if (!in->dir || in->dir->dentries.count(m->dname) == 0) {
3150       ldout(cct, 10) << " don't have dir|dentry " << m->get_ino() << "/" << m->dname <<dendl;
3151       goto revoke;
3152     }
3153     Dentry *dn = in->dir->dentries[m->dname];
3154     ldout(cct, 10) << " revoked DN lease on " << dn << dendl;
3155     dn->lease_mds = -1;
3156   }
3157
3158  revoke:
3159   {
3160     auto reply = make_message<MClientLease>(CEPH_MDS_LEASE_RELEASE, seq,
3161                                             m->get_mask(), m->get_ino(),
3162                                             m->get_first(), m->get_last(), m->dname);
3163     m->get_connection()->send_message2(std::move(reply));
3164   }
3165 }
3166
3167 void Client::_put_inode(Inode *in, int n)
3168 {
3169   ldout(cct, 10) << __func__ << " on " << *in << " n = " << n << dendl;
3170
3171   int left = in->_put(n);
3172   if (left == 0) {
3173     // release any caps
3174     remove_all_caps(in);
3175
3176     ldout(cct, 10) << __func__ << " deleting " << *in << dendl;
3177     bool unclean = objectcacher->release_set(&in->oset);
3178     ceph_assert(!unclean);
3179     inode_map.erase(in->vino());
3180     if (use_faked_inos())
3181       _release_faked_ino(in);
3182
3183     if (in == root) {
3184       root = 0;
3185       root_ancestor = 0;
3186       while (!root_parents.empty())
3187         root_parents.erase(root_parents.begin());
3188     }
3189
3190     delete in;
3191   }
3192 }
3193
3194 void Client::delay_put_inodes(bool wakeup)
3195 {
3196   ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
3197
3198   std::map<Inode*,int> release;
3199   {
3200     std::scoped_lock dl(delay_i_lock);
3201     release.swap(delay_i_release);
3202   }
3203
3204   if (release.empty())
3205     return;
3206
3207   for (auto &[in, cnt] : release)
3208     _put_inode(in, cnt);
3209
3210   if (wakeup)
3211     mount_cond.notify_all();
3212 }
3213
3214 void Client::put_inode(Inode *in, int n)
3215 {
3216   ldout(cct, 20) << __func__ << " on " << *in << " n = " << n << dendl;
3217
3218   std::scoped_lock dl(delay_i_lock);
3219   delay_i_release[in] += n;
3220 }
3221
3222 void Client::close_dir(Dir *dir)
3223 {
3224   Inode *in = dir->parent_inode;
3225   ldout(cct, 15) << __func__ << " dir " << dir << " on " << in << dendl;
3226   ceph_assert(dir->is_empty());
3227   ceph_assert(in->dir == dir);
3228   ceph_assert(in->dentries.size() < 2);     // dirs can't be hard-linked
3229   if (!in->dentries.empty())
3230     in->get_first_parent()->put();   // unpin dentry
3231
3232   delete in->dir;
3233   in->dir = 0;
3234   put_inode(in);               // unpin inode
3235 }
3236
3237   /**
3238    * Don't call this with in==NULL, use get_or_create for that
3239    * leave dn set to default NULL unless you're trying to add
3240    * a new inode to a pre-created Dentry
3241    */
3242 Dentry* Client::link(Dir *dir, const string& name, Inode *in, Dentry *dn)
3243 {
3244   if (!dn) {
3245     // create a new Dentry
3246     dn = new Dentry(dir, name);
3247
3248     lru.lru_insert_mid(dn);    // mid or top?
3249
3250     ldout(cct, 15) << "link dir " << dir->parent_inode << " '" << name << "' to inode " << in
3251                    << " dn " << dn << " (new dn)" << dendl;
3252   } else {
3253     ceph_assert(!dn->inode);
3254     ldout(cct, 15) << "link dir " << dir->parent_inode << " '" << name << "' to inode " << in
3255                    << " dn " << dn << " (old dn)" << dendl;
3256   }
3257
3258   if (in) {    // link to inode
3259     InodeRef tmp_ref;
3260     // only one parent for directories!
3261     if (in->is_dir() && !in->dentries.empty()) {
3262       tmp_ref = in; // prevent unlink below from freeing the inode.
3263       Dentry *olddn = in->get_first_parent();
3264       ceph_assert(olddn->dir != dir || olddn->name != name);
3265       Inode *old_diri = olddn->dir->parent_inode;
3266       clear_dir_complete_and_ordered(old_diri, true);
3267       unlink(olddn, true, true);  // keep dir, dentry
3268     }
3269
3270     dn->link(in);
3271     inc_dentry_nr();
3272     ldout(cct, 20) << "link  inode " << in << " parents now " << in->dentries << dendl;
3273   }
3274
3275   return dn;
3276 }
3277
3278 void Client::unlink(Dentry *dn, bool keepdir, bool keepdentry)
3279 {
3280   InodeRef in(dn->inode);
3281   ldout(cct, 15) << "unlink dir " << dn->dir->parent_inode << " '" << dn->name << "' dn " << dn
3282                  << " inode " << dn->inode << dendl;
3283
3284   // unlink from inode
3285   if (dn->inode) {
3286     dn->unlink();
3287     dec_dentry_nr();
3288     ldout(cct, 20) << "unlink  inode " << in << " parents now " << in->dentries << dendl;
3289   }
3290
3291   if (keepdentry) {
3292     dn->lease_mds = -1;
3293   } else {
3294     ldout(cct, 15) << "unlink  removing '" << dn->name << "' dn " << dn << dendl;
3295
3296     // unlink from dir
3297     Dir *dir = dn->dir;
3298     dn->detach();
3299
3300     // delete den
3301     lru.lru_remove(dn);
3302     dn->put();
3303
3304     if (dir->is_empty() && !keepdir)
3305       close_dir(dir);
3306   }
3307 }
3308
3309 /**
3310  * For asynchronous flushes, check for errors from the IO and
3311  * update the inode if necessary
3312  */
3313 class C_Client_FlushComplete : public Context {
3314 private:
3315   Client *client;
3316   InodeRef inode;
3317 public:
3318   C_Client_FlushComplete(Client *c, Inode *in) : client(c), inode(in) { }
3319   void finish(int r) override {
3320     ceph_assert(ceph_mutex_is_locked_by_me(client->client_lock));
3321     if (r != 0) {
3322       client_t const whoami = client->whoami;  // For the benefit of ldout prefix
3323       ldout(client->cct, 1) << "I/O error from flush on inode " << inode
3324         << " 0x" << std::hex << inode->ino << std::dec
3325         << ": " << r << "(" << cpp_strerror(r) << ")" << dendl;
3326       inode->set_async_err(r);
3327     }
3328   }
3329 };
3330
3331
3332 /****
3333  * caps
3334  */
3335
3336 void Client::get_cap_ref(Inode *in, int cap)
3337 {
3338   if ((cap & CEPH_CAP_FILE_BUFFER) &&
3339       in->cap_refs[CEPH_CAP_FILE_BUFFER] == 0) {
3340     ldout(cct, 5) << __func__ << " got first FILE_BUFFER ref on " << *in << dendl;
3341     in->get();
3342   }
3343   if ((cap & CEPH_CAP_FILE_CACHE) &&
3344       in->cap_refs[CEPH_CAP_FILE_CACHE] == 0) {
3345     ldout(cct, 5) << __func__ << " got first FILE_CACHE ref on " << *in << dendl;
3346     in->get();
3347   }
3348   in->get_cap_ref(cap);
3349 }
3350
3351 void Client::put_cap_ref(Inode *in, int cap)
3352 {
3353   int last = in->put_cap_ref(cap);
3354   if (last) {
3355     int put_nref = 0;
3356     int drop = last & ~in->caps_issued();
3357     if (in->snapid == CEPH_NOSNAP) {
3358       if ((last & (CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER)) &&
3359           !in->cap_snaps.empty() &&
3360           in->cap_snaps.rbegin()->second.writing) {
3361         ldout(cct, 10) << __func__ << " finishing pending cap_snap on " << *in << dendl;
3362         in->cap_snaps.rbegin()->second.writing = 0;
3363         finish_cap_snap(in, in->cap_snaps.rbegin()->second, get_caps_used(in));
3364         signal_cond_list(in->waitfor_caps);  // wake up blocked sync writers
3365       }
3366       if (last & CEPH_CAP_FILE_BUFFER) {
3367         for (auto &p : in->cap_snaps)
3368           p.second.dirty_data = 0;
3369         signal_cond_list(in->waitfor_commit);
3370         ldout(cct, 5) << __func__ << " dropped last FILE_BUFFER ref on " << *in << dendl;
3371         ++put_nref;
3372       }
3373     }
3374     if (last & CEPH_CAP_FILE_CACHE) {
3375       ldout(cct, 5) << __func__ << " dropped last FILE_CACHE ref on " << *in << dendl;
3376       ++put_nref;
3377     }
3378     if (drop)
3379       check_caps(in, 0);
3380     if (put_nref)
3381       put_inode(in, put_nref);
3382   }
3383 }
3384
3385 // get caps for a given file handle -- the inode should have @need caps
3386 // issued by the mds and @want caps not revoked (or not under revocation).
3387 // this routine blocks till the cap requirement is satisfied. also account
3388 // (track) for capability hit when required (when cap requirement succeedes).
3389 int Client::get_caps(Fh *fh, int need, int want, int *phave, loff_t endoff)
3390 {
3391   Inode *in = fh->inode.get();
3392
3393   int r = check_pool_perm(in, need);
3394   if (r < 0)
3395     return r;
3396
3397   while (1) {
3398     int file_wanted = in->caps_file_wanted();
3399     if ((file_wanted & need) != need) {
3400       ldout(cct, 10) << "get_caps " << *in << " need " << ccap_string(need)
3401                      << " file_wanted " << ccap_string(file_wanted) << ", EBADF "
3402                      << dendl;
3403       return -CEPHFS_EBADF;
3404     }
3405
3406     if ((fh->mode & CEPH_FILE_MODE_WR) && fh->gen != fd_gen)
3407       return -CEPHFS_EBADF;
3408
3409     if ((in->flags & I_ERROR_FILELOCK) && fh->has_any_filelocks())
3410       return -CEPHFS_EIO;
3411
3412     int implemented;
3413     int have = in->caps_issued(&implemented);
3414
3415     bool waitfor_caps = false;
3416     bool waitfor_commit = false;
3417
3418     if (have & need & CEPH_CAP_FILE_WR) {
3419       if (endoff > 0) {
3420          if ((endoff >= (loff_t)in->max_size ||
3421               endoff > (loff_t)(in->size << 1)) &&
3422              endoff > (loff_t)in->wanted_max_size) {
3423            ldout(cct, 10) << "wanted_max_size " << in->wanted_max_size << " -> " << endoff << dendl;
3424            in->wanted_max_size = endoff;
3425          }
3426          if (in->wanted_max_size > in->max_size &&
3427              in->wanted_max_size > in->requested_max_size)
3428            check_caps(in, 0);
3429       }
3430
3431       if (endoff >= 0 && endoff > (loff_t)in->max_size) {
3432         ldout(cct, 10) << "waiting on max_size, endoff " << endoff << " max_size " << in->max_size << " on " << *in << dendl;
3433         waitfor_caps = true;
3434       }
3435       if (!in->cap_snaps.empty()) {
3436         if (in->cap_snaps.rbegin()->second.writing) {
3437           ldout(cct, 10) << "waiting on cap_snap write to complete" << dendl;
3438           waitfor_caps = true;
3439         }
3440         for (auto &p : in->cap_snaps) {
3441           if (p.second.dirty_data) {
3442             waitfor_commit = true;
3443             break;
3444           }
3445         }
3446         if (waitfor_commit) {
3447           _flush(in, new C_Client_FlushComplete(this, in));
3448           ldout(cct, 10) << "waiting for WRBUFFER to get dropped" << dendl;
3449         }
3450       }
3451     }
3452
3453     if (!waitfor_caps && !waitfor_commit) {
3454       if ((have & need) == need) {
3455         int revoking = implemented & ~have;
3456         ldout(cct, 10) << "get_caps " << *in << " have " << ccap_string(have)
3457                  << " need " << ccap_string(need) << " want " << ccap_string(want)
3458                  << " revoking " << ccap_string(revoking)
3459                  << dendl;
3460         if ((revoking & want) == 0) {
3461           *phave = need | (have & want);
3462           in->get_cap_ref(need);
3463           cap_hit();
3464           return 0;
3465         }
3466       }
3467       ldout(cct, 10) << "waiting for caps " << *in << " need " << ccap_string(need) << " want " << ccap_string(want) << dendl;
3468       waitfor_caps = true;
3469     }
3470
3471     if ((need & CEPH_CAP_FILE_WR) && in->auth_cap &&
3472         in->auth_cap->session->readonly)
3473       return -CEPHFS_EROFS;
3474
3475     if (in->flags & I_CAP_DROPPED) {
3476       int mds_wanted = in->caps_mds_wanted();
3477       if ((mds_wanted & need) != need) {
3478         int ret = _renew_caps(in);
3479         if (ret < 0)
3480           return ret;
3481         continue;
3482       }
3483       if (!(file_wanted & ~mds_wanted))
3484         in->flags &= ~I_CAP_DROPPED;
3485     }
3486
3487     if (waitfor_caps)
3488       wait_on_list(in->waitfor_caps);
3489     else if (waitfor_commit)
3490       wait_on_list(in->waitfor_commit);
3491   }
3492 }
3493
3494 int Client::get_caps_used(Inode *in)
3495 {
3496   unsigned used = in->caps_used();
3497   if (!(used & CEPH_CAP_FILE_CACHE) &&
3498       !objectcacher->set_is_empty(&in->oset))
3499     used |= CEPH_CAP_FILE_CACHE;
3500   return used;
3501 }
3502
3503 void Client::cap_delay_requeue(Inode *in)
3504 {
3505   ldout(cct, 10) << __func__ << " on " << *in << dendl;
3506   in->hold_caps_until = ceph_clock_now();
3507   in->hold_caps_until += cct->_conf->client_caps_release_delay;
3508   delayed_list.push_back(&in->delay_cap_item);
3509 }
3510
3511 void Client::send_cap(Inode *in, MetaSession *session, Cap *cap,
3512                       int flags, int used, int want, int retain,
3513                       int flush, ceph_tid_t flush_tid)
3514 {
3515   int held = cap->issued | cap->implemented;
3516   int revoking = cap->implemented & ~cap->issued;
3517   retain &= ~revoking;
3518   int dropping = cap->issued & ~retain;
3519   int op = CEPH_CAP_OP_UPDATE;
3520
3521   ldout(cct, 10) << __func__ << " " << *in
3522            << " mds." << session->mds_num << " seq " << cap->seq
3523            << " used " << ccap_string(used)
3524            << " want " << ccap_string(want)
3525            << " flush " << ccap_string(flush)
3526            << " retain " << ccap_string(retain)
3527            << " held "<< ccap_string(held)
3528            << " revoking " << ccap_string(revoking)
3529            << " dropping " << ccap_string(dropping)
3530            << dendl;
3531
3532   if (cct->_conf->client_inject_release_failure && revoking) {
3533     const int would_have_issued = cap->issued & retain;
3534     const int would_have_implemented = cap->implemented & (cap->issued | used);
3535     // Simulated bug:
3536     //  - tell the server we think issued is whatever they issued plus whatever we implemented
3537     //  - leave what we have implemented in place
3538     ldout(cct, 20) << __func__ << " injecting failure to release caps" << dendl;
3539     cap->issued = cap->issued | cap->implemented;
3540
3541     // Make an exception for revoking xattr caps: we are injecting
3542     // failure to release other caps, but allow xattr because client
3543     // will block on xattr ops if it can't release these to MDS (#9800)
3544     const int xattr_mask = CEPH_CAP_XATTR_SHARED | CEPH_CAP_XATTR_EXCL;
3545     cap->issued ^= xattr_mask & revoking;
3546     cap->implemented ^= xattr_mask & revoking;
3547
3548     ldout(cct, 20) << __func__ << " issued " << ccap_string(cap->issued) << " vs " << ccap_string(would_have_issued) << dendl;
3549     ldout(cct, 20) << __func__ << " implemented " << ccap_string(cap->implemented) << " vs " << ccap_string(would_have_implemented) << dendl;
3550   } else {
3551     // Normal behaviour
3552     cap->issued &= retain;
3553     cap->implemented &= cap->issued | used;
3554   }
3555
3556   snapid_t follows = 0;
3557
3558   if (flush)
3559     follows = in->snaprealm->get_snap_context().seq;
3560
3561   auto m = make_message<MClientCaps>(op,
3562                                    in->ino,
3563                                    0,
3564                                    cap->cap_id, cap->seq,
3565                                    cap->implemented,
3566                                    want,
3567                                    flush,
3568                                    cap->mseq,
3569                                    cap_epoch_barrier);
3570   m->caller_uid = in->cap_dirtier_uid;
3571   m->caller_gid = in->cap_dirtier_gid;
3572
3573   m->head.issue_seq = cap->issue_seq;
3574   m->set_tid(flush_tid);
3575
3576   m->head.uid = in->uid;
3577   m->head.gid = in->gid;
3578   m->head.mode = in->mode;
3579
3580   m->head.nlink = in->nlink;
3581
3582   if (flush & CEPH_CAP_XATTR_EXCL) {
3583     encode(in->xattrs, m->xattrbl);
3584     m->head.xattr_version = in->xattr_version;
3585   }
3586
3587   m->size = in->size;
3588   m->max_size = in->max_size;
3589   m->truncate_seq = in->truncate_seq;
3590   m->truncate_size = in->truncate_size;
3591   m->mtime = in->mtime;
3592   m->atime = in->atime;
3593   m->ctime = in->ctime;
3594   m->btime = in->btime;
3595   m->time_warp_seq = in->time_warp_seq;
3596   m->change_attr = in->change_attr;
3597
3598   if (!(flags & MClientCaps::FLAG_PENDING_CAPSNAP) &&
3599       !in->cap_snaps.empty() &&
3600       in->cap_snaps.rbegin()->second.flush_tid == 0)
3601     flags |= MClientCaps::FLAG_PENDING_CAPSNAP;
3602   m->flags = flags;
3603
3604   if (flush & CEPH_CAP_FILE_WR) {
3605     m->inline_version = in->inline_version;
3606     m->inline_data = in->inline_data;
3607   }
3608
3609   in->reported_size = in->size;
3610   m->set_snap_follows(follows);
3611   cap->wanted = want;
3612   if (cap == in->auth_cap) {
3613     if (want & CEPH_CAP_ANY_FILE_WR) {
3614       m->set_max_size(in->wanted_max_size);
3615       in->requested_max_size = in->wanted_max_size;
3616       ldout(cct, 15) << "auth cap, requesting max_size " << in->requested_max_size << dendl;
3617     } else {
3618       in->requested_max_size = 0;
3619       ldout(cct, 15) << "auth cap, reset requested_max_size due to not wanting any file write cap" << dendl;
3620     }
3621   }
3622
3623   if (!session->flushing_caps_tids.empty())
3624     m->set_oldest_flush_tid(*session->flushing_caps_tids.begin());
3625
3626   session->con->send_message2(std::move(m));
3627 }
3628
3629 static bool is_max_size_approaching(Inode *in)
3630 {
3631   /* mds will adjust max size according to the reported size */
3632   if (in->flushing_caps & CEPH_CAP_FILE_WR)
3633     return false;
3634   if (in->size >= in->max_size)
3635     return true;
3636   /* half of previous max_size increment has been used */
3637   if (in->max_size > in->reported_size &&
3638       (in->size << 1) >= in->max_size + in->reported_size)
3639     return true;
3640   return false;
3641 }
3642
3643 static int adjust_caps_used_for_lazyio(int used, int issued, int implemented)
3644 {
3645   if (!(used & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_BUFFER)))
3646     return used;
3647   if (!(implemented & CEPH_CAP_FILE_LAZYIO))
3648     return used;
3649
3650   if (issued & CEPH_CAP_FILE_LAZYIO) {
3651     if (!(issued & CEPH_CAP_FILE_CACHE)) {
3652       used &= ~CEPH_CAP_FILE_CACHE;
3653       used |= CEPH_CAP_FILE_LAZYIO;
3654     }
3655     if (!(issued & CEPH_CAP_FILE_BUFFER)) {
3656       used &= ~CEPH_CAP_FILE_BUFFER;
3657       used |= CEPH_CAP_FILE_LAZYIO;
3658     }
3659   } else {
3660     if (!(implemented & CEPH_CAP_FILE_CACHE)) {
3661       used &= ~CEPH_CAP_FILE_CACHE;
3662       used |= CEPH_CAP_FILE_LAZYIO;
3663     }
3664     if (!(implemented & CEPH_CAP_FILE_BUFFER)) {
3665       used &= ~CEPH_CAP_FILE_BUFFER;
3666       used |= CEPH_CAP_FILE_LAZYIO;
3667     }
3668   }
3669   return used;
3670 }
3671
3672 /**
3673  * check_caps
3674  *
3675  * Examine currently used and wanted versus held caps. Release, flush or ack
3676  * revoked caps to the MDS as appropriate.
3677  *
3678  * @param in the inode to check
3679  * @param flags flags to apply to cap check
3680  */
3681 void Client::check_caps(Inode *in, unsigned flags)
3682 {
3683   unsigned wanted = in->caps_wanted();
3684   unsigned used = get_caps_used(in);
3685   unsigned cap_used;
3686
3687   int implemented;
3688   int issued = in->caps_issued(&implemented);
3689   int revoking = implemented & ~issued;
3690
3691   int orig_used = used;
3692   used = adjust_caps_used_for_lazyio(used, issued, implemented);
3693
3694   int retain = wanted | used | CEPH_CAP_PIN;
3695   if (!is_unmounting() && in->nlink > 0) {
3696     if (wanted) {
3697       retain |= CEPH_CAP_ANY;
3698     } else if (in->is_dir() &&
3699                (issued & CEPH_CAP_FILE_SHARED) &&
3700                (in->flags & I_COMPLETE)) {
3701       // we do this here because we don't want to drop to Fs (and then
3702       // drop the Fs if we do a create!) if that alone makes us send lookups
3703       // to the MDS. Doing it in in->caps_wanted() has knock-on effects elsewhere
3704       wanted = CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_EXCL;
3705       retain |= wanted;
3706     } else {
3707       retain |= CEPH_CAP_ANY_SHARED;
3708       // keep RD only if we didn't have the file open RW,
3709       // because then the mds would revoke it anyway to
3710       // journal max_size=0.
3711       if (in->max_size == 0)
3712         retain |= CEPH_CAP_ANY_RD;
3713     }
3714   }
3715
3716   ldout(cct, 10) << __func__ << " on " << *in
3717            << " wanted " << ccap_string(wanted)
3718            << " used " << ccap_string(used)
3719            << " issued " << ccap_string(issued)
3720            << " revoking " << ccap_string(revoking)
3721            << " flags=" << flags
3722            << dendl;
3723
3724   if (in->snapid != CEPH_NOSNAP)
3725     return; //snap caps last forever, can't write
3726
3727   if (in->caps.empty())
3728     return;   // guard if at end of func
3729
3730   if (!(orig_used & CEPH_CAP_FILE_BUFFER) &&
3731       (revoking & used & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO))) {
3732     if (_release(in))
3733       used &= ~(CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO);
3734   }
3735
3736
3737   for (auto &p : in->caps) {
3738     mds_rank_t mds = p.first;
3739     Cap &cap = p.second;
3740
3741     MetaSession *session = &mds_sessions.at(mds);
3742
3743     cap_used = used;
3744     if (in->auth_cap && &cap != in->auth_cap)
3745       cap_used &= ~in->auth_cap->issued;
3746
3747     revoking = cap.implemented & ~cap.issued;
3748
3749     ldout(cct, 10) << " cap mds." << mds
3750              << " issued " << ccap_string(cap.issued)
3751              << " implemented " << ccap_string(cap.implemented)
3752              << " revoking " << ccap_string(revoking) << dendl;
3753
3754     if (in->wanted_max_size > in->max_size &&
3755         in->wanted_max_size > in->requested_max_size &&
3756         &cap == in->auth_cap)
3757       goto ack;
3758
3759     /* approaching file_max? */
3760     if ((cap.issued & CEPH_CAP_FILE_WR) &&
3761         &cap == in->auth_cap &&
3762         is_max_size_approaching(in)) {
3763       ldout(cct, 10) << "size " << in->size << " approaching max_size " << in->max_size
3764                      << ", reported " << in->reported_size << dendl;
3765       goto ack;
3766     }
3767
3768     /* completed revocation? */
3769     if (revoking && (revoking & cap_used) == 0) {
3770       ldout(cct, 10) << "completed revocation of " << ccap_string(cap.implemented & ~cap.issued) << dendl;
3771       goto ack;
3772     }
3773
3774     /* want more caps from mds? */
3775     if (wanted & ~(cap.wanted | cap.issued))
3776       goto ack;
3777
3778     if (!revoking && is_unmounting() && (cap_used == 0))
3779       goto ack;
3780
3781     if ((cap.issued & ~retain) == 0 && // and we don't have anything we wouldn't like
3782         !in->dirty_caps)               // and we have no dirty caps
3783       continue;
3784
3785     if (!(flags & CHECK_CAPS_NODELAY)) {
3786       ldout(cct, 10) << "delaying cap release" << dendl;
3787       cap_delay_requeue(in);
3788       continue;
3789     }
3790
3791   ack:
3792     if (&cap == in->auth_cap) {
3793       if (in->flags & I_KICK_FLUSH) {
3794         ldout(cct, 20) << " reflushing caps (check_caps) on " << *in
3795                        << " to mds." << mds << dendl;
3796         kick_flushing_caps(in, session);
3797       }
3798       if (!in->cap_snaps.empty() &&
3799           in->cap_snaps.rbegin()->second.flush_tid == 0)
3800         flush_snaps(in);
3801     }
3802
3803     int flushing;
3804     int msg_flags = 0;
3805     ceph_tid_t flush_tid;
3806     if (in->auth_cap == &cap && in->dirty_caps) {
3807       flushing = mark_caps_flushing(in, &flush_tid);
3808       if (flags & CHECK_CAPS_SYNCHRONOUS)
3809         msg_flags |= MClientCaps::FLAG_SYNC;
3810     } else {
3811       flushing = 0;
3812       flush_tid = 0;
3813     }
3814
3815     send_cap(in, session, &cap, msg_flags, cap_used, wanted, retain,
3816              flushing, flush_tid);
3817   }
3818 }
3819
3820
3821 void Client::queue_cap_snap(Inode *in, SnapContext& old_snapc)
3822 {
3823   int used = get_caps_used(in);
3824   int dirty = in->caps_dirty();
3825   ldout(cct, 10) << __func__ << " " << *in << " snapc " << old_snapc << " used " << ccap_string(used) << dendl;
3826
3827   if (in->cap_snaps.size() &&
3828       in->cap_snaps.rbegin()->second.writing) {
3829     ldout(cct, 10) << __func__ << " already have pending cap_snap on " << *in << dendl;
3830     return;
3831   } else if (in->caps_dirty() ||
3832             (used & CEPH_CAP_FILE_WR) ||
3833              (dirty & CEPH_CAP_ANY_WR)) {
3834     const auto &capsnapem = in->cap_snaps.emplace(std::piecewise_construct, std::make_tuple(old_snapc.seq), std::make_tuple(in));
3835     ceph_assert(capsnapem.second); /* element inserted */
3836     CapSnap &capsnap = capsnapem.first->second;
3837     capsnap.context = old_snapc;
3838     capsnap.issued = in->caps_issued();
3839     capsnap.dirty = in->caps_dirty();
3840
3841     capsnap.dirty_data = (used & CEPH_CAP_FILE_BUFFER);
3842
3843     capsnap.uid = in->uid;
3844     capsnap.gid = in->gid;
3845     capsnap.mode = in->mode;
3846     capsnap.btime = in->btime;
3847     capsnap.xattrs = in->xattrs;
3848     capsnap.xattr_version = in->xattr_version;
3849     capsnap.cap_dirtier_uid = in->cap_dirtier_uid;
3850     capsnap.cap_dirtier_gid = in->cap_dirtier_gid;
3851
3852     if (used & CEPH_CAP_FILE_WR) {
3853       ldout(cct, 10) << __func__ << " WR used on " << *in << dendl;
3854       capsnap.writing = 1;
3855     } else {
3856       finish_cap_snap(in, capsnap, used);
3857     }
3858   } else {
3859     ldout(cct, 10) << __func__ << " not dirty|writing on " << *in << dendl;
3860   }
3861 }
3862
3863 void Client::finish_cap_snap(Inode *in, CapSnap &capsnap, int used)
3864 {
3865   ldout(cct, 10) << __func__ << " " << *in << " capsnap " << (void *)&capsnap << " used " << ccap_string(used) << dendl;
3866   capsnap.size = in->size;
3867   capsnap.mtime = in->mtime;
3868   capsnap.atime = in->atime;
3869   capsnap.ctime = in->ctime;
3870   capsnap.time_warp_seq = in->time_warp_seq;
3871   capsnap.change_attr = in->change_attr;
3872   capsnap.dirty |= in->caps_dirty();
3873
3874   /* Only reset it if it wasn't set before */
3875   if (capsnap.cap_dirtier_uid == -1) {
3876     capsnap.cap_dirtier_uid = in->cap_dirtier_uid;
3877     capsnap.cap_dirtier_gid = in->cap_dirtier_gid;
3878   }
3879
3880   if (capsnap.dirty & CEPH_CAP_FILE_WR) {
3881     capsnap.inline_data = in->inline_data;
3882     capsnap.inline_version = in->inline_version;
3883   }
3884
3885   if (used & CEPH_CAP_FILE_BUFFER) {
3886     capsnap.writing = 1;
3887     ldout(cct, 10) << __func__ << " " << *in << " cap_snap " << &capsnap << " used " << used
3888              << " WRBUFFER, delaying" << dendl;
3889   } else {
3890     capsnap.dirty_data = 0;
3891     flush_snaps(in);
3892   }
3893 }
3894
3895 void Client::send_flush_snap(Inode *in, MetaSession *session,
3896                              snapid_t follows, CapSnap& capsnap)
3897 {
3898   auto m = make_message<MClientCaps>(CEPH_CAP_OP_FLUSHSNAP,
3899                                      in->ino, in->snaprealm->ino, 0,
3900                                      in->auth_cap->mseq, cap_epoch_barrier);
3901   m->caller_uid = capsnap.cap_dirtier_uid;
3902   m->caller_gid = capsnap.cap_dirtier_gid;
3903
3904   m->set_client_tid(capsnap.flush_tid);
3905   m->head.snap_follows = follows;
3906
3907   m->head.caps = capsnap.issued;
3908   m->head.dirty = capsnap.dirty;
3909
3910   m->head.uid = capsnap.uid;
3911   m->head.gid = capsnap.gid;
3912   m->head.mode = capsnap.mode;
3913   m->btime = capsnap.btime;
3914
3915   m->size = capsnap.size;
3916
3917   m->head.xattr_version = capsnap.xattr_version;
3918   encode(capsnap.xattrs, m->xattrbl);
3919
3920   m->ctime = capsnap.ctime;
3921   m->btime = capsnap.btime;
3922   m->mtime = capsnap.mtime;
3923   m->atime = capsnap.atime;
3924   m->time_warp_seq = capsnap.time_warp_seq;
3925   m->change_attr = capsnap.change_attr;
3926
3927   if (capsnap.dirty & CEPH_CAP_FILE_WR) {
3928     m->inline_version = in->inline_version;
3929     m->inline_data = in->inline_data;
3930   }
3931
3932   ceph_assert(!session->flushing_caps_tids.empty());
3933   m->set_oldest_flush_tid(*session->flushing_caps_tids.begin());
3934
3935   session->con->send_message2(std::move(m));
3936 }
3937
3938 void Client::flush_snaps(Inode *in)
3939 {
3940   ldout(cct, 10) << "flush_snaps on " << *in << dendl;
3941   ceph_assert(in->cap_snaps.size());
3942
3943   // pick auth mds
3944   ceph_assert(in->auth_cap);
3945   MetaSession *session = in->auth_cap->session;
3946
3947   for (auto &p : in->cap_snaps) {
3948     CapSnap &capsnap = p.second;
3949     // only do new flush
3950     if (capsnap.flush_tid > 0)
3951       continue;
3952
3953     ldout(cct, 10) << "flush_snaps mds." << session->mds_num
3954              << " follows " << p.first
3955              << " size " << capsnap.size
3956              << " mtime " << capsnap.mtime
3957              << " dirty_data=" << capsnap.dirty_data
3958              << " writing=" << capsnap.writing
3959              << " on " << *in << dendl;
3960     if (capsnap.dirty_data || capsnap.writing)
3961       break;
3962
3963     capsnap.flush_tid = ++last_flush_tid;
3964     session->flushing_caps_tids.insert(capsnap.flush_tid);
3965     in->flushing_cap_tids[capsnap.flush_tid] = 0;
3966     if (!in->flushing_cap_item.is_on_list())
3967       session->flushing_caps.push_back(&in->flushing_cap_item);
3968
3969     send_flush_snap(in, session, p.first, capsnap);
3970   }
3971 }
3972
3973 void Client::wait_on_list(list<ceph::condition_variable*>& ls)
3974 {
3975   ceph::condition_variable cond;
3976   ls.push_back(&cond);
3977   std::unique_lock l{client_lock, std::adopt_lock};
3978   cond.wait(l);
3979   l.release();
3980   ls.remove(&cond);
3981 }
3982
3983 void Client::signal_cond_list(list<ceph::condition_variable*>& ls)
3984 {
3985   for (auto cond : ls) {
3986     cond->notify_all();
3987   }
3988 }
3989
3990 void Client::wait_on_context_list(list<Context*>& ls)
3991 {
3992   ceph::condition_variable cond;
3993   bool done = false;
3994   int r;
3995   ls.push_back(new C_Cond(cond, &done, &r));
3996   std::unique_lock l{client_lock, std::adopt_lock};
3997   cond.wait(l, [&done] { return done;});
3998   l.release();
3999 }
4000
4001 void Client::signal_context_list(list<Context*>& ls)
4002 {
4003   while (!ls.empty()) {
4004     ls.front()->complete(0);
4005     ls.pop_front();
4006   }
4007 }
4008
4009 void Client::wake_up_session_caps(MetaSession *s, bool reconnect)
4010 {
4011   for (const auto &cap : s->caps) {
4012     auto &in = cap->inode;
4013     if (reconnect) {
4014       in.requested_max_size = 0;
4015       in.wanted_max_size = 0;
4016     } else {
4017       if (cap->gen < s->cap_gen) {
4018         // mds did not re-issue stale cap.
4019         cap->issued = cap->implemented = CEPH_CAP_PIN;
4020         // make sure mds knows what we want.
4021         if (in.caps_file_wanted() & ~cap->wanted)
4022           in.flags |= I_CAP_DROPPED;
4023       }
4024     }
4025     signal_cond_list(in.waitfor_caps);
4026   }
4027 }
4028
4029
4030 // flush dirty data (from objectcache)
4031
4032 class C_Client_CacheInvalidate : public Context  {
4033 private:
4034   Client *client;
4035   vinodeno_t ino;
4036   int64_t offset, length;
4037 public:
4038   C_Client_CacheInvalidate(Client *c, Inode *in, int64_t off, int64_t len) :
4039     client(c), offset(off), length(len) {
4040     if (client->use_faked_inos())
4041       ino = vinodeno_t(in->faked_ino, CEPH_NOSNAP);
4042     else
4043       ino = in->vino();
4044   }
4045   void finish(int r) override {
4046     // _async_invalidate takes the lock when it needs to, call this back from outside of lock.
4047     ceph_assert(ceph_mutex_is_not_locked_by_me(client->client_lock));
4048     client->_async_invalidate(ino, offset, length);
4049   }
4050 };
4051
4052 void Client::_async_invalidate(vinodeno_t ino, int64_t off, int64_t len)
4053 {
4054   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
4055   if (!mref_reader.is_state_satisfied())
4056     return;
4057
4058   ldout(cct, 10) << __func__ << " " << ino << " " << off << "~" << len << dendl;
4059   ino_invalidate_cb(callback_handle, ino, off, len);
4060 }
4061
4062 void Client::_schedule_invalidate_callback(Inode *in, int64_t off, int64_t len) {
4063
4064   if (ino_invalidate_cb)
4065     // we queue the invalidate, which calls the callback and decrements the ref
4066     async_ino_invalidator.queue(new C_Client_CacheInvalidate(this, in, off, len));
4067 }
4068
4069 void Client::_invalidate_inode_cache(Inode *in)
4070 {
4071   ldout(cct, 10) << __func__ << " " << *in << dendl;
4072
4073   // invalidate our userspace inode cache
4074   if (cct->_conf->client_oc) {
4075     objectcacher->release_set(&in->oset);
4076     if (!objectcacher->set_is_empty(&in->oset))
4077       lderr(cct) << "failed to invalidate cache for " << *in << dendl;
4078   }
4079
4080   _schedule_invalidate_callback(in, 0, 0);
4081 }
4082
4083 void Client::_invalidate_inode_cache(Inode *in, int64_t off, int64_t len)
4084 {
4085   ldout(cct, 10) << __func__ << " " << *in << " " << off << "~" << len << dendl;
4086
4087   // invalidate our userspace inode cache
4088   if (cct->_conf->client_oc) {
4089     vector<ObjectExtent> ls;
4090     Striper::file_to_extents(cct, in->ino, &in->layout, off, len, in->truncate_size, ls);
4091     objectcacher->discard_writeback(&in->oset, ls, nullptr);
4092   }
4093
4094   _schedule_invalidate_callback(in, off, len);
4095 }
4096
4097 bool Client::_release(Inode *in)
4098 {
4099   ldout(cct, 20) << "_release " << *in << dendl;
4100   if (in->cap_refs[CEPH_CAP_FILE_CACHE] == 0) {
4101     _invalidate_inode_cache(in);
4102     return true;
4103   }
4104   return false;
4105 }
4106
4107 bool Client::_flush(Inode *in, Context *onfinish)
4108 {
4109   ldout(cct, 10) << "_flush " << *in << dendl;
4110
4111   if (!in->oset.dirty_or_tx) {
4112     ldout(cct, 10) << " nothing to flush" << dendl;
4113     onfinish->complete(0);
4114     return true;
4115   }
4116
4117   if (objecter->osdmap_pool_full(in->layout.pool_id)) {
4118     ldout(cct, 8) << __func__ << ": FULL, purging for ENOSPC" << dendl;
4119     objectcacher->purge_set(&in->oset);
4120     if (onfinish) {
4121       onfinish->complete(-CEPHFS_ENOSPC);
4122     }
4123     return true;
4124   }
4125
4126   return objectcacher->flush_set(&in->oset, onfinish);
4127 }
4128
4129 void Client::_flush_range(Inode *in, int64_t offset, uint64_t size)
4130 {
4131   ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
4132   if (!in->oset.dirty_or_tx) {
4133     ldout(cct, 10) << " nothing to flush" << dendl;
4134     return;
4135   }
4136
4137   C_SaferCond onflush("Client::_flush_range flock");
4138   bool ret = objectcacher->file_flush(&in->oset, &in->layout, in->snaprealm->get_snap_context(),
4139                                       offset, size, &onflush);
4140   if (!ret) {
4141     // wait for flush
4142     client_lock.unlock();
4143     onflush.wait();
4144     client_lock.lock();
4145   }
4146 }
4147
4148 void Client::flush_set_callback(ObjectCacher::ObjectSet *oset)
4149 {
4150   //  std::scoped_lock l(client_lock);
4151   ceph_assert(ceph_mutex_is_locked_by_me(client_lock));   // will be called via dispatch() -> objecter -> ...
4152   Inode *in = static_cast<Inode *>(oset->parent);
4153   ceph_assert(in);
4154   _flushed(in);
4155 }
4156
4157 void Client::_flushed(Inode *in)
4158 {
4159   ldout(cct, 10) << "_flushed " << *in << dendl;
4160
4161   put_cap_ref(in, CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_BUFFER);
4162 }
4163
4164
4165
4166 // checks common to add_update_cap, handle_cap_grant
4167 void Client::check_cap_issue(Inode *in, unsigned issued)
4168 {
4169   unsigned had = in->caps_issued();
4170
4171   if ((issued & CEPH_CAP_FILE_CACHE) &&
4172       !(had & CEPH_CAP_FILE_CACHE))
4173     in->cache_gen++;
4174
4175   if ((issued & CEPH_CAP_FILE_SHARED) !=
4176       (had & CEPH_CAP_FILE_SHARED)) {
4177     if (issued & CEPH_CAP_FILE_SHARED)
4178       in->shared_gen++;
4179     if (in->is_dir())
4180       clear_dir_complete_and_ordered(in, true);
4181   }
4182 }
4183
4184 void Client::add_update_cap(Inode *in, MetaSession *mds_session, uint64_t cap_id,
4185                             unsigned issued, unsigned wanted, unsigned seq, unsigned mseq,
4186                             inodeno_t realm, int flags, const UserPerm& cap_perms)
4187 {
4188   if (!in->is_any_caps()) {
4189     ceph_assert(in->snaprealm == 0);
4190     in->snaprealm = get_snap_realm(realm);
4191     in->snaprealm->inodes_with_caps.push_back(&in->snaprealm_item);
4192     ldout(cct, 15) << __func__ << " first one, opened snaprealm " << in->snaprealm << dendl;
4193   } else {
4194     ceph_assert(in->snaprealm);
4195     if ((flags & CEPH_CAP_FLAG_AUTH) &&
4196         realm != inodeno_t(-1) && in->snaprealm->ino != realm) {
4197       in->snaprealm_item.remove_myself();
4198       auto oldrealm = in->snaprealm;
4199       in->snaprealm = get_snap_realm(realm);
4200       in->snaprealm->inodes_with_caps.push_back(&in->snaprealm_item);
4201       put_snap_realm(oldrealm);
4202     }
4203   }
4204
4205   mds_rank_t mds = mds_session->mds_num;
4206   const auto &capem = in->caps.emplace(std::piecewise_construct, std::forward_as_tuple(mds), std::forward_as_tuple(*in, mds_session));
4207   Cap &cap = capem.first->second;
4208   if (!capem.second) {
4209     if (cap.gen < mds_session->cap_gen)
4210       cap.issued = cap.implemented = CEPH_CAP_PIN;
4211
4212     /*
4213      * auth mds of the inode changed. we received the cap export
4214      * message, but still haven't received the cap import message.
4215      * handle_cap_export() updated the new auth MDS' cap.
4216      *
4217      * "ceph_seq_cmp(seq, cap->seq) <= 0" means we are processing
4218      * a message that was send before the cap import message. So
4219      * don't remove caps.
4220      */
4221     if (ceph_seq_cmp(seq, cap.seq) <= 0) {
4222       if (&cap != in->auth_cap)
4223          ldout(cct, 0) << "WARNING: " <<  "inode " << *in << " caps on mds." << mds << " != auth_cap." << dendl;
4224
4225       ceph_assert(cap.cap_id == cap_id);
4226       seq = cap.seq;
4227       mseq = cap.mseq;
4228       issued |= cap.issued;
4229       flags |= CEPH_CAP_FLAG_AUTH;
4230     }
4231   } else {
4232     inc_pinned_icaps();
4233   }
4234
4235   check_cap_issue(in, issued);
4236
4237   if (flags & CEPH_CAP_FLAG_AUTH) {
4238     if (in->auth_cap != &cap &&
4239         (!in->auth_cap || ceph_seq_cmp(in->auth_cap->mseq, mseq) < 0)) {
4240       if (in->auth_cap && in->flushing_cap_item.is_on_list()) {
4241         ldout(cct, 10) << __func__ << " changing auth cap: "
4242                        << "add myself to new auth MDS' flushing caps list" << dendl;
4243         adjust_session_flushing_caps(in, in->auth_cap->session, mds_session);
4244       }
4245       in->auth_cap = &cap;
4246     }
4247   }
4248
4249   unsigned old_caps = cap.issued;
4250   cap.cap_id = cap_id;
4251   cap.issued = issued;
4252   cap.implemented |= issued;
4253   if (ceph_seq_cmp(mseq, cap.mseq) > 0)
4254     cap.wanted = wanted;
4255   else
4256     cap.wanted |= wanted;
4257   cap.seq = seq;
4258   cap.issue_seq = seq;
4259   cap.mseq = mseq;
4260   cap.gen = mds_session->cap_gen;
4261   cap.latest_perms = cap_perms;
4262   ldout(cct, 10) << __func__ << " issued " << ccap_string(old_caps) << " -> " << ccap_string(cap.issued)
4263            << " from mds." << mds
4264            << " on " << *in
4265            << dendl;
4266
4267   if ((issued & ~old_caps) && in->auth_cap == &cap) {
4268     // non-auth MDS is revoking the newly grant caps ?
4269     for (auto &p : in->caps) {
4270       if (&p.second == &cap)
4271         continue;
4272       if (p.second.implemented & ~p.second.issued & issued) {
4273         check_caps(in, CHECK_CAPS_NODELAY);
4274         break;
4275       }
4276     }
4277   }
4278
4279   if (issued & ~old_caps)
4280     signal_cond_list(in->waitfor_caps);
4281 }
4282
4283 void Client::remove_cap(Cap *cap, bool queue_release)
4284 {
4285   auto &in = cap->inode;
4286   MetaSession *session = cap->session;
4287   mds_rank_t mds = cap->session->mds_num;
4288
4289   ldout(cct, 10) << __func__ << " mds." << mds << " on " << in << dendl;
4290
4291   if (queue_release) {
4292     session->enqueue_cap_release(
4293       in.ino,
4294       cap->cap_id,
4295       cap->issue_seq,
4296       cap->mseq,
4297       cap_epoch_barrier);
4298   } else {
4299     dec_pinned_icaps();
4300   }
4301
4302
4303   if (in.auth_cap == cap) {
4304     if (in.flushing_cap_item.is_on_list()) {
4305       ldout(cct, 10) << " removing myself from flushing_cap list" << dendl;
4306       in.flushing_cap_item.remove_myself();
4307     }
4308     in.auth_cap = NULL;
4309   }
4310   size_t n = in.caps.erase(mds);
4311   ceph_assert(n == 1);
4312   cap = nullptr;
4313
4314   if (!in.is_any_caps()) {
4315     ldout(cct, 15) << __func__ << " last one, closing snaprealm " << in.snaprealm << dendl;
4316     in.snaprealm_item.remove_myself();
4317     put_snap_realm(in.snaprealm);
4318     in.snaprealm = 0;
4319   }
4320 }
4321
4322 void Client::remove_all_caps(Inode *in)
4323 {
4324   while (!in->caps.empty())
4325     remove_cap(&in->caps.begin()->second, true);
4326 }
4327
4328 void Client::remove_session_caps(MetaSession *s, int err)
4329 {
4330   ldout(cct, 10) << __func__ << " mds." << s->mds_num << dendl;
4331
4332   while (s->caps.size()) {
4333     Cap *cap = *s->caps.begin();
4334     InodeRef in(&cap->inode);
4335     bool dirty_caps = false;
4336     if (in->auth_cap == cap) {
4337       dirty_caps = in->dirty_caps | in->flushing_caps;
4338       in->wanted_max_size = 0;
4339       in->requested_max_size = 0;
4340       if (in->has_any_filelocks())
4341         in->flags |= I_ERROR_FILELOCK;
4342     }
4343     auto caps = cap->implemented;
4344     if (cap->wanted | cap->issued)
4345       in->flags |= I_CAP_DROPPED;
4346     remove_cap(cap, false);
4347     in->cap_snaps.clear();
4348     if (dirty_caps) {
4349       lderr(cct) << __func__ << " still has dirty|flushing caps on " << *in << dendl;
4350       if (in->flushing_caps) {
4351         num_flushing_caps--;
4352         in->flushing_cap_tids.clear();
4353       }
4354       in->flushing_caps = 0;
4355       in->mark_caps_clean();
4356       put_inode(in.get());
4357     }
4358     caps &= CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_BUFFER;
4359     if (caps && !in->caps_issued_mask(caps, true)) {
4360       if (err == -CEPHFS_EBLOCKLISTED) {
4361         if (in->oset.dirty_or_tx) {
4362           lderr(cct) << __func__ << " still has dirty data on " << *in << dendl;
4363           in->set_async_err(err);
4364         }
4365         objectcacher->purge_set(&in->oset);
4366       } else {
4367         objectcacher->release_set(&in->oset);
4368       }
4369       _schedule_invalidate_callback(in.get(), 0, 0);
4370     }
4371
4372     signal_cond_list(in->waitfor_caps);
4373   }
4374   s->flushing_caps_tids.clear();
4375   sync_cond.notify_all();
4376 }
4377
4378 int Client::_do_remount(bool retry_on_error)
4379 {
4380   uint64_t max_retries = cct->_conf.get_val<uint64_t>("mds_max_retries_on_remount_failure");
4381
4382   errno = 0;
4383   int r = remount_cb(callback_handle);
4384   if (r == 0) {
4385     retries_on_invalidate = 0;
4386   } else {
4387     int e = errno;
4388     client_t whoami = get_nodeid();
4389     if (r == -1) {
4390       lderr(cct) <<
4391           "failed to remount (to trim kernel dentries): "
4392           "errno = " << e << " (" << strerror(e) << ")" << dendl;
4393     } else {
4394       lderr(cct) <<
4395           "failed to remount (to trim kernel dentries): "
4396           "return code = " << r << dendl;
4397     }
4398     bool should_abort =
4399       (cct->_conf.get_val<bool>("client_die_on_failed_remount") ||
4400        cct->_conf.get_val<bool>("client_die_on_failed_dentry_invalidate")) &&
4401       !(retry_on_error && (++retries_on_invalidate < max_retries));
4402     if (should_abort && !is_unmounting()) {
4403       lderr(cct) << "failed to remount for kernel dentry trimming; quitting!" << dendl;
4404       ceph_abort();
4405     }
4406   }
4407   return r;
4408 }
4409
4410 class C_Client_Remount : public Context  {
4411 private:
4412   Client *client;
4413 public:
4414   explicit C_Client_Remount(Client *c) : client(c) {}
4415   void finish(int r) override {
4416     ceph_assert(r == 0);
4417     client->_do_remount(true);
4418   }
4419 };
4420
4421 void Client::_invalidate_kernel_dcache()
4422 {
4423   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
4424   if (!mref_reader.is_state_satisfied())
4425     return;
4426
4427   if (can_invalidate_dentries) {
4428     if (dentry_invalidate_cb && root->dir) {
4429       for (ceph::unordered_map<string, Dentry*>::iterator p = root->dir->dentries.begin();
4430          p != root->dir->dentries.end();
4431          ++p) {
4432        if (p->second->inode)
4433         _schedule_invalidate_dentry_callback(p->second, false);
4434       }
4435     }
4436   } else if (remount_cb) {
4437     // Hacky:
4438     // when remounting a file system, linux kernel trims all unused dentries in the fs
4439     remount_finisher.queue(new C_Client_Remount(this));
4440   }
4441 }
4442
4443 void Client::_trim_negative_child_dentries(InodeRef& in)
4444 {
4445   if (!in->is_dir())
4446     return;
4447
4448   Dir* dir = in->dir;
4449   if (dir && dir->dentries.size() == dir->num_null_dentries) {
4450     for (auto p = dir->dentries.begin(); p != dir->dentries.end(); ) {
4451       Dentry *dn = p->second;
4452       ++p;
4453       ceph_assert(!dn->inode);
4454       if (dn->lru_is_expireable())
4455         unlink(dn, true, false);  // keep dir, drop dentry
4456     }
4457     if (dir->dentries.empty()) {
4458       close_dir(dir);
4459     }
4460   }
4461
4462   if (in->flags & I_SNAPDIR_OPEN) {
4463     InodeRef snapdir = open_snapdir(in.get());
4464     _trim_negative_child_dentries(snapdir);
4465   }
4466 }
4467
4468 class C_Client_CacheRelease : public Context  {
4469 private:
4470   Client *client;
4471   vinodeno_t ino;
4472 public:
4473   C_Client_CacheRelease(Client *c, Inode *in) :
4474     client(c) {
4475     if (client->use_faked_inos())
4476       ino = vinodeno_t(in->faked_ino, CEPH_NOSNAP);
4477     else
4478       ino = in->vino();
4479   }
4480   void finish(int r) override {
4481     ceph_assert(ceph_mutex_is_not_locked_by_me(client->client_lock));
4482     client->_async_inode_release(ino);
4483   }
4484 };
4485
4486 void Client::_async_inode_release(vinodeno_t ino)
4487 {
4488   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
4489   if (!mref_reader.is_state_satisfied())
4490     return;
4491
4492   ldout(cct, 10) << __func__ << " " << ino << dendl;
4493   ino_release_cb(callback_handle, ino);
4494 }
4495
4496 void Client::_schedule_ino_release_callback(Inode *in) {
4497
4498   if (ino_release_cb)
4499     // we queue the invalidate, which calls the callback and decrements the ref
4500     async_ino_releasor.queue(new C_Client_CacheRelease(this, in));
4501 }
4502
4503 void Client::trim_caps(MetaSession *s, uint64_t max)
4504 {
4505   mds_rank_t mds = s->mds_num;
4506   size_t caps_size = s->caps.size();
4507   ldout(cct, 10) << __func__ << " mds." << mds << " max " << max
4508     << " caps " << caps_size << dendl;
4509
4510   uint64_t trimmed = 0;
4511   auto p = s->caps.begin();
4512   std::set<Dentry *> to_trim; /* this avoids caps other than the one we're
4513                                * looking at from getting deleted during traversal. */
4514   while ((caps_size - trimmed) > max && !p.end()) {
4515     Cap *cap = *p;
4516     InodeRef in(&cap->inode);
4517
4518     // Increment p early because it will be invalidated if cap
4519     // is deleted inside remove_cap
4520     ++p;
4521
4522     if (in->caps.size() > 1 && cap != in->auth_cap) {
4523       int mine = cap->issued | cap->implemented;
4524       int oissued = in->auth_cap ? in->auth_cap->issued : 0;
4525       // disposable non-auth cap
4526       if (!(get_caps_used(in.get()) & ~oissued & mine)) {
4527         ldout(cct, 20) << " removing unused, unneeded non-auth cap on " << *in << dendl;
4528         cap = (remove_cap(cap, true), nullptr);
4529         trimmed++;
4530       }
4531     } else {
4532       ldout(cct, 20) << " trying to trim dentries for " << *in << dendl;
4533       _trim_negative_child_dentries(in);
4534       bool all = true;
4535       auto q = in->dentries.begin();
4536       while (q != in->dentries.end()) {
4537         Dentry *dn = *q;
4538         ++q;
4539         if (dn->lru_is_expireable()) {
4540           if (can_invalidate_dentries &&
4541               dn->dir->parent_inode->ino == MDS_INO_ROOT) {
4542             // Only issue one of these per DN for inodes in root: handle
4543             // others more efficiently by calling for root-child DNs at
4544             // the end of this function.
4545             _schedule_invalidate_dentry_callback(dn, true);
4546           }
4547           ldout(cct, 20) << " queueing dentry for trimming: " << dn->name << dendl;
4548           to_trim.insert(dn);
4549         } else {
4550           ldout(cct, 20) << "  not expirable: " << dn->name << dendl;
4551           all = false;
4552         }
4553       }
4554       if (in->ll_ref == 1 && in->ino != MDS_INO_ROOT) {
4555          _schedule_ino_release_callback(in.get());
4556       }
4557       if (all && in->ino != MDS_INO_ROOT) {
4558         ldout(cct, 20) << __func__ << " counting as trimmed: " << *in << dendl;
4559         trimmed++;
4560       }
4561     }
4562   }
4563   ldout(cct, 20) << " trimming queued dentries: " << dendl;
4564   for (const auto &dn : to_trim) {
4565     trim_dentry(dn);
4566   }
4567   to_trim.clear();
4568
4569   caps_size = s->caps.size();
4570   if (caps_size > (size_t)max)
4571     _invalidate_kernel_dcache();
4572 }
4573
4574 void Client::force_session_readonly(MetaSession *s)
4575 {
4576   s->readonly = true;
4577   for (xlist<Cap*>::iterator p = s->caps.begin(); !p.end(); ++p) {
4578     auto &in = (*p)->inode;
4579     if (in.caps_wanted() & CEPH_CAP_FILE_WR)
4580       signal_cond_list(in.waitfor_caps);
4581   }
4582 }
4583
4584 int Client::mark_caps_flushing(Inode *in, ceph_tid_t* ptid)
4585 {
4586   MetaSession *session = in->auth_cap->session;
4587
4588   int flushing = in->dirty_caps;
4589   ceph_assert(flushing);
4590
4591   ceph_tid_t flush_tid = ++last_flush_tid;
4592   in->flushing_cap_tids[flush_tid] = flushing;
4593
4594   if (!in->flushing_caps) {
4595     ldout(cct, 10) << __func__ << " " << ccap_string(flushing) << " " << *in << dendl;
4596     num_flushing_caps++;
4597   } else {
4598     ldout(cct, 10) << __func__ << " (more) " << ccap_string(flushing) << " " << *in << dendl;
4599   }
4600
4601   in->flushing_caps |= flushing;
4602   in->mark_caps_clean();
4603
4604   if (!in->flushing_cap_item.is_on_list())
4605     session->flushing_caps.push_back(&in->flushing_cap_item);
4606   session->flushing_caps_tids.insert(flush_tid);
4607
4608   *ptid = flush_tid;
4609   return flushing;
4610 }
4611
4612 void Client::adjust_session_flushing_caps(Inode *in, MetaSession *old_s,  MetaSession *new_s)
4613 {
4614   for (auto &p : in->cap_snaps) {
4615     CapSnap &capsnap = p.second;
4616     if (capsnap.flush_tid > 0) {
4617       old_s->flushing_caps_tids.erase(capsnap.flush_tid);
4618       new_s->flushing_caps_tids.insert(capsnap.flush_tid);
4619     }
4620   }
4621   for (map<ceph_tid_t, int>::iterator it = in->flushing_cap_tids.begin();
4622        it != in->flushing_cap_tids.end();
4623        ++it) {
4624     old_s->flushing_caps_tids.erase(it->first);
4625     new_s->flushing_caps_tids.insert(it->first);
4626   }
4627   new_s->flushing_caps.push_back(&in->flushing_cap_item);
4628 }
4629
4630 /*
4631  * Flush all caps back to the MDS. Because the callers generally wait on the
4632  * result of this function (syncfs and umount cases), we set
4633  * CHECK_CAPS_SYNCHRONOUS on the last check_caps call.
4634  */
4635 void Client::flush_caps_sync()
4636 {
4637   ldout(cct, 10) << __func__ << dendl;
4638   xlist<Inode*>::iterator p = delayed_list.begin();
4639   while (!p.end()) {
4640     unsigned flags = CHECK_CAPS_NODELAY;
4641     Inode *in = *p;
4642
4643     ++p;
4644     delayed_list.pop_front();
4645     if (p.end() && dirty_list.empty())
4646       flags |= CHECK_CAPS_SYNCHRONOUS;
4647     check_caps(in, flags);
4648   }
4649
4650   // other caps, too
4651   p = dirty_list.begin();
4652   while (!p.end()) {
4653     unsigned flags = CHECK_CAPS_NODELAY;
4654     Inode *in = *p;
4655
4656     ++p;
4657     if (p.end())
4658       flags |= CHECK_CAPS_SYNCHRONOUS;
4659     check_caps(in, flags);
4660   }
4661 }
4662
4663 void Client::wait_sync_caps(Inode *in, ceph_tid_t want)
4664 {
4665   while (in->flushing_caps) {
4666     map<ceph_tid_t, int>::iterator it = in->flushing_cap_tids.begin();
4667     ceph_assert(it != in->flushing_cap_tids.end());
4668     if (it->first > want)
4669       break;
4670     ldout(cct, 10) << __func__ << " on " << *in << " flushing "
4671                    << ccap_string(it->second) << " want " << want
4672                    << " last " << it->first << dendl;
4673     wait_on_list(in->waitfor_caps);
4674   }
4675 }
4676
4677 void Client::wait_sync_caps(ceph_tid_t want)
4678 {
4679  retry:
4680   ldout(cct, 10) << __func__ << " want " << want  << " (last is " << last_flush_tid << ", "
4681            << num_flushing_caps << " total flushing)" << dendl;
4682   for (auto &p : mds_sessions) {
4683     MetaSession *s = &p.second;
4684     if (s->flushing_caps_tids.empty())
4685         continue;
4686     ceph_tid_t oldest_tid = *s->flushing_caps_tids.begin();
4687     if (oldest_tid <= want) {
4688       ldout(cct, 10) << " waiting on mds." << p.first << " tid " << oldest_tid
4689                      << " (want " << want << ")" << dendl;
4690       std::unique_lock l{client_lock, std::adopt_lock};
4691       sync_cond.wait(l);
4692       l.release();
4693       goto retry;
4694     }
4695   }
4696 }
4697
4698 void Client::kick_flushing_caps(Inode *in, MetaSession *session)
4699 {
4700   in->flags &= ~I_KICK_FLUSH;
4701
4702   Cap *cap = in->auth_cap;
4703   ceph_assert(cap->session == session);
4704
4705   ceph_tid_t last_snap_flush = 0;
4706   for (auto p = in->flushing_cap_tids.rbegin();
4707        p != in->flushing_cap_tids.rend();
4708        ++p) {
4709     if (!p->second) {
4710       last_snap_flush = p->first;
4711       break;
4712     }
4713   }
4714
4715   int wanted = in->caps_wanted();
4716   int used = get_caps_used(in) | in->caps_dirty();
4717   auto it = in->cap_snaps.begin();
4718   for (auto& p : in->flushing_cap_tids) {
4719     if (p.second) {
4720       int msg_flags = p.first < last_snap_flush ? MClientCaps::FLAG_PENDING_CAPSNAP : 0;
4721       send_cap(in, session, cap, msg_flags, used, wanted, (cap->issued | cap->implemented),
4722                p.second, p.first);
4723     } else {
4724       ceph_assert(it != in->cap_snaps.end());
4725       ceph_assert(it->second.flush_tid == p.first);
4726       send_flush_snap(in, session, it->first, it->second);
4727       ++it;
4728     }
4729   }
4730 }
4731
4732 void Client::kick_flushing_caps(MetaSession *session)
4733 {
4734   mds_rank_t mds = session->mds_num;
4735   ldout(cct, 10) << __func__ << " mds." << mds << dendl;
4736
4737   for (xlist<Inode*>::iterator p = session->flushing_caps.begin(); !p.end(); ++p) {
4738     Inode *in = *p;
4739     if (in->flags & I_KICK_FLUSH) {
4740       ldout(cct, 20) << " reflushing caps on " << *in << " to mds." << mds << dendl;
4741       kick_flushing_caps(in, session);
4742     }
4743   }
4744 }
4745
4746 void Client::early_kick_flushing_caps(MetaSession *session)
4747 {
4748   for (xlist<Inode*>::iterator p = session->flushing_caps.begin(); !p.end(); ++p) {
4749     Inode *in = *p;
4750     Cap *cap = in->auth_cap;
4751     ceph_assert(cap);
4752
4753     // if flushing caps were revoked, we re-send the cap flush in client reconnect
4754     // stage. This guarantees that MDS processes the cap flush message before issuing
4755     // the flushing caps to other client.
4756     if ((in->flushing_caps & in->auth_cap->issued) == in->flushing_caps) {
4757       in->flags |= I_KICK_FLUSH;
4758       continue;
4759     }
4760
4761     ldout(cct, 20) << " reflushing caps (early_kick) on " << *in
4762                    << " to mds." << session->mds_num << dendl;
4763     // send_reconnect() also will reset these sequence numbers. make sure
4764     // sequence numbers in cap flush message match later reconnect message.
4765     cap->seq = 0;
4766     cap->issue_seq = 0;
4767     cap->mseq = 0;
4768     cap->issued = cap->implemented;
4769
4770     kick_flushing_caps(in, session);
4771   }
4772 }
4773
4774 void SnapRealm::build_snap_context()
4775 {
4776   set<snapid_t> snaps;
4777   snapid_t max_seq = seq;
4778
4779   // start with prior_parents?
4780   for (unsigned i=0; i<prior_parent_snaps.size(); i++)
4781     snaps.insert(prior_parent_snaps[i]);
4782
4783   // current parent's snaps
4784   if (pparent) {
4785     const SnapContext& psnapc = pparent->get_snap_context();
4786     for (unsigned i=0; i<psnapc.snaps.size(); i++)
4787       if (psnapc.snaps[i] >= parent_since)
4788         snaps.insert(psnapc.snaps[i]);
4789     if (psnapc.seq > max_seq)
4790       max_seq = psnapc.seq;
4791   }
4792
4793   // my snaps
4794   for (unsigned i=0; i<my_snaps.size(); i++)
4795     snaps.insert(my_snaps[i]);
4796
4797   // ok!
4798   cached_snap_context.seq = max_seq;
4799   cached_snap_context.snaps.resize(0);
4800   cached_snap_context.snaps.reserve(snaps.size());
4801   for (set<snapid_t>::reverse_iterator p = snaps.rbegin(); p != snaps.rend(); ++p)
4802     cached_snap_context.snaps.push_back(*p);
4803 }
4804
4805 void Client::invalidate_snaprealm_and_children(SnapRealm *realm)
4806 {
4807   list<SnapRealm*> q;
4808   q.push_back(realm);
4809
4810   while (!q.empty()) {
4811     realm = q.front();
4812     q.pop_front();
4813
4814     ldout(cct, 10) << __func__ << " " << *realm << dendl;
4815     realm->invalidate_cache();
4816
4817     for (set<SnapRealm*>::iterator p = realm->pchildren.begin();
4818          p != realm->pchildren.end();
4819          ++p)
4820       q.push_back(*p);
4821   }
4822 }
4823
4824 SnapRealm *Client::get_snap_realm(inodeno_t r)
4825 {
4826   SnapRealm *realm = snap_realms[r];
4827   if (!realm)
4828     snap_realms[r] = realm = new SnapRealm(r);
4829   ldout(cct, 20) << __func__ << " " << r << " " << realm << " " << realm->nref << " -> " << (realm->nref + 1) << dendl;
4830   realm->nref++;
4831   return realm;
4832 }
4833
4834 SnapRealm *Client::get_snap_realm_maybe(inodeno_t r)
4835 {
4836   if (snap_realms.count(r) == 0) {
4837     ldout(cct, 20) << __func__ << " " << r << " fail" << dendl;
4838     return NULL;
4839   }
4840   SnapRealm *realm = snap_realms[r];
4841   ldout(cct, 20) << __func__ << " " << r << " " << realm << " " << realm->nref << " -> " << (realm->nref + 1) << dendl;
4842   realm->nref++;
4843   return realm;
4844 }
4845
4846 void Client::put_snap_realm(SnapRealm *realm)
4847 {
4848   ldout(cct, 20) << __func__ << " " << realm->ino << " " << realm
4849                  << " " << realm->nref << " -> " << (realm->nref - 1) << dendl;
4850   if (--realm->nref == 0) {
4851     snap_realms.erase(realm->ino);
4852     if (realm->pparent) {
4853       realm->pparent->pchildren.erase(realm);
4854       put_snap_realm(realm->pparent);
4855     }
4856     delete realm;
4857   }
4858 }
4859
4860 bool Client::adjust_realm_parent(SnapRealm *realm, inodeno_t parent)
4861 {
4862   if (realm->parent != parent) {
4863     ldout(cct, 10) << __func__ << " " << *realm
4864              << " " << realm->parent << " -> " << parent << dendl;
4865     realm->parent = parent;
4866     if (realm->pparent) {
4867       realm->pparent->pchildren.erase(realm);
4868       put_snap_realm(realm->pparent);
4869     }
4870     realm->pparent = get_snap_realm(parent);
4871     realm->pparent->pchildren.insert(realm);
4872     return true;
4873   }
4874   return false;
4875 }
4876
4877 static bool has_new_snaps(const SnapContext& old_snapc,
4878                           const SnapContext& new_snapc)
4879 {
4880   return !new_snapc.snaps.empty() && new_snapc.snaps[0] > old_snapc.seq;
4881 }
4882
4883
4884 void Client::update_snap_trace(const bufferlist& bl, SnapRealm **realm_ret, bool flush)
4885 {
4886   SnapRealm *first_realm = NULL;
4887   ldout(cct, 10) << __func__ << " len " << bl.length() << dendl;
4888
4889   map<SnapRealm*, SnapContext> dirty_realms;
4890
4891   auto p = bl.cbegin();
4892   while (!p.end()) {
4893     SnapRealmInfo info;
4894     decode(info, p);
4895     SnapRealm *realm = get_snap_realm(info.ino());
4896
4897     bool invalidate = false;
4898
4899     if (info.seq() > realm->seq) {
4900       ldout(cct, 10) << __func__ << " " << *realm << " seq " << info.seq() << " > " << realm->seq
4901                << dendl;
4902
4903       if (flush) {
4904         // writeback any dirty caps _before_ updating snap list (i.e. with old snap info)
4905         //  flush me + children
4906         list<SnapRealm*> q;
4907         q.push_back(realm);
4908         while (!q.empty()) {
4909           SnapRealm *realm = q.front();
4910           q.pop_front();
4911
4912           for (set<SnapRealm*>::iterator p = realm->pchildren.begin();
4913                p != realm->pchildren.end();
4914                ++p)
4915             q.push_back(*p);
4916
4917           if (dirty_realms.count(realm) == 0) {
4918             realm->nref++;
4919             dirty_realms[realm] = realm->get_snap_context();
4920           }
4921         }
4922       }
4923
4924       // update
4925       realm->seq = info.seq();
4926       realm->created = info.created();
4927       realm->parent_since = info.parent_since();
4928       realm->prior_parent_snaps = info.prior_parent_snaps;
4929       realm->my_snaps = info.my_snaps;
4930       invalidate = true;
4931     }
4932
4933     // _always_ verify parent
4934     if (adjust_realm_parent(realm, info.parent()))
4935       invalidate = true;
4936
4937     if (invalidate) {
4938       invalidate_snaprealm_and_children(realm);
4939       ldout(cct, 15) << __func__ << " " << *realm << " self|parent updated" << dendl;
4940       ldout(cct, 15) << "  snapc " << realm->get_snap_context() << dendl;
4941     } else {
4942       ldout(cct, 10) << __func__ << " " << *realm << " seq " << info.seq()
4943                << " <= " << realm->seq << " and same parent, SKIPPING" << dendl;
4944     }
4945
4946     if (!first_realm)
4947       first_realm = realm;
4948     else
4949       put_snap_realm(realm);
4950   }
4951
4952   for (auto &[realm, snapc] : dirty_realms) {
4953     // if there are new snaps ?
4954     if (has_new_snaps(snapc, realm->get_snap_context())) {
4955       ldout(cct, 10) << " flushing caps on " << *realm << dendl;
4956       for (auto&& in : realm->inodes_with_caps) {
4957         queue_cap_snap(in, snapc);
4958       }
4959     } else {
4960       ldout(cct, 10) << " no new snap on " << *realm << dendl;
4961     }
4962     put_snap_realm(realm);
4963   }
4964
4965   if (realm_ret)
4966     *realm_ret = first_realm;
4967   else
4968     put_snap_realm(first_realm);
4969 }
4970
4971 void Client::handle_snap(const MConstRef<MClientSnap>& m)
4972 {
4973   ldout(cct, 10) << __func__ << " " << *m << dendl;
4974   mds_rank_t mds = mds_rank_t(m->get_source().num());
4975
4976   std::scoped_lock cl(client_lock);
4977   MetaSession *session = _get_mds_session(mds, m->get_connection().get());
4978   if (!session) {
4979     return;
4980   }
4981
4982   got_mds_push(session);
4983
4984   map<Inode*, SnapContext> to_move;
4985   SnapRealm *realm = 0;
4986
4987   if (m->head.op == CEPH_SNAP_OP_SPLIT) {
4988     ceph_assert(m->head.split);
4989     SnapRealmInfo info;
4990     auto p = m->bl.cbegin();
4991     decode(info, p);
4992     ceph_assert(info.ino() == m->head.split);
4993
4994     // flush, then move, ino's.
4995     realm = get_snap_realm(info.ino());
4996     ldout(cct, 10) << " splitting off " << *realm << dendl;
4997     for (auto& ino : m->split_inos) {
4998       vinodeno_t vino(ino, CEPH_NOSNAP);
4999       if (inode_map.count(vino)) {
5000         Inode *in = inode_map[vino];
5001         if (!in->snaprealm || in->snaprealm == realm)
5002           continue;
5003         if (in->snaprealm->created > info.created()) {
5004           ldout(cct, 10) << " NOT moving " << *in << " from _newer_ realm "
5005                    << *in->snaprealm << dendl;
5006           continue;
5007         }
5008         ldout(cct, 10) << " moving " << *in << " from " << *in->snaprealm << dendl;
5009
5010
5011         in->snaprealm_item.remove_myself();
5012         to_move[in] = in->snaprealm->get_snap_context();
5013         put_snap_realm(in->snaprealm);
5014       }
5015     }
5016
5017     // move child snaprealms, too
5018     for (auto& child_realm : m->split_realms) {
5019       ldout(cct, 10) << "adjusting snaprealm " << child_realm << " parent" << dendl;
5020       SnapRealm *child = get_snap_realm_maybe(child_realm);
5021       if (!child)
5022         continue;
5023       adjust_realm_parent(child, realm->ino);
5024       put_snap_realm(child);
5025     }
5026   }
5027
5028   update_snap_trace(m->bl, NULL, m->head.op != CEPH_SNAP_OP_DESTROY);
5029
5030   if (realm) {
5031     for (auto p = to_move.begin(); p != to_move.end(); ++p) {
5032       Inode *in = p->first;
5033       in->snaprealm = realm;
5034       realm->inodes_with_caps.push_back(&in->snaprealm_item);
5035       realm->nref++;
5036       // queue for snap writeback
5037       if (has_new_snaps(p->second, realm->get_snap_context()))
5038         queue_cap_snap(in, p->second);
5039     }
5040     put_snap_realm(realm);
5041   }
5042 }
5043
5044 void Client::handle_quota(const MConstRef<MClientQuota>& m)
5045 {
5046   mds_rank_t mds = mds_rank_t(m->get_source().num());
5047
5048   std::scoped_lock cl(client_lock);
5049   MetaSession *session = _get_mds_session(mds, m->get_connection().get());
5050   if (!session) {
5051     return;
5052   }
5053
5054   got_mds_push(session);
5055
5056   ldout(cct, 10) << __func__ << " " << *m << " from mds." << mds << dendl;
5057
5058   vinodeno_t vino(m->ino, CEPH_NOSNAP);
5059   if (inode_map.count(vino)) {
5060     Inode *in = NULL;
5061     in = inode_map[vino];
5062
5063     if (in) {
5064       in->quota = m->quota;
5065       in->rstat = m->rstat;
5066     }
5067   }
5068 }
5069
5070 void Client::handle_caps(const MConstRef<MClientCaps>& m)
5071 {
5072   mds_rank_t mds = mds_rank_t(m->get_source().num());
5073
5074   std::scoped_lock cl(client_lock);
5075   MetaSession *session = _get_mds_session(mds, m->get_connection().get());
5076   if (!session) {
5077     return;
5078   }
5079
5080   if (m->osd_epoch_barrier && !objecter->have_map(m->osd_epoch_barrier)) {
5081     // Pause RADOS operations until we see the required epoch
5082     objecter->set_epoch_barrier(m->osd_epoch_barrier);
5083   }
5084
5085   if (m->osd_epoch_barrier > cap_epoch_barrier) {
5086     // Record the barrier so that we will transmit it to MDS when releasing
5087     set_cap_epoch_barrier(m->osd_epoch_barrier);
5088   }
5089
5090   got_mds_push(session);
5091
5092   Inode *in;
5093   vinodeno_t vino(m->get_ino(), CEPH_NOSNAP);
5094   if (auto it = inode_map.find(vino); it != inode_map.end()) {
5095     in = it->second;
5096   } else {
5097     if (m->get_op() == CEPH_CAP_OP_IMPORT) {
5098       ldout(cct, 5) << __func__ << " don't have vino " << vino << " on IMPORT, immediately releasing" << dendl;
5099       session->enqueue_cap_release(
5100         m->get_ino(),
5101         m->get_cap_id(),
5102         m->get_seq(),
5103         m->get_mseq(),
5104         cap_epoch_barrier);
5105     } else {
5106       ldout(cct, 5) << __func__ << " don't have vino " << vino << ", dropping" << dendl;
5107     }
5108
5109     // in case the mds is waiting on e.g. a revocation
5110     flush_cap_releases();
5111     return;
5112   }
5113
5114   switch (m->get_op()) {
5115     case CEPH_CAP_OP_EXPORT: return handle_cap_export(session, in, m);
5116     case CEPH_CAP_OP_FLUSHSNAP_ACK: return handle_cap_flushsnap_ack(session, in, m);
5117     case CEPH_CAP_OP_IMPORT: /* no return */ handle_cap_import(session, in, m);
5118   }
5119
5120   if (auto it = in->caps.find(mds); it != in->caps.end()) {
5121     Cap &cap = in->caps.at(mds);
5122
5123     switch (m->get_op()) {
5124       case CEPH_CAP_OP_TRUNC: return handle_cap_trunc(session, in, m);
5125       case CEPH_CAP_OP_IMPORT:
5126       case CEPH_CAP_OP_REVOKE:
5127       case CEPH_CAP_OP_GRANT: return handle_cap_grant(session, in, &cap, m);
5128       case CEPH_CAP_OP_FLUSH_ACK: return handle_cap_flush_ack(session, in, &cap, m);
5129     }
5130   } else {
5131     ldout(cct, 5) << __func__ << " don't have " << *in << " cap on mds." << mds << dendl;
5132     return;
5133   }
5134 }
5135
5136 void Client::handle_cap_import(MetaSession *session, Inode *in, const MConstRef<MClientCaps>& m)
5137 {
5138   mds_rank_t mds = session->mds_num;
5139
5140   ldout(cct, 5) << __func__ << " ino " << m->get_ino() << " mseq " << m->get_mseq()
5141                 << " IMPORT from mds." << mds << dendl;
5142
5143   const mds_rank_t peer_mds = mds_rank_t(m->peer.mds);
5144   Cap *cap = NULL;
5145   UserPerm cap_perms;
5146   if (auto it = in->caps.find(peer_mds); m->peer.cap_id && it != in->caps.end()) {
5147     cap = &it->second;
5148     cap_perms = cap->latest_perms;
5149   }
5150
5151   // add/update it
5152   SnapRealm *realm = NULL;
5153   update_snap_trace(m->snapbl, &realm);
5154
5155   int issued = m->get_caps();
5156   int wanted = m->get_wanted();
5157   add_update_cap(in, session, m->get_cap_id(),
5158                  issued, wanted, m->get_seq(), m->get_mseq(),
5159                  m->get_realm(), CEPH_CAP_FLAG_AUTH, cap_perms);
5160
5161   if (cap && cap->cap_id == m->peer.cap_id) {
5162       remove_cap(cap, (m->peer.flags & CEPH_CAP_FLAG_RELEASE));
5163   }
5164
5165   if (realm)
5166     put_snap_realm(realm);
5167
5168   if (in->auth_cap && in->auth_cap->session == session) {
5169     if (!(wanted & CEPH_CAP_ANY_FILE_WR) ||
5170         in->requested_max_size > m->get_max_size()) {
5171       in->requested_max_size = 0;
5172       ldout(cct, 15) << "reset requested_max_size after cap import" << dendl;
5173     }
5174     // reflush any/all caps (if we are now the auth_cap)
5175     kick_flushing_caps(in, session);
5176   }
5177 }
5178
5179 void Client::handle_cap_export(MetaSession *session, Inode *in, const MConstRef<MClientCaps>& m)
5180 {
5181   mds_rank_t mds = session->mds_num;
5182
5183   ldout(cct, 5) << __func__ << " ino " << m->get_ino() << " mseq " << m->get_mseq()
5184                 << " EXPORT from mds." << mds << dendl;
5185
5186   auto it = in->caps.find(mds);
5187   if (it != in->caps.end()) {
5188     Cap &cap = it->second;
5189     if (cap.cap_id == m->get_cap_id()) {
5190       if (m->peer.cap_id) {
5191         const auto peer_mds = mds_rank_t(m->peer.mds);
5192         MetaSession *tsession = _get_or_open_mds_session(peer_mds);
5193         auto it = in->caps.find(peer_mds);
5194         if (it != in->caps.end()) {
5195           Cap &tcap = it->second;
5196           if (tcap.cap_id == m->peer.cap_id &&
5197               ceph_seq_cmp(tcap.seq, m->peer.seq) < 0) {
5198             tcap.cap_id = m->peer.cap_id;
5199             tcap.seq = m->peer.seq - 1;
5200             tcap.issue_seq = tcap.seq;
5201             tcap.issued |= cap.issued;
5202             tcap.implemented |= cap.issued;
5203             if (&cap == in->auth_cap)
5204               in->auth_cap = &tcap;
5205             if (in->auth_cap == &tcap && in->flushing_cap_item.is_on_list())
5206               adjust_session_flushing_caps(in, session, tsession);
5207           }
5208         } else {
5209           add_update_cap(in, tsession, m->peer.cap_id, cap.issued, 0,
5210                          m->peer.seq - 1, m->peer.mseq, (uint64_t)-1,
5211                          &cap == in->auth_cap ? CEPH_CAP_FLAG_AUTH : 0,
5212                          cap.latest_perms);
5213         }
5214       } else {
5215         if (cap.wanted | cap.issued)
5216           in->flags |= I_CAP_DROPPED;
5217       }
5218
5219       remove_cap(&cap, false);
5220     }
5221   }
5222 }
5223
5224 void Client::handle_cap_trunc(MetaSession *session, Inode *in, const MConstRef<MClientCaps>& m)
5225 {
5226   mds_rank_t mds = session->mds_num;
5227   ceph_assert(in->caps.count(mds));
5228
5229   ldout(cct, 10) << __func__ << " on ino " << *in
5230            << " size " << in->size << " -> " << m->get_size()
5231            << dendl;
5232
5233   int issued;
5234   in->caps_issued(&issued);
5235   issued |= in->caps_dirty();
5236   update_inode_file_size(in, issued, m->get_size(),
5237                          m->get_truncate_seq(), m->get_truncate_size());
5238 }
5239
5240 void Client::handle_cap_flush_ack(MetaSession *session, Inode *in, Cap *cap, const MConstRef<MClientCaps>& m)
5241 {
5242   ceph_tid_t flush_ack_tid = m->get_client_tid();
5243   int dirty = m->get_dirty();
5244   int cleaned = 0;
5245   int flushed = 0;
5246
5247   auto it = in->flushing_cap_tids.begin();
5248   if (it->first < flush_ack_tid) {
5249        ldout(cct, 0) << __func__ << " mds." << session->mds_num
5250                    << " got unexpected flush ack tid " << flush_ack_tid
5251                    << " expected is " << it->first << dendl;
5252   }
5253   for (; it != in->flushing_cap_tids.end(); ) {
5254     if (!it->second) {
5255       // cap snap
5256       ++it;
5257       continue;
5258     }
5259     if (it->first == flush_ack_tid)
5260       cleaned = it->second;
5261     if (it->first <= flush_ack_tid) {
5262       session->flushing_caps_tids.erase(it->first);
5263       in->flushing_cap_tids.erase(it++);
5264       ++flushed;
5265       continue;
5266     }
5267     cleaned &= ~it->second;
5268     if (!cleaned)
5269       break;
5270     ++it;
5271   }
5272
5273   ldout(cct, 5) << __func__ << " mds." << session->mds_num
5274           << " cleaned " << ccap_string(cleaned) << " on " << *in
5275           << " with " << ccap_string(dirty) << dendl;
5276
5277   if (flushed) {
5278     signal_cond_list(in->waitfor_caps);
5279     if (session->flushing_caps_tids.empty() ||
5280         *session->flushing_caps_tids.begin() > flush_ack_tid)
5281       sync_cond.notify_all();
5282   }
5283
5284   if (!dirty) {
5285     in->cap_dirtier_uid = -1;
5286     in->cap_dirtier_gid = -1;
5287   }
5288
5289   if (!cleaned) {
5290     ldout(cct, 10) << " tid " << m->get_client_tid() << " != any cap bit tids" << dendl;
5291   } else {
5292     if (in->flushing_caps) {
5293       ldout(cct, 5) << "  flushing_caps " << ccap_string(in->flushing_caps)
5294               << " -> " << ccap_string(in->flushing_caps & ~cleaned) << dendl;
5295       in->flushing_caps &= ~cleaned;
5296       if (in->flushing_caps == 0) {
5297         ldout(cct, 10) << " " << *in << " !flushing" << dendl;
5298         num_flushing_caps--;
5299        if (in->flushing_cap_tids.empty())
5300           in->flushing_cap_item.remove_myself();
5301       }
5302       if (!in->caps_dirty())
5303         put_inode(in);
5304     }
5305   }
5306 }
5307
5308
5309 void Client::handle_cap_flushsnap_ack(MetaSession *session, Inode *in, const MConstRef<MClientCaps>& m)
5310 {
5311   ceph_tid_t flush_ack_tid = m->get_client_tid();
5312   mds_rank_t mds = session->mds_num;
5313   ceph_assert(in->caps.count(mds));
5314   snapid_t follows = m->get_snap_follows();
5315
5316   if (auto it = in->cap_snaps.find(follows); it != in->cap_snaps.end()) {
5317     auto& capsnap = it->second;
5318     if (flush_ack_tid != capsnap.flush_tid) {
5319       ldout(cct, 10) << " tid " << flush_ack_tid << " != " << capsnap.flush_tid << dendl;
5320     } else {
5321       InodeRef tmp_ref(in);
5322       ldout(cct, 5) << __func__ << " mds." << mds << " flushed snap follows " << follows
5323               << " on " << *in << dendl;
5324       session->flushing_caps_tids.erase(capsnap.flush_tid);
5325       in->flushing_cap_tids.erase(capsnap.flush_tid);
5326       if (in->flushing_caps == 0 && in->flushing_cap_tids.empty())
5327         in->flushing_cap_item.remove_myself();
5328       in->cap_snaps.erase(it);
5329
5330       signal_cond_list(in->waitfor_caps);
5331       if (session->flushing_caps_tids.empty() ||
5332           *session->flushing_caps_tids.begin() > flush_ack_tid)
5333         sync_cond.notify_all();
5334     }
5335   } else {
5336     ldout(cct, 5) << __func__ << " DUP(?) mds." << mds << " flushed snap follows " << follows
5337             << " on " << *in << dendl;
5338     // we may not have it if we send multiple FLUSHSNAP requests and (got multiple FLUSHEDSNAPs back)
5339   }
5340 }
5341
5342 class C_Client_DentryInvalidate : public Context  {
5343 private:
5344   Client *client;
5345   vinodeno_t dirino;
5346   vinodeno_t ino;
5347   string name;
5348 public:
5349   C_Client_DentryInvalidate(Client *c, Dentry *dn, bool del) :
5350     client(c), name(dn->name) {
5351       if (client->use_faked_inos()) {
5352         dirino.ino = dn->dir->parent_inode->faked_ino;
5353         if (del)
5354           ino.ino = dn->inode->faked_ino;
5355       } else {
5356         dirino = dn->dir->parent_inode->vino();
5357         if (del)
5358           ino = dn->inode->vino();
5359       }
5360       if (!del)
5361         ino.ino = inodeno_t();
5362   }
5363   void finish(int r) override {
5364     // _async_dentry_invalidate is responsible for its own locking
5365     ceph_assert(ceph_mutex_is_not_locked_by_me(client->client_lock));
5366     client->_async_dentry_invalidate(dirino, ino, name);
5367   }
5368 };
5369
5370 void Client::_async_dentry_invalidate(vinodeno_t dirino, vinodeno_t ino, string& name)
5371 {
5372   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
5373   if (!mref_reader.is_state_satisfied())
5374     return;
5375
5376   ldout(cct, 10) << __func__ << " '" << name << "' ino " << ino
5377                  << " in dir " << dirino << dendl;
5378   dentry_invalidate_cb(callback_handle, dirino, ino, name.c_str(), name.length());
5379 }
5380
5381 void Client::_schedule_invalidate_dentry_callback(Dentry *dn, bool del)
5382 {
5383   if (dentry_invalidate_cb && dn->inode->ll_ref > 0)
5384     async_dentry_invalidator.queue(new C_Client_DentryInvalidate(this, dn, del));
5385 }
5386
5387 void Client::_try_to_trim_inode(Inode *in, bool sched_inval)
5388 {
5389   int ref = in->get_num_ref();
5390   ldout(cct, 5) << __func__ << " in " << *in <<dendl;
5391
5392   if (in->dir && !in->dir->dentries.empty()) {
5393     for (auto p = in->dir->dentries.begin();
5394          p != in->dir->dentries.end(); ) {
5395       Dentry *dn = p->second;
5396       ++p;
5397       /* rmsnap removes whole subtree, need trim inodes recursively.
5398        * we don't need to invalidate dentries recursively. because
5399        * invalidating a directory dentry effectively invalidate
5400        * whole subtree */
5401       if (in->snapid != CEPH_NOSNAP && dn->inode && dn->inode->is_dir())
5402         _try_to_trim_inode(dn->inode.get(), false);
5403
5404       if (dn->lru_is_expireable())
5405         unlink(dn, true, false);  // keep dir, drop dentry
5406     }
5407     if (in->dir->dentries.empty()) {
5408       close_dir(in->dir);
5409       --ref;
5410     }
5411   }
5412
5413   if (ref > 0 && (in->flags & I_SNAPDIR_OPEN)) {
5414     InodeRef snapdir = open_snapdir(in);
5415     _try_to_trim_inode(snapdir.get(), false);
5416     --ref;
5417   }
5418
5419   if (ref > 0) {
5420     auto q = in->dentries.begin();
5421     while (q != in->dentries.end()) {
5422       Dentry *dn = *q;
5423       ++q;
5424       if( in->ll_ref > 0 && sched_inval) {
5425         // FIXME: we play lots of unlink/link tricks when handling MDS replies,
5426         //        so in->dentries doesn't always reflect the state of kernel's dcache.
5427         _schedule_invalidate_dentry_callback(dn, true);
5428       }
5429       unlink(dn, true, true);
5430     }
5431   }
5432 }
5433
5434 void Client::handle_cap_grant(MetaSession *session, Inode *in, Cap *cap, const MConstRef<MClientCaps>& m)
5435 {
5436   mds_rank_t mds = session->mds_num;
5437   int used = get_caps_used(in);
5438   int wanted = in->caps_wanted();
5439
5440   const unsigned new_caps = m->get_caps();
5441   const bool was_stale = session->cap_gen > cap->gen;
5442   ldout(cct, 5) << __func__ << " on in " << m->get_ino()
5443                 << " mds." << mds << " seq " << m->get_seq()
5444                 << " caps now " << ccap_string(new_caps)
5445                 << " was " << ccap_string(cap->issued)
5446                 << (was_stale ? " (stale)" : "") << dendl;
5447
5448   if (was_stale)
5449       cap->issued = cap->implemented = CEPH_CAP_PIN;
5450   cap->seq = m->get_seq();
5451   cap->gen = session->cap_gen;
5452
5453   check_cap_issue(in, new_caps);
5454
5455   // update inode
5456   int issued;
5457   in->caps_issued(&issued);
5458   issued |= in->caps_dirty();
5459
5460   if ((new_caps & CEPH_CAP_AUTH_SHARED) &&
5461       !(issued & CEPH_CAP_AUTH_EXCL)) {
5462     in->mode = m->head.mode;
5463     in->uid = m->head.uid;
5464     in->gid = m->head.gid;
5465     in->btime = m->btime;
5466   }
5467   bool deleted_inode = false;
5468   if ((new_caps & CEPH_CAP_LINK_SHARED) &&
5469       !(issued & CEPH_CAP_LINK_EXCL)) {
5470     in->nlink = m->head.nlink;
5471     if (in->nlink == 0 &&
5472         (new_caps & (CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL)))
5473       deleted_inode = true;
5474   }
5475   if (!(issued & CEPH_CAP_XATTR_EXCL) &&
5476       m->xattrbl.length() &&
5477       m->head.xattr_version > in->xattr_version) {
5478     auto p = m->xattrbl.cbegin();
5479     decode(in->xattrs, p);
5480     in->xattr_version = m->head.xattr_version;
5481   }
5482
5483   if ((new_caps & CEPH_CAP_FILE_SHARED) && m->dirstat_is_valid()) {
5484     in->dirstat.nfiles = m->get_nfiles();
5485     in->dirstat.nsubdirs = m->get_nsubdirs();
5486   }
5487
5488   if (new_caps & CEPH_CAP_ANY_RD) {
5489     update_inode_file_time(in, issued, m->get_time_warp_seq(),
5490                            m->get_ctime(), m->get_mtime(), m->get_atime());
5491   }
5492
5493   if (new_caps & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR)) {
5494     in->layout = m->get_layout();
5495     update_inode_file_size(in, issued, m->get_size(),
5496                            m->get_truncate_seq(), m->get_truncate_size());
5497   }
5498
5499   if (m->inline_version > in->inline_version) {
5500     in->inline_data = m->inline_data;
5501     in->inline_version = m->inline_version;
5502   }
5503
5504   /* always take a newer change attr */
5505   if (m->get_change_attr() > in->change_attr)
5506     in->change_attr = m->get_change_attr();
5507
5508   // max_size
5509   if (cap == in->auth_cap &&
5510       (new_caps & CEPH_CAP_ANY_FILE_WR) &&
5511       (m->get_max_size() != in->max_size)) {
5512     ldout(cct, 10) << "max_size " << in->max_size << " -> " << m->get_max_size() << dendl;
5513     in->max_size = m->get_max_size();
5514     if (in->max_size > in->wanted_max_size) {
5515       in->wanted_max_size = 0;
5516       in->requested_max_size = 0;
5517     }
5518   }
5519
5520   bool check = false;
5521   if ((was_stale || m->get_op() == CEPH_CAP_OP_IMPORT) &&
5522       (wanted & ~(cap->wanted | new_caps))) {
5523     // If mds is importing cap, prior cap messages that update 'wanted'
5524     // may get dropped by mds (migrate seq mismatch).
5525     //
5526     // We don't send cap message to update 'wanted' if what we want are
5527     // already issued. If mds revokes caps, cap message that releases caps
5528     // also tells mds what we want. But if caps got revoked by mds forcedly
5529     // (session stale). We may haven't told mds what we want.
5530     check = true;
5531   }
5532
5533
5534   // update caps
5535   auto revoked = cap->issued & ~new_caps;
5536   if (revoked) {
5537     ldout(cct, 10) << "  revocation of " << ccap_string(revoked) << dendl;
5538     cap->issued = new_caps;
5539     cap->implemented |= new_caps;
5540
5541     // recall delegations if we're losing caps necessary for them
5542     if (revoked & ceph_deleg_caps_for_type(CEPH_DELEGATION_RD))
5543       in->recall_deleg(false);
5544     else if (revoked & ceph_deleg_caps_for_type(CEPH_DELEGATION_WR))
5545       in->recall_deleg(true);
5546
5547     used = adjust_caps_used_for_lazyio(used, cap->issued, cap->implemented);
5548     if ((used & revoked & (CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO)) &&
5549         !_flush(in, new C_Client_FlushComplete(this, in))) {
5550       // waitin' for flush
5551     } else if (used & revoked & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO)) {
5552       if (_release(in))
5553         check = true;
5554     } else {
5555       cap->wanted = 0; // don't let check_caps skip sending a response to MDS
5556       check = true;
5557     }
5558   } else if (cap->issued == new_caps) {
5559     ldout(cct, 10) << "  caps unchanged at " << ccap_string(cap->issued) << dendl;
5560   } else {
5561     ldout(cct, 10) << "  grant, new caps are " << ccap_string(new_caps & ~cap->issued) << dendl;
5562     cap->issued = new_caps;
5563     cap->implemented |= new_caps;
5564
5565     if (cap == in->auth_cap) {
5566       // non-auth MDS is revoking the newly grant caps ?
5567       for (const auto &p : in->caps) {
5568         if (&p.second == cap)
5569           continue;
5570         if (p.second.implemented & ~p.second.issued & new_caps) {
5571           check = true;
5572           break;
5573         }
5574       }
5575     }
5576   }
5577
5578   if (check)
5579     check_caps(in, 0);
5580
5581   // wake up waiters
5582   if (new_caps)
5583     signal_cond_list(in->waitfor_caps);
5584
5585   // may drop inode's last ref
5586   if (deleted_inode)
5587     _try_to_trim_inode(in, true);
5588 }
5589
5590 int Client::inode_permission(Inode *in, const UserPerm& perms, unsigned want)
5591 {
5592   if (perms.uid() == 0)
5593     return 0;
5594
5595   if (perms.uid() != in->uid && (in->mode & S_IRWXG)) {
5596     int ret = _posix_acl_permission(in, perms, want);
5597     if (ret != -CEPHFS_EAGAIN)
5598       return ret;
5599   }
5600
5601   // check permissions before doing anything else
5602   if (!in->check_mode(perms, want))
5603     return -CEPHFS_EACCES;
5604   return 0;
5605 }
5606
5607 int Client::xattr_permission(Inode *in, const char *name, unsigned want,
5608                              const UserPerm& perms)
5609 {
5610   int r = _getattr_for_perm(in, perms);
5611   if (r < 0)
5612     goto out;
5613
5614   r = 0;
5615   if (strncmp(name, "system.", 7) == 0) {
5616     if ((want & MAY_WRITE) && (perms.uid() != 0 && perms.uid() != in->uid))
5617       r = -CEPHFS_EPERM;
5618   } else {
5619     r = inode_permission(in, perms, want);
5620   }
5621 out:
5622   ldout(cct, 5) << __func__ << " " << in << " = " << r <<  dendl;
5623   return r;
5624 }
5625
5626 ostream& operator<<(ostream &out, const UserPerm& perm) {
5627   out << "UserPerm(uid: " << perm.uid() << ", gid: " << perm.gid() << ")";
5628   return out;
5629 }
5630
5631 int Client::may_setattr(Inode *in, struct ceph_statx *stx, int mask,
5632                         const UserPerm& perms)
5633 {
5634   ldout(cct, 20) << __func__ << " " << *in << "; " << perms << dendl;
5635   int r = _getattr_for_perm(in, perms);
5636   if (r < 0)
5637     goto out;
5638
5639   if (mask & CEPH_SETATTR_SIZE) {
5640     r = inode_permission(in, perms, MAY_WRITE);
5641     if (r < 0)
5642       goto out;
5643   }
5644
5645   r = -CEPHFS_EPERM;
5646   if (mask & CEPH_SETATTR_UID) {
5647     if (perms.uid() != 0 && (perms.uid() != in->uid || stx->stx_uid != in->uid))
5648       goto out;
5649   }
5650   if (mask & CEPH_SETATTR_GID) {
5651     if (perms.uid() != 0 && (perms.uid() != in->uid ||
5652                (!perms.gid_in_groups(stx->stx_gid) && stx->stx_gid != in->gid)))
5653       goto out;
5654   }
5655
5656   if (mask & CEPH_SETATTR_MODE) {
5657     if (perms.uid() != 0 && perms.uid() != in->uid)
5658       goto out;
5659
5660     gid_t i_gid = (mask & CEPH_SETATTR_GID) ? stx->stx_gid : in->gid;
5661     if (perms.uid() != 0 && !perms.gid_in_groups(i_gid))
5662       stx->stx_mode &= ~S_ISGID;
5663   }
5664
5665   if (mask & (CEPH_SETATTR_CTIME | CEPH_SETATTR_BTIME |
5666               CEPH_SETATTR_MTIME | CEPH_SETATTR_ATIME)) {
5667     if (perms.uid() != 0 && perms.uid() != in->uid) {
5668       int check_mask = CEPH_SETATTR_CTIME | CEPH_SETATTR_BTIME;
5669       if (!(mask & CEPH_SETATTR_MTIME_NOW))
5670         check_mask |= CEPH_SETATTR_MTIME;
5671       if (!(mask & CEPH_SETATTR_ATIME_NOW))
5672         check_mask |= CEPH_SETATTR_ATIME;
5673       if (check_mask & mask) {
5674         goto out;
5675       } else {
5676         r = inode_permission(in, perms, MAY_WRITE);
5677         if (r < 0)
5678           goto out;
5679       }
5680     }
5681   }
5682   r = 0;
5683 out:
5684   ldout(cct, 3) << __func__ << " " << in << " = " << r <<  dendl;
5685   return r;
5686 }
5687
5688 int Client::may_open(Inode *in, int flags, const UserPerm& perms)
5689 {
5690   ldout(cct, 20) << __func__ << " " << *in << "; " << perms << dendl;
5691   unsigned want = 0;
5692
5693   if ((flags & O_ACCMODE) == O_WRONLY)
5694     want = MAY_WRITE;
5695   else if ((flags & O_ACCMODE) == O_RDWR)
5696     want = MAY_READ | MAY_WRITE;
5697   else if ((flags & O_ACCMODE) == O_RDONLY)
5698     want = MAY_READ;
5699   if (flags & O_TRUNC)
5700     want |= MAY_WRITE;
5701
5702   int r = 0;
5703   switch (in->mode & S_IFMT) {
5704     case S_IFLNK:
5705       r = -CEPHFS_ELOOP;
5706       goto out;
5707     case S_IFDIR:
5708       if (want & MAY_WRITE) {
5709         r = -CEPHFS_EISDIR;
5710         goto out;
5711       }
5712       break;
5713   }
5714
5715   r = _getattr_for_perm(in, perms);
5716   if (r < 0)
5717     goto out;
5718
5719   r = inode_permission(in, perms, want);
5720 out:
5721   ldout(cct, 3) << __func__ << " " << in << " = " << r <<  dendl;
5722   return r;
5723 }
5724
5725 int Client::may_lookup(Inode *dir, const UserPerm& perms)
5726 {
5727   ldout(cct, 20) << __func__ << " " << *dir << "; " << perms << dendl;
5728   int r = _getattr_for_perm(dir, perms);
5729   if (r < 0)
5730     goto out;
5731
5732   r = inode_permission(dir, perms, MAY_EXEC);
5733 out:
5734   ldout(cct, 3) << __func__ << " " << dir << " = " << r <<  dendl;
5735   return r;
5736 }
5737
5738 int Client::may_create(Inode *dir, const UserPerm& perms)
5739 {
5740   ldout(cct, 20) << __func__ << " " << *dir << "; " << perms << dendl;
5741   int r = _getattr_for_perm(dir, perms);
5742   if (r < 0)
5743     goto out;
5744
5745   r = inode_permission(dir, perms, MAY_EXEC | MAY_WRITE);
5746 out:
5747   ldout(cct, 3) << __func__ << " " << dir << " = " << r <<  dendl;
5748   return r;
5749 }
5750
5751 int Client::may_delete(Inode *dir, const char *name, const UserPerm& perms)
5752 {
5753   ldout(cct, 20) << __func__ << " " << *dir << "; " << "; name " << name << "; " << perms << dendl;
5754   int r = _getattr_for_perm(dir, perms);
5755   if (r < 0)
5756     goto out;
5757
5758   r = inode_permission(dir, perms, MAY_EXEC | MAY_WRITE);
5759   if (r < 0)
5760     goto out;
5761
5762   /* 'name == NULL' means rmsnap w/o permission checks */
5763   if (perms.uid() != 0 && name && (dir->mode & S_ISVTX)) {
5764     InodeRef otherin;
5765     r = _lookup(dir, name, CEPH_CAP_AUTH_SHARED, &otherin, perms);
5766     if (r < 0)
5767       goto out;
5768     if (dir->uid != perms.uid() && otherin->uid != perms.uid())
5769       r = -CEPHFS_EPERM;
5770   }
5771 out:
5772   ldout(cct, 3) << __func__ << " " << dir << " = " << r <<  dendl;
5773   return r;
5774 }
5775
5776 int Client::may_delete(const char *relpath, const UserPerm& perms) {
5777   ldout(cct, 20) << __func__ << " " << relpath << "; " << perms << dendl;
5778
5779   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
5780   if (!mref_reader.is_state_satisfied())
5781     return -ENOTCONN;
5782
5783   filepath path(relpath);
5784   string name = path.last_dentry();
5785   path.pop_dentry();
5786   InodeRef dir;
5787
5788   std::scoped_lock lock(client_lock);
5789   int r = path_walk(path, &dir, perms);
5790   if (r < 0)
5791     return r;
5792   if (cct->_conf->client_permissions) {
5793     int r = may_delete(dir.get(), name.c_str(), perms);
5794     if (r < 0)
5795       return r;
5796   }
5797
5798   return 0;
5799 }
5800
5801 int Client::may_hardlink(Inode *in, const UserPerm& perms)
5802 {
5803   ldout(cct, 20) << __func__ << " " << *in << "; " << perms << dendl;
5804   int r = _getattr_for_perm(in, perms);
5805   if (r < 0)
5806     goto out;
5807
5808   if (perms.uid() == 0 || perms.uid() == in->uid) {
5809     r = 0;
5810     goto out;
5811   }
5812
5813   r = -CEPHFS_EPERM;
5814   if (!S_ISREG(in->mode))
5815     goto out;
5816
5817   if (in->mode & S_ISUID)
5818     goto out;
5819
5820   if ((in->mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP))
5821     goto out;
5822
5823   r = inode_permission(in, perms, MAY_READ | MAY_WRITE);
5824 out:
5825   ldout(cct, 3) << __func__ << " " << in << " = " << r <<  dendl;
5826   return r;
5827 }
5828
5829 int Client::_getattr_for_perm(Inode *in, const UserPerm& perms)
5830 {
5831   int mask = CEPH_STAT_CAP_MODE;
5832   bool force = false;
5833   if (acl_type != NO_ACL) {
5834     mask |= CEPH_STAT_CAP_XATTR;
5835     force = in->xattr_version == 0;
5836   }
5837   return _getattr(in, mask, perms, force);
5838 }
5839
5840 vinodeno_t Client::_get_vino(Inode *in)
5841 {
5842   /* The caller must hold the client lock */
5843   return vinodeno_t(in->ino, in->snapid);
5844 }
5845
5846 /**
5847  * Resolve an MDS spec to a list of MDS daemon GIDs.
5848  *
5849  * The spec is a string representing a GID, rank, filesystem:rank, or name/id.
5850  * It may be '*' in which case it matches all GIDs.
5851  *
5852  * If no error is returned, the `targets` vector will be populated with at least
5853  * one MDS.
5854  */
5855 int Client::resolve_mds(
5856     const std::string &mds_spec,
5857     std::vector<mds_gid_t> *targets)
5858 {
5859   ceph_assert(fsmap);
5860   ceph_assert(targets != nullptr);
5861
5862   mds_role_t role;
5863   CachedStackStringStream css;
5864   int role_r = fsmap->parse_role(mds_spec, &role, *css);
5865   if (role_r == 0) {
5866     // We got a role, resolve it to a GID
5867     auto& info = fsmap->get_filesystem(role.fscid)->mds_map.get_info(role.rank);
5868     ldout(cct, 10) << __func__ << ": resolved " << mds_spec << " to role '"
5869       << role << "' aka " << info.human_name() << dendl;
5870     targets->push_back(info.global_id);
5871     return 0;
5872   }
5873
5874   std::string strtol_err;
5875   long long rank_or_gid = strict_strtoll(mds_spec.c_str(), 10, &strtol_err);
5876   if (strtol_err.empty()) {
5877     // It is a possible GID
5878     const mds_gid_t mds_gid = mds_gid_t(rank_or_gid);
5879     if (fsmap->gid_exists(mds_gid)) {
5880       auto& info = fsmap->get_info_gid(mds_gid);
5881       ldout(cct, 10) << __func__ << ": validated gid " << mds_gid << " aka "
5882                      << info.human_name() << dendl;
5883       targets->push_back(mds_gid);
5884       return 0;
5885     } else {
5886       lderr(cct) << __func__ << ": gid " << mds_gid << " not in MDS map"
5887                  << dendl;
5888       lderr(cct) << "FSMap: " << *fsmap << dendl;
5889       return -CEPHFS_ENOENT;
5890     }
5891   } else if (mds_spec == "*") {
5892     // It is a wildcard: use all MDSs
5893     const auto& mds_info = fsmap->get_mds_info();
5894
5895     ldout(cct, 10) << __func__ << ": resolving `*' to all MDS daemons" << dendl;
5896     if (mds_info.empty()) {
5897       lderr(cct) << __func__ << ": no MDS daemons found" << dendl;
5898       lderr(cct) << "FSMap: " << *fsmap << dendl;
5899       return -CEPHFS_ENOENT;
5900     }
5901
5902     for (const auto& [gid, info] : mds_info) {
5903       ldout(cct, 10) << __func__ << ": appending " << info.human_name() << " to targets" << dendl;
5904       targets->push_back(gid);
5905     }
5906     return 0;
5907   } else {
5908     // It did not parse as an integer, it is not a wildcard, it must be a name
5909     const mds_gid_t mds_gid = fsmap->find_mds_gid_by_name(mds_spec);
5910     if (mds_gid == 0) {
5911       lderr(cct) << __func__ << ": no MDS daemons found by name `" << mds_spec << "'" << dendl;
5912       lderr(cct) << "FSMap: " << *fsmap << dendl;
5913       return -CEPHFS_ENOENT;
5914     } else {
5915       auto& info = fsmap->get_info_gid(mds_gid);
5916       ldout(cct, 10) << __func__ << ": resolved name '" << mds_spec
5917                      << "' to " << info.human_name() << dendl;
5918       targets->push_back(mds_gid);
5919     }
5920     return 0;
5921   }
5922 }
5923
5924
5925 /**
5926  * Authenticate with mon and establish global ID
5927  */
5928 int Client::authenticate()
5929 {
5930   ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
5931
5932   if (monclient->is_authenticated()) {
5933     return 0;
5934   }
5935
5936   client_lock.unlock();
5937   int r = monclient->authenticate(cct->_conf->client_mount_timeout);
5938   client_lock.lock();
5939   if (r < 0) {
5940     return r;
5941   }
5942
5943   whoami = monclient->get_global_id();
5944   messenger->set_myname(entity_name_t::CLIENT(whoami.v));
5945
5946   return 0;
5947 }
5948
5949 int Client::fetch_fsmap(bool user)
5950 {
5951   ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
5952
5953   // Retrieve FSMap to enable looking up daemon addresses.  We need FSMap
5954   // rather than MDSMap because no one MDSMap contains all the daemons, and
5955   // a `tell` can address any daemon.
5956   version_t fsmap_latest;
5957   bs::error_code ec;
5958   do {
5959     client_lock.unlock();
5960     std::tie(fsmap_latest, std::ignore) =
5961       monclient->get_version("fsmap", ca::use_blocked[ec]);
5962     client_lock.lock();
5963   } while (ec == bs::errc::resource_unavailable_try_again);
5964
5965   if (ec) {
5966     lderr(cct) << "Failed to learn FSMap version: " << ec << dendl;
5967     return ceph::from_error_code(ec);
5968   }
5969
5970   ldout(cct, 10) << __func__ << " learned FSMap version " << fsmap_latest << dendl;
5971
5972   if (user) {
5973     if (!fsmap_user || fsmap_user->get_epoch() < fsmap_latest) {
5974       monclient->sub_want("fsmap.user", fsmap_latest, CEPH_SUBSCRIBE_ONETIME);
5975       monclient->renew_subs();
5976       wait_on_list(waiting_for_fsmap);
5977     }
5978     ceph_assert(fsmap_user);
5979     ceph_assert(fsmap_user->get_epoch() >= fsmap_latest);
5980   } else {
5981     if (!fsmap || fsmap->get_epoch() < fsmap_latest) {
5982       monclient->sub_want("fsmap", fsmap_latest, CEPH_SUBSCRIBE_ONETIME);
5983       monclient->renew_subs();
5984       wait_on_list(waiting_for_fsmap);
5985     }
5986     ceph_assert(fsmap);
5987     ceph_assert(fsmap->get_epoch() >= fsmap_latest);
5988   }
5989   ldout(cct, 10) << __func__ << " finished waiting for FSMap version "
5990                  << fsmap_latest << dendl;
5991   return 0;
5992 }
5993
5994 /**
5995  *
5996  * @mds_spec one of ID, rank, GID, "*"
5997  *
5998  */
5999 int Client::mds_command(
6000     const std::string &mds_spec,
6001     const vector<string>& cmd,
6002     const bufferlist& inbl,
6003     bufferlist *outbl,
6004     string *outs,
6005     Context *onfinish)
6006 {
6007   RWRef_t iref_reader(initialize_state, CLIENT_INITIALIZED);
6008   if (!iref_reader.is_state_satisfied())
6009     return -CEPHFS_ENOTCONN;
6010
6011   std::unique_lock cl(client_lock);
6012
6013   int r;
6014   r = authenticate();
6015   if (r < 0) {
6016     return r;
6017   }
6018
6019   r = fetch_fsmap(false);
6020   if (r < 0) {
6021     return r;
6022   }
6023
6024   // Look up MDS target(s) of the command
6025   std::vector<mds_gid_t> targets;
6026   r = resolve_mds(mds_spec, &targets);
6027   if (r < 0) {
6028     return r;
6029   }
6030
6031   // If daemons are laggy, we won't send them commands.  If all
6032   // are laggy then we fail.
6033   std::vector<mds_gid_t> non_laggy;
6034   for (const auto& gid : targets) {
6035     const auto info = fsmap->get_info_gid(gid);
6036     if (!info.laggy()) {
6037       non_laggy.push_back(gid);
6038     }
6039   }
6040   if (non_laggy.size() == 0) {
6041     *outs = "All targeted MDS daemons are laggy";
6042     return -CEPHFS_ENOENT;
6043   }
6044
6045   if (metadata.empty()) {
6046     // We are called on an unmounted client, so metadata
6047     // won't be initialized yet.
6048     populate_metadata("");
6049   }
6050
6051   // Send commands to targets
6052   C_GatherBuilder gather(cct, onfinish);
6053   for (const auto& target_gid : non_laggy) {
6054     const auto info = fsmap->get_info_gid(target_gid);
6055
6056     // Open a connection to the target MDS
6057     ConnectionRef conn = messenger->connect_to_mds(info.get_addrs());
6058
6059     cl.unlock();
6060     {
6061       std::scoped_lock cmd_lock(command_lock);
6062       // Generate MDSCommandOp state
6063       auto &op = command_table.start_command();
6064
6065       op.on_finish = gather.new_sub();
6066       op.cmd = cmd;
6067       op.outbl = outbl;
6068       op.outs = outs;
6069       op.inbl = inbl;
6070       op.mds_gid = target_gid;
6071       op.con = conn;
6072
6073       ldout(cct, 4) << __func__ << ": new command op to " << target_gid
6074         << " tid=" << op.tid << cmd << dendl;
6075
6076       // Construct and send MCommand
6077       MessageRef m = op.get_message(monclient->get_fsid());
6078       conn->send_message2(std::move(m));
6079     }
6080     cl.lock();
6081   }
6082   gather.activate();
6083
6084   return 0;
6085 }
6086
6087 void Client::handle_command_reply(const MConstRef<MCommandReply>& m)
6088 {
6089   ceph_tid_t const tid = m->get_tid();
6090
6091   ldout(cct, 10) << __func__ << ": tid=" << m->get_tid() << dendl;
6092
6093   std::scoped_lock cmd_lock(command_lock);
6094   if (!command_table.exists(tid)) {
6095     ldout(cct, 1) << __func__ << ": unknown tid " << tid << ", dropping" << dendl;
6096     return;
6097   }
6098
6099   auto &op = command_table.get_command(tid);
6100   if (op.outbl) {
6101     *op.outbl = m->get_data();
6102   }
6103   if (op.outs) {
6104     *op.outs = m->rs;
6105   }
6106
6107   if (op.on_finish) {
6108     op.on_finish->complete(m->r);
6109   }
6110
6111   command_table.erase(tid);
6112 }
6113
6114 // -------------------
6115 // MOUNT
6116
6117 int Client::subscribe_mdsmap(const std::string &fs_name)
6118 {
6119   int r = authenticate();
6120   if (r < 0) {
6121     lderr(cct) << "authentication failed: " << cpp_strerror(r) << dendl;
6122     return r;
6123   }
6124
6125   std::string resolved_fs_name;
6126   if (fs_name.empty()) {
6127     resolved_fs_name = cct->_conf.get_val<std::string>("client_fs");
6128     if (resolved_fs_name.empty())
6129             // Try the backwards compatibility fs name option
6130             resolved_fs_name = cct->_conf.get_val<std::string>("client_mds_namespace");
6131   } else {
6132     resolved_fs_name = fs_name;
6133   }
6134
6135   std::string want = "mdsmap";
6136   if (!resolved_fs_name.empty()) {
6137     r = fetch_fsmap(true);
6138     if (r < 0)
6139       return r;
6140     fscid = fsmap_user->get_fs_cid(resolved_fs_name);
6141     if (fscid == FS_CLUSTER_ID_NONE) {
6142       return -CEPHFS_ENOENT;
6143     }
6144
6145     std::ostringstream oss;
6146     oss << want << "." << fscid;
6147     want = oss.str();
6148   }
6149   ldout(cct, 10) << "Subscribing to map '" << want << "'" << dendl;
6150
6151   monclient->sub_want(want, 0, 0);
6152   monclient->renew_subs();
6153
6154   return 0;
6155 }
6156
6157 int Client::mount(const std::string &mount_root, const UserPerm& perms,
6158                   bool require_mds, const std::string &fs_name)
6159 {
6160   ceph_assert(is_initialized());
6161
6162   /*
6163    * To make sure that the _unmount() must wait until the mount()
6164    * is done.
6165    */
6166   RWRef_t mref_writer(mount_state, CLIENT_MOUNTING, false);
6167   if (!mref_writer.is_first_writer()) // already mounting or mounted
6168     return 0;
6169
6170   std::unique_lock cl(client_lock);
6171
6172   int r = subscribe_mdsmap(fs_name);
6173   if (r < 0) {
6174     lderr(cct) << "mdsmap subscription failed: " << cpp_strerror(r) << dendl;
6175     return r;
6176   }
6177
6178   start_tick_thread(); // start tick thread
6179
6180   if (require_mds) {
6181     while (1) {
6182       auto availability = mdsmap->is_cluster_available();
6183       if (availability == MDSMap::STUCK_UNAVAILABLE) {
6184         // Error out
6185         ldout(cct, 10) << "mds cluster unavailable: epoch=" << mdsmap->get_epoch() << dendl;
6186         return CEPH_FUSE_NO_MDS_UP;
6187       } else if (availability == MDSMap::AVAILABLE) {
6188         // Continue to mount
6189         break;
6190       } else if (availability == MDSMap::TRANSIENT_UNAVAILABLE) {
6191         // Else, wait.  MDSMonitor will update the map to bring
6192         // us to a conclusion eventually.
6193         wait_on_list(waiting_for_mdsmap);
6194       } else {
6195         // Unexpected value!
6196         ceph_abort();
6197       }
6198     }
6199   }
6200
6201   populate_metadata(mount_root.empty() ? "/" : mount_root);
6202
6203   filepath fp(CEPH_INO_ROOT);
6204   if (!mount_root.empty()) {
6205     fp = filepath(mount_root.c_str());
6206   }
6207   while (true) {
6208     MetaRequest *req = new MetaRequest(CEPH_MDS_OP_GETATTR);
6209     req->set_filepath(fp);
6210     req->head.args.getattr.mask = CEPH_STAT_CAP_INODE_ALL;
6211     int res = make_request(req, perms);
6212     if (res < 0) {
6213       if (res == -CEPHFS_EACCES && root) {
6214         ldout(cct, 1) << __func__ << " EACCES on parent of mount point; quotas may not work" << dendl;
6215         break;
6216       }
6217       return res;
6218     }
6219
6220     if (fp.depth())
6221       fp.pop_dentry();
6222     else
6223       break;
6224   }
6225
6226   ceph_assert(root);
6227   _ll_get(root);
6228
6229   // trace?
6230   if (!cct->_conf->client_trace.empty()) {
6231     traceout.open(cct->_conf->client_trace.c_str());
6232     if (traceout.is_open()) {
6233       ldout(cct, 1) << "opened trace file '" << cct->_conf->client_trace << "'" << dendl;
6234     } else {
6235       ldout(cct, 1) << "FAILED to open trace file '" << cct->_conf->client_trace << "'" << dendl;
6236     }
6237   }
6238
6239   /*
6240   ldout(cct, 3) << "op: // client trace data structs" << dendl;
6241   ldout(cct, 3) << "op: struct stat st;" << dendl;
6242   ldout(cct, 3) << "op: struct utimbuf utim;" << dendl;
6243   ldout(cct, 3) << "op: int readlinkbuf_len = 1000;" << dendl;
6244   ldout(cct, 3) << "op: char readlinkbuf[readlinkbuf_len];" << dendl;
6245   ldout(cct, 3) << "op: map<string, inode_t*> dir_contents;" << dendl;
6246   ldout(cct, 3) << "op: map<int, int> open_files;" << dendl;
6247   ldout(cct, 3) << "op: int fd;" << dendl;
6248   */
6249
6250   mref_writer.update_state(CLIENT_MOUNTED);
6251   return 0;
6252 }
6253
6254 // UNMOUNT
6255
6256 void Client::_close_sessions()
6257 {
6258   for (auto it = mds_sessions.begin(); it != mds_sessions.end(); ) {
6259     if (it->second.state == MetaSession::STATE_REJECTED)
6260       mds_sessions.erase(it++);
6261     else
6262       ++it;
6263   }
6264
6265   while (!mds_sessions.empty()) {
6266     // send session closes!
6267     for (auto &p : mds_sessions) {
6268       if (p.second.state != MetaSession::STATE_CLOSING) {
6269         _close_mds_session(&p.second);
6270         mds_ranks_closing.insert(p.first);
6271       }
6272     }
6273
6274     // wait for sessions to close
6275     double timo = cct->_conf.get_val<std::chrono::seconds>("client_shutdown_timeout").count();
6276     ldout(cct, 2) << "waiting for " << mds_ranks_closing.size() << " mds session(s) to close (timeout: "
6277                   << timo << "s)" << dendl;
6278     std::unique_lock l{client_lock, std::adopt_lock};
6279     if (!timo) {
6280       mount_cond.wait(l);
6281     } else if (!mount_cond.wait_for(l, ceph::make_timespan(timo), [this] { return mds_ranks_closing.empty(); })) {
6282       ldout(cct, 1) << mds_ranks_closing.size() << " mds(s) did not respond to session close -- timing out." << dendl;
6283       while (!mds_ranks_closing.empty()) {
6284         auto session = mds_sessions.at(*mds_ranks_closing.begin());
6285         // this prunes entry from mds_sessions and mds_ranks_closing
6286         _closed_mds_session(&session, -CEPHFS_ETIMEDOUT);
6287       }
6288     }
6289
6290     mds_ranks_closing.clear();
6291     l.release();
6292   }
6293 }
6294
6295 void Client::flush_mdlog_sync()
6296 {
6297   if (mds_requests.empty())
6298     return;
6299   for (auto &p : mds_sessions) {
6300     flush_mdlog(&p.second);
6301   }
6302 }
6303
6304 void Client::flush_mdlog(MetaSession *session)
6305 {
6306   // Only send this to Luminous or newer MDS daemons, older daemons
6307   // will crash if they see an unknown CEPH_SESSION_* value in this msg.
6308   const uint64_t features = session->con->get_features();
6309   if (HAVE_FEATURE(features, SERVER_LUMINOUS)) {
6310     auto m = make_message<MClientSession>(CEPH_SESSION_REQUEST_FLUSH_MDLOG);
6311     session->con->send_message2(std::move(m));
6312   }
6313 }
6314
6315
6316 void Client::_abort_mds_sessions(int err)
6317 {
6318   for (auto p = mds_requests.begin(); p != mds_requests.end(); ) {
6319     auto req = p->second;
6320     ++p;
6321     // unsafe requests will be removed during close session below.
6322     if (req->got_unsafe)
6323       continue;
6324
6325     req->abort(err);
6326     if (req->caller_cond) {
6327       req->kick = true;
6328       req->caller_cond->notify_all();
6329     }
6330   }
6331
6332   // Process aborts on any requests that were on this waitlist.
6333   // Any requests that were on a waiting_for_open session waitlist
6334   // will get kicked during close session below.
6335   signal_cond_list(waiting_for_mdsmap);
6336
6337   // Force-close all sessions
6338   while(!mds_sessions.empty()) {
6339     auto& session = mds_sessions.begin()->second;
6340     _closed_mds_session(&session, err);
6341   }
6342 }
6343
6344 void Client::_unmount(bool abort)
6345 {
6346   /*
6347    * We are unmounting the client.
6348    *
6349    * Just declare the state to STATE_UNMOUNTING to block and fail
6350    * any new comming "reader" and then try to wait all the in-flight
6351    * "readers" to finish.
6352    */
6353   RWRef_t mref_writer(mount_state, CLIENT_UNMOUNTING, false);
6354   if (!mref_writer.is_first_writer())
6355     return;
6356   mref_writer.wait_readers_done();
6357
6358   std::unique_lock lock{client_lock};
6359
6360   if (abort || blocklisted) {
6361     ldout(cct, 2) << "unmounting (" << (abort ? "abort)" : "blocklisted)") << dendl;
6362   } else {
6363     ldout(cct, 2) << "unmounting" << dendl;
6364   }
6365
6366   deleg_timeout = 0;
6367
6368   if (abort) {
6369     mount_aborted = true;
6370     // Abort all mds sessions
6371     _abort_mds_sessions(-CEPHFS_ENOTCONN);
6372
6373     objecter->op_cancel_writes(-CEPHFS_ENOTCONN);
6374   } else {
6375     // flush the mdlog for pending requests, if any
6376     flush_mdlog_sync();
6377   }
6378
6379   mount_cond.wait(lock, [this] {
6380     if (!mds_requests.empty()) {
6381       ldout(cct, 10) << "waiting on " << mds_requests.size() << " requests"
6382                      << dendl;
6383     }
6384     return mds_requests.empty();
6385   });
6386
6387   cwd.reset();
6388
6389   // clean up any unclosed files
6390   while (!fd_map.empty()) {
6391     Fh *fh = fd_map.begin()->second;
6392     fd_map.erase(fd_map.begin());
6393     ldout(cct, 0) << " destroyed lost open file " << fh << " on " << *fh->inode << dendl;
6394     _release_fh(fh);
6395   }
6396
6397   while (!ll_unclosed_fh_set.empty()) {
6398     set<Fh*>::iterator it = ll_unclosed_fh_set.begin();
6399     Fh *fh = *it;
6400     ll_unclosed_fh_set.erase(fh);
6401     ldout(cct, 0) << " destroyed lost open file " << fh << " on " << *(fh->inode) << dendl;
6402     _release_fh(fh);
6403   }
6404
6405   while (!opened_dirs.empty()) {
6406     dir_result_t *dirp = *opened_dirs.begin();
6407     ldout(cct, 0) << " destroyed lost open dir " << dirp << " on " << *dirp->inode << dendl;
6408     _closedir(dirp);
6409   }
6410
6411   _ll_drop_pins();
6412
6413   if (cct->_conf->client_oc) {
6414     // flush/release all buffered data
6415     std::list<InodeRef> anchor;
6416     for (auto& p : inode_map) {
6417       Inode *in = p.second;
6418       if (!in) {
6419         ldout(cct, 0) << "null inode_map entry ino " << p.first << dendl;
6420         ceph_assert(in);
6421       }
6422
6423       // prevent inode from getting freed
6424       anchor.emplace_back(in);
6425
6426       if (abort || blocklisted) {
6427         objectcacher->purge_set(&in->oset);
6428       } else if (!in->caps.empty()) {
6429         _release(in);
6430         _flush(in, new C_Client_FlushComplete(this, in));
6431       }
6432     }
6433   }
6434
6435   if (abort || blocklisted) {
6436     for (auto p = dirty_list.begin(); !p.end(); ) {
6437       Inode *in = *p;
6438       ++p;
6439       if (in->dirty_caps) {
6440         ldout(cct, 0) << " drop dirty caps on " << *in << dendl;
6441         in->mark_caps_clean();
6442         put_inode(in);
6443       }
6444     }
6445   } else {
6446     flush_caps_sync();
6447     wait_sync_caps(last_flush_tid);
6448   }
6449
6450   // empty lru cache
6451   trim_cache();
6452
6453   delay_put_inodes();
6454
6455   while (lru.lru_get_size() > 0 ||
6456          !inode_map.empty()) {
6457     ldout(cct, 2) << "cache still has " << lru.lru_get_size()
6458             << "+" << inode_map.size() << " items"
6459             << ", waiting (for caps to release?)"
6460             << dendl;
6461
6462     if (auto r = mount_cond.wait_for(lock, ceph::make_timespan(5));
6463         r == std::cv_status::timeout) {
6464       dump_cache(NULL);
6465     }
6466   }
6467   ceph_assert(lru.lru_get_size() == 0);
6468   ceph_assert(inode_map.empty());
6469
6470   // stop tracing
6471   if (!cct->_conf->client_trace.empty()) {
6472     ldout(cct, 1) << "closing trace file '" << cct->_conf->client_trace << "'" << dendl;
6473     traceout.close();
6474   }
6475
6476   // stop the tick thread
6477   tick_thread_stopped = true;
6478   upkeep_cond.notify_one();
6479
6480   _close_sessions();
6481
6482   mref_writer.update_state(CLIENT_UNMOUNTED);
6483
6484   ldout(cct, 2) << "unmounted." << dendl;
6485 }
6486
6487 void Client::unmount()
6488 {
6489   _unmount(false);
6490 }
6491
6492 void Client::abort_conn()
6493 {
6494   _unmount(true);
6495 }
6496
6497 void Client::flush_cap_releases()
6498 {
6499   uint64_t nr_caps = 0;
6500
6501   // send any cap releases
6502   for (auto &p : mds_sessions) {
6503     auto &session = p.second;
6504     if (session.release && mdsmap->is_clientreplay_or_active_or_stopping(
6505           p.first)) {
6506       nr_caps += session.release->caps.size();
6507       if (cct->_conf->client_inject_release_failure) {
6508         ldout(cct, 20) << __func__ << " injecting failure to send cap release message" << dendl;
6509       } else {
6510         session.con->send_message2(std::move(session.release));
6511       }
6512       session.release.reset();
6513     }
6514   }
6515
6516   if (nr_caps > 0) {
6517     dec_pinned_icaps(nr_caps);
6518   }
6519 }
6520
6521 void Client::renew_and_flush_cap_releases()
6522 {
6523   ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
6524
6525   if (!mount_aborted && mdsmap->get_epoch()) {
6526     // renew caps?
6527     utime_t el = ceph_clock_now() - last_cap_renew;
6528     if (unlikely(el > mdsmap->get_session_timeout() / 3.0))
6529       renew_caps();
6530
6531     flush_cap_releases();
6532   }
6533 }
6534
6535 void Client::tick()
6536 {
6537   ldout(cct, 20) << "tick" << dendl;
6538
6539   utime_t now = ceph_clock_now();
6540
6541   /*
6542    * If the mount() is not finished
6543    */
6544   if (is_mounting() && !mds_requests.empty()) {
6545     MetaRequest *req = mds_requests.begin()->second;
6546
6547     if (req->op_stamp + cct->_conf->client_mount_timeout < now) {
6548       req->abort(-CEPHFS_ETIMEDOUT);
6549       if (req->caller_cond) {
6550         req->kick = true;
6551         req->caller_cond->notify_all();
6552       }
6553       signal_cond_list(waiting_for_mdsmap);
6554       for (auto &p : mds_sessions) {
6555         signal_context_list(p.second.waiting_for_open);
6556       }
6557     }
6558   }
6559
6560   renew_and_flush_cap_releases();
6561
6562   // delayed caps
6563   xlist<Inode*>::iterator p = delayed_list.begin();
6564   while (!p.end()) {
6565     Inode *in = *p;
6566     ++p;
6567     if (!mount_aborted && in->hold_caps_until > now)
6568       break;
6569     delayed_list.pop_front();
6570     if (!mount_aborted)
6571       check_caps(in, CHECK_CAPS_NODELAY);
6572   }
6573
6574   if (!mount_aborted)
6575     collect_and_send_metrics();
6576
6577   delay_put_inodes(is_unmounting());
6578   trim_cache(true);
6579
6580   if (blocklisted && (is_mounted() || is_unmounting()) &&
6581       last_auto_reconnect + 30 * 60 < now &&
6582       cct->_conf.get_val<bool>("client_reconnect_stale")) {
6583     messenger->client_reset();
6584     fd_gen++; // invalidate open files
6585     blocklisted = false;
6586     _kick_stale_sessions();
6587     last_auto_reconnect = now;
6588   }
6589 }
6590
6591 void Client::start_tick_thread()
6592 {
6593   upkeeper = std::thread([this]() {
6594     using time = ceph::coarse_mono_time;
6595     using sec = std::chrono::seconds;
6596
6597     auto last_tick = time::min();
6598
6599     std::unique_lock cl(client_lock);
6600     while (!tick_thread_stopped) {
6601       auto now = clock::now();
6602       auto since = now - last_tick;
6603
6604       auto t_interval = clock::duration(cct->_conf.get_val<sec>("client_tick_interval"));
6605       auto d_interval = clock::duration(cct->_conf.get_val<sec>("client_debug_inject_tick_delay"));
6606
6607       auto interval = std::max(t_interval, d_interval);
6608       if (likely(since >= interval*.90)) {
6609         tick();
6610         last_tick = clock::now();
6611       } else {
6612         interval -= since;
6613       }
6614
6615       ldout(cct, 20) << "upkeep thread waiting interval " << interval << dendl;
6616       if (!tick_thread_stopped)
6617         upkeep_cond.wait_for(cl, interval);
6618     }
6619   });
6620 }
6621
6622 void Client::collect_and_send_metrics() {
6623   ldout(cct, 20) << __func__ << dendl;
6624
6625   ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
6626
6627   // right now, we only track and send global metrics. its sufficient
6628   // to send these metrics to MDS rank0.
6629   collect_and_send_global_metrics();
6630 }
6631
6632 void Client::collect_and_send_global_metrics() {
6633   ldout(cct, 20) << __func__ << dendl;
6634   ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
6635
6636   if (!have_open_session((mds_rank_t)0)) {
6637     ldout(cct, 5) << __func__ << ": no session with rank=0 -- not sending metric"
6638                   << dendl;
6639     return;
6640   }
6641   auto session = _get_or_open_mds_session((mds_rank_t)0);
6642   if (!session->mds_features.test(CEPHFS_FEATURE_METRIC_COLLECT)) {
6643     ldout(cct, 5) << __func__ << ": rank=0 does not support metrics" << dendl;
6644     return;
6645   }
6646
6647   ClientMetricMessage metric;
6648   std::vector<ClientMetricMessage> message;
6649
6650   // read latency
6651   metric = ClientMetricMessage(ReadLatencyPayload(logger->tget(l_c_read)));
6652   message.push_back(metric);
6653
6654   // write latency
6655   metric = ClientMetricMessage(WriteLatencyPayload(logger->tget(l_c_wrlat)));
6656   message.push_back(metric);
6657
6658   // metadata latency
6659   metric = ClientMetricMessage(MetadataLatencyPayload(logger->tget(l_c_lat)));
6660   message.push_back(metric);
6661
6662   // cap hit ratio -- nr_caps is unused right now
6663   auto [cap_hits, cap_misses] = get_cap_hit_rates();
6664   metric = ClientMetricMessage(CapInfoPayload(cap_hits, cap_misses, 0));
6665   message.push_back(metric);
6666
6667   // dentry lease hit ratio
6668   auto [dlease_hits, dlease_misses, nr] = get_dlease_hit_rates();
6669   metric = ClientMetricMessage(DentryLeasePayload(dlease_hits, dlease_misses, nr));
6670   message.push_back(metric);
6671
6672   // opened files
6673   {
6674     auto [opened_files, total_inodes] = get_opened_files_rates();
6675     metric = ClientMetricMessage(OpenedFilesPayload(opened_files, total_inodes));
6676   }
6677   message.push_back(metric);
6678
6679   // pinned i_caps
6680   {
6681     auto [pinned_icaps, total_inodes] = get_pinned_icaps_rates();
6682     metric = ClientMetricMessage(PinnedIcapsPayload(pinned_icaps, total_inodes));
6683   }
6684   message.push_back(metric);
6685
6686   // opened inodes
6687   {
6688     auto [opened_inodes, total_inodes] = get_opened_inodes_rates();
6689     metric = ClientMetricMessage(OpenedInodesPayload(opened_inodes, total_inodes));
6690   }
6691   message.push_back(metric);
6692
6693   session->con->send_message2(make_message<MClientMetrics>(std::move(message)));
6694 }
6695
6696 void Client::renew_caps()
6697 {
6698   ldout(cct, 10) << "renew_caps()" << dendl;
6699   last_cap_renew = ceph_clock_now();
6700
6701   for (auto &p : mds_sessions) {
6702     ldout(cct, 15) << "renew_caps requesting from mds." << p.first << dendl;
6703     if (mdsmap->get_state(p.first) >= MDSMap::STATE_REJOIN)
6704       renew_caps(&p.second);
6705   }
6706 }
6707
6708 void Client::renew_caps(MetaSession *session)
6709 {
6710   ldout(cct, 10) << "renew_caps mds." << session->mds_num << dendl;
6711   session->last_cap_renew_request = ceph_clock_now();
6712   uint64_t seq = ++session->cap_renew_seq;
6713   session->con->send_message2(make_message<MClientSession>(CEPH_SESSION_REQUEST_RENEWCAPS, seq));
6714 }
6715
6716
6717 // ===============================================================
6718 // high level (POSIXy) interface
6719
6720 int Client::_do_lookup(Inode *dir, const string& name, int mask,
6721                        InodeRef *target, const UserPerm& perms)
6722 {
6723   int op = dir->snapid == CEPH_SNAPDIR ? CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP;
6724   MetaRequest *req = new MetaRequest(op);
6725   filepath path;
6726   dir->make_nosnap_relative_path(path);
6727   path.push_dentry(name);
6728   req->set_filepath(path);
6729   req->set_inode(dir);
6730   if (cct->_conf->client_debug_getattr_caps && op == CEPH_MDS_OP_LOOKUP)
6731       mask |= DEBUG_GETATTR_CAPS;
6732   req->head.args.getattr.mask = mask;
6733
6734   ldout(cct, 10) << __func__ << " on " << path << dendl;
6735
6736   int r = make_request(req, perms, target);
6737   ldout(cct, 10) << __func__ << " res is " << r << dendl;
6738   return r;
6739 }
6740
6741 bool Client::_dentry_valid(const Dentry *dn)
6742 {
6743   ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
6744
6745   // is dn lease valid?
6746   utime_t now = ceph_clock_now();
6747   if (dn->lease_mds >= 0 && dn->lease_ttl > now &&
6748       mds_sessions.count(dn->lease_mds)) {
6749     MetaSession &s = mds_sessions.at(dn->lease_mds);
6750     if (s.cap_ttl > now && s.cap_gen == dn->lease_gen) {
6751       dlease_hit();
6752       return true;
6753     }
6754
6755     ldout(cct, 20) << " bad lease, cap_ttl " << s.cap_ttl << ", cap_gen " << s.cap_gen
6756                    << " vs lease_gen " << dn->lease_gen << dendl;
6757   }
6758
6759   dlease_miss();
6760   return false;
6761 }
6762
6763 int Client::_lookup(Inode *dir, const string& dname, int mask, InodeRef *target,
6764                     const UserPerm& perms, std::string* alternate_name)
6765 {
6766   int r = 0;
6767   Dentry *dn = NULL;
6768   bool did_lookup_request = false;
6769   // can only request shared caps
6770   mask &= CEPH_CAP_ANY_SHARED | CEPH_STAT_RSTAT;
6771
6772   if (dname == "..") {
6773     if (dir->dentries.empty()) {
6774       MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPPARENT);
6775       filepath path(dir->ino);
6776       req->set_filepath(path);
6777
6778       InodeRef tmptarget;
6779       int r = make_request(req, perms, &tmptarget, NULL, rand() % mdsmap->get_num_in_mds());
6780
6781       if (r == 0) {
6782         *target = std::move(tmptarget);
6783         ldout(cct, 8) << __func__ << " found target " << (*target)->ino << dendl;
6784       } else {
6785         *target = dir;
6786       }
6787     }
6788     else
6789       *target = dir->get_first_parent()->dir->parent_inode; //dirs can't be hard-linked
6790     goto done;
6791   }
6792
6793   if (dname == ".") {
6794     *target = dir;
6795     goto done;
6796   }
6797
6798   if (!dir->is_dir()) {
6799     r = -CEPHFS_ENOTDIR;
6800     goto done;
6801   }
6802
6803   if (dname.length() > NAME_MAX) {
6804     r = -CEPHFS_ENAMETOOLONG;
6805     goto done;
6806   }
6807
6808   if (dname == cct->_conf->client_snapdir &&
6809       dir->snapid == CEPH_NOSNAP) {
6810     *target = open_snapdir(dir);
6811     goto done;
6812   }
6813
6814 relookup:
6815   if (dir->dir &&
6816       dir->dir->dentries.count(dname)) {
6817     dn = dir->dir->dentries[dname];
6818
6819     ldout(cct, 20) << __func__ << " have " << *dn << " from mds." << dn->lease_mds
6820         << " ttl " << dn->lease_ttl << " seq " << dn->lease_seq << dendl;
6821
6822     if (!dn->inode || dn->inode->caps_issued_mask(mask, true)) {
6823       if (_dentry_valid(dn)) {
6824         // touch this mds's dir cap too, even though we don't _explicitly_ use it here, to
6825         // make trim_caps() behave.
6826         dir->try_touch_cap(dn->lease_mds);
6827           goto hit_dn;
6828       }
6829       // dir shared caps?
6830       if (dir->caps_issued_mask(CEPH_CAP_FILE_SHARED, true)) {
6831         if (dn->cap_shared_gen == dir->shared_gen &&
6832             (!dn->inode || dn->inode->caps_issued_mask(mask, true)))
6833               goto hit_dn;
6834         if (!dn->inode && (dir->flags & I_COMPLETE)) {
6835           ldout(cct, 10) << __func__ << " concluded ENOENT locally for "
6836                          << *dir << " dn '" << dname << "'" << dendl;
6837           return -CEPHFS_ENOENT;
6838         }
6839       }
6840     } else {
6841       ldout(cct, 20) << " no cap on " << dn->inode->vino() << dendl;
6842     }
6843   } else {
6844     // can we conclude ENOENT locally?
6845     if (dir->caps_issued_mask(CEPH_CAP_FILE_SHARED, true) &&
6846         (dir->flags & I_COMPLETE)) {
6847       ldout(cct, 10) << __func__ << " concluded ENOENT locally for " << *dir << " dn '" << dname << "'" << dendl;
6848       return -CEPHFS_ENOENT;
6849     }
6850   }
6851
6852   if (did_lookup_request) {
6853     r = 0;
6854     goto done;
6855   }
6856   r = _do_lookup(dir, dname, mask, target, perms);
6857   did_lookup_request = true;
6858   if (r == 0) {
6859     /* complete lookup to get dentry for alternate_name */
6860     goto relookup;
6861   } else {
6862     goto done;
6863   }
6864
6865  hit_dn:
6866   if (dn->inode) {
6867     *target = dn->inode;
6868     if (alternate_name)
6869       *alternate_name = dn->alternate_name;
6870   } else {
6871     r = -CEPHFS_ENOENT;
6872   }
6873   touch_dn(dn);
6874   goto done;
6875
6876  done:
6877   if (r < 0)
6878     ldout(cct, 10) << __func__ << " " << *dir << " " << dname << " = " << r << dendl;
6879   else
6880     ldout(cct, 10) << __func__ << " " << *dir << " " << dname << " = " << **target << dendl;
6881   return r;
6882 }
6883
6884 int Client::get_or_create(Inode *dir, const char* name,
6885                           Dentry **pdn, bool expect_null)
6886 {
6887   // lookup
6888   ldout(cct, 20) << __func__ << " " << *dir << " name " << name << dendl;
6889   dir->open_dir();
6890   if (dir->dir->dentries.count(name)) {
6891     Dentry *dn = dir->dir->dentries[name];
6892     if (_dentry_valid(dn)) {
6893       if (expect_null)
6894         return -CEPHFS_EEXIST;
6895     }
6896     *pdn = dn;
6897   } else {
6898     // otherwise link up a new one
6899     *pdn = link(dir->dir, name, NULL, NULL);
6900   }
6901
6902   // success
6903   return 0;
6904 }
6905
6906 int Client::walk(std::string_view path, walk_dentry_result* wdr, const UserPerm& perms, bool followsym)
6907 {
6908   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
6909   if (!mref_reader.is_state_satisfied())
6910     return -CEPHFS_ENOTCONN;
6911
6912   ldout(cct, 10) << __func__ << ": " << path << dendl;
6913
6914   std::scoped_lock lock(client_lock);
6915
6916   return path_walk(path, wdr, perms, followsym);
6917 }
6918
6919 int Client::path_walk(const filepath& origpath, InodeRef *end,
6920                       const UserPerm& perms, bool followsym, int mask)
6921 {
6922   walk_dentry_result wdr;
6923   int rc = path_walk(origpath, &wdr, perms, followsym, mask);
6924   *end = std::move(wdr.in);
6925   return rc;
6926 }
6927
6928 int Client::path_walk(const filepath& origpath, walk_dentry_result* result, const UserPerm& perms, bool followsym, int mask)
6929 {
6930   filepath path = origpath;
6931   InodeRef cur;
6932   std::string alternate_name;
6933   if (origpath.absolute())
6934     cur = root;
6935   else
6936     cur = cwd;
6937   ceph_assert(cur);
6938
6939   ldout(cct, 10) << __func__ << " " << path << dendl;
6940
6941   int symlinks = 0;
6942
6943   unsigned i=0;
6944   while (i < path.depth() && cur) {
6945     int caps = 0;
6946     const string &dname = path[i];
6947     ldout(cct, 10) << " " << i << " " << *cur << " " << dname << dendl;
6948     ldout(cct, 20) << "  (path is " << path << ")" << dendl;
6949     InodeRef next;
6950     if (cct->_conf->client_permissions) {
6951       int r = may_lookup(cur.get(), perms);
6952       if (r < 0)
6953         return r;
6954       caps = CEPH_CAP_AUTH_SHARED;
6955     }
6956
6957     /* Get extra requested caps on the last component */
6958     if (i == (path.depth() - 1))
6959       caps |= mask;
6960     int r = _lookup(cur.get(), dname, caps, &next, perms, &alternate_name);
6961     if (r < 0)
6962       return r;
6963     // only follow trailing symlink if followsym.  always follow
6964     // 'directory' symlinks.
6965     if (next && next->is_symlink()) {
6966       symlinks++;
6967       ldout(cct, 20) << " symlink count " << symlinks << ", value is '" << next->symlink << "'" << dendl;
6968       if (symlinks > MAXSYMLINKS) {
6969         return -CEPHFS_ELOOP;
6970       }
6971
6972       if (i < path.depth() - 1) {
6973         // dir symlink
6974         // replace consumed components of path with symlink dir target
6975         filepath resolved(next->symlink.c_str());
6976         resolved.append(path.postfixpath(i + 1));
6977         path = resolved;
6978         i = 0;
6979         if (next->symlink[0] == '/') {
6980           cur = root;
6981         }
6982         continue;
6983       } else if (followsym) {
6984         if (next->symlink[0] == '/') {
6985           path = next->symlink.c_str();
6986           i = 0;
6987           // reset position
6988           cur = root;
6989         } else {
6990           filepath more(next->symlink.c_str());
6991           // we need to remove the symlink component from off of the path
6992           // before adding the target that the symlink points to.  remain
6993           // at the same position in the path.
6994           path.pop_dentry();
6995           path.append(more);
6996         }
6997         continue;
6998       }
6999     }
7000     cur.swap(next);
7001     i++;
7002   }
7003   if (!cur)
7004     return -CEPHFS_ENOENT;
7005   if (result) {
7006     result->in = std::move(cur);
7007     result->alternate_name = std::move(alternate_name);
7008   }
7009   return 0;
7010 }
7011
7012
7013 // namespace ops
7014
7015 int Client::link(const char *relexisting, const char *relpath, const UserPerm& perm, std::string alternate_name)
7016 {
7017   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
7018   if (!mref_reader.is_state_satisfied())
7019     return -CEPHFS_ENOTCONN;
7020
7021   tout(cct) << "link" << std::endl;
7022   tout(cct) << relexisting << std::endl;
7023   tout(cct) << relpath << std::endl;
7024
7025   filepath existing(relexisting);
7026
7027   InodeRef in, dir;
7028
7029   std::scoped_lock lock(client_lock);
7030   int r = path_walk(existing, &in, perm, true);
7031   if (r < 0)
7032     return r;
7033   if (std::string(relpath) == "/") {
7034     r = -CEPHFS_EEXIST;
7035     return r;
7036   }
7037   filepath path(relpath);
7038   string name = path.last_dentry();
7039   path.pop_dentry();
7040
7041   r = path_walk(path, &dir, perm, true);
7042   if (r < 0)
7043     return r;
7044   if (cct->_conf->client_permissions) {
7045     if (S_ISDIR(in->mode)) {
7046       r = -CEPHFS_EPERM;
7047       return r;
7048     }
7049     r = may_hardlink(in.get(), perm);
7050     if (r < 0)
7051       return r;
7052     r = may_create(dir.get(), perm);
7053     if (r < 0)
7054       return r;
7055   }
7056   r = _link(in.get(), dir.get(), name.c_str(), perm, std::move(alternate_name));
7057   return r;
7058 }
7059
7060 int Client::unlink(const char *relpath, const UserPerm& perm)
7061 {
7062   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
7063   if (!mref_reader.is_state_satisfied())
7064     return -CEPHFS_ENOTCONN;
7065
7066   tout(cct) << __func__ << std::endl;
7067   tout(cct) << relpath << std::endl;
7068
7069   if (std::string(relpath) == "/")
7070     return -CEPHFS_EISDIR;
7071
7072   filepath path(relpath);
7073   string name = path.last_dentry();
7074   path.pop_dentry();
7075   InodeRef dir;
7076
7077   std::scoped_lock lock(client_lock);
7078   int r = path_walk(path, &dir, perm);
7079   if (r < 0)
7080     return r;
7081   if (cct->_conf->client_permissions) {
7082     r = may_delete(dir.get(), name.c_str(), perm);
7083     if (r < 0)
7084       return r;
7085   }
7086   return _unlink(dir.get(), name.c_str(), perm);
7087 }
7088
7089 int Client::rename(const char *relfrom, const char *relto, const UserPerm& perm, std::string alternate_name)
7090 {
7091   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
7092   if (!mref_reader.is_state_satisfied())
7093     return -CEPHFS_ENOTCONN;
7094
7095   tout(cct) << __func__ << std::endl;
7096   tout(cct) << relfrom << std::endl;
7097   tout(cct) << relto << std::endl;
7098
7099   if (std::string(relfrom) == "/" || std::string(relto) == "/")
7100     return -CEPHFS_EBUSY;
7101
7102   filepath from(relfrom);
7103   filepath to(relto);
7104   string fromname = from.last_dentry();
7105   from.pop_dentry();
7106   string toname = to.last_dentry();
7107   to.pop_dentry();
7108
7109   InodeRef fromdir, todir;
7110
7111   std::scoped_lock lock(client_lock);
7112   int r = path_walk(from, &fromdir, perm);
7113   if (r < 0)
7114     goto out;
7115   r = path_walk(to, &todir, perm);
7116   if (r < 0)
7117     goto out;
7118
7119   if (cct->_conf->client_permissions) {
7120     int r = may_delete(fromdir.get(), fromname.c_str(), perm);
7121     if (r < 0)
7122       return r;
7123     r = may_delete(todir.get(), toname.c_str(), perm);
7124     if (r < 0 && r != -CEPHFS_ENOENT)
7125       return r;
7126   }
7127   r = _rename(fromdir.get(), fromname.c_str(), todir.get(), toname.c_str(), perm, std::move(alternate_name));
7128 out:
7129   return r;
7130 }
7131
7132 // dirs
7133
7134 int Client::mkdir(const char *relpath, mode_t mode, const UserPerm& perm, std::string alternate_name)
7135 {
7136   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
7137   if (!mref_reader.is_state_satisfied())
7138     return -CEPHFS_ENOTCONN;
7139
7140   tout(cct) << __func__ << std::endl;
7141   tout(cct) << relpath << std::endl;
7142   tout(cct) << mode << std::endl;
7143   ldout(cct, 10) << __func__ << ": " << relpath << dendl;
7144
7145   if (std::string(relpath) == "/")
7146     return -CEPHFS_EEXIST;
7147
7148   filepath path(relpath);
7149   string name = path.last_dentry();
7150   path.pop_dentry();
7151   InodeRef dir;
7152
7153   std::scoped_lock lock(client_lock);
7154   int r = path_walk(path, &dir, perm);
7155   if (r < 0)
7156     return r;
7157   if (cct->_conf->client_permissions) {
7158     r = may_create(dir.get(), perm);
7159     if (r < 0)
7160       return r;
7161   }
7162   return _mkdir(dir.get(), name.c_str(), mode, perm, 0, {}, std::move(alternate_name));
7163 }
7164
7165 int Client::mkdirs(const char *relpath, mode_t mode, const UserPerm& perms)
7166 {
7167   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
7168   if (!mref_reader.is_state_satisfied())
7169     return -CEPHFS_ENOTCONN;
7170
7171   ldout(cct, 10) << "Client::mkdirs " << relpath << dendl;
7172   tout(cct) << __func__ << std::endl;
7173   tout(cct) << relpath << std::endl;
7174   tout(cct) << mode << std::endl;
7175
7176   //get through existing parts of path
7177   filepath path(relpath);
7178   unsigned int i;
7179   int r = 0, caps = 0;
7180   InodeRef cur, next;
7181
7182   std::scoped_lock lock(client_lock);
7183   cur = cwd;
7184   for (i=0; i<path.depth(); ++i) {
7185     if (cct->_conf->client_permissions) {
7186       r = may_lookup(cur.get(), perms);
7187       if (r < 0)
7188         break;
7189       caps = CEPH_CAP_AUTH_SHARED;
7190     }
7191     r = _lookup(cur.get(), path[i].c_str(), caps, &next, perms);
7192     if (r < 0)
7193       break;
7194     cur.swap(next);
7195   }
7196   if (r!=-CEPHFS_ENOENT) return r;
7197   ldout(cct, 20) << __func__ << " got through " << i << " directories on path " << relpath << dendl;
7198   //make new directory at each level
7199   for (; i<path.depth(); ++i) {
7200     if (cct->_conf->client_permissions) {
7201       r = may_create(cur.get(), perms);
7202       if (r < 0)
7203         return r;
7204     }
7205     //make new dir
7206     r = _mkdir(cur.get(), path[i].c_str(), mode, perms, &next);
7207
7208     //check proper creation/existence
7209     if(-CEPHFS_EEXIST == r && i < path.depth() - 1) {
7210       r = _lookup(cur.get(), path[i].c_str(), CEPH_CAP_AUTH_SHARED, &next, perms);
7211     }
7212     if (r < 0)
7213       return r;
7214     //move to new dir and continue
7215     cur.swap(next);
7216     ldout(cct, 20) << __func__ << ": successfully created directory "
7217                    << filepath(cur->ino).get_path() << dendl;
7218   }
7219   return 0;
7220 }
7221
7222 int Client::rmdir(const char *relpath, const UserPerm& perms)
7223 {
7224   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
7225   if (!mref_reader.is_state_satisfied())
7226     return -CEPHFS_ENOTCONN;
7227
7228   tout(cct) << __func__ << std::endl;
7229   tout(cct) << relpath << std::endl;
7230
7231   if (std::string(relpath) == "/")
7232     return -CEPHFS_EBUSY;
7233
7234   filepath path(relpath);
7235   string name = path.last_dentry();
7236   path.pop_dentry();
7237   InodeRef dir;
7238
7239   std::scoped_lock lock(client_lock);
7240   int r = path_walk(path, &dir, perms);
7241   if (r < 0)
7242     return r;
7243   if (cct->_conf->client_permissions) {
7244     int r = may_delete(dir.get(), name.c_str(), perms);
7245     if (r < 0)
7246       return r;
7247   }
7248   return _rmdir(dir.get(), name.c_str(), perms);
7249 }
7250
7251 int Client::mknod(const char *relpath, mode_t mode, const UserPerm& perms, dev_t rdev)
7252 {
7253   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
7254   if (!mref_reader.is_state_satisfied())
7255     return -CEPHFS_ENOTCONN;
7256
7257   tout(cct) << __func__ << std::endl;
7258   tout(cct) << relpath << std::endl;
7259   tout(cct) << mode << std::endl;
7260   tout(cct) << rdev << std::endl;
7261
7262   if (std::string(relpath) == "/")
7263     return -CEPHFS_EEXIST;
7264
7265   filepath path(relpath);
7266   string name = path.last_dentry();
7267   path.pop_dentry();
7268   InodeRef dir;
7269
7270   std::scoped_lock lock(client_lock);
7271   int r = path_walk(path, &dir, perms);
7272   if (r < 0)
7273     return r;
7274   if (cct->_conf->client_permissions) {
7275     int r = may_create(dir.get(), perms);
7276     if (r < 0)
7277       return r;
7278   }
7279   return _mknod(dir.get(), name.c_str(), mode, rdev, perms);
7280 }
7281
7282 // symlinks
7283
7284 int Client::symlink(const char *target, const char *relpath, const UserPerm& perms, std::string alternate_name)
7285 {
7286   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
7287   if (!mref_reader.is_state_satisfied())
7288     return -CEPHFS_ENOTCONN;
7289
7290   tout(cct) << __func__ << std::endl;
7291   tout(cct) << target << std::endl;
7292   tout(cct) << relpath << std::endl;
7293
7294   if (std::string(relpath) == "/")
7295     return -CEPHFS_EEXIST;
7296
7297   filepath path(relpath);
7298   string name = path.last_dentry();
7299   path.pop_dentry();
7300   InodeRef dir;
7301
7302   std::scoped_lock lock(client_lock);
7303   int r = path_walk(path, &dir, perms);
7304   if (r < 0)
7305     return r;
7306   if (cct->_conf->client_permissions) {
7307     int r = may_create(dir.get(), perms);
7308     if (r < 0)
7309       return r;
7310   }
7311   return _symlink(dir.get(), name.c_str(), target, perms, std::move(alternate_name));
7312 }
7313
7314 int Client::readlink(const char *relpath, char *buf, loff_t size, const UserPerm& perms)
7315 {
7316   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
7317   if (!mref_reader.is_state_satisfied())
7318     return -CEPHFS_ENOTCONN;
7319
7320   tout(cct) << __func__ << std::endl;
7321   tout(cct) << relpath << std::endl;
7322
7323   filepath path(relpath);
7324   InodeRef in;
7325
7326   std::scoped_lock lock(client_lock);
7327   int r = path_walk(path, &in, perms, false);
7328   if (r < 0)
7329     return r;
7330
7331   return _readlink(in.get(), buf, size);
7332 }
7333
7334 int Client::_readlink(Inode *in, char *buf, size_t size)
7335 {
7336   if (!in->is_symlink())
7337     return -CEPHFS_EINVAL;
7338
7339   // copy into buf (at most size bytes)
7340   int r = in->symlink.length();
7341   if (r > (int)size)
7342     r = size;
7343   memcpy(buf, in->symlink.c_str(), r);
7344   return r;
7345 }
7346
7347
7348 // inode stuff
7349
7350 int Client::_getattr(Inode *in, int mask, const UserPerm& perms, bool force)
7351 {
7352   bool yes = in->caps_issued_mask(mask, true);
7353
7354   ldout(cct, 10) << __func__ << " mask " << ccap_string(mask) << " issued=" << yes << dendl;
7355   if (yes && !force)
7356     return 0;
7357
7358   MetaRequest *req = new MetaRequest(CEPH_MDS_OP_GETATTR);
7359   filepath path;
7360   in->make_nosnap_relative_path(path);
7361   req->set_filepath(path);
7362   req->set_inode(in);
7363   req->head.args.getattr.mask = mask;
7364
7365   int res = make_request(req, perms);
7366   ldout(cct, 10) << __func__ << " result=" << res << dendl;
7367   return res;
7368 }
7369
7370 int Client::_do_setattr(Inode *in, struct ceph_statx *stx, int mask,
7371                         const UserPerm& perms, InodeRef *inp)
7372 {
7373   int issued = in->caps_issued();
7374
7375   ldout(cct, 10) << __func__ << " mask " << mask << " issued " <<
7376     ccap_string(issued) << dendl;
7377
7378   if (in->snapid != CEPH_NOSNAP) {
7379     return -CEPHFS_EROFS;
7380   }
7381   if ((mask & CEPH_SETATTR_SIZE) &&
7382       (uint64_t)stx->stx_size > in->size &&
7383       is_quota_bytes_exceeded(in, (uint64_t)stx->stx_size - in->size,
7384                               perms)) {
7385     return -CEPHFS_EDQUOT;
7386   }
7387
7388   // make the change locally?
7389   if ((in->cap_dirtier_uid >= 0 && perms.uid() != in->cap_dirtier_uid) ||
7390       (in->cap_dirtier_gid >= 0 && perms.gid() != in->cap_dirtier_gid)) {
7391     ldout(cct, 10) << __func__ << " caller " << perms.uid() << ":" << perms.gid()
7392                    << " != cap dirtier " << in->cap_dirtier_uid << ":"
7393                    << in->cap_dirtier_gid << ", forcing sync setattr"
7394                    << dendl;
7395     /*
7396      * This works because we implicitly flush the caps as part of the
7397      * request, so the cap update check will happen with the writeback
7398      * cap context, and then the setattr check will happen with the
7399      * caller's context.
7400      *
7401      * In reality this pattern is likely pretty rare (different users
7402      * setattr'ing the same file).  If that turns out not to be the
7403      * case later, we can build a more complex pipelined cap writeback
7404      * infrastructure...
7405      */
7406     if (!mask)
7407       mask |= CEPH_SETATTR_CTIME;
7408     goto force_request;
7409   }
7410
7411   if (!mask) {
7412     // caller just needs us to bump the ctime
7413     in->ctime = ceph_clock_now();
7414     in->cap_dirtier_uid = perms.uid();
7415     in->cap_dirtier_gid = perms.gid();
7416     if (issued & CEPH_CAP_AUTH_EXCL)
7417       in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7418     else if (issued & CEPH_CAP_FILE_EXCL)
7419       in->mark_caps_dirty(CEPH_CAP_FILE_EXCL);
7420     else if (issued & CEPH_CAP_XATTR_EXCL)
7421       in->mark_caps_dirty(CEPH_CAP_XATTR_EXCL);
7422     else
7423       mask |= CEPH_SETATTR_CTIME;
7424   }
7425
7426   if (in->caps_issued_mask(CEPH_CAP_AUTH_EXCL)) {
7427     bool kill_sguid = mask & (CEPH_SETATTR_SIZE|CEPH_SETATTR_KILL_SGUID);
7428
7429     mask &= ~CEPH_SETATTR_KILL_SGUID;
7430
7431     if (mask & CEPH_SETATTR_UID) {
7432       in->ctime = ceph_clock_now();
7433       in->cap_dirtier_uid = perms.uid();
7434       in->cap_dirtier_gid = perms.gid();
7435       in->uid = stx->stx_uid;
7436       in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7437       mask &= ~CEPH_SETATTR_UID;
7438       kill_sguid = true;
7439       ldout(cct,10) << "changing uid to " << stx->stx_uid << dendl;
7440     }
7441     if (mask & CEPH_SETATTR_GID) {
7442       in->ctime = ceph_clock_now();
7443       in->cap_dirtier_uid = perms.uid();
7444       in->cap_dirtier_gid = perms.gid();
7445       in->gid = stx->stx_gid;
7446       in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7447       mask &= ~CEPH_SETATTR_GID;
7448       kill_sguid = true;
7449       ldout(cct,10) << "changing gid to " << stx->stx_gid << dendl;
7450     }
7451
7452     if (mask & CEPH_SETATTR_MODE) {
7453       in->ctime = ceph_clock_now();
7454       in->cap_dirtier_uid = perms.uid();
7455       in->cap_dirtier_gid = perms.gid();
7456       in->mode = (in->mode & ~07777) | (stx->stx_mode & 07777);
7457       in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7458       mask &= ~CEPH_SETATTR_MODE;
7459       ldout(cct,10) << "changing mode to " << stx->stx_mode << dendl;
7460     } else if (kill_sguid && S_ISREG(in->mode) && (in->mode & (S_IXUSR|S_IXGRP|S_IXOTH))) {
7461       /* Must squash the any setuid/setgid bits with an ownership change */
7462       in->mode &= ~(S_ISUID|S_ISGID);
7463       in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7464     }
7465
7466     if (mask & CEPH_SETATTR_BTIME) {
7467       in->ctime = ceph_clock_now();
7468       in->cap_dirtier_uid = perms.uid();
7469       in->cap_dirtier_gid = perms.gid();
7470       in->btime = utime_t(stx->stx_btime);
7471       in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7472       mask &= ~CEPH_SETATTR_BTIME;
7473       ldout(cct,10) << "changing btime to " << in->btime << dendl;
7474     }
7475   } else if (mask & CEPH_SETATTR_SIZE) {
7476     /* If we don't have Ax, then we must ask the server to clear them on truncate */
7477     mask |= CEPH_SETATTR_KILL_SGUID;
7478   }
7479
7480   if (in->caps_issued_mask(CEPH_CAP_FILE_EXCL)) {
7481     if (mask & (CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME)) {
7482       if (mask & CEPH_SETATTR_MTIME)
7483         in->mtime = utime_t(stx->stx_mtime);
7484       if (mask & CEPH_SETATTR_ATIME)
7485         in->atime = utime_t(stx->stx_atime);
7486       in->ctime = ceph_clock_now();
7487       in->cap_dirtier_uid = perms.uid();
7488       in->cap_dirtier_gid = perms.gid();
7489       in->time_warp_seq++;
7490       in->mark_caps_dirty(CEPH_CAP_FILE_EXCL);
7491       mask &= ~(CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME);
7492     }
7493   }
7494   if (!mask) {
7495     in->change_attr++;
7496     return 0;
7497   }
7498
7499 force_request:
7500   MetaRequest *req = new MetaRequest(CEPH_MDS_OP_SETATTR);
7501
7502   filepath path;
7503
7504   in->make_nosnap_relative_path(path);
7505   req->set_filepath(path);
7506   req->set_inode(in);
7507
7508   if (mask & CEPH_SETATTR_KILL_SGUID) {
7509     req->inode_drop |= CEPH_CAP_AUTH_SHARED;
7510   }
7511   if (mask & CEPH_SETATTR_MODE) {
7512     req->head.args.setattr.mode = stx->stx_mode;
7513     req->inode_drop |= CEPH_CAP_AUTH_SHARED;
7514     ldout(cct,10) << "changing mode to " << stx->stx_mode << dendl;
7515   }
7516   if (mask & CEPH_SETATTR_UID) {
7517     req->head.args.setattr.uid = stx->stx_uid;
7518     req->inode_drop |= CEPH_CAP_AUTH_SHARED;
7519     ldout(cct,10) << "changing uid to " << stx->stx_uid << dendl;
7520   }
7521   if (mask & CEPH_SETATTR_GID) {
7522     req->head.args.setattr.gid = stx->stx_gid;
7523     req->inode_drop |= CEPH_CAP_AUTH_SHARED;
7524     ldout(cct,10) << "changing gid to " << stx->stx_gid << dendl;
7525   }
7526   if (mask & CEPH_SETATTR_BTIME) {
7527     req->head.args.setattr.btime = utime_t(stx->stx_btime);
7528     req->inode_drop |= CEPH_CAP_AUTH_SHARED;
7529   }
7530   if (mask & CEPH_SETATTR_MTIME) {
7531     req->head.args.setattr.mtime = utime_t(stx->stx_mtime);
7532     req->inode_drop |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_RD |
7533       CEPH_CAP_FILE_WR;
7534   }
7535   if (mask & CEPH_SETATTR_ATIME) {
7536     req->head.args.setattr.atime = utime_t(stx->stx_atime);
7537     req->inode_drop |= CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_RD |
7538       CEPH_CAP_FILE_WR;
7539   }
7540   if (mask & CEPH_SETATTR_SIZE) {
7541     if ((uint64_t)stx->stx_size < mdsmap->get_max_filesize()) {
7542       req->head.args.setattr.size = stx->stx_size;
7543       ldout(cct,10) << "changing size to " << stx->stx_size << dendl;
7544     } else { //too big!
7545       put_request(req);
7546       ldout(cct,10) << "unable to set size to " << stx->stx_size << ". Too large!" << dendl;
7547       return -CEPHFS_EFBIG;
7548     }
7549     req->inode_drop |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_RD |
7550       CEPH_CAP_FILE_WR;
7551   }
7552   req->head.args.setattr.mask = mask;
7553
7554   req->regetattr_mask = mask;
7555
7556   int res = make_request(req, perms, inp);
7557   ldout(cct, 10) << "_setattr result=" << res << dendl;
7558   return res;
7559 }
7560
7561 /* Note that we only care about attrs that setattr cares about */
7562 void Client::stat_to_statx(struct stat *st, struct ceph_statx *stx)
7563 {
7564   stx->stx_size = st->st_size;
7565   stx->stx_mode = st->st_mode;
7566   stx->stx_uid = st->st_uid;
7567   stx->stx_gid = st->st_gid;
7568 #ifdef __APPLE__
7569   stx->stx_mtime = st->st_mtimespec;
7570   stx->stx_atime = st->st_atimespec;
7571 #elif __WIN32
7572   stx->stx_mtime.tv_sec = st->st_mtime;
7573   stx->stx_atime.tv_sec = st->st_atime;
7574 #else
7575   stx->stx_mtime = st->st_mtim;
7576   stx->stx_atime = st->st_atim;
7577 #endif
7578 }
7579
7580 int Client::__setattrx(Inode *in, struct ceph_statx *stx, int mask,
7581                        const UserPerm& perms, InodeRef *inp)
7582 {
7583   int ret = _do_setattr(in, stx, mask, perms, inp);
7584   if (ret < 0)
7585    return ret;
7586   if (mask & CEPH_SETATTR_MODE)
7587     ret = _posix_acl_chmod(in, stx->stx_mode, perms);
7588   return ret;
7589 }
7590
7591 int Client::_setattrx(InodeRef &in, struct ceph_statx *stx, int mask,
7592                       const UserPerm& perms)
7593 {
7594   mask &= (CEPH_SETATTR_MODE | CEPH_SETATTR_UID |
7595            CEPH_SETATTR_GID | CEPH_SETATTR_MTIME |
7596            CEPH_SETATTR_ATIME | CEPH_SETATTR_SIZE |
7597            CEPH_SETATTR_CTIME | CEPH_SETATTR_BTIME);
7598   if (cct->_conf->client_permissions) {
7599     int r = may_setattr(in.get(), stx, mask, perms);
7600     if (r < 0)
7601       return r;
7602   }
7603   return __setattrx(in.get(), stx, mask, perms);
7604 }
7605
7606 int Client::_setattr(InodeRef &in, struct stat *attr, int mask,
7607                      const UserPerm& perms)
7608 {
7609   struct ceph_statx stx;
7610
7611   stat_to_statx(attr, &stx);
7612   mask &= ~CEPH_SETATTR_BTIME;
7613
7614   if ((mask & CEPH_SETATTR_UID) && attr->st_uid == static_cast<uid_t>(-1)) {
7615     mask &= ~CEPH_SETATTR_UID;
7616   }
7617   if ((mask & CEPH_SETATTR_GID) && attr->st_gid == static_cast<uid_t>(-1)) {
7618     mask &= ~CEPH_SETATTR_GID;
7619   }
7620
7621   return _setattrx(in, &stx, mask, perms);
7622 }
7623
7624 int Client::setattr(const char *relpath, struct stat *attr, int mask,
7625                     const UserPerm& perms)
7626 {
7627   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
7628   if (!mref_reader.is_state_satisfied())
7629     return -CEPHFS_ENOTCONN;
7630
7631   tout(cct) << __func__ << std::endl;
7632   tout(cct) << relpath << std::endl;
7633   tout(cct) << mask  << std::endl;
7634
7635   filepath path(relpath);
7636   InodeRef in;
7637
7638   std::scoped_lock lock(client_lock);
7639   int r = path_walk(path, &in, perms);
7640   if (r < 0)
7641     return r;
7642   return _setattr(in, attr, mask, perms);
7643 }
7644
7645 int Client::setattrx(const char *relpath, struct ceph_statx *stx, int mask,
7646                      const UserPerm& perms, int flags)
7647 {
7648   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
7649   if (!mref_reader.is_state_satisfied())
7650     return -CEPHFS_ENOTCONN;
7651
7652   tout(cct) << __func__ << std::endl;
7653   tout(cct) << relpath << std::endl;
7654   tout(cct) << mask  << std::endl;
7655
7656   filepath path(relpath);
7657   InodeRef in;
7658
7659   std::scoped_lock lock(client_lock);
7660   int r = path_walk(path, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW));
7661   if (r < 0)
7662     return r;
7663   return _setattrx(in, stx, mask, perms);
7664 }
7665
7666 int Client::fsetattr(int fd, struct stat *attr, int mask, const UserPerm& perms)
7667 {
7668   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
7669   if (!mref_reader.is_state_satisfied())
7670     return -CEPHFS_ENOTCONN;
7671
7672   tout(cct) << __func__ << std::endl;
7673   tout(cct) << fd << std::endl;
7674   tout(cct) << mask  << std::endl;
7675
7676   std::scoped_lock lock(client_lock);
7677   Fh *f = get_filehandle(fd);
7678   if (!f)
7679     return -CEPHFS_EBADF;
7680 #if defined(__linux__) && defined(O_PATH)
7681   if (f->flags & O_PATH)
7682     return -CEPHFS_EBADF;
7683 #endif
7684   return _setattr(f->inode, attr, mask, perms);
7685 }
7686
7687 int Client::fsetattrx(int fd, struct ceph_statx *stx, int mask, const UserPerm& perms)
7688 {
7689   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
7690   if (!mref_reader.is_state_satisfied())
7691     return -CEPHFS_ENOTCONN;
7692
7693   tout(cct) << __func__ << std::endl;
7694   tout(cct) << fd << std::endl;
7695   tout(cct) << mask  << std::endl;
7696
7697   std::scoped_lock lock(client_lock);
7698   Fh *f = get_filehandle(fd);
7699   if (!f)
7700     return -CEPHFS_EBADF;
7701 #if defined(__linux__) && defined(O_PATH)
7702   if (f->flags & O_PATH)
7703     return -CEPHFS_EBADF;
7704 #endif
7705   return _setattrx(f->inode, stx, mask, perms);
7706 }
7707
7708 int Client::stat(const char *relpath, struct stat *stbuf, const UserPerm& perms,
7709                  frag_info_t *dirstat, int mask)
7710 {
7711   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
7712   if (!mref_reader.is_state_satisfied())
7713     return -CEPHFS_ENOTCONN;
7714
7715   ldout(cct, 3) << __func__ << " enter (relpath " << relpath << " mask " << mask << ")" << dendl;
7716   tout(cct) << "stat" << std::endl;
7717   tout(cct) << relpath << std::endl;
7718
7719   filepath path(relpath);
7720   InodeRef in;
7721
7722   std::scoped_lock lock(client_lock);
7723   int r = path_walk(path, &in, perms, true, mask);
7724   if (r < 0)
7725     return r;
7726   r = _getattr(in, mask, perms);
7727   if (r < 0) {
7728     ldout(cct, 3) << __func__ << " exit on error!" << dendl;
7729     return r;
7730   }
7731   fill_stat(in, stbuf, dirstat);
7732   ldout(cct, 3) << __func__ << " exit (relpath " << relpath << " mask " << mask << ")" << dendl;
7733   return r;
7734 }
7735
7736 unsigned Client::statx_to_mask(unsigned int flags, unsigned int want)
7737 {
7738   unsigned mask = 0;
7739
7740   /* if NO_ATTR_SYNC is set, then we don't need any -- just use what's in cache */
7741   if (flags & AT_NO_ATTR_SYNC)
7742     goto out;
7743
7744   /* Always set PIN to distinguish from AT_NO_ATTR_SYNC case */
7745   mask |= CEPH_CAP_PIN;
7746   if (want & (CEPH_STATX_MODE|CEPH_STATX_UID|CEPH_STATX_GID|CEPH_STATX_BTIME|CEPH_STATX_CTIME|CEPH_STATX_VERSION))
7747     mask |= CEPH_CAP_AUTH_SHARED;
7748   if (want & (CEPH_STATX_NLINK|CEPH_STATX_CTIME|CEPH_STATX_VERSION))
7749     mask |= CEPH_CAP_LINK_SHARED;
7750   if (want & (CEPH_STATX_NLINK|CEPH_STATX_ATIME|CEPH_STATX_MTIME|CEPH_STATX_CTIME|CEPH_STATX_SIZE|CEPH_STATX_BLOCKS|CEPH_STATX_VERSION))
7751     mask |= CEPH_CAP_FILE_SHARED;
7752   if (want & (CEPH_STATX_VERSION|CEPH_STATX_CTIME))
7753     mask |= CEPH_CAP_XATTR_SHARED;
7754 out:
7755   return mask;
7756 }
7757
7758 int Client::statx(const char *relpath, struct ceph_statx *stx,
7759                   const UserPerm& perms,
7760                   unsigned int want, unsigned int flags)
7761 {
7762   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
7763   if (!mref_reader.is_state_satisfied())
7764     return -CEPHFS_ENOTCONN;
7765
7766   ldout(cct, 3) << __func__ << " enter (relpath " << relpath << " want " << want << ")" << dendl;
7767   tout(cct) << "statx" << std::endl;
7768   tout(cct) << relpath << std::endl;
7769
7770   filepath path(relpath);
7771   InodeRef in;
7772
7773   unsigned mask = statx_to_mask(flags, want);
7774
7775   std::scoped_lock lock(client_lock);
7776   int r = path_walk(path, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW), mask);
7777   if (r < 0)
7778     return r;
7779
7780   r = _getattr(in, mask, perms);
7781   if (r < 0) {
7782     ldout(cct, 3) << __func__ << " exit on error!" << dendl;
7783     return r;
7784   }
7785
7786   fill_statx(in, mask, stx);
7787   ldout(cct, 3) << __func__ << " exit (relpath " << relpath << " mask " << stx->stx_mask << ")" << dendl;
7788   return r;
7789 }
7790
7791 int Client::lstat(const char *relpath, struct stat *stbuf,
7792                   const UserPerm& perms, frag_info_t *dirstat, int mask)
7793 {
7794   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
7795   if (!mref_reader.is_state_satisfied())
7796     return -CEPHFS_ENOTCONN;
7797
7798   ldout(cct, 3) << __func__ << " enter (relpath " << relpath << " mask " << mask << ")" << dendl;
7799   tout(cct) << __func__ << std::endl;
7800   tout(cct) << relpath << std::endl;
7801
7802   filepath path(relpath);
7803   InodeRef in;
7804
7805   std::scoped_lock lock(client_lock);
7806   // don't follow symlinks
7807   int r = path_walk(path, &in, perms, false, mask);
7808   if (r < 0)
7809     return r;
7810   r = _getattr(in, mask, perms);
7811   if (r < 0) {
7812     ldout(cct, 3) << __func__ << " exit on error!" << dendl;
7813     return r;
7814   }
7815   fill_stat(in, stbuf, dirstat);
7816   ldout(cct, 3) << __func__ << " exit (relpath " << relpath << " mask " << mask << ")" << dendl;
7817   return r;
7818 }
7819
7820 int Client::fill_stat(Inode *in, struct stat *st, frag_info_t *dirstat, nest_info_t *rstat)
7821 {
7822   ldout(cct, 10) << __func__ << " on " << in->ino << " snap/dev" << in->snapid
7823            << " mode 0" << oct << in->mode << dec
7824            << " mtime " << in->mtime << " ctime " << in->ctime << dendl;
7825   memset(st, 0, sizeof(struct stat));
7826   if (use_faked_inos())
7827     st->st_ino = in->faked_ino;
7828   else
7829     st->st_ino = in->ino;
7830   st->st_dev = in->snapid;
7831   st->st_mode = in->mode;
7832   st->st_rdev = in->rdev;
7833   if (in->is_dir()) {
7834     switch (in->nlink) {
7835       case 0:
7836         st->st_nlink = 0; /* dir is unlinked */
7837         break;
7838       case 1:
7839         st->st_nlink = 1 /* parent dentry */
7840                        + 1 /* <dir>/. */
7841                        + in->dirstat.nsubdirs; /* include <dir>/. self-reference */
7842         break;
7843       default:
7844         ceph_abort();
7845     }
7846   } else {
7847     st->st_nlink = in->nlink;
7848   }
7849   st->st_uid = in->uid;
7850   st->st_gid = in->gid;
7851   if (in->ctime > in->mtime) {
7852     stat_set_ctime_sec(st, in->ctime.sec());
7853     stat_set_ctime_nsec(st, in->ctime.nsec());
7854   } else {
7855     stat_set_ctime_sec(st, in->mtime.sec());
7856     stat_set_ctime_nsec(st, in->mtime.nsec());
7857   }
7858   stat_set_atime_sec(st, in->atime.sec());
7859   stat_set_atime_nsec(st, in->atime.nsec());
7860   stat_set_mtime_sec(st, in->mtime.sec());
7861   stat_set_mtime_nsec(st, in->mtime.nsec());
7862   if (in->is_dir()) {
7863     if (cct->_conf->client_dirsize_rbytes)
7864       st->st_size = in->rstat.rbytes;
7865     else
7866       st->st_size = in->dirstat.size();
7867 // The Windows "stat" structure provides just a subset of the fields that are
7868 // available on Linux.
7869 #ifndef _WIN32
7870     st->st_blocks = 1;
7871 #endif
7872   } else {
7873     st->st_size = in->size;
7874 #ifndef _WIN32
7875     st->st_blocks = (in->size + 511) >> 9;
7876 #endif
7877   }
7878 #ifndef _WIN32
7879   st->st_blksize = std::max<uint32_t>(in->layout.stripe_unit, 4096);
7880 #endif
7881
7882   if (dirstat)
7883     *dirstat = in->dirstat;
7884   if (rstat)
7885     *rstat = in->rstat;
7886
7887   return in->caps_issued();
7888 }
7889
7890 void Client::fill_statx(Inode *in, unsigned int mask, struct ceph_statx *stx)
7891 {
7892   ldout(cct, 10) << __func__ << " on " << in->ino << " snap/dev" << in->snapid
7893            << " mode 0" << oct << in->mode << dec
7894            << " mtime " << in->mtime << " ctime " << in->ctime << dendl;
7895   memset(stx, 0, sizeof(struct ceph_statx));
7896
7897   /*
7898    * If mask is 0, then the caller set AT_NO_ATTR_SYNC. Reset the mask
7899    * so that all bits are set.
7900    */
7901   if (!mask)
7902     mask = ~0;
7903
7904   /* These are always considered to be available */
7905   stx->stx_dev = in->snapid;
7906   stx->stx_blksize = std::max<uint32_t>(in->layout.stripe_unit, 4096);
7907
7908   /* Type bits are always set, even when CEPH_STATX_MODE is not */
7909   stx->stx_mode = S_IFMT & in->mode;
7910   stx->stx_ino = use_faked_inos() ? in->faked_ino : (ino_t)in->ino;
7911   stx->stx_rdev = in->rdev;
7912   stx->stx_mask |= (CEPH_STATX_INO|CEPH_STATX_RDEV);
7913
7914   if (mask & CEPH_CAP_AUTH_SHARED) {
7915     stx->stx_uid = in->uid;
7916     stx->stx_gid = in->gid;
7917     stx->stx_mode = in->mode;
7918     in->btime.to_timespec(&stx->stx_btime);
7919     stx->stx_mask |= (CEPH_STATX_MODE|CEPH_STATX_UID|CEPH_STATX_GID|CEPH_STATX_BTIME);
7920   }
7921
7922   if (mask & CEPH_CAP_LINK_SHARED) {
7923     if (in->is_dir()) {
7924       switch (in->nlink) {
7925         case 0:
7926           stx->stx_nlink = 0; /* dir is unlinked */
7927           break;
7928         case 1:
7929           stx->stx_nlink = 1 /* parent dentry */
7930                            + 1 /* <dir>/. */
7931                            + in->dirstat.nsubdirs; /* include <dir>/. self-reference */
7932           break;
7933         default:
7934           ceph_abort();
7935       }
7936     } else {
7937       stx->stx_nlink = in->nlink;
7938     }
7939     stx->stx_mask |= CEPH_STATX_NLINK;
7940   }
7941
7942   if (mask & CEPH_CAP_FILE_SHARED) {
7943
7944     in->atime.to_timespec(&stx->stx_atime);
7945     in->mtime.to_timespec(&stx->stx_mtime);
7946
7947     if (in->is_dir()) {
7948       if (cct->_conf->client_dirsize_rbytes)
7949         stx->stx_size = in->rstat.rbytes;
7950       else
7951         stx->stx_size = in->dirstat.size();
7952       stx->stx_blocks = 1;
7953     } else {
7954       stx->stx_size = in->size;
7955       stx->stx_blocks = (in->size + 511) >> 9;
7956     }
7957     stx->stx_mask |= (CEPH_STATX_ATIME|CEPH_STATX_MTIME|
7958                       CEPH_STATX_SIZE|CEPH_STATX_BLOCKS);
7959   }
7960
7961   /* Change time and change_attr both require all shared caps to view */
7962   if ((mask & CEPH_STAT_CAP_INODE_ALL) == CEPH_STAT_CAP_INODE_ALL) {
7963     stx->stx_version = in->change_attr;
7964     if (in->ctime > in->mtime)
7965       in->ctime.to_timespec(&stx->stx_ctime);
7966     else
7967       in->mtime.to_timespec(&stx->stx_ctime);
7968     stx->stx_mask |= (CEPH_STATX_CTIME|CEPH_STATX_VERSION);
7969   }
7970
7971 }
7972
7973 void Client::touch_dn(Dentry *dn)
7974 {
7975   lru.lru_touch(dn);
7976 }
7977
7978 int Client::chmod(const char *relpath, mode_t mode, const UserPerm& perms)
7979 {
7980   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
7981   if (!mref_reader.is_state_satisfied())
7982     return -CEPHFS_ENOTCONN;
7983
7984   tout(cct) << __func__ << std::endl;
7985   tout(cct) << relpath << std::endl;
7986   tout(cct) << mode << std::endl;
7987
7988   filepath path(relpath);
7989   InodeRef in;
7990
7991   std::scoped_lock lock(client_lock);
7992   int r = path_walk(path, &in, perms);
7993   if (r < 0)
7994     return r;
7995   struct stat attr;
7996   attr.st_mode = mode;
7997   return _setattr(in, &attr, CEPH_SETATTR_MODE, perms);
7998 }
7999
8000 int Client::fchmod(int fd, mode_t mode, const UserPerm& perms)
8001 {
8002   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8003   if (!mref_reader.is_state_satisfied())
8004     return -CEPHFS_ENOTCONN;
8005
8006   tout(cct) << __func__ << std::endl;
8007   tout(cct) << fd << std::endl;
8008   tout(cct) << mode << std::endl;
8009
8010   std::scoped_lock lock(client_lock);
8011   Fh *f = get_filehandle(fd);
8012   if (!f)
8013     return -CEPHFS_EBADF;
8014 #if defined(__linux__) && defined(O_PATH)
8015   if (f->flags & O_PATH)
8016     return -CEPHFS_EBADF;
8017 #endif
8018   struct stat attr;
8019   attr.st_mode = mode;
8020   return _setattr(f->inode, &attr, CEPH_SETATTR_MODE, perms);
8021 }
8022
8023 int Client::lchmod(const char *relpath, mode_t mode, const UserPerm& perms)
8024 {
8025   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8026   if (!mref_reader.is_state_satisfied())
8027     return -CEPHFS_ENOTCONN;
8028
8029   tout(cct) << __func__ << std::endl;
8030   tout(cct) << relpath << std::endl;
8031   tout(cct) << mode << std::endl;
8032
8033   filepath path(relpath);
8034   InodeRef in;
8035
8036   std::scoped_lock lock(client_lock);
8037   // don't follow symlinks
8038   int r = path_walk(path, &in, perms, false);
8039   if (r < 0)
8040     return r;
8041   struct stat attr;
8042   attr.st_mode = mode;
8043   return _setattr(in, &attr, CEPH_SETATTR_MODE, perms);
8044 }
8045
8046 int Client::chown(const char *relpath, uid_t new_uid, gid_t new_gid,
8047                   const UserPerm& perms)
8048 {
8049   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8050   if (!mref_reader.is_state_satisfied())
8051     return -CEPHFS_ENOTCONN;
8052
8053   tout(cct) << __func__ << std::endl;
8054   tout(cct) << relpath << std::endl;
8055   tout(cct) << new_uid << std::endl;
8056   tout(cct) << new_gid << std::endl;
8057
8058   filepath path(relpath);
8059   InodeRef in;
8060
8061   std::scoped_lock lock(client_lock);
8062   int r = path_walk(path, &in, perms);
8063   if (r < 0)
8064     return r;
8065   struct stat attr;
8066   attr.st_uid = new_uid;
8067   attr.st_gid = new_gid;
8068   return _setattr(in, &attr, CEPH_SETATTR_UID|CEPH_SETATTR_GID, perms);
8069 }
8070
8071 int Client::fchown(int fd, uid_t new_uid, gid_t new_gid, const UserPerm& perms)
8072 {
8073   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8074   if (!mref_reader.is_state_satisfied())
8075     return -CEPHFS_ENOTCONN;
8076
8077   tout(cct) << __func__ << std::endl;
8078   tout(cct) << fd << std::endl;
8079   tout(cct) << new_uid << std::endl;
8080   tout(cct) << new_gid << std::endl;
8081
8082   std::scoped_lock lock(client_lock);
8083   Fh *f = get_filehandle(fd);
8084   if (!f)
8085     return -CEPHFS_EBADF;
8086 #if defined(__linux__) && defined(O_PATH)
8087   if (f->flags & O_PATH)
8088     return -CEPHFS_EBADF;
8089 #endif
8090   struct stat attr;
8091   attr.st_uid = new_uid;
8092   attr.st_gid = new_gid;
8093   int mask = 0;
8094   if (new_uid != static_cast<uid_t>(-1)) mask |= CEPH_SETATTR_UID;
8095   if (new_gid != static_cast<gid_t>(-1)) mask |= CEPH_SETATTR_GID;
8096   return _setattr(f->inode, &attr, mask, perms);
8097 }
8098
8099 int Client::lchown(const char *relpath, uid_t new_uid, gid_t new_gid,
8100                    const UserPerm& perms)
8101 {
8102   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8103   if (!mref_reader.is_state_satisfied())
8104     return -CEPHFS_ENOTCONN;
8105
8106   tout(cct) << __func__ << std::endl;
8107   tout(cct) << relpath << std::endl;
8108   tout(cct) << new_uid << std::endl;
8109   tout(cct) << new_gid << std::endl;
8110
8111   filepath path(relpath);
8112   InodeRef in;
8113
8114   std::scoped_lock lock(client_lock);
8115   // don't follow symlinks
8116   int r = path_walk(path, &in, perms, false);
8117   if (r < 0)
8118     return r;
8119   struct stat attr;
8120   attr.st_uid = new_uid;
8121   attr.st_gid = new_gid;
8122   int mask = 0;
8123   if (new_uid != static_cast<uid_t>(-1)) mask |= CEPH_SETATTR_UID;
8124   if (new_gid != static_cast<gid_t>(-1)) mask |= CEPH_SETATTR_GID;
8125   return _setattr(in, &attr, mask, perms);
8126 }
8127
8128 static void attr_set_atime_and_mtime(struct stat *attr,
8129                                      const utime_t &atime,
8130                                      const utime_t &mtime)
8131 {
8132   stat_set_atime_sec(attr, atime.tv.tv_sec);
8133   stat_set_atime_nsec(attr, atime.tv.tv_nsec);
8134   stat_set_mtime_sec(attr, mtime.tv.tv_sec);
8135   stat_set_mtime_nsec(attr, mtime.tv.tv_nsec);
8136 }
8137
8138 // for [l]utime() invoke the timeval variant as the timespec
8139 // variant are not yet implemented. for futime[s](), invoke
8140 // the timespec variant.
8141 int Client::utime(const char *relpath, struct utimbuf *buf,
8142                   const UserPerm& perms)
8143 {
8144   struct timeval tv[2];
8145   tv[0].tv_sec  = buf->actime;
8146   tv[0].tv_usec = 0;
8147   tv[1].tv_sec  = buf->modtime;
8148   tv[1].tv_usec = 0;
8149
8150   return utimes(relpath, tv, perms);
8151 }
8152
8153 int Client::lutime(const char *relpath, struct utimbuf *buf,
8154                    const UserPerm& perms)
8155 {
8156   struct timeval tv[2];
8157   tv[0].tv_sec  = buf->actime;
8158   tv[0].tv_usec = 0;
8159   tv[1].tv_sec  = buf->modtime;
8160   tv[1].tv_usec = 0;
8161
8162   return lutimes(relpath, tv, perms);
8163 }
8164
8165 int Client::futime(int fd, struct utimbuf *buf, const UserPerm& perms)
8166 {
8167   struct timespec ts[2];
8168   ts[0].tv_sec  = buf->actime;
8169   ts[0].tv_nsec = 0;
8170   ts[1].tv_sec  = buf->modtime;
8171   ts[1].tv_nsec = 0;
8172
8173   return futimens(fd, ts, perms);
8174 }
8175
8176 int Client::utimes(const char *relpath, struct timeval times[2],
8177                    const UserPerm& perms)
8178 {
8179   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8180   if (!mref_reader.is_state_satisfied())
8181     return -CEPHFS_ENOTCONN;
8182
8183   tout(cct) << __func__ << std::endl;
8184   tout(cct) << relpath << std::endl;
8185   tout(cct) << "atime: " << times[0].tv_sec << "." << times[0].tv_usec
8186             << std::endl;
8187   tout(cct) << "mtime: " << times[1].tv_sec << "." << times[1].tv_usec
8188             << std::endl;
8189
8190   filepath path(relpath);
8191   InodeRef in;
8192
8193   std::scoped_lock lock(client_lock);
8194   int r = path_walk(path, &in, perms);
8195   if (r < 0)
8196     return r;
8197   struct stat attr;
8198   utime_t atime(times[0]);
8199   utime_t mtime(times[1]);
8200
8201   attr_set_atime_and_mtime(&attr, atime, mtime);
8202   return _setattr(in, &attr, CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME, perms);
8203 }
8204
8205 int Client::lutimes(const char *relpath, struct timeval times[2],
8206                     const UserPerm& perms)
8207 {
8208   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8209   if (!mref_reader.is_state_satisfied())
8210     return -CEPHFS_ENOTCONN;
8211
8212   tout(cct) << __func__ << std::endl;
8213   tout(cct) << relpath << std::endl;
8214   tout(cct) << "atime: " << times[0].tv_sec << "." << times[0].tv_usec
8215             << std::endl;
8216   tout(cct) << "mtime: " << times[1].tv_sec << "." << times[1].tv_usec
8217             << std::endl;
8218
8219   filepath path(relpath);
8220   InodeRef in;
8221
8222   std::scoped_lock lock(client_lock);
8223   int r = path_walk(path, &in, perms, false);
8224   if (r < 0)
8225     return r;
8226   struct stat attr;
8227   utime_t atime(times[0]);
8228   utime_t mtime(times[1]);
8229
8230   attr_set_atime_and_mtime(&attr, atime, mtime);
8231   return _setattr(in, &attr, CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME, perms);
8232 }
8233
8234 int Client::futimes(int fd, struct timeval times[2], const UserPerm& perms)
8235 {
8236   struct timespec ts[2];
8237   ts[0].tv_sec  = times[0].tv_sec;
8238   ts[0].tv_nsec = times[0].tv_usec * 1000;
8239   ts[1].tv_sec  = times[1].tv_sec;
8240   ts[1].tv_nsec = times[1].tv_usec * 1000;
8241
8242   return futimens(fd, ts, perms);
8243 }
8244
8245 int Client::futimens(int fd, struct timespec times[2], const UserPerm& perms)
8246 {
8247   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8248   if (!mref_reader.is_state_satisfied())
8249     return -CEPHFS_ENOTCONN;
8250
8251   tout(cct) << __func__ << std::endl;
8252   tout(cct) << fd << std::endl;
8253   tout(cct) << "atime: " << times[0].tv_sec << "." << times[0].tv_nsec
8254             << std::endl;
8255   tout(cct) << "mtime: " << times[1].tv_sec << "." << times[1].tv_nsec
8256             << std::endl;
8257
8258   std::scoped_lock lock(client_lock);
8259   Fh *f = get_filehandle(fd);
8260   if (!f)
8261     return -CEPHFS_EBADF;
8262 #if defined(__linux__) && defined(O_PATH)
8263   if (f->flags & O_PATH)
8264     return -CEPHFS_EBADF;
8265 #endif
8266   struct stat attr;
8267   utime_t atime(times[0]);
8268   utime_t mtime(times[1]);
8269
8270   attr_set_atime_and_mtime(&attr, atime, mtime);
8271   return _setattr(f->inode, &attr, CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME, perms);
8272 }
8273
8274 int Client::flock(int fd, int operation, uint64_t owner)
8275 {
8276   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8277   if (!mref_reader.is_state_satisfied())
8278     return -CEPHFS_ENOTCONN;
8279
8280   tout(cct) << __func__ << std::endl;
8281   tout(cct) << fd << std::endl;
8282   tout(cct) << operation << std::endl;
8283   tout(cct) << owner << std::endl;
8284
8285   std::scoped_lock lock(client_lock);
8286   Fh *f = get_filehandle(fd);
8287   if (!f)
8288     return -CEPHFS_EBADF;
8289
8290   return _flock(f, operation, owner);
8291 }
8292
8293 int Client::opendir(const char *relpath, dir_result_t **dirpp, const UserPerm& perms)
8294 {
8295   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8296   if (!mref_reader.is_state_satisfied())
8297     return -CEPHFS_ENOTCONN;
8298
8299   tout(cct) << __func__ << std::endl;
8300   tout(cct) << relpath << std::endl;
8301
8302   filepath path(relpath);
8303   InodeRef in;
8304
8305   std::scoped_lock lock(client_lock);
8306   int r = path_walk(path, &in, perms, true);
8307   if (r < 0)
8308     return r;
8309   if (cct->_conf->client_permissions) {
8310     int r = may_open(in.get(), O_RDONLY, perms);
8311     if (r < 0)
8312       return r;
8313   }
8314   r = _opendir(in.get(), dirpp, perms);
8315   /* if ENOTDIR, dirpp will be an uninitialized point and it's very dangerous to access its value */
8316   if (r != -CEPHFS_ENOTDIR)
8317       tout(cct) << (uintptr_t)*dirpp << std::endl;
8318   return r;
8319 }
8320
8321 int Client::_opendir(Inode *in, dir_result_t **dirpp, const UserPerm& perms)
8322 {
8323   if (!in->is_dir())
8324     return -CEPHFS_ENOTDIR;
8325   *dirpp = new dir_result_t(in, perms);
8326   opened_dirs.insert(*dirpp);
8327   ldout(cct, 8) << __func__ << "(" << in->ino << ") = " << 0 << " (" << *dirpp << ")" << dendl;
8328   return 0;
8329 }
8330
8331
8332 int Client::closedir(dir_result_t *dir)
8333 {
8334   tout(cct) << __func__ << std::endl;
8335   tout(cct) << (uintptr_t)dir << std::endl;
8336
8337   ldout(cct, 3) << __func__ << "(" << dir << ") = 0" << dendl;
8338   std::scoped_lock lock(client_lock);
8339   _closedir(dir);
8340   return 0;
8341 }
8342
8343 void Client::_closedir(dir_result_t *dirp)
8344 {
8345   ldout(cct, 10) << __func__ << "(" << dirp << ")" << dendl;
8346
8347   if (dirp->inode) {
8348     ldout(cct, 10) << __func__ << " detaching inode " << dirp->inode << dendl;
8349     dirp->inode.reset();
8350   }
8351   _readdir_drop_dirp_buffer(dirp);
8352   opened_dirs.erase(dirp);
8353   delete dirp;
8354 }
8355
8356 void Client::rewinddir(dir_result_t *dirp)
8357 {
8358   ldout(cct, 3) << __func__ << "(" << dirp << ")" << dendl;
8359
8360   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8361   if (!mref_reader.is_state_satisfied())
8362     return;
8363
8364   std::scoped_lock lock(client_lock);
8365   dir_result_t *d = static_cast<dir_result_t*>(dirp);
8366   _readdir_drop_dirp_buffer(d);
8367   d->reset();
8368 }
8369
8370 loff_t Client::telldir(dir_result_t *dirp)
8371 {
8372   dir_result_t *d = static_cast<dir_result_t*>(dirp);
8373   ldout(cct, 3) << __func__ << "(" << dirp << ") = " << d->offset << dendl;
8374   return d->offset;
8375 }
8376
8377 void Client::seekdir(dir_result_t *dirp, loff_t offset)
8378 {
8379   ldout(cct, 3) << __func__ << "(" << dirp << ", " << offset << ")" << dendl;
8380
8381   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8382   if (!mref_reader.is_state_satisfied())
8383     return;
8384
8385   std::scoped_lock lock(client_lock);
8386
8387   if (offset == dirp->offset)
8388     return;
8389
8390   if (offset > dirp->offset)
8391     dirp->release_count = 0;   // bump if we do a forward seek
8392   else
8393     dirp->ordered_count = 0;   // disable filling readdir cache
8394
8395   if (dirp->hash_order()) {
8396     if (dirp->offset > offset) {
8397       _readdir_drop_dirp_buffer(dirp);
8398       dirp->reset();
8399     }
8400   } else {
8401     if (offset == 0 ||
8402         dirp->buffer_frag != frag_t(dir_result_t::fpos_high(offset)) ||
8403         dirp->offset_low() > dir_result_t::fpos_low(offset))  {
8404       _readdir_drop_dirp_buffer(dirp);
8405       dirp->reset();
8406     }
8407   }
8408
8409   dirp->offset = offset;
8410 }
8411
8412
8413 //struct dirent {
8414 //  ino_t          d_ino;       /* inode number */
8415 //  off_t          d_off;       /* offset to the next dirent */
8416 //  unsigned short d_reclen;    /* length of this record */
8417 //  unsigned char  d_type;      /* type of file */
8418 //  char           d_name[256]; /* filename */
8419 //};
8420 void Client::fill_dirent(struct dirent *de, const char *name, int type, uint64_t ino, loff_t next_off)
8421 {
8422   strncpy(de->d_name, name, 255);
8423   de->d_name[255] = '\0';
8424 #if !defined(__CYGWIN__) && !(defined(_WIN32))
8425   de->d_ino = ino;
8426 #if !defined(__APPLE__) && !defined(__FreeBSD__)
8427   de->d_off = next_off;
8428 #endif
8429   de->d_reclen = 1;
8430   de->d_type = IFTODT(type);
8431   ldout(cct, 10) << __func__ << " '" << de->d_name << "' -> " << inodeno_t(de->d_ino)
8432            << " type " << (int)de->d_type << " w/ next_off " << hex << next_off << dec << dendl;
8433 #endif
8434 }
8435
8436 void Client::_readdir_next_frag(dir_result_t *dirp)
8437 {
8438   frag_t fg = dirp->buffer_frag;
8439
8440   if (fg.is_rightmost()) {
8441     ldout(cct, 10) << __func__ << " advance from " << fg << " to END" << dendl;
8442     dirp->set_end();
8443     return;
8444   }
8445
8446   // advance
8447   fg = fg.next();
8448   ldout(cct, 10) << __func__ << " advance from " << dirp->buffer_frag << " to " << fg << dendl;
8449
8450   if (dirp->hash_order()) {
8451     // keep last_name
8452     int64_t new_offset = dir_result_t::make_fpos(fg.value(), 2, true);
8453     if (dirp->offset < new_offset) // don't decrease offset
8454       dirp->offset = new_offset;
8455   } else {
8456     dirp->last_name.clear();
8457     dirp->offset = dir_result_t::make_fpos(fg, 2, false);
8458     _readdir_rechoose_frag(dirp);
8459   }
8460 }
8461
8462 void Client::_readdir_rechoose_frag(dir_result_t *dirp)
8463 {
8464   ceph_assert(dirp->inode);
8465
8466   if (dirp->hash_order())
8467     return;
8468
8469   frag_t cur = frag_t(dirp->offset_high());
8470   frag_t fg = dirp->inode->dirfragtree[cur.value()];
8471   if (fg != cur) {
8472     ldout(cct, 10) << __func__ << " frag " << cur << " maps to " << fg << dendl;
8473     dirp->offset = dir_result_t::make_fpos(fg, 2, false);
8474     dirp->last_name.clear();
8475     dirp->next_offset = 2;
8476   }
8477 }
8478
8479 void Client::_readdir_drop_dirp_buffer(dir_result_t *dirp)
8480 {
8481   ldout(cct, 10) << __func__ << " " << dirp << dendl;
8482   dirp->buffer.clear();
8483 }
8484
8485 int Client::_readdir_get_frag(dir_result_t *dirp)
8486 {
8487   ceph_assert(dirp);
8488   ceph_assert(dirp->inode);
8489
8490   // get the current frag.
8491   frag_t fg;
8492   if (dirp->hash_order())
8493     fg = dirp->inode->dirfragtree[dirp->offset_high()];
8494   else
8495     fg = frag_t(dirp->offset_high());
8496
8497   ldout(cct, 10) << __func__ << " " << dirp << " on " << dirp->inode->ino << " fg " << fg
8498                  << " offset " << hex << dirp->offset << dec << dendl;
8499
8500   int op = CEPH_MDS_OP_READDIR;
8501   if (dirp->inode && dirp->inode->snapid == CEPH_SNAPDIR)
8502     op = CEPH_MDS_OP_LSSNAP;
8503
8504   InodeRef& diri = dirp->inode;
8505
8506   MetaRequest *req = new MetaRequest(op);
8507   filepath path;
8508   diri->make_nosnap_relative_path(path);
8509   req->set_filepath(path);
8510   req->set_inode(diri.get());
8511   req->head.args.readdir.frag = fg;
8512   req->head.args.readdir.flags = CEPH_READDIR_REPLY_BITFLAGS;
8513   if (dirp->last_name.length()) {
8514     req->path2.set_path(dirp->last_name);
8515   } else if (dirp->hash_order()) {
8516     req->head.args.readdir.offset_hash = dirp->offset_high();
8517   }
8518   req->dirp = dirp;
8519
8520   bufferlist dirbl;
8521   int res = make_request(req, dirp->perms, NULL, NULL, -1, &dirbl);
8522
8523   if (res == -CEPHFS_EAGAIN) {
8524     ldout(cct, 10) << __func__ << " got EAGAIN, retrying" << dendl;
8525     _readdir_rechoose_frag(dirp);
8526     return _readdir_get_frag(dirp);
8527   }
8528
8529   if (res == 0) {
8530     ldout(cct, 10) << __func__ << " " << dirp << " got frag " << dirp->buffer_frag
8531                    << " size " << dirp->buffer.size() << dendl;
8532   } else {
8533     ldout(cct, 10) << __func__ << " got error " << res << ", setting end flag" << dendl;
8534     dirp->set_end();
8535   }
8536
8537   return res;
8538 }
8539
8540 struct dentry_off_lt {
8541   bool operator()(const Dentry* dn, int64_t off) const {
8542     return dir_result_t::fpos_cmp(dn->offset, off) < 0;
8543   }
8544 };
8545
8546 int Client::_readdir_cache_cb(dir_result_t *dirp, add_dirent_cb_t cb, void *p,
8547                               int caps, bool getref)
8548 {
8549   ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
8550   ldout(cct, 10) << __func__ << " " << dirp << " on " << dirp->inode->ino
8551            << " last_name " << dirp->last_name << " offset " << hex << dirp->offset << dec
8552            << dendl;
8553   Dir *dir = dirp->inode->dir;
8554
8555   if (!dir) {
8556     ldout(cct, 10) << " dir is empty" << dendl;
8557     dirp->set_end();
8558     return 0;
8559   }
8560
8561   vector<Dentry*>::iterator pd = std::lower_bound(dir->readdir_cache.begin(),
8562                                                   dir->readdir_cache.end(),
8563                                                   dirp->offset, dentry_off_lt());
8564
8565   string dn_name;
8566   while (true) {
8567     int mask = caps;
8568     if (!dirp->inode->is_complete_and_ordered())
8569       return -CEPHFS_EAGAIN;
8570     if (pd == dir->readdir_cache.end())
8571       break;
8572     Dentry *dn = *pd;
8573     if (dn->inode == NULL) {
8574       ldout(cct, 15) << " skipping null '" << dn->name << "'" << dendl;
8575       ++pd;
8576       continue;
8577     }
8578     if (dn->cap_shared_gen != dir->parent_inode->shared_gen) {
8579       ldout(cct, 15) << " skipping mismatch shared gen '" << dn->name << "'" << dendl;
8580       ++pd;
8581       continue;
8582     }
8583
8584     int idx = pd - dir->readdir_cache.begin();
8585     if (dn->inode->is_dir()) {
8586       mask |= CEPH_STAT_RSTAT;
8587     }
8588     int r = _getattr(dn->inode, mask, dirp->perms);
8589     if (r < 0)
8590       return r;
8591
8592     // the content of readdir_cache may change after _getattr(), so pd may be invalid iterator
8593     pd = dir->readdir_cache.begin() + idx;
8594     if (pd >= dir->readdir_cache.end() || *pd != dn)
8595       return -CEPHFS_EAGAIN;
8596
8597     struct ceph_statx stx;
8598     struct dirent de;
8599     fill_statx(dn->inode, caps, &stx);
8600
8601     uint64_t next_off = dn->offset + 1;
8602     fill_dirent(&de, dn->name.c_str(), stx.stx_mode, stx.stx_ino, next_off);
8603     ++pd;
8604     if (pd == dir->readdir_cache.end())
8605       next_off = dir_result_t::END;
8606
8607     Inode *in = NULL;
8608     if (getref) {
8609       in = dn->inode.get();
8610       _ll_get(in);
8611     }
8612
8613     dn_name = dn->name; // fill in name while we have lock
8614
8615     client_lock.unlock();
8616     r = cb(p, &de, &stx, next_off, in);  // _next_ offset
8617     client_lock.lock();
8618     ldout(cct, 15) << " de " << de.d_name << " off " << hex << dn->offset << dec
8619                    << " = " << r << dendl;
8620     if (r < 0) {
8621       return r;
8622     }
8623
8624     dirp->offset = next_off;
8625     if (dirp->at_end())
8626       dirp->next_offset = 2;
8627     else
8628       dirp->next_offset = dirp->offset_low();
8629     dirp->last_name = dn_name; // we successfully returned this one; update!
8630     dirp->release_count = 0; // last_name no longer match cache index
8631     if (r > 0)
8632       return r;
8633   }
8634
8635   ldout(cct, 10) << __func__ << " " << dirp << " on " << dirp->inode->ino << " at end" << dendl;
8636   dirp->set_end();
8637   return 0;
8638 }
8639
8640 int Client::readdir_r_cb(dir_result_t *d, add_dirent_cb_t cb, void *p,
8641                          unsigned want, unsigned flags, bool getref)
8642 {
8643   int caps = statx_to_mask(flags, want);
8644
8645   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8646   if (!mref_reader.is_state_satisfied())
8647     return -CEPHFS_ENOTCONN;
8648
8649   std::unique_lock cl(client_lock);
8650
8651   dir_result_t *dirp = static_cast<dir_result_t*>(d);
8652
8653   ldout(cct, 10) << __func__ << " " << *dirp->inode << " offset " << hex << dirp->offset
8654                  << dec << " at_end=" << dirp->at_end()
8655                  << " hash_order=" << dirp->hash_order() << dendl;
8656
8657   struct dirent de;
8658   struct ceph_statx stx;
8659   memset(&de, 0, sizeof(de));
8660   memset(&stx, 0, sizeof(stx));
8661
8662   InodeRef& diri = dirp->inode;
8663
8664   if (dirp->at_end())
8665     return 0;
8666
8667   if (dirp->offset == 0) {
8668     ldout(cct, 15) << " including ." << dendl;
8669     ceph_assert(diri->dentries.size() < 2); // can't have multiple hard-links to a dir
8670     uint64_t next_off = 1;
8671
8672     int r;
8673     r = _getattr(diri, caps | CEPH_STAT_RSTAT, dirp->perms);
8674     if (r < 0)
8675       return r;
8676
8677     fill_statx(diri, caps, &stx);
8678     fill_dirent(&de, ".", S_IFDIR, stx.stx_ino, next_off);
8679
8680     Inode *inode = NULL;
8681     if (getref) {
8682       inode = diri.get();
8683       _ll_get(inode);
8684     }
8685
8686     cl.unlock();
8687     r = cb(p, &de, &stx, next_off, inode);
8688     cl.lock();
8689     if (r < 0)
8690       return r;
8691
8692     dirp->offset = next_off;
8693     if (r > 0)
8694       return r;
8695   }
8696   if (dirp->offset == 1) {
8697     ldout(cct, 15) << " including .." << dendl;
8698     uint64_t next_off = 2;
8699     InodeRef in;
8700     if (diri->dentries.empty())
8701       in = diri;
8702     else
8703       in = diri->get_first_parent()->dir->parent_inode;
8704
8705     int r;
8706     r = _getattr(in, caps | CEPH_STAT_RSTAT, dirp->perms);
8707     if (r < 0)
8708       return r;
8709
8710     fill_statx(in, caps, &stx);
8711     fill_dirent(&de, "..", S_IFDIR, stx.stx_ino, next_off);
8712
8713     Inode *inode = NULL;
8714     if (getref) {
8715       inode = in.get();
8716       _ll_get(inode);
8717     }
8718
8719     cl.unlock();
8720     r = cb(p, &de, &stx, next_off, inode);
8721     cl.lock();
8722     if (r < 0)
8723       return r;
8724
8725     dirp->offset = next_off;
8726     if (r > 0)
8727       return r;
8728   }
8729
8730   // can we read from our cache?
8731   ldout(cct, 10) << "offset " << hex << dirp->offset << dec
8732            << " snapid " << dirp->inode->snapid << " (complete && ordered) "
8733            << dirp->inode->is_complete_and_ordered()
8734            << " issued " << ccap_string(dirp->inode->caps_issued())
8735            << dendl;
8736   if (dirp->inode->snapid != CEPH_SNAPDIR &&
8737       dirp->inode->is_complete_and_ordered() &&
8738       dirp->inode->caps_issued_mask(CEPH_CAP_FILE_SHARED, true)) {
8739     int err = _readdir_cache_cb(dirp, cb, p, caps, getref);
8740     if (err != -CEPHFS_EAGAIN)
8741       return err;
8742   }
8743
8744   while (1) {
8745     if (dirp->at_end())
8746       return 0;
8747
8748     bool check_caps = true;
8749     if (!dirp->is_cached()) {
8750       int r = _readdir_get_frag(dirp);
8751       if (r)
8752         return r;
8753       // _readdir_get_frag () may updates dirp->offset if the replied dirfrag is
8754       // different than the requested one. (our dirfragtree was outdated)
8755       check_caps = false;
8756     }
8757     frag_t fg = dirp->buffer_frag;
8758
8759     ldout(cct, 10) << "frag " << fg << " buffer size " << dirp->buffer.size()
8760                    << " offset " << hex << dirp->offset << dendl;
8761
8762     for (auto it = std::lower_bound(dirp->buffer.begin(), dirp->buffer.end(),
8763                                     dirp->offset, dir_result_t::dentry_off_lt());
8764          it != dirp->buffer.end();
8765          ++it) {
8766       dir_result_t::dentry &entry = *it;
8767
8768       uint64_t next_off = entry.offset + 1;
8769
8770       int r;
8771       if (check_caps) {
8772         int mask = caps;
8773         if(entry.inode->is_dir()){
8774           mask |= CEPH_STAT_RSTAT;
8775         }
8776         r = _getattr(entry.inode, mask, dirp->perms);
8777         if (r < 0)
8778           return r;
8779       }
8780
8781       fill_statx(entry.inode, caps, &stx);
8782       fill_dirent(&de, entry.name.c_str(), stx.stx_mode, stx.stx_ino, next_off);
8783
8784       Inode *inode = NULL;
8785       if (getref) {
8786         inode = entry.inode.get();
8787         _ll_get(inode);
8788       }
8789
8790       cl.unlock();
8791       r = cb(p, &de, &stx, next_off, inode);  // _next_ offset
8792       cl.lock();
8793
8794       ldout(cct, 15) << " de " << de.d_name << " off " << hex << next_off - 1 << dec
8795                      << " = " << r << dendl;
8796       if (r < 0)
8797         return r;
8798
8799       dirp->offset = next_off;
8800       if (r > 0)
8801         return r;
8802     }
8803
8804     if (dirp->next_offset > 2) {
8805       ldout(cct, 10) << " fetching next chunk of this frag" << dendl;
8806       _readdir_drop_dirp_buffer(dirp);
8807       continue;  // more!
8808     }
8809
8810     if (!fg.is_rightmost()) {
8811       // next frag!
8812       _readdir_next_frag(dirp);
8813       continue;
8814     }
8815
8816     if (diri->shared_gen == dirp->start_shared_gen &&
8817         diri->dir_release_count == dirp->release_count) {
8818       if (diri->dir_ordered_count == dirp->ordered_count) {
8819         ldout(cct, 10) << " marking (I_COMPLETE|I_DIR_ORDERED) on " << *diri << dendl;
8820         if (diri->dir) {
8821           ceph_assert(diri->dir->readdir_cache.size() >= dirp->cache_index);
8822           diri->dir->readdir_cache.resize(dirp->cache_index);
8823         }
8824         diri->flags |= I_COMPLETE | I_DIR_ORDERED;
8825       } else {
8826         ldout(cct, 10) << " marking I_COMPLETE on " << *diri << dendl;
8827         diri->flags |= I_COMPLETE;
8828       }
8829     }
8830
8831     dirp->set_end();
8832     return 0;
8833   }
8834   ceph_abort();
8835   return 0;
8836 }
8837
8838
8839 int Client::readdir_r(dir_result_t *d, struct dirent *de)
8840 {
8841   return readdirplus_r(d, de, 0, 0, 0, NULL);
8842 }
8843
8844 /*
8845  * readdirplus_r
8846  *
8847  * returns
8848  *  1 if we got a dirent
8849  *  0 for end of directory
8850  * <0 on error
8851  */
8852
8853 struct single_readdir {
8854   struct dirent *de;
8855   struct ceph_statx *stx;
8856   Inode *inode;
8857   bool full;
8858 };
8859
8860 static int _readdir_single_dirent_cb(void *p, struct dirent *de,
8861                                      struct ceph_statx *stx, off_t off,
8862                                      Inode *in)
8863 {
8864   single_readdir *c = static_cast<single_readdir *>(p);
8865
8866   if (c->full)
8867     return -1;  // already filled this dirent
8868
8869   *c->de = *de;
8870   if (c->stx)
8871     *c->stx = *stx;
8872   c->inode = in;
8873   c->full = true;
8874   return 1;
8875 }
8876
8877 struct dirent *Client::readdir(dir_result_t *d)
8878 {
8879   int ret;
8880   auto& de = d->de;
8881   single_readdir sr;
8882   sr.de = &de;
8883   sr.stx = NULL;
8884   sr.inode = NULL;
8885   sr.full = false;
8886
8887   // our callback fills the dirent and sets sr.full=true on first
8888   // call, and returns -1 the second time around.
8889   ret = readdir_r_cb(d, _readdir_single_dirent_cb, (void *)&sr);
8890   if (ret < -1) {
8891     errno = -ret;  // this sucks.
8892     return (dirent *) NULL;
8893   }
8894   if (sr.full) {
8895     return &de;
8896   }
8897   return (dirent *) NULL;
8898 }
8899
8900 int Client::readdirplus_r(dir_result_t *d, struct dirent *de,
8901                           struct ceph_statx *stx, unsigned want,
8902                           unsigned flags, Inode **out)
8903 {
8904   single_readdir sr;
8905   sr.de = de;
8906   sr.stx = stx;
8907   sr.inode = NULL;
8908   sr.full = false;
8909
8910   // our callback fills the dirent and sets sr.full=true on first
8911   // call, and returns -1 the second time around.
8912   int r = readdir_r_cb(d, _readdir_single_dirent_cb, (void *)&sr, want, flags, out);
8913   if (r < -1)
8914     return r;
8915   if (out)
8916     *out = sr.inode;
8917   if (sr.full)
8918     return 1;
8919   return 0;
8920 }
8921
8922
8923 /* getdents */
8924 struct getdents_result {
8925   char *buf;
8926   int buflen;
8927   int pos;
8928   bool fullent;
8929 };
8930
8931 static int _readdir_getdent_cb(void *p, struct dirent *de,
8932                                struct ceph_statx *stx, off_t off, Inode *in)
8933 {
8934   struct getdents_result *c = static_cast<getdents_result *>(p);
8935
8936   int dlen;
8937   if (c->fullent)
8938     dlen = sizeof(*de);
8939   else
8940     dlen = strlen(de->d_name) + 1;
8941
8942   if (c->pos + dlen > c->buflen)
8943     return -1;  // doesn't fit
8944
8945   if (c->fullent) {
8946     memcpy(c->buf + c->pos, de, sizeof(*de));
8947   } else {
8948     memcpy(c->buf + c->pos, de->d_name, dlen);
8949   }
8950   c->pos += dlen;
8951   return 0;
8952 }
8953
8954 int Client::_getdents(dir_result_t *dir, char *buf, int buflen, bool fullent)
8955 {
8956   getdents_result gr;
8957   gr.buf = buf;
8958   gr.buflen = buflen;
8959   gr.fullent = fullent;
8960   gr.pos = 0;
8961
8962   int r = readdir_r_cb(dir, _readdir_getdent_cb, (void *)&gr);
8963
8964   if (r < 0) { // some error
8965     if (r == -1) { // buffer ran out of space
8966       if (gr.pos) { // but we got some entries already!
8967         return gr.pos;
8968       } // or we need a larger buffer
8969       return -CEPHFS_ERANGE;
8970     } else { // actual error, return it
8971       return r;
8972     }
8973   }
8974   return gr.pos;
8975 }
8976
8977
8978 /* getdir */
8979 struct getdir_result {
8980   list<string> *contents;
8981   int num;
8982 };
8983
8984 static int _getdir_cb(void *p, struct dirent *de, struct ceph_statx *stx, off_t off, Inode *in)
8985 {
8986   getdir_result *r = static_cast<getdir_result *>(p);
8987
8988   r->contents->push_back(de->d_name);
8989   r->num++;
8990   return 0;
8991 }
8992
8993 int Client::getdir(const char *relpath, list<string>& contents,
8994                    const UserPerm& perms)
8995 {
8996   ldout(cct, 3) << "getdir(" << relpath << ")" << dendl;
8997   tout(cct) << "getdir" << std::endl;
8998   tout(cct) << relpath << std::endl;
8999
9000   dir_result_t *d;
9001   int r = opendir(relpath, &d, perms);
9002   if (r < 0)
9003     return r;
9004
9005   getdir_result gr;
9006   gr.contents = &contents;
9007   gr.num = 0;
9008   r = readdir_r_cb(d, _getdir_cb, (void *)&gr);
9009
9010   closedir(d);
9011
9012   if (r < 0)
9013     return r;
9014   return gr.num;
9015 }
9016
9017
9018 /****** file i/o **********/
9019 int Client::open(const char *relpath, int flags, const UserPerm& perms,
9020                  mode_t mode, int stripe_unit, int stripe_count,
9021                  int object_size, const char *data_pool, std::string alternate_name)
9022 {
9023   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
9024   if (!mref_reader.is_state_satisfied())
9025     return -CEPHFS_ENOTCONN;
9026
9027   int cflags = ceph_flags_sys2wire(flags);
9028
9029   ldout(cct, 3) << "open enter(" << relpath << ", " << cflags << "," << mode << ")" << dendl;
9030   tout(cct) << "open" << std::endl;
9031   tout(cct) << relpath << std::endl;
9032   tout(cct) << cflags << std::endl;
9033
9034   Fh *fh = NULL;
9035
9036 #if defined(__linux__) && defined(O_PATH)
9037   /* When the O_PATH is being specified, others flags than O_DIRECTORY
9038    * and O_NOFOLLOW are ignored. Please refer do_entry_open() function
9039    * in kernel (fs/open.c). */
9040   if (flags & O_PATH)
9041     flags &= O_DIRECTORY | O_NOFOLLOW | O_PATH;
9042 #endif
9043
9044   filepath path(relpath);
9045   InodeRef in;
9046   bool created = false;
9047   /* O_CREATE with O_EXCL enforces O_NOFOLLOW. */
9048   bool followsym = !((flags & O_NOFOLLOW) || ((flags & O_CREAT) && (flags & O_EXCL)));
9049   int mask = ceph_caps_for_mode(ceph_flags_to_mode(cflags));
9050
9051   std::scoped_lock lock(client_lock);
9052   int r = path_walk(path, &in, perms, followsym, mask);
9053
9054   if (r == 0 && (flags & O_CREAT) && (flags & O_EXCL))
9055     return -CEPHFS_EEXIST;
9056
9057 #if defined(__linux__) && defined(O_PATH)
9058   if (r == 0 && in->is_symlink() && (flags & O_NOFOLLOW) && !(flags & O_PATH))
9059 #else
9060   if (r == 0 && in->is_symlink() && (flags & O_NOFOLLOW))
9061 #endif
9062     return -CEPHFS_ELOOP;
9063
9064   if (r == -CEPHFS_ENOENT && (flags & O_CREAT)) {
9065     filepath dirpath = path;
9066     string dname = dirpath.last_dentry();
9067     dirpath.pop_dentry();
9068     InodeRef dir;
9069     r = path_walk(dirpath, &dir, perms, true,
9070                   cct->_conf->client_permissions ? CEPH_CAP_AUTH_SHARED : 0);
9071     if (r < 0)
9072       goto out;
9073     if (cct->_conf->client_permissions) {
9074       r = may_create(dir.get(), perms);
9075       if (r < 0)
9076         goto out;
9077     }
9078     r = _create(dir.get(), dname.c_str(), flags, mode, &in, &fh, stripe_unit,
9079                 stripe_count, object_size, data_pool, &created, perms,
9080                 std::move(alternate_name));
9081   }
9082   if (r < 0)
9083     goto out;
9084
9085   if (!created) {
9086     // posix says we can only check permissions of existing files
9087     if (cct->_conf->client_permissions) {
9088       r = may_open(in.get(), flags, perms);
9089       if (r < 0)
9090         goto out;
9091     }
9092   }
9093
9094   if (!fh)
9095     r = _open(in.get(), flags, mode, &fh, perms);
9096   if (r >= 0) {
9097     // allocate a integer file descriptor
9098     ceph_assert(fh);
9099     r = get_fd();
9100     ceph_assert(fd_map.count(r) == 0);
9101     fd_map[r] = fh;
9102   }
9103
9104  out:
9105   tout(cct) << r << std::endl;
9106   ldout(cct, 3) << "open exit(" << path << ", " << cflags << ") = " << r << dendl;
9107   return r;
9108 }
9109
9110 int Client::lookup_hash(inodeno_t ino, inodeno_t dirino, const char *name,
9111                         const UserPerm& perms)
9112 {
9113   ldout(cct, 3) << __func__ << " enter(" << ino << ", #" << dirino << "/" << name << ")" << dendl;
9114
9115   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
9116   if (!mref_reader.is_state_satisfied())
9117     return -CEPHFS_ENOTCONN;
9118
9119   std::scoped_lock lock(client_lock);
9120   MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPHASH);
9121   filepath path(ino);
9122   req->set_filepath(path);
9123
9124   uint32_t h = ceph_str_hash(CEPH_STR_HASH_RJENKINS, name, strlen(name));
9125   char f[30];
9126   sprintf(f, "%u", h);
9127   filepath path2(dirino);
9128   path2.push_dentry(string(f));
9129   req->set_filepath2(path2);
9130
9131   int r = make_request(req, perms, NULL, NULL,
9132                        rand() % mdsmap->get_num_in_mds());
9133   ldout(cct, 3) << __func__ << " exit(" << ino << ", #" << dirino << "/" << name << ") = " << r << dendl;
9134   return r;
9135 }
9136
9137
9138 /**
9139  * Load inode into local cache.
9140  *
9141  * If inode pointer is non-NULL, and take a reference on
9142  * the resulting Inode object in one operation, so that caller
9143  * can safely assume inode will still be there after return.
9144  */
9145 int Client::_lookup_vino(vinodeno_t vino, const UserPerm& perms, Inode **inode)
9146 {
9147   ldout(cct, 8) << __func__ << " enter(" << vino << ")" << dendl;
9148
9149   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
9150   if (!mref_reader.is_state_satisfied())
9151     return -CEPHFS_ENOTCONN;
9152
9153   MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPINO);
9154   filepath path(vino.ino);
9155   req->set_filepath(path);
9156
9157   /*
9158    * The MDS expects either a "real" snapid here or 0. The special value
9159    * carveouts for the snapid are all at the end of the range so we can
9160    * just look for any snapid below this value.
9161    */
9162   if (vino.snapid < CEPH_NOSNAP)
9163     req->head.args.lookupino.snapid = vino.snapid;
9164
9165   int r = make_request(req, perms, NULL, NULL, rand() % mdsmap->get_num_in_mds());
9166   if (r == 0 && inode != NULL) {
9167     unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino);
9168     ceph_assert(p != inode_map.end());
9169     *inode = p->second;
9170     _ll_get(*inode);
9171   }
9172   ldout(cct, 8) << __func__ << " exit(" << vino << ") = " << r << dendl;
9173   return r;
9174 }
9175
9176 int Client::lookup_ino(inodeno_t ino, const UserPerm& perms, Inode **inode)
9177 {
9178   vinodeno_t vino(ino, CEPH_NOSNAP);
9179   std::scoped_lock lock(client_lock);
9180   return _lookup_vino(vino, perms, inode);
9181 }
9182
9183 /**
9184  * Find the parent inode of `ino` and insert it into
9185  * our cache.  Conditionally also set `parent` to a referenced
9186  * Inode* if caller provides non-NULL value.
9187  */
9188 int Client::_lookup_parent(Inode *ino, const UserPerm& perms, Inode **parent)
9189 {
9190   ldout(cct, 8) << __func__ << " enter(" << ino->ino << ")" << dendl;
9191
9192   MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPPARENT);
9193   filepath path(ino->ino);
9194   req->set_filepath(path);
9195
9196   InodeRef target;
9197   int r = make_request(req, perms, &target, NULL, rand() % mdsmap->get_num_in_mds());
9198   // Give caller a reference to the parent ino if they provided a pointer.
9199   if (parent != NULL) {
9200     if (r == 0) {
9201       *parent = target.get();
9202       _ll_get(*parent);
9203       ldout(cct, 8) << __func__ << " found parent " << (*parent)->ino << dendl;
9204     } else {
9205       *parent = NULL;
9206     }
9207   }
9208   ldout(cct, 8) << __func__ << " exit(" << ino->ino << ") = " << r << dendl;
9209   return r;
9210 }
9211
9212 /**
9213  * Populate the parent dentry for `ino`, provided it is
9214  * a child of `parent`.
9215  */
9216 int Client::_lookup_name(Inode *ino, Inode *parent, const UserPerm& perms)
9217 {
9218   ceph_assert(parent->is_dir());
9219   ldout(cct, 3) << __func__ << " enter(" << ino->ino << ")" << dendl;
9220
9221   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
9222   if (!mref_reader.is_state_satisfied())
9223     return -CEPHFS_ENOTCONN;
9224
9225   MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPNAME);
9226   req->set_filepath2(filepath(parent->ino));
9227   req->set_filepath(filepath(ino->ino));
9228   req->set_inode(ino);
9229
9230   int r = make_request(req, perms, NULL, NULL, rand() % mdsmap->get_num_in_mds());
9231   ldout(cct, 3) << __func__ << " exit(" << ino->ino << ") = " << r << dendl;
9232   return r;
9233 }
9234
9235 int Client::lookup_name(Inode *ino, Inode *parent, const UserPerm& perms)
9236 {
9237   std::scoped_lock lock(client_lock);
9238   return _lookup_name(ino, parent, perms);
9239 }
9240
9241 Fh *Client::_create_fh(Inode *in, int flags, int cmode, const UserPerm& perms)
9242 {
9243   ceph_assert(in);
9244   Fh *f = new Fh(in, flags, cmode, fd_gen, perms);
9245
9246   ldout(cct, 10) << __func__ << " " << in->ino << " mode " << cmode << dendl;
9247
9248   if (in->snapid != CEPH_NOSNAP) {
9249     in->snap_cap_refs++;
9250     ldout(cct, 5) << "open success, fh is " << f << " combined IMMUTABLE SNAP caps "
9251             << ccap_string(in->caps_issued()) << dendl;
9252   }
9253
9254   const auto& conf = cct->_conf;
9255   f->readahead.set_trigger_requests(1);
9256   f->readahead.set_min_readahead_size(conf->client_readahead_min);
9257   uint64_t max_readahead = Readahead::NO_LIMIT;
9258   if (conf->client_readahead_max_bytes) {
9259     max_readahead = std::min(max_readahead, (uint64_t)conf->client_readahead_max_bytes);
9260   }
9261   if (conf->client_readahead_max_periods) {
9262     max_readahead = std::min(max_readahead, in->layout.get_period()*(uint64_t)conf->client_readahead_max_periods);
9263   }
9264   f->readahead.set_max_readahead_size(max_readahead);
9265   vector<uint64_t> alignments;
9266   alignments.push_back(in->layout.get_period());
9267   alignments.push_back(in->layout.stripe_unit);
9268   f->readahead.set_alignments(alignments);
9269
9270   return f;
9271 }
9272
9273 int Client::_release_fh(Fh *f)
9274 {
9275   //ldout(cct, 3) << "op: client->close(open_files[ " << fh << " ]);" << dendl;
9276   //ldout(cct, 3) << "op: open_files.erase( " << fh << " );" << dendl;
9277   Inode *in = f->inode.get();
9278   ldout(cct, 8) << __func__ << " " << f << " mode " << f->mode << " on " << *in << dendl;
9279
9280   in->unset_deleg(f);
9281
9282   if (in->snapid == CEPH_NOSNAP) {
9283     if (in->put_open_ref(f->mode)) {
9284       _flush(in, new C_Client_FlushComplete(this, in));
9285       check_caps(in, 0);
9286     }
9287   } else {
9288     ceph_assert(in->snap_cap_refs > 0);
9289     in->snap_cap_refs--;
9290   }
9291
9292   _release_filelocks(f);
9293
9294   // Finally, read any async err (i.e. from flushes)
9295   int err = f->take_async_err();
9296   if (err != 0) {
9297     ldout(cct, 1) << __func__ << " " << f << " on inode " << *in << " caught async_err = "
9298                   << cpp_strerror(err) << dendl;
9299   } else {
9300     ldout(cct, 10) << __func__ << " " << f << " on inode " << *in << " no async_err state" << dendl;
9301   }
9302
9303   _put_fh(f);
9304
9305   return err;
9306 }
9307
9308 void Client::_put_fh(Fh *f)
9309 {
9310   int left = f->put();
9311   if (!left) {
9312     delete f;
9313   }
9314 }
9315
9316 int Client::_open(Inode *in, int flags, mode_t mode, Fh **fhp,
9317                   const UserPerm& perms)
9318 {
9319   if (in->snapid != CEPH_NOSNAP &&
9320       (flags & (O_WRONLY | O_RDWR | O_CREAT | O_TRUNC | O_APPEND))) {
9321     return -CEPHFS_EROFS;
9322   }
9323
9324   // use normalized flags to generate cmode
9325   int cflags = ceph_flags_sys2wire(flags);
9326   if (cct->_conf.get_val<bool>("client_force_lazyio"))
9327     cflags |= CEPH_O_LAZY;
9328
9329   int cmode = ceph_flags_to_mode(cflags);
9330   int want = ceph_caps_for_mode(cmode);
9331   int result = 0;
9332
9333   in->get_open_ref(cmode);  // make note of pending open, since it effects _wanted_ caps.
9334
9335   if ((flags & O_TRUNC) == 0 && in->caps_issued_mask(want)) {
9336     // update wanted?
9337     check_caps(in, CHECK_CAPS_NODELAY);
9338   } else {
9339
9340     MetaRequest *req = new MetaRequest(CEPH_MDS_OP_OPEN);
9341     filepath path;
9342     in->make_nosnap_relative_path(path);
9343     req->set_filepath(path);
9344     req->head.args.open.flags = cflags & ~CEPH_O_CREAT;
9345     req->head.args.open.mode = mode;
9346     req->head.args.open.pool = -1;
9347     if (cct->_conf->client_debug_getattr_caps)
9348       req->head.args.open.mask = DEBUG_GETATTR_CAPS;
9349     else
9350       req->head.args.open.mask = 0;
9351     req->head.args.open.old_size = in->size;   // for O_TRUNC
9352     req->set_inode(in);
9353     result = make_request(req, perms);
9354
9355     /*
9356      * NFS expects that delegations will be broken on a conflicting open,
9357      * not just when there is actual conflicting access to the file. SMB leases
9358      * and oplocks also have similar semantics.
9359      *
9360      * Ensure that clients that have delegations enabled will wait on minimal
9361      * caps during open, just to ensure that other clients holding delegations
9362      * return theirs first.
9363      */
9364     if (deleg_timeout && result == 0) {
9365       int need = 0, have;
9366
9367       if (cmode & CEPH_FILE_MODE_WR)
9368         need |= CEPH_CAP_FILE_WR;
9369       if (cmode & CEPH_FILE_MODE_RD)
9370         need |= CEPH_CAP_FILE_RD;
9371
9372       Fh fh(in, flags, cmode, fd_gen, perms);
9373       result = get_caps(&fh, need, want, &have, -1);
9374       if (result < 0) {
9375         ldout(cct, 8) << "Unable to get caps after open of inode " << *in <<
9376                           " . Denying open: " <<
9377                           cpp_strerror(result) << dendl;
9378       } else {
9379         put_cap_ref(in, need);
9380       }
9381     }
9382   }
9383
9384   // success?
9385   if (result >= 0) {
9386     if (fhp)
9387       *fhp = _create_fh(in, flags, cmode, perms);
9388   } else {
9389     in->put_open_ref(cmode);
9390   }
9391
9392   trim_cache();
9393
9394   return result;
9395 }
9396
9397 int Client::_renew_caps(Inode *in)
9398 {
9399   int wanted = in->caps_file_wanted();
9400   if (in->is_any_caps() &&
9401       ((wanted & CEPH_CAP_ANY_WR) == 0 || in->auth_cap)) {
9402     check_caps(in, CHECK_CAPS_NODELAY);
9403     return 0;
9404   }
9405
9406   int flags = 0;
9407   if ((wanted & CEPH_CAP_FILE_RD) && (wanted & CEPH_CAP_FILE_WR))
9408     flags = O_RDWR;
9409   else if (wanted & CEPH_CAP_FILE_RD)
9410     flags = O_RDONLY;
9411   else if (wanted & CEPH_CAP_FILE_WR)
9412     flags = O_WRONLY;
9413
9414   MetaRequest *req = new MetaRequest(CEPH_MDS_OP_OPEN);
9415   filepath path;
9416   in->make_nosnap_relative_path(path);
9417   req->set_filepath(path);
9418   req->head.args.open.flags = flags;
9419   req->head.args.open.pool = -1;
9420   if (cct->_conf->client_debug_getattr_caps)
9421     req->head.args.open.mask = DEBUG_GETATTR_CAPS;
9422   else
9423     req->head.args.open.mask = 0;
9424   req->set_inode(in);
9425
9426   // duplicate in case Cap goes away; not sure if that race is a concern?
9427   const UserPerm *pperm = in->get_best_perms();
9428   UserPerm perms;
9429   if (pperm != NULL)
9430     perms = *pperm;
9431   int ret = make_request(req, perms);
9432   return ret;
9433 }
9434
9435 int Client::close(int fd)
9436 {
9437   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
9438   if (!mref_reader.is_state_satisfied())
9439     return -CEPHFS_ENOTCONN;
9440
9441   ldout(cct, 3) << "close enter(" << fd << ")" << dendl;
9442   tout(cct) << "close" << std::endl;
9443   tout(cct) << fd << std::endl;
9444
9445   std::scoped_lock lock(client_lock);
9446   Fh *fh = get_filehandle(fd);
9447   if (!fh)
9448     return -CEPHFS_EBADF;
9449   int err = _release_fh(fh);
9450   fd_map.erase(fd);
9451   put_fd(fd);
9452   ldout(cct, 3) << "close exit(" << fd << ")" << dendl;
9453   return err;
9454 }
9455
9456
9457 // ------------
9458 // read, write
9459
9460 loff_t Client::lseek(int fd, loff_t offset, int whence)
9461 {
9462   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
9463   if (!mref_reader.is_state_satisfied())
9464     return -CEPHFS_ENOTCONN;
9465
9466   tout(cct) << "lseek" << std::endl;
9467   tout(cct) << fd << std::endl;
9468   tout(cct) << offset << std::endl;
9469   tout(cct) << whence << std::endl;
9470
9471   std::scoped_lock lock(client_lock);
9472   Fh *f = get_filehandle(fd);
9473   if (!f)
9474     return -CEPHFS_EBADF;
9475 #if defined(__linux__) && defined(O_PATH)
9476   if (f->flags & O_PATH)
9477     return -CEPHFS_EBADF;
9478 #endif
9479   return _lseek(f, offset, whence);
9480 }
9481
9482 loff_t Client::_lseek(Fh *f, loff_t offset, int whence)
9483 {
9484   Inode *in = f->inode.get();
9485   bool whence_check = false;
9486   loff_t pos = -1;
9487
9488   switch (whence) {
9489   case SEEK_END:
9490     whence_check = true;
9491   break;
9492
9493 #ifdef SEEK_DATA
9494   case SEEK_DATA:
9495     whence_check = true;
9496   break;
9497 #endif
9498
9499 #ifdef SEEK_HOLE
9500   case SEEK_HOLE:
9501     whence_check = true;
9502   break;
9503 #endif
9504   }
9505
9506   if (whence_check) {
9507     int r = _getattr(in, CEPH_STAT_CAP_SIZE, f->actor_perms);
9508     if (r < 0)
9509       return r;
9510   }
9511
9512   switch (whence) {
9513   case SEEK_SET:
9514     pos = offset;
9515     break;
9516
9517   case SEEK_CUR:
9518     pos = f->pos + offset;
9519     break;
9520
9521   case SEEK_END:
9522     pos = in->size + offset;
9523     break;
9524
9525 #ifdef SEEK_DATA
9526   case SEEK_DATA:
9527     if (offset < 0 || static_cast<uint64_t>(offset) >= in->size)
9528       return -CEPHFS_ENXIO;
9529     pos = offset;
9530     break;
9531 #endif
9532
9533 #ifdef SEEK_HOLE
9534   case SEEK_HOLE:
9535     if (offset < 0 || static_cast<uint64_t>(offset) >= in->size)
9536       return -CEPHFS_ENXIO;
9537     pos = in->size;
9538     break;
9539 #endif
9540
9541   default:
9542     ldout(cct, 1) << __func__ << ": invalid whence value " << whence << dendl;
9543     return -CEPHFS_EINVAL;
9544   }
9545
9546   if (pos < 0) {
9547     return -CEPHFS_EINVAL;
9548   } else {
9549     f->pos = pos;
9550   }
9551
9552   ldout(cct, 8) << "_lseek(" << f << ", " << offset << ", " << whence << ") = " << f->pos << dendl;
9553   return f->pos;
9554 }
9555
9556
9557 void Client::lock_fh_pos(Fh *f)
9558 {
9559   ldout(cct, 10) << __func__ << " " << f << dendl;
9560
9561   if (f->pos_locked || !f->pos_waiters.empty()) {
9562     ceph::condition_variable cond;
9563     f->pos_waiters.push_back(&cond);
9564     ldout(cct, 10) << __func__ << " BLOCKING on " << f << dendl;
9565     std::unique_lock l{client_lock, std::adopt_lock};
9566     cond.wait(l, [f, me=&cond] {
9567       return !f->pos_locked && f->pos_waiters.front() == me;
9568     });
9569     l.release();
9570     ldout(cct, 10) << __func__ << " UNBLOCKING on " << f << dendl;
9571     ceph_assert(f->pos_waiters.front() == &cond);
9572     f->pos_waiters.pop_front();
9573   }
9574
9575   f->pos_locked = true;
9576 }
9577
9578 void Client::unlock_fh_pos(Fh *f)
9579 {
9580   ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
9581
9582   ldout(cct, 10) << __func__ << " " << f << dendl;
9583   f->pos_locked = false;
9584   if (!f->pos_waiters.empty()) {
9585     // only wake up the oldest waiter
9586     auto cond = f->pos_waiters.front();
9587     cond->notify_one();
9588   }
9589 }
9590
9591 int Client::uninline_data(Inode *in, Context *onfinish)
9592 {
9593   if (!in->inline_data.length()) {
9594     onfinish->complete(0);
9595     return 0;
9596   }
9597
9598   char oid_buf[32];
9599   snprintf(oid_buf, sizeof(oid_buf), "%llx.00000000", (long long unsigned)in->ino);
9600   object_t oid = oid_buf;
9601
9602   ObjectOperation create_ops;
9603   create_ops.create(false);
9604
9605   objecter->mutate(oid,
9606                    OSDMap::file_to_object_locator(in->layout),
9607                    create_ops,
9608                    in->snaprealm->get_snap_context(),
9609                    ceph::real_clock::now(),
9610                    0,
9611                    NULL);
9612
9613   bufferlist inline_version_bl;
9614   encode(in->inline_version, inline_version_bl);
9615
9616   ObjectOperation uninline_ops;
9617   uninline_ops.cmpxattr("inline_version",
9618                         CEPH_OSD_CMPXATTR_OP_GT,
9619                         CEPH_OSD_CMPXATTR_MODE_U64,
9620                         inline_version_bl);
9621   bufferlist inline_data = in->inline_data;
9622   uninline_ops.write(0, inline_data, in->truncate_size, in->truncate_seq);
9623   uninline_ops.setxattr("inline_version", stringify(in->inline_version));
9624
9625   objecter->mutate(oid,
9626                    OSDMap::file_to_object_locator(in->layout),
9627                    uninline_ops,
9628                    in->snaprealm->get_snap_context(),
9629                    ceph::real_clock::now(),
9630                    0,
9631                    onfinish);
9632
9633   return 0;
9634 }
9635
9636 //
9637
9638 // blocking osd interface
9639
9640 int Client::read(int fd, char *buf, loff_t size, loff_t offset)
9641 {
9642   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
9643   if (!mref_reader.is_state_satisfied())
9644     return -CEPHFS_ENOTCONN;
9645
9646   tout(cct) << "read" << std::endl;
9647   tout(cct) << fd << std::endl;
9648   tout(cct) << size << std::endl;
9649   tout(cct) << offset << std::endl;
9650
9651   std::unique_lock lock(client_lock);
9652   Fh *f = get_filehandle(fd);
9653   if (!f)
9654     return -CEPHFS_EBADF;
9655 #if defined(__linux__) && defined(O_PATH)
9656   if (f->flags & O_PATH)
9657     return -CEPHFS_EBADF;
9658 #endif
9659   bufferlist bl;
9660   /* We can't return bytes written larger than INT_MAX, clamp size to that */
9661   size = std::min(size, (loff_t)INT_MAX);
9662   int r = _read(f, offset, size, &bl);
9663   ldout(cct, 3) << "read(" << fd << ", " << (void*)buf << ", " << size << ", " << offset << ") = " << r << dendl;
9664   if (r >= 0) {
9665     lock.unlock();
9666     bl.begin().copy(bl.length(), buf);
9667     r = bl.length();
9668   }
9669   return r;
9670 }
9671
9672 int Client::preadv(int fd, const struct iovec *iov, int iovcnt, loff_t offset)
9673 {
9674   if (iovcnt < 0)
9675     return -CEPHFS_EINVAL;
9676   return _preadv_pwritev(fd, iov, iovcnt, offset, false);
9677 }
9678
9679 int64_t Client::_read(Fh *f, int64_t offset, uint64_t size, bufferlist *bl)
9680 {
9681   ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
9682
9683   int want, have = 0;
9684   bool movepos = false;
9685   std::unique_ptr<C_SaferCond> onuninline;
9686   int64_t rc = 0;
9687   const auto& conf = cct->_conf;
9688   Inode *in = f->inode.get();
9689   utime_t lat;
9690   utime_t start = ceph_clock_now();
9691
9692   if ((f->mode & CEPH_FILE_MODE_RD) == 0)
9693     return -CEPHFS_EBADF;
9694   //bool lazy = f->mode == CEPH_FILE_MODE_LAZY;
9695
9696   if (offset < 0) {
9697     lock_fh_pos(f);
9698     offset = f->pos;
9699     movepos = true;
9700   }
9701   loff_t start_pos = offset;
9702
9703   if (in->inline_version == 0) {
9704     auto r = _getattr(in, CEPH_STAT_CAP_INLINE_DATA, f->actor_perms, true);
9705     if (r < 0) {
9706       rc = r;
9707       goto done;
9708     }
9709     ceph_assert(in->inline_version > 0);
9710   }
9711
9712 retry:
9713   if (f->mode & CEPH_FILE_MODE_LAZY)
9714     want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO;
9715   else
9716     want = CEPH_CAP_FILE_CACHE;
9717   {
9718     auto r = get_caps(f, CEPH_CAP_FILE_RD, want, &have, -1);
9719     if (r < 0) {
9720       rc = r;
9721       goto done;
9722     }
9723   }
9724   if (f->flags & O_DIRECT)
9725     have &= ~(CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO);
9726
9727   if (in->inline_version < CEPH_INLINE_NONE) {
9728     if (!(have & CEPH_CAP_FILE_CACHE)) {
9729       onuninline.reset(new C_SaferCond("Client::_read_uninline_data flock"));
9730       uninline_data(in, onuninline.get());
9731     } else {
9732       uint32_t len = in->inline_data.length();
9733       uint64_t endoff = offset + size;
9734       if (endoff > in->size)
9735         endoff = in->size;
9736
9737       if (offset < len) {
9738         if (endoff <= len) {
9739           bl->substr_of(in->inline_data, offset, endoff - offset);
9740         } else {
9741           bl->substr_of(in->inline_data, offset, len - offset);
9742           bl->append_zero(endoff - len);
9743         }
9744         rc = endoff - offset;
9745       } else if ((uint64_t)offset < endoff) {
9746         bl->append_zero(endoff - offset);
9747         rc = endoff - offset;
9748       } else {
9749         rc = 0;
9750       }
9751       goto success;
9752     }
9753   }
9754
9755   if (!conf->client_debug_force_sync_read &&
9756       conf->client_oc &&
9757       (have & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO))) {
9758
9759     if (f->flags & O_RSYNC) {
9760       _flush_range(in, offset, size);
9761     }
9762     rc = _read_async(f, offset, size, bl);
9763     if (rc < 0)
9764       goto done;
9765   } else {
9766     if (f->flags & O_DIRECT)
9767       _flush_range(in, offset, size);
9768
9769     bool checkeof = false;
9770     rc = _read_sync(f, offset, size, bl, &checkeof);
9771     if (rc < 0)
9772       goto done;
9773     if (checkeof) {
9774       offset += rc;
9775       size -= rc;
9776
9777       put_cap_ref(in, CEPH_CAP_FILE_RD);
9778       have = 0;
9779       // reverify size
9780       {
9781         auto r = _getattr(in, CEPH_STAT_CAP_SIZE, f->actor_perms);
9782         if (r < 0) {
9783           rc = r;
9784           goto done;
9785         }
9786       }
9787
9788       // eof?  short read.
9789       if ((uint64_t)offset < in->size)
9790         goto retry;
9791     }
9792   }
9793
9794 success:
9795   ceph_assert(rc >= 0);
9796   if (movepos) {
9797     // adjust fd pos
9798     f->pos = start_pos + rc;
9799   }
9800
9801   lat = ceph_clock_now();
9802   lat -= start;
9803   logger->tinc(l_c_read, lat);
9804
9805 done:
9806   // done!
9807
9808   if (onuninline) {
9809     client_lock.unlock();
9810     int ret = onuninline->wait();
9811     client_lock.lock();
9812     if (ret >= 0 || ret == -CEPHFS_ECANCELED) {
9813       in->inline_data.clear();
9814       in->inline_version = CEPH_INLINE_NONE;
9815       in->mark_caps_dirty(CEPH_CAP_FILE_WR);
9816       check_caps(in, 0);
9817     } else
9818       rc = ret;
9819   }
9820   if (have) {
9821     put_cap_ref(in, CEPH_CAP_FILE_RD);
9822   }
9823   if (movepos) {
9824     unlock_fh_pos(f);
9825   }
9826   return rc;
9827 }
9828
9829 Client::C_Readahead::C_Readahead(Client *c, Fh *f) :
9830     client(c), f(f) {
9831   f->get();
9832   f->readahead.inc_pending();
9833 }
9834
9835 Client::C_Readahead::~C_Readahead() {
9836   f->readahead.dec_pending();
9837   client->_put_fh(f);
9838 }
9839
9840 void Client::C_Readahead::finish(int r) {
9841   lgeneric_subdout(client->cct, client, 20) << "client." << client->get_nodeid() << " " << "C_Readahead on " << f->inode << dendl;
9842   client->put_cap_ref(f->inode.get(), CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE);
9843 }
9844
9845 int Client::_read_async(Fh *f, uint64_t off, uint64_t len, bufferlist *bl)
9846 {
9847   ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
9848
9849   const auto& conf = cct->_conf;
9850   Inode *in = f->inode.get();
9851
9852   ldout(cct, 10) << __func__ << " " << *in << " " << off << "~" << len << dendl;
9853
9854   // trim read based on file size?
9855   if (off >= in->size)
9856     return 0;
9857   if (len == 0)
9858     return 0;
9859   if (off + len > in->size) {
9860     len = in->size - off;
9861   }
9862
9863   ldout(cct, 10) << " min_bytes=" << f->readahead.get_min_readahead_size()
9864                  << " max_bytes=" << f->readahead.get_max_readahead_size()
9865                  << " max_periods=" << conf->client_readahead_max_periods << dendl;
9866
9867   // read (and possibly block)
9868   int r = 0;
9869   C_SaferCond onfinish("Client::_read_async flock");
9870   r = objectcacher->file_read(&in->oset, &in->layout, in->snapid,
9871                               off, len, bl, 0, &onfinish);
9872   if (r == 0) {
9873     get_cap_ref(in, CEPH_CAP_FILE_CACHE);
9874     client_lock.unlock();
9875     r = onfinish.wait();
9876     client_lock.lock();
9877     put_cap_ref(in, CEPH_CAP_FILE_CACHE);
9878   }
9879
9880   if(f->readahead.get_min_readahead_size() > 0) {
9881     pair<uint64_t, uint64_t> readahead_extent = f->readahead.update(off, len, in->size);
9882     if (readahead_extent.second > 0) {
9883       ldout(cct, 20) << "readahead " << readahead_extent.first << "~" << readahead_extent.second
9884                      << " (caller wants " << off << "~" << len << ")" << dendl;
9885       Context *onfinish2 = new C_Readahead(this, f);
9886       int r2 = objectcacher->file_read(&in->oset, &in->layout, in->snapid,
9887                                        readahead_extent.first, readahead_extent.second,
9888                                        NULL, 0, onfinish2);
9889       if (r2 == 0) {
9890         ldout(cct, 20) << "readahead initiated, c " << onfinish2 << dendl;
9891         get_cap_ref(in, CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE);
9892       } else {
9893         ldout(cct, 20) << "readahead was no-op, already cached" << dendl;
9894         delete onfinish2;
9895       }
9896     }
9897   }
9898
9899   return r;
9900 }
9901
9902 int Client::_read_sync(Fh *f, uint64_t off, uint64_t len, bufferlist *bl,
9903                        bool *checkeof)
9904 {
9905   ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
9906
9907   Inode *in = f->inode.get();
9908   uint64_t pos = off;
9909   int left = len;
9910   int read = 0;
9911
9912   ldout(cct, 10) << __func__ << " " << *in << " " << off << "~" << len << dendl;
9913
9914   // 0 success, 1 continue and < 0 error happen.
9915   auto wait_and_copy = [&](C_SaferCond &onfinish, bufferlist &tbl, int wanted) {
9916     int r = onfinish.wait();
9917
9918     // if we get ENOENT from OSD, assume 0 bytes returned
9919     if (r == -CEPHFS_ENOENT)
9920       r = 0;
9921     if (r < 0)
9922       return r;
9923
9924     if (tbl.length()) {
9925       r = tbl.length();
9926
9927       read += r;
9928       pos += r;
9929       left -= r;
9930       bl->claim_append(tbl);
9931     }
9932     // short read?
9933     if (r >= 0 && r < wanted) {
9934       if (pos < in->size) {
9935         // zero up to known EOF
9936         int64_t some = in->size - pos;
9937         if (some > left)
9938           some = left;
9939         auto z = buffer::ptr_node::create(some);
9940         z->zero();
9941         bl->push_back(std::move(z));
9942         read += some;
9943         pos += some;
9944         left -= some;
9945         if (left == 0)
9946           return 0;
9947       }
9948
9949       *checkeof = true;
9950       return 0;
9951     }
9952     return 1;
9953   };
9954
9955   while (left > 0) {
9956     C_SaferCond onfinish("Client::_read_sync flock");
9957     bufferlist tbl;
9958
9959     int wanted = left;
9960     filer->read_trunc(in->ino, &in->layout, in->snapid,
9961                       pos, left, &tbl, 0,
9962                       in->truncate_size, in->truncate_seq,
9963                       &onfinish);
9964     client_lock.unlock();
9965     int r = wait_and_copy(onfinish, tbl, wanted);
9966     client_lock.lock();
9967     if (!r)
9968       return read;
9969     if (r < 0)
9970       return r;
9971   }
9972   return read;
9973 }
9974
9975 int Client::write(int fd, const char *buf, loff_t size, loff_t offset)
9976 {
9977   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
9978   if (!mref_reader.is_state_satisfied())
9979     return -CEPHFS_ENOTCONN;
9980
9981   tout(cct) << "write" << std::endl;
9982   tout(cct) << fd << std::endl;
9983   tout(cct) << size << std::endl;
9984   tout(cct) << offset << std::endl;
9985
9986   std::scoped_lock lock(client_lock);
9987   Fh *fh = get_filehandle(fd);
9988   if (!fh)
9989     return -CEPHFS_EBADF;
9990 #if defined(__linux__) && defined(O_PATH)
9991   if (fh->flags & O_PATH)
9992     return -CEPHFS_EBADF;
9993 #endif
9994   /* We can't return bytes written larger than INT_MAX, clamp size to that */
9995   size = std::min(size, (loff_t)INT_MAX);
9996   int r = _write(fh, offset, size, buf, NULL, false);
9997   ldout(cct, 3) << "write(" << fd << ", \"...\", " << size << ", " << offset << ") = " << r << dendl;
9998   return r;
9999 }
10000
10001 int Client::pwritev(int fd, const struct iovec *iov, int iovcnt, int64_t offset)
10002 {
10003   if (iovcnt < 0)
10004     return -CEPHFS_EINVAL;
10005   return _preadv_pwritev(fd, iov, iovcnt, offset, true);
10006 }
10007
10008 int64_t Client::_preadv_pwritev_locked(Fh *fh, const struct iovec *iov,
10009                                    unsigned iovcnt, int64_t offset, bool write,
10010                                    bool clamp_to_int, std::unique_lock<ceph::mutex> &cl)
10011 {
10012 #if defined(__linux__) && defined(O_PATH)
10013     if (fh->flags & O_PATH)
10014         return -CEPHFS_EBADF;
10015 #endif
10016     loff_t totallen = 0;
10017     for (unsigned i = 0; i < iovcnt; i++) {
10018         totallen += iov[i].iov_len;
10019     }
10020
10021     /*
10022      * Some of the API functions take 64-bit size values, but only return
10023      * 32-bit signed integers. Clamp the I/O sizes in those functions so that
10024      * we don't do I/Os larger than the values we can return.
10025      */
10026     if (clamp_to_int) {
10027       totallen = std::min(totallen, (loff_t)INT_MAX);
10028     }
10029     if (write) {
10030         int64_t w = _write(fh, offset, totallen, NULL, iov, iovcnt);
10031         ldout(cct, 3) << "pwritev(" << fh << ", \"...\", " << totallen << ", " << offset << ") = " << w << dendl;
10032         return w;
10033     } else {
10034         bufferlist bl;
10035         int64_t r = _read(fh, offset, totallen, &bl);
10036         ldout(cct, 3) << "preadv(" << fh << ", " <<  offset << ") = " << r << dendl;
10037         if (r <= 0)
10038           return r;
10039
10040         cl.unlock();
10041         auto iter = bl.cbegin();
10042         for (unsigned j = 0, resid = r; j < iovcnt && resid > 0; j++) {
10043                /*
10044                 * This piece of code aims to handle the case that bufferlist
10045                 * does not have enough data to fill in the iov
10046                 */
10047                const auto round_size = std::min<unsigned>(resid, iov[j].iov_len);
10048                iter.copy(round_size, reinterpret_cast<char*>(iov[j].iov_base));
10049                resid -= round_size;
10050                /* iter is self-updating */
10051         }
10052         cl.lock();
10053         return r;
10054     }
10055 }
10056
10057 int Client::_preadv_pwritev(int fd, const struct iovec *iov, unsigned iovcnt, int64_t offset, bool write)
10058 {
10059     RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
10060     if (!mref_reader.is_state_satisfied())
10061       return -CEPHFS_ENOTCONN;
10062
10063     tout(cct) << fd << std::endl;
10064     tout(cct) << offset << std::endl;
10065
10066     std::unique_lock cl(client_lock);
10067     Fh *fh = get_filehandle(fd);
10068     if (!fh)
10069       return -CEPHFS_EBADF;
10070     return _preadv_pwritev_locked(fh, iov, iovcnt, offset, write, true, cl);
10071 }
10072
10073 int64_t Client::_write(Fh *f, int64_t offset, uint64_t size, const char *buf,
10074                         const struct iovec *iov, int iovcnt)
10075 {
10076   ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
10077
10078   uint64_t fpos = 0;
10079
10080   if ((uint64_t)(offset+size) > mdsmap->get_max_filesize()) //too large!
10081     return -CEPHFS_EFBIG;
10082
10083   //ldout(cct, 7) << "write fh " << fh << " size " << size << " offset " << offset << dendl;
10084   Inode *in = f->inode.get();
10085
10086   if (objecter->osdmap_pool_full(in->layout.pool_id)) {
10087     return -CEPHFS_ENOSPC;
10088   }
10089
10090   ceph_assert(in->snapid == CEPH_NOSNAP);
10091
10092   // was Fh opened as writeable?
10093   if ((f->mode & CEPH_FILE_MODE_WR) == 0)
10094     return -CEPHFS_EBADF;
10095
10096   // use/adjust fd pos?
10097   if (offset < 0) {
10098     lock_fh_pos(f);
10099     /*
10100      * FIXME: this is racy in that we may block _after_ this point waiting for caps, and size may
10101      * change out from under us.
10102      */
10103     if (f->flags & O_APPEND) {
10104       auto r = _lseek(f, 0, SEEK_END);
10105       if (r < 0) {
10106         unlock_fh_pos(f);
10107         return r;
10108       }
10109     }
10110     offset = f->pos;
10111     fpos = offset+size;
10112     unlock_fh_pos(f);
10113   }
10114
10115   // check quota
10116   uint64_t endoff = offset + size;
10117   if (endoff > in->size && is_quota_bytes_exceeded(in, endoff - in->size,
10118                                                    f->actor_perms)) {
10119     return -CEPHFS_EDQUOT;
10120   }
10121
10122   //bool lazy = f->mode == CEPH_FILE_MODE_LAZY;
10123
10124   ldout(cct, 10) << "cur file size is " << in->size << dendl;
10125
10126   // time it.
10127   utime_t start = ceph_clock_now();
10128
10129   if (in->inline_version == 0) {
10130     int r = _getattr(in, CEPH_STAT_CAP_INLINE_DATA, f->actor_perms, true);
10131     if (r < 0)
10132       return r;
10133     ceph_assert(in->inline_version > 0);
10134   }
10135
10136   // copy into fresh buffer (since our write may be resub, async)
10137   bufferlist bl;
10138   if (buf) {
10139     if (size > 0)
10140       bl.append(buf, size);
10141   } else if (iov){
10142     for (int i = 0; i < iovcnt; i++) {
10143       if (iov[i].iov_len > 0) {
10144         bl.append((const char *)iov[i].iov_base, iov[i].iov_len);
10145       }
10146     }
10147   }
10148
10149   utime_t lat;
10150   uint64_t totalwritten;
10151   int want, have;
10152   if (f->mode & CEPH_FILE_MODE_LAZY)
10153     want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO;
10154   else
10155     want = CEPH_CAP_FILE_BUFFER;
10156   int r = get_caps(f, CEPH_CAP_FILE_WR|CEPH_CAP_AUTH_SHARED, want, &have, endoff);
10157   if (r < 0)
10158     return r;
10159
10160   /* clear the setuid/setgid bits, if any */
10161   if (unlikely(in->mode & (S_ISUID|S_ISGID)) && size > 0) {
10162     struct ceph_statx stx = { 0 };
10163
10164     put_cap_ref(in, CEPH_CAP_AUTH_SHARED);
10165     r = __setattrx(in, &stx, CEPH_SETATTR_KILL_SGUID, f->actor_perms);
10166     if (r < 0)
10167       return r;
10168   } else {
10169     put_cap_ref(in, CEPH_CAP_AUTH_SHARED);
10170   }
10171
10172   if (f->flags & O_DIRECT)
10173     have &= ~(CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO);
10174
10175   ldout(cct, 10) << " snaprealm " << *in->snaprealm << dendl;
10176
10177   std::unique_ptr<C_SaferCond> onuninline = nullptr;
10178
10179   if (in->inline_version < CEPH_INLINE_NONE) {
10180     if (endoff > cct->_conf->client_max_inline_size ||
10181         endoff > CEPH_INLINE_MAX_SIZE ||
10182         !(have & CEPH_CAP_FILE_BUFFER)) {
10183       onuninline.reset(new C_SaferCond("Client::_write_uninline_data flock"));
10184       uninline_data(in, onuninline.get());
10185     } else {
10186       get_cap_ref(in, CEPH_CAP_FILE_BUFFER);
10187
10188       uint32_t len = in->inline_data.length();
10189
10190       if (endoff < len)
10191         in->inline_data.begin(endoff).copy(len - endoff, bl); // XXX
10192
10193       if (offset < len)
10194         in->inline_data.splice(offset, len - offset);
10195       else if (offset > len)
10196         in->inline_data.append_zero(offset - len);
10197
10198       in->inline_data.append(bl);
10199       in->inline_version++;
10200
10201       put_cap_ref(in, CEPH_CAP_FILE_BUFFER);
10202
10203       goto success;
10204     }
10205   }
10206
10207   if (cct->_conf->client_oc &&
10208       (have & (CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO))) {
10209     // do buffered write
10210     if (!in->oset.dirty_or_tx)
10211       get_cap_ref(in, CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_BUFFER);
10212
10213     get_cap_ref(in, CEPH_CAP_FILE_BUFFER);
10214
10215     // async, caching, non-blocking.
10216     r = objectcacher->file_write(&in->oset, &in->layout,
10217                                  in->snaprealm->get_snap_context(),
10218                                  offset, size, bl, ceph::real_clock::now(),
10219                                  0);
10220     put_cap_ref(in, CEPH_CAP_FILE_BUFFER);
10221
10222     if (r < 0)
10223       goto done;
10224
10225     // flush cached write if O_SYNC is set on file fh
10226     // O_DSYNC == O_SYNC on linux < 2.6.33
10227     // O_SYNC = __O_SYNC | O_DSYNC on linux >= 2.6.33
10228     if ((f->flags & O_SYNC) || (f->flags & O_DSYNC)) {
10229       _flush_range(in, offset, size);
10230     }
10231   } else {
10232     if (f->flags & O_DIRECT)
10233       _flush_range(in, offset, size);
10234
10235     // simple, non-atomic sync write
10236     C_SaferCond onfinish("Client::_write flock");
10237     get_cap_ref(in, CEPH_CAP_FILE_BUFFER);
10238
10239     filer->write_trunc(in->ino, &in->layout, in->snaprealm->get_snap_context(),
10240                        offset, size, bl, ceph::real_clock::now(), 0,
10241                        in->truncate_size, in->truncate_seq,
10242                        &onfinish);
10243     client_lock.unlock();
10244     r = onfinish.wait();
10245     client_lock.lock();
10246     put_cap_ref(in, CEPH_CAP_FILE_BUFFER);
10247     if (r < 0)
10248       goto done;
10249   }
10250
10251   // if we get here, write was successful, update client metadata
10252 success:
10253   // time
10254   lat = ceph_clock_now();
10255   lat -= start;
10256   logger->tinc(l_c_wrlat, lat);
10257
10258   if (fpos) {
10259     lock_fh_pos(f);
10260     f->pos = fpos;
10261     unlock_fh_pos(f);
10262   }
10263   totalwritten = size;
10264   r = (int64_t)totalwritten;
10265
10266   // extend file?
10267   if (totalwritten + offset > in->size) {
10268     in->size = totalwritten + offset;
10269     in->mark_caps_dirty(CEPH_CAP_FILE_WR);
10270
10271     if (is_quota_bytes_approaching(in, f->actor_perms)) {
10272       check_caps(in, CHECK_CAPS_NODELAY);
10273     } else if (is_max_size_approaching(in)) {
10274       check_caps(in, 0);
10275     }
10276
10277     ldout(cct, 7) << "wrote to " << totalwritten+offset << ", extending file size" << dendl;
10278   } else {
10279     ldout(cct, 7) << "wrote to " << totalwritten+offset << ", leaving file size at " << in->size << dendl;
10280   }
10281
10282   // mtime
10283   in->mtime = in->ctime = ceph_clock_now();
10284   in->change_attr++;
10285   in->mark_caps_dirty(CEPH_CAP_FILE_WR);
10286
10287 done:
10288
10289   if (nullptr != onuninline) {
10290     client_lock.unlock();
10291     int uninline_ret = onuninline->wait();
10292     client_lock.lock();
10293
10294     if (uninline_ret >= 0 || uninline_ret == -CEPHFS_ECANCELED) {
10295       in->inline_data.clear();
10296       in->inline_version = CEPH_INLINE_NONE;
10297       in->mark_caps_dirty(CEPH_CAP_FILE_WR);
10298       check_caps(in, 0);
10299     } else
10300       r = uninline_ret;
10301   }
10302
10303   put_cap_ref(in, CEPH_CAP_FILE_WR);
10304   return r;
10305 }
10306
10307 int Client::_flush(Fh *f)
10308 {
10309   Inode *in = f->inode.get();
10310   int err = f->take_async_err();
10311   if (err != 0) {
10312     ldout(cct, 1) << __func__ << ": " << f << " on inode " << *in << " caught async_err = "
10313                   << cpp_strerror(err) << dendl;
10314   } else {
10315     ldout(cct, 10) << __func__ << ": " << f << " on inode " << *in << " no async_err state" << dendl;
10316   }
10317
10318   return err;
10319 }
10320
10321 int Client::truncate(const char *relpath, loff_t length, const UserPerm& perms)
10322 {
10323   struct ceph_statx stx;
10324   stx.stx_size = length;
10325   return setattrx(relpath, &stx, CEPH_SETATTR_SIZE, perms);
10326 }
10327
10328 int Client::ftruncate(int fd, loff_t length, const UserPerm& perms)
10329 {
10330   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
10331   if (!mref_reader.is_state_satisfied())
10332     return -CEPHFS_ENOTCONN;
10333
10334   tout(cct) << __func__ << std::endl;
10335   tout(cct) << fd << std::endl;
10336   tout(cct) << length << std::endl;
10337
10338   std::scoped_lock lock(client_lock);
10339   Fh *f = get_filehandle(fd);
10340   if (!f)
10341     return -CEPHFS_EBADF;
10342 #if defined(__linux__) && defined(O_PATH)
10343   if (f->flags & O_PATH)
10344     return -CEPHFS_EBADF;
10345 #endif
10346   if ((f->mode & CEPH_FILE_MODE_WR) == 0)
10347     return -CEPHFS_EBADF;
10348   struct stat attr;
10349   attr.st_size = length;
10350   return _setattr(f->inode, &attr, CEPH_SETATTR_SIZE, perms);
10351 }
10352
10353 int Client::fsync(int fd, bool syncdataonly)
10354 {
10355   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
10356   if (!mref_reader.is_state_satisfied())
10357     return -CEPHFS_ENOTCONN;
10358
10359   tout(cct) << "fsync" << std::endl;
10360   tout(cct) << fd << std::endl;
10361   tout(cct) << syncdataonly << std::endl;
10362
10363   std::scoped_lock lock(client_lock);
10364   Fh *f = get_filehandle(fd);
10365   if (!f)
10366     return -CEPHFS_EBADF;
10367 #if defined(__linux__) && defined(O_PATH)
10368   if (f->flags & O_PATH)
10369     return -CEPHFS_EBADF;
10370 #endif
10371   int r = _fsync(f, syncdataonly);
10372   if (r == 0) {
10373     // The IOs in this fsync were okay, but maybe something happened
10374     // in the background that we shoudl be reporting?
10375     r = f->take_async_err();
10376     ldout(cct, 5) << "fsync(" << fd << ", " << syncdataonly
10377                   << ") = 0, async_err = " << r << dendl;
10378   } else {
10379     // Assume that an error we encountered during fsync, even reported
10380     // synchronously, would also have applied the error to the Fh, and we
10381     // should clear it here to avoid returning the same error again on next
10382     // call.
10383     ldout(cct, 5) << "fsync(" << fd << ", " << syncdataonly << ") = "
10384                   << r << dendl;
10385     f->take_async_err();
10386   }
10387   return r;
10388 }
10389
10390 int Client::_fsync(Inode *in, bool syncdataonly)
10391 {
10392   ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
10393
10394   int r = 0;
10395   std::unique_ptr<C_SaferCond> object_cacher_completion = nullptr;
10396   ceph_tid_t flush_tid = 0;
10397   InodeRef tmp_ref;
10398   utime_t lat;
10399   utime_t start = ceph_clock_now();
10400
10401   ldout(cct, 8) << "_fsync on " << *in << " " << (syncdataonly ? "(dataonly)":"(data+metadata)") << dendl;
10402
10403   if (cct->_conf->client_oc) {
10404     object_cacher_completion.reset(new C_SaferCond("Client::_fsync::lock"));
10405     tmp_ref = in; // take a reference; C_SaferCond doesn't and _flush won't either
10406     _flush(in, object_cacher_completion.get());
10407     ldout(cct, 15) << "using return-valued form of _fsync" << dendl;
10408   }
10409
10410   if (!syncdataonly && in->dirty_caps) {
10411     check_caps(in, CHECK_CAPS_NODELAY|CHECK_CAPS_SYNCHRONOUS);
10412     if (in->flushing_caps)
10413       flush_tid = last_flush_tid;
10414   } else ldout(cct, 10) << "no metadata needs to commit" << dendl;
10415
10416   if (!syncdataonly && !in->unsafe_ops.empty()) {
10417     flush_mdlog_sync();
10418
10419     MetaRequest *req = in->unsafe_ops.back();
10420     ldout(cct, 15) << "waiting on unsafe requests, last tid " << req->get_tid() <<  dendl;
10421
10422     req->get();
10423     wait_on_list(req->waitfor_safe);
10424     put_request(req);
10425   }
10426
10427   if (nullptr != object_cacher_completion) { // wait on a real reply instead of guessing
10428     client_lock.unlock();
10429     ldout(cct, 15) << "waiting on data to flush" << dendl;
10430     r = object_cacher_completion->wait();
10431     client_lock.lock();
10432     ldout(cct, 15) << "got " << r << " from flush writeback" << dendl;
10433   } else {
10434     // FIXME: this can starve
10435     while (in->cap_refs[CEPH_CAP_FILE_BUFFER] > 0) {
10436       ldout(cct, 10) << "ino " << in->ino << " has " << in->cap_refs[CEPH_CAP_FILE_BUFFER]
10437                      << " uncommitted, waiting" << dendl;
10438       wait_on_list(in->waitfor_commit);
10439     }
10440   }
10441
10442   if (!r) {
10443     if (flush_tid > 0)
10444       wait_sync_caps(in, flush_tid);
10445
10446     ldout(cct, 10) << "ino " << in->ino << " has no uncommitted writes" << dendl;
10447   } else {
10448     ldout(cct, 8) << "ino " << in->ino << " failed to commit to disk! "
10449                   << cpp_strerror(-r) << dendl;
10450   }
10451
10452   lat = ceph_clock_now();
10453   lat -= start;
10454   logger->tinc(l_c_fsync, lat);
10455
10456   return r;
10457 }
10458
10459 int Client::_fsync(Fh *f, bool syncdataonly)
10460 {
10461   ldout(cct, 8) << "_fsync(" << f << ", " << (syncdataonly ? "dataonly)":"data+metadata)") << dendl;
10462   return _fsync(f->inode.get(), syncdataonly);
10463 }
10464
10465 int Client::fstat(int fd, struct stat *stbuf, const UserPerm& perms, int mask)
10466 {
10467   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
10468   if (!mref_reader.is_state_satisfied())
10469     return -CEPHFS_ENOTCONN;
10470
10471   tout(cct) << "fstat mask " << hex << mask << dec << std::endl;
10472   tout(cct) << fd << std::endl;
10473
10474   std::scoped_lock lock(client_lock);
10475   Fh *f = get_filehandle(fd);
10476   if (!f)
10477     return -CEPHFS_EBADF;
10478   int r = _getattr(f->inode, mask, perms);
10479   if (r < 0)
10480     return r;
10481   fill_stat(f->inode, stbuf, NULL);
10482   ldout(cct, 5) << "fstat(" << fd << ", " << stbuf << ") = " << r << dendl;
10483   return r;
10484 }
10485
10486 int Client::fstatx(int fd, struct ceph_statx *stx, const UserPerm& perms,
10487                    unsigned int want, unsigned int flags)
10488 {
10489   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
10490   if (!mref_reader.is_state_satisfied())
10491     return -CEPHFS_ENOTCONN;
10492
10493   tout(cct) << "fstatx flags " << hex << flags << " want " << want << dec << std::endl;
10494   tout(cct) << fd << std::endl;
10495
10496   std::scoped_lock lock(client_lock);
10497   Fh *f = get_filehandle(fd);
10498   if (!f)
10499     return -CEPHFS_EBADF;
10500
10501   unsigned mask = statx_to_mask(flags, want);
10502
10503   int r = 0;
10504   if (mask && !f->inode->caps_issued_mask(mask, true)) {
10505     r = _getattr(f->inode, mask, perms);
10506     if (r < 0) {
10507       ldout(cct, 3) << "fstatx exit on error!" << dendl;
10508       return r;
10509     }
10510   }
10511
10512   fill_statx(f->inode, mask, stx);
10513   ldout(cct, 3) << "fstatx(" << fd << ", " << stx << ") = " << r << dendl;
10514   return r;
10515 }
10516
10517 // not written yet, but i want to link!
10518
10519 int Client::chdir(const char *relpath, std::string &new_cwd,
10520                   const UserPerm& perms)
10521 {
10522   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
10523   if (!mref_reader.is_state_satisfied())
10524     return -CEPHFS_ENOTCONN;
10525
10526   tout(cct) << "chdir" << std::endl;
10527   tout(cct) << relpath << std::endl;
10528
10529   filepath path(relpath);
10530   InodeRef in;
10531
10532   std::scoped_lock lock(client_lock);
10533   int r = path_walk(path, &in, perms);
10534   if (r < 0)
10535     return r;
10536
10537   if (!(in.get()->is_dir()))
10538     return -CEPHFS_ENOTDIR;
10539
10540   if (cwd != in)
10541     cwd.swap(in);
10542   ldout(cct, 3) << "chdir(" << relpath << ")  cwd now " << cwd->ino << dendl;
10543
10544   _getcwd(new_cwd, perms);
10545   return 0;
10546 }
10547
10548 void Client::_getcwd(string& dir, const UserPerm& perms)
10549 {
10550   filepath path;
10551   ldout(cct, 10) << __func__ << " " << *cwd << dendl;
10552
10553   Inode *in = cwd.get();
10554   while (in != root) {
10555     ceph_assert(in->dentries.size() < 2); // dirs can't be hard-linked
10556
10557     // A cwd or ancester is unlinked
10558     if (in->dentries.empty()) {
10559       return;
10560     }
10561
10562     Dentry *dn = in->get_first_parent();
10563
10564
10565     if (!dn) {
10566       // look it up
10567       ldout(cct, 10) << __func__ << " looking up parent for " << *in << dendl;
10568       MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPNAME);
10569       filepath path(in->ino);
10570       req->set_filepath(path);
10571       req->set_inode(in);
10572       int res = make_request(req, perms);
10573       if (res < 0)
10574         break;
10575
10576       // start over
10577       path = filepath();
10578       in = cwd.get();
10579       continue;
10580     }
10581     path.push_front_dentry(dn->name);
10582     in = dn->dir->parent_inode;
10583   }
10584   dir = "/";
10585   dir += path.get_path();
10586 }
10587
10588 void Client::getcwd(string& dir, const UserPerm& perms)
10589 {
10590   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
10591   if (!mref_reader.is_state_satisfied())
10592     return;
10593
10594   std::scoped_lock l(client_lock);
10595
10596   _getcwd(dir, perms);
10597 }
10598
10599 int Client::statfs(const char *path, struct statvfs *stbuf,
10600                    const UserPerm& perms)
10601 {
10602   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
10603   if (!mref_reader.is_state_satisfied())
10604     return -CEPHFS_ENOTCONN;
10605
10606   tout(cct) << __func__ << std::endl;
10607   unsigned long int total_files_on_fs;
10608
10609   ceph_statfs stats;
10610   C_SaferCond cond;
10611
10612   std::unique_lock lock(client_lock);
10613   const vector<int64_t> &data_pools = mdsmap->get_data_pools();
10614   if (data_pools.size() == 1) {
10615     objecter->get_fs_stats(stats, data_pools[0], &cond);
10616   } else {
10617     objecter->get_fs_stats(stats, boost::optional<int64_t>(), &cond);
10618   }
10619
10620   lock.unlock();
10621   int rval = cond.wait();
10622   lock.lock();
10623
10624   assert(root);
10625   total_files_on_fs = root->rstat.rfiles + root->rstat.rsubdirs;
10626
10627   if (rval < 0) {
10628     ldout(cct, 1) << "underlying call to statfs returned error: "
10629                   << cpp_strerror(rval)
10630                   << dendl;
10631     return rval;
10632   }
10633
10634   memset(stbuf, 0, sizeof(*stbuf));
10635
10636   /*
10637    * we're going to set a block size of 4MB so we can represent larger
10638    * FSes without overflowing. Additionally convert the space
10639    * measurements from KB to bytes while making them in terms of
10640    * blocks.  We use 4MB only because it is big enough, and because it
10641    * actually *is* the (ceph) default block size.
10642    */
10643   const int CEPH_BLOCK_SHIFT = 22;
10644   stbuf->f_frsize = 1 << CEPH_BLOCK_SHIFT;
10645   stbuf->f_bsize = 1 << CEPH_BLOCK_SHIFT;
10646   stbuf->f_files = total_files_on_fs;
10647   stbuf->f_ffree = -1;
10648   stbuf->f_favail = -1;
10649   stbuf->f_fsid = -1;       // ??
10650   stbuf->f_flag = 0;        // ??
10651   stbuf->f_namemax = NAME_MAX;
10652
10653   // Usually quota_root will == root_ancestor, but if the mount root has no
10654   // quota but we can see a parent of it that does have a quota, we'll
10655   // respect that one instead.
10656   ceph_assert(root != nullptr);
10657   Inode *quota_root = root->quota.is_enable() ? root : get_quota_root(root, perms);
10658
10659   // get_quota_root should always give us something
10660   // because client quotas are always enabled
10661   ceph_assert(quota_root != nullptr);
10662
10663   if (quota_root && cct->_conf->client_quota_df && quota_root->quota.max_bytes) {
10664
10665     // Skip the getattr if any sessions are stale, as we don't want to
10666     // block `df` if this client has e.g. been evicted, or if the MDS cluster
10667     // is unhealthy.
10668     if (!_any_stale_sessions()) {
10669       int r = _getattr(quota_root, 0, perms, true);
10670       if (r != 0) {
10671         // Ignore return value: error getting latest inode metadata is not a good
10672         // reason to break "df".
10673         lderr(cct) << "Error in getattr on quota root 0x"
10674                    << std::hex << quota_root->ino << std::dec
10675                    << " statfs result may be outdated" << dendl;
10676       }
10677     }
10678
10679     // Special case: if there is a size quota set on the Inode acting
10680     // as the root for this client mount, then report the quota status
10681     // as the filesystem statistics.
10682     const fsblkcnt_t total = quota_root->quota.max_bytes >> CEPH_BLOCK_SHIFT;
10683     const fsblkcnt_t used = quota_root->rstat.rbytes >> CEPH_BLOCK_SHIFT;
10684     // It is possible for a quota to be exceeded: arithmetic here must
10685     // handle case where used > total.
10686     const fsblkcnt_t free = total > used ? total - used : 0;
10687
10688     stbuf->f_blocks = total;
10689     stbuf->f_bfree = free;
10690     stbuf->f_bavail = free;
10691   } else {
10692     // General case: report the cluster statistics returned from RADOS. Because
10693     // multiple pools may be used without one filesystem namespace via
10694     // layouts, this is the most correct thing we can do.
10695     stbuf->f_blocks = stats.kb >> (CEPH_BLOCK_SHIFT - 10);
10696     stbuf->f_bfree = stats.kb_avail >> (CEPH_BLOCK_SHIFT - 10);
10697     stbuf->f_bavail = stats.kb_avail >> (CEPH_BLOCK_SHIFT - 10);
10698   }
10699
10700   return rval;
10701 }
10702
10703 int Client::_do_filelock(Inode *in, Fh *fh, int lock_type, int op, int sleep,
10704                          struct flock *fl, uint64_t owner, bool removing)
10705 {
10706   ldout(cct, 10) << __func__ << " ino " << in->ino
10707                  << (lock_type == CEPH_LOCK_FCNTL ? " fcntl" : " flock")
10708                  << " type " << fl->l_type << " owner " << owner
10709                  << " " << fl->l_start << "~" << fl->l_len << dendl;
10710
10711   if (in->flags & I_ERROR_FILELOCK)
10712     return -CEPHFS_EIO;
10713
10714   int lock_cmd;
10715   if (F_RDLCK == fl->l_type)
10716     lock_cmd = CEPH_LOCK_SHARED;
10717   else if (F_WRLCK == fl->l_type)
10718     lock_cmd = CEPH_LOCK_EXCL;
10719   else if (F_UNLCK == fl->l_type)
10720     lock_cmd = CEPH_LOCK_UNLOCK;
10721   else
10722     return -CEPHFS_EIO;
10723
10724   if (op != CEPH_MDS_OP_SETFILELOCK || lock_cmd == CEPH_LOCK_UNLOCK)
10725     sleep = 0;
10726
10727   /*
10728    * Set the most significant bit, so that MDS knows the 'owner'
10729    * is sufficient to identify the owner of lock. (old code uses
10730    * both 'owner' and 'pid')
10731    */
10732   owner |= (1ULL << 63);
10733
10734   MetaRequest *req = new MetaRequest(op);
10735   filepath path;
10736   in->make_nosnap_relative_path(path);
10737   req->set_filepath(path);
10738   req->set_inode(in);
10739
10740   req->head.args.filelock_change.rule = lock_type;
10741   req->head.args.filelock_change.type = lock_cmd;
10742   req->head.args.filelock_change.owner = owner;
10743   req->head.args.filelock_change.pid = fl->l_pid;
10744   req->head.args.filelock_change.start = fl->l_start;
10745   req->head.args.filelock_change.length = fl->l_len;
10746   req->head.args.filelock_change.wait = sleep;
10747
10748   int ret;
10749   bufferlist bl;
10750
10751   if (sleep && switch_interrupt_cb) {
10752     // enable interrupt
10753     switch_interrupt_cb(callback_handle, req->get());
10754     ret = make_request(req, fh->actor_perms, NULL, NULL, -1, &bl);
10755     // disable interrupt
10756     switch_interrupt_cb(callback_handle, NULL);
10757     if (ret == 0 && req->aborted()) {
10758       // effect of this lock request has been revoked by the 'lock intr' request
10759       ret = req->get_abort_code();
10760     }
10761     put_request(req);
10762   } else {
10763     ret = make_request(req, fh->actor_perms, NULL, NULL, -1, &bl);
10764   }
10765
10766   if (ret == 0) {
10767     if (op == CEPH_MDS_OP_GETFILELOCK) {
10768       ceph_filelock filelock;
10769       auto p = bl.cbegin();
10770       decode(filelock, p);
10771
10772       if (CEPH_LOCK_SHARED == filelock.type)
10773         fl->l_type = F_RDLCK;
10774       else if (CEPH_LOCK_EXCL == filelock.type)
10775         fl->l_type = F_WRLCK;
10776       else
10777         fl->l_type = F_UNLCK;
10778
10779       fl->l_whence = SEEK_SET;
10780       fl->l_start = filelock.start;
10781       fl->l_len = filelock.length;
10782       fl->l_pid = filelock.pid;
10783     } else if (op == CEPH_MDS_OP_SETFILELOCK) {
10784       ceph_lock_state_t *lock_state;
10785       if (lock_type == CEPH_LOCK_FCNTL) {
10786         if (!in->fcntl_locks)
10787           in->fcntl_locks.reset(new ceph_lock_state_t(cct, CEPH_LOCK_FCNTL));
10788         lock_state = in->fcntl_locks.get();
10789       } else if (lock_type == CEPH_LOCK_FLOCK) {
10790         if (!in->flock_locks)
10791           in->flock_locks.reset(new ceph_lock_state_t(cct, CEPH_LOCK_FLOCK));
10792         lock_state = in->flock_locks.get();
10793       } else {
10794         ceph_abort();
10795         return -CEPHFS_EINVAL;
10796       }
10797       _update_lock_state(fl, owner, lock_state);
10798
10799       if (!removing) {
10800         if (lock_type == CEPH_LOCK_FCNTL) {
10801           if (!fh->fcntl_locks)
10802             fh->fcntl_locks.reset(new ceph_lock_state_t(cct, CEPH_LOCK_FCNTL));
10803           lock_state = fh->fcntl_locks.get();
10804         } else {
10805           if (!fh->flock_locks)
10806             fh->flock_locks.reset(new ceph_lock_state_t(cct, CEPH_LOCK_FLOCK));
10807           lock_state = fh->flock_locks.get();
10808         }
10809         _update_lock_state(fl, owner, lock_state);
10810       }
10811     } else
10812       ceph_abort();
10813   }
10814   return ret;
10815 }
10816
10817 int Client::_interrupt_filelock(MetaRequest *req)
10818 {
10819   // Set abort code, but do not kick. The abort code prevents the request
10820   // from being re-sent.
10821   req->abort(-CEPHFS_EINTR);
10822   if (req->mds < 0)
10823     return 0; // haven't sent the request
10824
10825   Inode *in = req->inode();
10826
10827   int lock_type;
10828   if (req->head.args.filelock_change.rule == CEPH_LOCK_FLOCK)
10829     lock_type = CEPH_LOCK_FLOCK_INTR;
10830   else if (req->head.args.filelock_change.rule == CEPH_LOCK_FCNTL)
10831     lock_type = CEPH_LOCK_FCNTL_INTR;
10832   else {
10833     ceph_abort();
10834     return -CEPHFS_EINVAL;
10835   }
10836
10837   MetaRequest *intr_req = new MetaRequest(CEPH_MDS_OP_SETFILELOCK);
10838   filepath path;
10839   in->make_nosnap_relative_path(path);
10840   intr_req->set_filepath(path);
10841   intr_req->set_inode(in);
10842   intr_req->head.args.filelock_change = req->head.args.filelock_change;
10843   intr_req->head.args.filelock_change.rule = lock_type;
10844   intr_req->head.args.filelock_change.type = CEPH_LOCK_UNLOCK;
10845
10846   UserPerm perms(req->get_uid(), req->get_gid());
10847   return make_request(intr_req, perms, NULL, NULL, -1);
10848 }
10849
10850 void Client::_encode_filelocks(Inode *in, bufferlist& bl)
10851 {
10852   if (!in->fcntl_locks && !in->flock_locks)
10853     return;
10854
10855   unsigned nr_fcntl_locks = in->fcntl_locks ? in->fcntl_locks->held_locks.size() : 0;
10856   encode(nr_fcntl_locks, bl);
10857   if (nr_fcntl_locks) {
10858     auto &lock_state = in->fcntl_locks;
10859     for(multimap<uint64_t, ceph_filelock>::iterator p = lock_state->held_locks.begin();
10860         p != lock_state->held_locks.end();
10861         ++p)
10862       encode(p->second, bl);
10863   }
10864
10865   unsigned nr_flock_locks = in->flock_locks ? in->flock_locks->held_locks.size() : 0;
10866   encode(nr_flock_locks, bl);
10867   if (nr_flock_locks) {
10868     auto &lock_state = in->flock_locks;
10869     for(multimap<uint64_t, ceph_filelock>::iterator p = lock_state->held_locks.begin();
10870         p != lock_state->held_locks.end();
10871         ++p)
10872       encode(p->second, bl);
10873   }
10874
10875   ldout(cct, 10) << __func__ << " ino " << in->ino << ", " << nr_fcntl_locks
10876                  << " fcntl locks, " << nr_flock_locks << " flock locks" <<  dendl;
10877 }
10878
10879 void Client::_release_filelocks(Fh *fh)
10880 {
10881   if (!fh->fcntl_locks && !fh->flock_locks)
10882     return;
10883
10884   Inode *in = fh->inode.get();
10885   ldout(cct, 10) << __func__ << " " << fh << " ino " << in->ino << dendl;
10886
10887   list<ceph_filelock> activated_locks;
10888
10889   list<pair<int, ceph_filelock> > to_release;
10890
10891   if (fh->fcntl_locks) {
10892     auto &lock_state = fh->fcntl_locks;
10893     for(auto p = lock_state->held_locks.begin(); p != lock_state->held_locks.end(); ) {
10894       auto q = p++;
10895       if (in->flags & I_ERROR_FILELOCK) {
10896         lock_state->remove_lock(q->second, activated_locks);
10897       } else {
10898         to_release.push_back(pair<int, ceph_filelock>(CEPH_LOCK_FCNTL, q->second));
10899       }
10900     }
10901     lock_state.reset();
10902   }
10903   if (fh->flock_locks) {
10904     auto &lock_state = fh->flock_locks;
10905     for(auto p = lock_state->held_locks.begin(); p != lock_state->held_locks.end(); ) {
10906       auto q = p++;
10907       if (in->flags & I_ERROR_FILELOCK) {
10908         lock_state->remove_lock(q->second, activated_locks);
10909       } else {
10910         to_release.push_back(pair<int, ceph_filelock>(CEPH_LOCK_FLOCK, q->second));
10911       }
10912     }
10913     lock_state.reset();
10914   }
10915
10916   if ((in->flags & I_ERROR_FILELOCK) && !in->has_any_filelocks())
10917     in->flags &= ~I_ERROR_FILELOCK;
10918
10919   if (to_release.empty())
10920     return;
10921
10922   struct flock fl;
10923   memset(&fl, 0, sizeof(fl));
10924   fl.l_whence = SEEK_SET;
10925   fl.l_type = F_UNLCK;
10926
10927   for (list<pair<int, ceph_filelock> >::iterator p = to_release.begin();
10928        p != to_release.end();
10929        ++p) {
10930     fl.l_start = p->second.start;
10931     fl.l_len = p->second.length;
10932     fl.l_pid = p->second.pid;
10933     _do_filelock(in, fh, p->first, CEPH_MDS_OP_SETFILELOCK, 0, &fl,
10934                  p->second.owner, true);
10935   }
10936 }
10937
10938 void Client::_update_lock_state(struct flock *fl, uint64_t owner,
10939                                 ceph_lock_state_t *lock_state)
10940 {
10941   int lock_cmd;
10942   if (F_RDLCK == fl->l_type)
10943     lock_cmd = CEPH_LOCK_SHARED;
10944   else if (F_WRLCK == fl->l_type)
10945     lock_cmd = CEPH_LOCK_EXCL;
10946   else
10947     lock_cmd = CEPH_LOCK_UNLOCK;;
10948
10949   ceph_filelock filelock;
10950   filelock.start = fl->l_start;
10951   filelock.length = fl->l_len;
10952   filelock.client = 0;
10953   // see comment in _do_filelock()
10954   filelock.owner = owner | (1ULL << 63);
10955   filelock.pid = fl->l_pid;
10956   filelock.type = lock_cmd;
10957
10958   if (filelock.type == CEPH_LOCK_UNLOCK) {
10959     list<ceph_filelock> activated_locks;
10960     lock_state->remove_lock(filelock, activated_locks);
10961   } else {
10962     bool r = lock_state->add_lock(filelock, false, false, NULL);
10963     ceph_assert(r);
10964   }
10965 }
10966
10967 int Client::_getlk(Fh *fh, struct flock *fl, uint64_t owner)
10968 {
10969   Inode *in = fh->inode.get();
10970   ldout(cct, 10) << "_getlk " << fh << " ino " << in->ino << dendl;
10971   int ret = _do_filelock(in, fh, CEPH_LOCK_FCNTL, CEPH_MDS_OP_GETFILELOCK, 0, fl, owner);
10972   return ret;
10973 }
10974
10975 int Client::_setlk(Fh *fh, struct flock *fl, uint64_t owner, int sleep)
10976 {
10977   Inode *in = fh->inode.get();
10978   ldout(cct, 10) << "_setlk " << fh << " ino " << in->ino << dendl;
10979   int ret =  _do_filelock(in, fh, CEPH_LOCK_FCNTL, CEPH_MDS_OP_SETFILELOCK, sleep, fl, owner);
10980   ldout(cct, 10) << "_setlk " << fh << " ino " << in->ino << " result=" << ret << dendl;
10981   return ret;
10982 }
10983
10984 int Client::_flock(Fh *fh, int cmd, uint64_t owner)
10985 {
10986   Inode *in = fh->inode.get();
10987   ldout(cct, 10) << "_flock " << fh << " ino " << in->ino << dendl;
10988
10989   int sleep = !(cmd & LOCK_NB);
10990   cmd &= ~LOCK_NB;
10991
10992   int type;
10993   switch (cmd) {
10994     case LOCK_SH:
10995       type = F_RDLCK;
10996       break;
10997     case LOCK_EX:
10998       type = F_WRLCK;
10999       break;
11000     case LOCK_UN:
11001       type = F_UNLCK;
11002       break;
11003     default:
11004       return -CEPHFS_EINVAL;
11005   }
11006
11007   struct flock fl;
11008   memset(&fl, 0, sizeof(fl));
11009   fl.l_type = type;
11010   fl.l_whence = SEEK_SET;
11011
11012   int ret =  _do_filelock(in, fh, CEPH_LOCK_FLOCK, CEPH_MDS_OP_SETFILELOCK, sleep, &fl, owner);
11013   ldout(cct, 10) << "_flock " << fh << " ino " << in->ino << " result=" << ret << dendl;
11014   return ret;
11015 }
11016
11017 int Client::get_snap_info(const char *path, const UserPerm &perms, SnapInfo *snap_info) {
11018   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11019   if (!mref_reader.is_state_satisfied()) {
11020     return -CEPHFS_ENOTCONN;
11021   }
11022
11023   std::unique_lock locker(client_lock);
11024   InodeRef in;
11025   int r = Client::path_walk(path, &in, perms, true);
11026   if (r < 0) {
11027     return r;
11028   }
11029
11030   if (in->snapid == CEPH_NOSNAP) {
11031     return -CEPHFS_EINVAL;
11032   }
11033
11034   snap_info->id = in->snapid;
11035   snap_info->metadata = in->snap_metadata;
11036   return 0;
11037 }
11038
11039 int Client::ll_statfs(Inode *in, struct statvfs *stbuf, const UserPerm& perms)
11040 {
11041   /* Since the only thing this does is wrap a call to statfs, and
11042      statfs takes a lock, it doesn't seem we have a need to split it
11043      out. */
11044   return statfs(0, stbuf, perms);
11045 }
11046
11047 void Client::ll_register_callbacks(struct ceph_client_callback_args *args)
11048 {
11049   if (!args)
11050     return;
11051   std::scoped_lock l(client_lock);
11052   ldout(cct, 10) << __func__ << " cb " << args->handle
11053                  << " invalidate_ino_cb " << args->ino_cb
11054                  << " invalidate_dentry_cb " << args->dentry_cb
11055                  << " switch_interrupt_cb " << args->switch_intr_cb
11056                  << " remount_cb " << args->remount_cb
11057                  << dendl;
11058   callback_handle = args->handle;
11059   if (args->ino_cb) {
11060     ino_invalidate_cb = args->ino_cb;
11061     async_ino_invalidator.start();
11062   }
11063   if (args->dentry_cb) {
11064     dentry_invalidate_cb = args->dentry_cb;
11065     async_dentry_invalidator.start();
11066   }
11067   if (args->switch_intr_cb) {
11068     switch_interrupt_cb = args->switch_intr_cb;
11069     interrupt_finisher.start();
11070   }
11071   if (args->remount_cb) {
11072     remount_cb = args->remount_cb;
11073     remount_finisher.start();
11074   }
11075   if (args->ino_release_cb) {
11076     ino_release_cb = args->ino_release_cb;
11077     async_ino_releasor.start();
11078   }
11079   if (args->umask_cb)
11080     umask_cb = args->umask_cb;
11081 }
11082
11083 int Client::test_dentry_handling(bool can_invalidate)
11084 {
11085   int r = 0;
11086
11087   RWRef_t iref_reader(initialize_state, CLIENT_INITIALIZED);
11088   if (!iref_reader.is_state_satisfied())
11089     return -CEPHFS_ENOTCONN;
11090
11091   can_invalidate_dentries = can_invalidate;
11092
11093   if (can_invalidate_dentries) {
11094     ceph_assert(dentry_invalidate_cb);
11095     ldout(cct, 1) << "using dentry_invalidate_cb" << dendl;
11096     r = 0;
11097   } else {
11098     ceph_assert(remount_cb);
11099     ldout(cct, 1) << "using remount_cb" << dendl;
11100     r = _do_remount(false);
11101   }
11102
11103   return r;
11104 }
11105
11106 int Client::_sync_fs()
11107 {
11108   ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
11109
11110   ldout(cct, 10) << __func__ << dendl;
11111
11112   // flush file data
11113   std::unique_ptr<C_SaferCond> cond = nullptr;
11114   if (cct->_conf->client_oc) {
11115     cond.reset(new C_SaferCond("Client::_sync_fs:lock"));
11116     objectcacher->flush_all(cond.get());
11117   }
11118
11119   // flush caps
11120   flush_caps_sync();
11121   ceph_tid_t flush_tid = last_flush_tid;
11122
11123   // wait for unsafe mds requests
11124   wait_unsafe_requests();
11125
11126   wait_sync_caps(flush_tid);
11127
11128   if (nullptr != cond) {
11129     client_lock.unlock();
11130     ldout(cct, 15) << __func__ << " waiting on data to flush" << dendl;
11131     cond->wait();
11132     ldout(cct, 15) << __func__ << " flush finished" << dendl;
11133     client_lock.lock();
11134   }
11135
11136   return 0;
11137 }
11138
11139 int Client::sync_fs()
11140 {
11141   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11142   if (!mref_reader.is_state_satisfied())
11143     return -CEPHFS_ENOTCONN;
11144
11145   std::scoped_lock l(client_lock);
11146
11147   return _sync_fs();
11148 }
11149
11150 int64_t Client::drop_caches()
11151 {
11152   std::scoped_lock l(client_lock);
11153   return objectcacher->release_all();
11154 }
11155
11156 int Client::_lazyio(Fh *fh, int enable)
11157 {
11158   Inode *in = fh->inode.get();
11159   ldout(cct, 20) << __func__ << " " << *in << " " << !!enable << dendl;
11160
11161   if (!!(fh->mode & CEPH_FILE_MODE_LAZY) == !!enable)
11162     return 0;
11163
11164   int orig_mode = fh->mode;
11165   if (enable) {
11166     fh->mode |= CEPH_FILE_MODE_LAZY;
11167     in->get_open_ref(fh->mode);
11168     in->put_open_ref(orig_mode);
11169     check_caps(in, CHECK_CAPS_NODELAY);
11170   } else {
11171     fh->mode &= ~CEPH_FILE_MODE_LAZY;
11172     in->get_open_ref(fh->mode);
11173     in->put_open_ref(orig_mode);
11174     check_caps(in, 0);
11175   }
11176
11177   return 0;
11178 }
11179
11180 int Client::lazyio(int fd, int enable)
11181 {
11182   std::scoped_lock l(client_lock);
11183   Fh *f = get_filehandle(fd);
11184   if (!f)
11185     return -CEPHFS_EBADF;
11186
11187   return _lazyio(f, enable);
11188 }
11189
11190 int Client::ll_lazyio(Fh *fh, int enable)
11191 {
11192   ldout(cct, 3) << __func__ << " " << fh << " " << fh->inode->ino << " " << !!enable << dendl;
11193   tout(cct) << __func__ << std::endl;
11194
11195   std::scoped_lock lock(client_lock);
11196   return _lazyio(fh, enable);
11197 }
11198
11199 int Client::lazyio_propagate(int fd, loff_t offset, size_t count)
11200 {
11201   std::scoped_lock l(client_lock);
11202   ldout(cct, 3) << "op: client->lazyio_propagate(" << fd
11203           << ", " << offset << ", " << count << ")" << dendl;
11204
11205   Fh *f = get_filehandle(fd);
11206   if (!f)
11207     return -CEPHFS_EBADF;
11208
11209   // for now
11210   _fsync(f, true);
11211
11212   return 0;
11213 }
11214
11215 int Client::lazyio_synchronize(int fd, loff_t offset, size_t count)
11216 {
11217   std::scoped_lock l(client_lock);
11218   ldout(cct, 3) << "op: client->lazyio_synchronize(" << fd
11219           << ", " << offset << ", " << count << ")" << dendl;
11220
11221   Fh *f = get_filehandle(fd);
11222   if (!f)
11223     return -CEPHFS_EBADF;
11224   Inode *in = f->inode.get();
11225
11226   _fsync(f, true);
11227   if (_release(in)) {
11228     int r =_getattr(in, CEPH_STAT_CAP_SIZE, f->actor_perms);
11229     if (r < 0)
11230       return r;
11231   }
11232   return 0;
11233 }
11234
11235
11236 // =============================
11237 // snaps
11238
11239 int Client::mksnap(const char *relpath, const char *name, const UserPerm& perm,
11240                    mode_t mode, const std::map<std::string, std::string> &metadata)
11241 {
11242   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11243   if (!mref_reader.is_state_satisfied())
11244     return -CEPHFS_ENOTCONN;
11245
11246   std::scoped_lock l(client_lock);
11247
11248   filepath path(relpath);
11249   InodeRef in;
11250   int r = path_walk(path, &in, perm);
11251   if (r < 0)
11252     return r;
11253   if (cct->_conf->client_permissions) {
11254     r = may_create(in.get(), perm);
11255     if (r < 0)
11256       return r;
11257   }
11258   Inode *snapdir = open_snapdir(in.get());
11259   return _mkdir(snapdir, name, mode, perm, nullptr, metadata);
11260 }
11261
11262 int Client::rmsnap(const char *relpath, const char *name, const UserPerm& perms, bool check_perms)
11263 {
11264   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11265   if (!mref_reader.is_state_satisfied())
11266     return -CEPHFS_ENOTCONN;
11267
11268   std::scoped_lock l(client_lock);
11269
11270   filepath path(relpath);
11271   InodeRef in;
11272   int r = path_walk(path, &in, perms);
11273   if (r < 0)
11274     return r;
11275   Inode *snapdir = open_snapdir(in.get());
11276   if (cct->_conf->client_permissions) {
11277     r = may_delete(snapdir, check_perms ? name : NULL, perms);
11278     if (r < 0)
11279       return r;
11280   }
11281   return _rmdir(snapdir, name, perms);
11282 }
11283
11284 // =============================
11285 // expose caps
11286
11287 int Client::get_caps_issued(int fd)
11288 {
11289   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11290   if (!mref_reader.is_state_satisfied())
11291     return -CEPHFS_ENOTCONN;
11292
11293   std::scoped_lock lock(client_lock);
11294
11295   Fh *f = get_filehandle(fd);
11296   if (!f)
11297     return -CEPHFS_EBADF;
11298
11299   return f->inode->caps_issued();
11300 }
11301
11302 int Client::get_caps_issued(const char *path, const UserPerm& perms)
11303 {
11304   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11305   if (!mref_reader.is_state_satisfied())
11306     return -CEPHFS_ENOTCONN;
11307
11308   std::scoped_lock lock(client_lock);
11309
11310   filepath p(path);
11311   InodeRef in;
11312   int r = path_walk(p, &in, perms, true);
11313   if (r < 0)
11314     return r;
11315   return in->caps_issued();
11316 }
11317
11318 // =========================================
11319 // low level
11320
11321 Inode *Client::open_snapdir(Inode *diri)
11322 {
11323   Inode *in;
11324   vinodeno_t vino(diri->ino, CEPH_SNAPDIR);
11325   if (!inode_map.count(vino)) {
11326     in = new Inode(this, vino, &diri->layout);
11327
11328     in->ino = diri->ino;
11329     in->snapid = CEPH_SNAPDIR;
11330     in->mode = diri->mode;
11331     in->uid = diri->uid;
11332     in->gid = diri->gid;
11333     in->nlink = 1;
11334     in->mtime = diri->mtime;
11335     in->ctime = diri->ctime;
11336     in->btime = diri->btime;
11337     in->atime = diri->atime;
11338     in->size = diri->size;
11339     in->change_attr = diri->change_attr;
11340
11341     in->dirfragtree.clear();
11342     in->snapdir_parent = diri;
11343     diri->flags |= I_SNAPDIR_OPEN;
11344     inode_map[vino] = in;
11345     if (use_faked_inos())
11346       _assign_faked_ino(in);
11347     ldout(cct, 10) << "open_snapdir created snapshot inode " << *in << dendl;
11348   } else {
11349     in = inode_map[vino];
11350     ldout(cct, 10) << "open_snapdir had snapshot inode " << *in << dendl;
11351   }
11352   return in;
11353 }
11354
11355 int Client::ll_lookup(Inode *parent, const char *name, struct stat *attr,
11356                       Inode **out, const UserPerm& perms)
11357 {
11358   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11359   if (!mref_reader.is_state_satisfied())
11360     return -CEPHFS_ENOTCONN;
11361
11362   vinodeno_t vparent = _get_vino(parent);
11363   ldout(cct, 3) << __func__ << " " << vparent << " " << name << dendl;
11364   tout(cct) << __func__ << std::endl;
11365   tout(cct) << name << std::endl;
11366
11367   std::scoped_lock lock(client_lock);
11368
11369   int r = 0;
11370   if (!fuse_default_permissions) {
11371     if (strcmp(name, ".") && strcmp(name, "..")) {
11372       r = may_lookup(parent, perms);
11373       if (r < 0)
11374         return r;
11375     }
11376   }
11377
11378   string dname(name);
11379   InodeRef in;
11380
11381   r = _lookup(parent, dname, CEPH_STAT_CAP_INODE_ALL, &in, perms);
11382   if (r < 0) {
11383     attr->st_ino = 0;
11384     goto out;
11385   }
11386
11387   ceph_assert(in);
11388   fill_stat(in, attr);
11389   _ll_get(in.get());
11390
11391  out:
11392   ldout(cct, 3) << __func__ << " " << vparent << " " << name
11393           << " -> " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
11394   tout(cct) << attr->st_ino << std::endl;
11395   *out = in.get();
11396   return r;
11397 }
11398
11399 int Client::ll_lookup_vino(
11400     vinodeno_t vino,
11401     const UserPerm& perms,
11402     Inode **inode)
11403 {
11404   ceph_assert(inode != NULL);
11405   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11406   if (!mref_reader.is_state_satisfied())
11407     return -CEPHFS_ENOTCONN;
11408
11409   std::scoped_lock lock(client_lock);
11410   ldout(cct, 3) << __func__ << " " << vino << dendl;
11411
11412   // Check the cache first
11413   unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino);
11414   if (p != inode_map.end()) {
11415     *inode = p->second;
11416     _ll_get(*inode);
11417     return 0;
11418   }
11419
11420   uint64_t snapid = vino.snapid;
11421
11422   // for snapdir, find the non-snapped dir inode
11423   if (snapid == CEPH_SNAPDIR)
11424     vino.snapid = CEPH_NOSNAP;
11425
11426   int r = _lookup_vino(vino, perms, inode);
11427   if (r)
11428     return r;
11429   ceph_assert(*inode != NULL);
11430
11431   if (snapid == CEPH_SNAPDIR) {
11432     Inode *tmp = *inode;
11433
11434     // open the snapdir and put the inode ref
11435     *inode = open_snapdir(tmp);
11436     _ll_forget(tmp, 1);
11437     _ll_get(*inode);
11438   }
11439   return 0;
11440 }
11441
11442 int Client::ll_lookup_inode(
11443     struct inodeno_t ino,
11444     const UserPerm& perms,
11445     Inode **inode)
11446 {
11447   vinodeno_t vino(ino, CEPH_NOSNAP);
11448   return ll_lookup_vino(vino, perms, inode);
11449 }
11450
11451 int Client::ll_lookupx(Inode *parent, const char *name, Inode **out,
11452                        struct ceph_statx *stx, unsigned want, unsigned flags,
11453                        const UserPerm& perms)
11454 {
11455   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11456   if (!mref_reader.is_state_satisfied())
11457     return -CEPHFS_ENOTCONN;
11458
11459   vinodeno_t vparent = _get_vino(parent);
11460   ldout(cct, 3) << __func__ << " " << vparent << " " << name << dendl;
11461   tout(cct) << "ll_lookupx" << std::endl;
11462   tout(cct) << name << std::endl;
11463
11464   std::scoped_lock lock(client_lock);
11465
11466   int r = 0;
11467   if (!fuse_default_permissions) {
11468     r = may_lookup(parent, perms);
11469     if (r < 0)
11470       return r;
11471   }
11472
11473   string dname(name);
11474   InodeRef in;
11475
11476   unsigned mask = statx_to_mask(flags, want);
11477   r = _lookup(parent, dname, mask, &in, perms);
11478   if (r < 0) {
11479     stx->stx_ino = 0;
11480     stx->stx_mask = 0;
11481   } else {
11482     ceph_assert(in);
11483     fill_statx(in, mask, stx);
11484     _ll_get(in.get());
11485   }
11486
11487   ldout(cct, 3) << __func__ << " " << vparent << " " << name
11488           << " -> " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
11489   tout(cct) << stx->stx_ino << std::endl;
11490   *out = in.get();
11491   return r;
11492 }
11493
11494 int Client::ll_walk(const char* name, Inode **out, struct ceph_statx *stx,
11495                     unsigned int want, unsigned int flags, const UserPerm& perms)
11496 {
11497   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11498   if (!mref_reader.is_state_satisfied())
11499     return -CEPHFS_ENOTCONN;
11500
11501   filepath fp(name, 0);
11502   InodeRef in;
11503   int rc;
11504   unsigned mask = statx_to_mask(flags, want);
11505
11506   ldout(cct, 3) << __func__ << " " << name << dendl;
11507   tout(cct) << __func__ << std::endl;
11508   tout(cct) << name << std::endl;
11509
11510   std::scoped_lock lock(client_lock);
11511   rc = path_walk(fp, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW), mask);
11512   if (rc < 0) {
11513     /* zero out mask, just in case... */
11514     stx->stx_mask = 0;
11515     stx->stx_ino = 0;
11516     *out = NULL;
11517     return rc;
11518   } else {
11519     ceph_assert(in);
11520     fill_statx(in, mask, stx);
11521     _ll_get(in.get());
11522     *out = in.get();
11523     return 0;
11524   }
11525 }
11526
11527 void Client::_ll_get(Inode *in)
11528 {
11529   if (in->ll_ref == 0) {
11530     in->get();
11531     if (in->is_dir() && !in->dentries.empty()) {
11532       ceph_assert(in->dentries.size() == 1); // dirs can't be hard-linked
11533       in->get_first_parent()->get(); // pin dentry
11534     }
11535     if (in->snapid != CEPH_NOSNAP)
11536       ll_snap_ref[in->snapid]++;
11537   }
11538   in->ll_get();
11539   ldout(cct, 20) << __func__ << " " << in << " " << in->ino << " -> " << in->ll_ref << dendl;
11540 }
11541
11542 int Client::_ll_put(Inode *in, uint64_t num)
11543 {
11544   in->ll_put(num);
11545   ldout(cct, 20) << __func__ << " " << in << " " << in->ino << " " << num << " -> " << in->ll_ref << dendl;
11546   if (in->ll_ref == 0) {
11547     if (in->is_dir() && !in->dentries.empty()) {
11548       ceph_assert(in->dentries.size() == 1); // dirs can't be hard-linked
11549       in->get_first_parent()->put(); // unpin dentry
11550     }
11551     if (in->snapid != CEPH_NOSNAP) {
11552       auto p = ll_snap_ref.find(in->snapid);
11553       ceph_assert(p != ll_snap_ref.end());
11554       ceph_assert(p->second > 0);
11555       if (--p->second == 0)
11556         ll_snap_ref.erase(p);
11557     }
11558     put_inode(in);
11559     return 0;
11560   } else {
11561     return in->ll_ref;
11562   }
11563 }
11564
11565 void Client::_ll_drop_pins()
11566 {
11567   ldout(cct, 10) << __func__ << dendl;
11568   std::set<InodeRef> to_be_put; //this set will be deconstructed item by item when exit
11569   ceph::unordered_map<vinodeno_t, Inode*>::iterator next;
11570   for (ceph::unordered_map<vinodeno_t, Inode*>::iterator it = inode_map.begin();
11571        it != inode_map.end();
11572        it = next) {
11573     Inode *in = it->second;
11574     next = it;
11575     ++next;
11576     if (in->ll_ref){
11577       to_be_put.insert(in);
11578       _ll_put(in, in->ll_ref);
11579     }
11580   }
11581 }
11582
11583 bool Client::_ll_forget(Inode *in, uint64_t count)
11584 {
11585   inodeno_t ino = in->ino;
11586
11587   ldout(cct, 8) << __func__ << " " << ino << " " << count << dendl;
11588   tout(cct) << __func__ << std::endl;
11589   tout(cct) << ino.val << std::endl;
11590   tout(cct) << count << std::endl;
11591
11592   // Ignore forget if we're no longer mounted
11593   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11594   if (!mref_reader.is_state_satisfied())
11595     return true;
11596
11597   if (ino == 1) return true;  // ignore forget on root.
11598
11599   bool last = false;
11600   if (in->ll_ref < count) {
11601     ldout(cct, 1) << "WARNING: ll_forget on " << ino << " " << count
11602                   << ", which only has ll_ref=" << in->ll_ref << dendl;
11603     _ll_put(in, in->ll_ref);
11604     last = true;
11605   } else {
11606     if (_ll_put(in, count) == 0)
11607       last = true;
11608   }
11609
11610   return last;
11611 }
11612
11613 bool Client::ll_forget(Inode *in, uint64_t count)
11614 {
11615   std::scoped_lock lock(client_lock);
11616   return _ll_forget(in, count);
11617 }
11618
11619 bool Client::ll_put(Inode *in)
11620 {
11621   /* ll_forget already takes the lock */
11622   return ll_forget(in, 1);
11623 }
11624
11625 int Client::ll_get_snap_ref(snapid_t snap)
11626 {
11627   std::scoped_lock lock(client_lock);
11628   auto p = ll_snap_ref.find(snap);
11629   if (p != ll_snap_ref.end())
11630     return p->second;
11631   return 0;
11632 }
11633
11634 snapid_t Client::ll_get_snapid(Inode *in)
11635 {
11636   std::scoped_lock lock(client_lock);
11637   return in->snapid;
11638 }
11639
11640 Inode *Client::ll_get_inode(ino_t ino)
11641 {
11642   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11643   if (!mref_reader.is_state_satisfied())
11644     return NULL;
11645
11646   std::scoped_lock lock(client_lock);
11647
11648   vinodeno_t vino = _map_faked_ino(ino);
11649   unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino);
11650   if (p == inode_map.end())
11651     return NULL;
11652   Inode *in = p->second;
11653   _ll_get(in);
11654   return in;
11655 }
11656
11657 Inode *Client::ll_get_inode(vinodeno_t vino)
11658 {
11659   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11660   if (!mref_reader.is_state_satisfied())
11661     return NULL;
11662
11663   std::scoped_lock lock(client_lock);
11664
11665   unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino);
11666   if (p == inode_map.end())
11667     return NULL;
11668   Inode *in = p->second;
11669   _ll_get(in);
11670   return in;
11671 }
11672
11673 int Client::_ll_getattr(Inode *in, int caps, const UserPerm& perms)
11674 {
11675   vinodeno_t vino = _get_vino(in);
11676
11677   ldout(cct, 8) << __func__ << " " << vino << dendl;
11678   tout(cct) << __func__ << std::endl;
11679   tout(cct) << vino.ino.val << std::endl;
11680
11681   if (vino.snapid < CEPH_NOSNAP)
11682     return 0;
11683   else
11684     return _getattr(in, caps, perms);
11685 }
11686
11687 int Client::ll_getattr(Inode *in, struct stat *attr, const UserPerm& perms)
11688 {
11689   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11690   if (!mref_reader.is_state_satisfied())
11691     return -CEPHFS_ENOTCONN;
11692
11693   std::scoped_lock lock(client_lock);
11694
11695   int res = _ll_getattr(in, CEPH_STAT_CAP_INODE_ALL, perms);
11696
11697   if (res == 0)
11698     fill_stat(in, attr);
11699   ldout(cct, 3) << __func__ << " " << _get_vino(in) << " = " << res << dendl;
11700   return res;
11701 }
11702
11703 int Client::ll_getattrx(Inode *in, struct ceph_statx *stx, unsigned int want,
11704                         unsigned int flags, const UserPerm& perms)
11705 {
11706   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11707   if (!mref_reader.is_state_satisfied())
11708     return -CEPHFS_ENOTCONN;
11709
11710   std::scoped_lock lock(client_lock);
11711
11712   int res = 0;
11713   unsigned mask = statx_to_mask(flags, want);
11714
11715   if (mask && !in->caps_issued_mask(mask, true))
11716     res = _ll_getattr(in, mask, perms);
11717
11718   if (res == 0)
11719     fill_statx(in, mask, stx);
11720   ldout(cct, 3) << __func__ << " " << _get_vino(in) << " = " << res << dendl;
11721   return res;
11722 }
11723
11724 int Client::_ll_setattrx(Inode *in, struct ceph_statx *stx, int mask,
11725                          const UserPerm& perms, InodeRef *inp)
11726 {
11727   vinodeno_t vino = _get_vino(in);
11728
11729   ldout(cct, 8) << __func__ << " " << vino << " mask " << hex << mask << dec
11730                 << dendl;
11731   tout(cct) << __func__ << std::endl;
11732   tout(cct) << vino.ino.val << std::endl;
11733   tout(cct) << stx->stx_mode << std::endl;
11734   tout(cct) << stx->stx_uid << std::endl;
11735   tout(cct) << stx->stx_gid << std::endl;
11736   tout(cct) << stx->stx_size << std::endl;
11737   tout(cct) << stx->stx_mtime << std::endl;
11738   tout(cct) << stx->stx_atime << std::endl;
11739   tout(cct) << stx->stx_btime << std::endl;
11740   tout(cct) << mask << std::endl;
11741
11742   if (!fuse_default_permissions) {
11743     int res = may_setattr(in, stx, mask, perms);
11744     if (res < 0)
11745       return res;
11746   }
11747
11748   mask &= ~(CEPH_SETATTR_MTIME_NOW | CEPH_SETATTR_ATIME_NOW);
11749
11750   return __setattrx(in, stx, mask, perms, inp);
11751 }
11752
11753 int Client::ll_setattrx(Inode *in, struct ceph_statx *stx, int mask,
11754                         const UserPerm& perms)
11755 {
11756   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11757   if (!mref_reader.is_state_satisfied())
11758     return -CEPHFS_ENOTCONN;
11759
11760   std::scoped_lock lock(client_lock);
11761
11762   InodeRef target(in);
11763   int res = _ll_setattrx(in, stx, mask, perms, &target);
11764   if (res == 0) {
11765     ceph_assert(in == target.get());
11766     fill_statx(in, in->caps_issued(), stx);
11767   }
11768
11769   ldout(cct, 3) << __func__ << " " << _get_vino(in) << " = " << res << dendl;
11770   return res;
11771 }
11772
11773 int Client::ll_setattr(Inode *in, struct stat *attr, int mask,
11774                        const UserPerm& perms)
11775 {
11776   struct ceph_statx stx;
11777   stat_to_statx(attr, &stx);
11778
11779   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11780   if (!mref_reader.is_state_satisfied())
11781     return -CEPHFS_ENOTCONN;
11782
11783   std::scoped_lock lock(client_lock);
11784
11785   InodeRef target(in);
11786   int res = _ll_setattrx(in, &stx, mask, perms, &target);
11787   if (res == 0) {
11788     ceph_assert(in == target.get());
11789     fill_stat(in, attr);
11790   }
11791
11792   ldout(cct, 3) << __func__ << " " << _get_vino(in) << " = " << res << dendl;
11793   return res;
11794 }
11795
11796
11797 // ----------
11798 // xattrs
11799
11800 int Client::getxattr(const char *path, const char *name, void *value, size_t size,
11801                      const UserPerm& perms)
11802 {
11803   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11804   if (!mref_reader.is_state_satisfied())
11805     return -CEPHFS_ENOTCONN;
11806
11807   std::scoped_lock lock(client_lock);
11808
11809   InodeRef in;
11810   int r = Client::path_walk(path, &in, perms, true, CEPH_STAT_CAP_XATTR);
11811   if (r < 0)
11812     return r;
11813   return _getxattr(in, name, value, size, perms);
11814 }
11815
11816 int Client::lgetxattr(const char *path, const char *name, void *value, size_t size,
11817                       const UserPerm& perms)
11818 {
11819   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11820   if (!mref_reader.is_state_satisfied())
11821     return -CEPHFS_ENOTCONN;
11822
11823   std::scoped_lock lock(client_lock);
11824
11825   InodeRef in;
11826   int r = Client::path_walk(path, &in, perms, false, CEPH_STAT_CAP_XATTR);
11827   if (r < 0)
11828     return r;
11829   return _getxattr(in, name, value, size, perms);
11830 }
11831
11832 int Client::fgetxattr(int fd, const char *name, void *value, size_t size,
11833                       const UserPerm& perms)
11834 {
11835   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11836   if (!mref_reader.is_state_satisfied())
11837     return -CEPHFS_ENOTCONN;
11838
11839   std::scoped_lock lock(client_lock);
11840
11841   Fh *f = get_filehandle(fd);
11842   if (!f)
11843     return -CEPHFS_EBADF;
11844   return _getxattr(f->inode, name, value, size, perms);
11845 }
11846
11847 int Client::listxattr(const char *path, char *list, size_t size,
11848                       const UserPerm& perms)
11849 {
11850   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11851   if (!mref_reader.is_state_satisfied())
11852     return -CEPHFS_ENOTCONN;
11853
11854   std::scoped_lock lock(client_lock);
11855
11856   InodeRef in;
11857   int r = Client::path_walk(path, &in, perms, true, CEPH_STAT_CAP_XATTR);
11858   if (r < 0)
11859     return r;
11860   return Client::_listxattr(in.get(), list, size, perms);
11861 }
11862
11863 int Client::llistxattr(const char *path, char *list, size_t size,
11864                        const UserPerm& perms)
11865 {
11866   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11867   if (!mref_reader.is_state_satisfied())
11868     return -CEPHFS_ENOTCONN;
11869
11870   std::scoped_lock lock(client_lock);
11871
11872   InodeRef in;
11873   int r = Client::path_walk(path, &in, perms, false, CEPH_STAT_CAP_XATTR);
11874   if (r < 0)
11875     return r;
11876   return Client::_listxattr(in.get(), list, size, perms);
11877 }
11878
11879 int Client::flistxattr(int fd, char *list, size_t size, const UserPerm& perms)
11880 {
11881   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11882   if (!mref_reader.is_state_satisfied())
11883     return -CEPHFS_ENOTCONN;
11884
11885   std::scoped_lock lock(client_lock);
11886
11887   Fh *f = get_filehandle(fd);
11888   if (!f)
11889     return -CEPHFS_EBADF;
11890   return Client::_listxattr(f->inode.get(), list, size, perms);
11891 }
11892
11893 int Client::removexattr(const char *path, const char *name,
11894                         const UserPerm& perms)
11895 {
11896   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11897   if (!mref_reader.is_state_satisfied())
11898     return -CEPHFS_ENOTCONN;
11899
11900   std::scoped_lock lock(client_lock);
11901
11902   InodeRef in;
11903   int r = Client::path_walk(path, &in, perms, true);
11904   if (r < 0)
11905     return r;
11906   return _removexattr(in, name, perms);
11907 }
11908
11909 int Client::lremovexattr(const char *path, const char *name,
11910                          const UserPerm& perms)
11911 {
11912   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11913   if (!mref_reader.is_state_satisfied())
11914     return -CEPHFS_ENOTCONN;
11915
11916   std::scoped_lock lock(client_lock);
11917
11918   InodeRef in;
11919   int r = Client::path_walk(path, &in, perms, false);
11920   if (r < 0)
11921     return r;
11922   return _removexattr(in, name, perms);
11923 }
11924
11925 int Client::fremovexattr(int fd, const char *name, const UserPerm& perms)
11926 {
11927   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11928   if (!mref_reader.is_state_satisfied())
11929     return -CEPHFS_ENOTCONN;
11930
11931   std::scoped_lock lock(client_lock);
11932
11933   Fh *f = get_filehandle(fd);
11934   if (!f)
11935     return -CEPHFS_EBADF;
11936   return _removexattr(f->inode, name, perms);
11937 }
11938
11939 int Client::setxattr(const char *path, const char *name, const void *value,
11940                      size_t size, int flags, const UserPerm& perms)
11941 {
11942   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11943   if (!mref_reader.is_state_satisfied())
11944     return -CEPHFS_ENOTCONN;
11945
11946   _setxattr_maybe_wait_for_osdmap(name, value, size);
11947
11948   std::scoped_lock lock(client_lock);
11949
11950   InodeRef in;
11951   int r = Client::path_walk(path, &in, perms, true);
11952   if (r < 0)
11953     return r;
11954   return _setxattr(in, name, value, size, flags, perms);
11955 }
11956
11957 int Client::lsetxattr(const char *path, const char *name, const void *value,
11958                       size_t size, int flags, const UserPerm& perms)
11959 {
11960   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11961   if (!mref_reader.is_state_satisfied())
11962     return -CEPHFS_ENOTCONN;
11963
11964   _setxattr_maybe_wait_for_osdmap(name, value, size);
11965
11966   std::scoped_lock lock(client_lock);
11967
11968   InodeRef in;
11969   int r = Client::path_walk(path, &in, perms, false);
11970   if (r < 0)
11971     return r;
11972   return _setxattr(in, name, value, size, flags, perms);
11973 }
11974
11975 int Client::fsetxattr(int fd, const char *name, const void *value, size_t size,
11976                       int flags, const UserPerm& perms)
11977 {
11978   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11979   if (!mref_reader.is_state_satisfied())
11980     return -CEPHFS_ENOTCONN;
11981
11982   _setxattr_maybe_wait_for_osdmap(name, value, size);
11983
11984   std::scoped_lock lock(client_lock);
11985
11986   Fh *f = get_filehandle(fd);
11987   if (!f)
11988     return -CEPHFS_EBADF;
11989   return _setxattr(f->inode, name, value, size, flags, perms);
11990 }
11991
11992 int Client::_getxattr(Inode *in, const char *name, void *value, size_t size,
11993                       const UserPerm& perms)
11994 {
11995   int r;
11996
11997   const VXattr *vxattr = _match_vxattr(in, name);
11998   if (vxattr) {
11999     r = -CEPHFS_ENODATA;
12000
12001     // Do a force getattr to get the latest quota before returning
12002     // a value to userspace.
12003     int flags = 0;
12004     if (vxattr->flags & VXATTR_RSTAT) {
12005       flags |= CEPH_STAT_RSTAT;
12006     }
12007     if (vxattr->flags & VXATTR_DIRSTAT) {
12008       flags |= CEPH_CAP_FILE_SHARED;
12009     }
12010     r = _getattr(in, flags | CEPH_STAT_CAP_XATTR, perms, true);
12011     if (r != 0) {
12012       // Error from getattr!
12013       return r;
12014     }
12015
12016     // call pointer-to-member function
12017     char buf[256];
12018     if (!(vxattr->exists_cb && !(this->*(vxattr->exists_cb))(in))) {
12019       r = (this->*(vxattr->getxattr_cb))(in, buf, sizeof(buf));
12020     } else {
12021       r = -CEPHFS_ENODATA;
12022     }
12023
12024     if (size != 0) {
12025       if (r > (int)size) {
12026         r = -CEPHFS_ERANGE;
12027       } else if (r > 0) {
12028         memcpy(value, buf, r);
12029       }
12030     }
12031     goto out;
12032   }
12033
12034   if (acl_type == NO_ACL && !strncmp(name, "system.", 7)) {
12035     r = -CEPHFS_EOPNOTSUPP;
12036     goto out;
12037   }
12038
12039   r = _getattr(in, CEPH_STAT_CAP_XATTR, perms, in->xattr_version == 0);
12040   if (r == 0) {
12041     string n(name);
12042     r = -CEPHFS_ENODATA;
12043    if (in->xattrs.count(n)) {
12044       r = in->xattrs[n].length();
12045       if (r > 0 && size != 0) {
12046         if (size >= (unsigned)r)
12047           memcpy(value, in->xattrs[n].c_str(), r);
12048         else
12049           r = -CEPHFS_ERANGE;
12050       }
12051     }
12052   }
12053  out:
12054   ldout(cct, 8) << "_getxattr(" << in->ino << ", \"" << name << "\", " << size << ") = " << r << dendl;
12055   return r;
12056 }
12057
12058 int Client::_getxattr(InodeRef &in, const char *name, void *value, size_t size,
12059                       const UserPerm& perms)
12060 {
12061   if (cct->_conf->client_permissions) {
12062     int r = xattr_permission(in.get(), name, MAY_READ, perms);
12063     if (r < 0)
12064       return r;
12065   }
12066   return _getxattr(in.get(), name, value, size, perms);
12067 }
12068
12069 int Client::ll_getxattr(Inode *in, const char *name, void *value,
12070                         size_t size, const UserPerm& perms)
12071 {
12072   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12073   if (!mref_reader.is_state_satisfied())
12074     return -CEPHFS_ENOTCONN;
12075
12076   vinodeno_t vino = _get_vino(in);
12077
12078   ldout(cct, 3) << __func__ << " " << vino << " " << name << " size " << size << dendl;
12079   tout(cct) << __func__ << std::endl;
12080   tout(cct) << vino.ino.val << std::endl;
12081   tout(cct) << name << std::endl;
12082
12083   std::scoped_lock lock(client_lock);
12084   if (!fuse_default_permissions) {
12085     int r = xattr_permission(in, name, MAY_READ, perms);
12086     if (r < 0)
12087       return r;
12088   }
12089
12090   return _getxattr(in, name, value, size, perms);
12091 }
12092
12093 int Client::_listxattr(Inode *in, char *name, size_t size,
12094                        const UserPerm& perms)
12095 {
12096   bool len_only = (size == 0);
12097   int r = _getattr(in, CEPH_STAT_CAP_XATTR, perms, in->xattr_version == 0);
12098   if (r != 0) {
12099     goto out;
12100   }
12101
12102   r = 0;
12103   for ([[maybe_unused]] const auto &[xattr_name, xattr_value_bl] : in->xattrs) {
12104     if (xattr_name.rfind("ceph.", 0) == 0) {
12105       continue;
12106     }
12107
12108     size_t this_len = xattr_name.length() + 1;
12109     r += this_len;
12110     if (len_only)
12111       continue;
12112
12113     if (this_len > size) {
12114       r = -CEPHFS_ERANGE;
12115       goto out;
12116     }
12117
12118     memcpy(name, xattr_name.c_str(), this_len);
12119     name += this_len;
12120     size -= this_len;
12121   }
12122 out:
12123   ldout(cct, 8) << __func__ << "(" << in->ino << ", " << size << ") = " << r << dendl;
12124   return r;
12125 }
12126
12127 int Client::ll_listxattr(Inode *in, char *names, size_t size,
12128                          const UserPerm& perms)
12129 {
12130   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12131   if (!mref_reader.is_state_satisfied())
12132     return -CEPHFS_ENOTCONN;
12133
12134   vinodeno_t vino = _get_vino(in);
12135
12136   ldout(cct, 3) << __func__ << " " << vino << " size " << size << dendl;
12137   tout(cct) << __func__ << std::endl;
12138   tout(cct) << vino.ino.val << std::endl;
12139   tout(cct) << size << std::endl;
12140
12141   std::scoped_lock lock(client_lock);
12142   return _listxattr(in, names, size, perms);
12143 }
12144
12145 int Client::_do_setxattr(Inode *in, const char *name, const void *value,
12146                          size_t size, int flags, const UserPerm& perms)
12147 {
12148
12149   int xattr_flags = 0;
12150   if (!value)
12151     xattr_flags |= CEPH_XATTR_REMOVE;
12152   if (flags & XATTR_CREATE)
12153     xattr_flags |= CEPH_XATTR_CREATE;
12154   if (flags & XATTR_REPLACE)
12155     xattr_flags |= CEPH_XATTR_REPLACE;
12156
12157   MetaRequest *req = new MetaRequest(CEPH_MDS_OP_SETXATTR);
12158   filepath path;
12159   in->make_nosnap_relative_path(path);
12160   req->set_filepath(path);
12161   req->set_string2(name);
12162   req->set_inode(in);
12163   req->head.args.setxattr.flags = xattr_flags;
12164
12165   bufferlist bl;
12166   assert (value || size == 0);
12167   bl.append((const char*)value, size);
12168   req->set_data(bl);
12169
12170   int res = make_request(req, perms);
12171
12172   trim_cache();
12173   ldout(cct, 3) << __func__ << "(" << in->ino << ", \"" << name << "\") = " <<
12174     res << dendl;
12175   return res;
12176 }
12177
12178 int Client::_setxattr(Inode *in, const char *name, const void *value,
12179                       size_t size, int flags, const UserPerm& perms)
12180 {
12181   if (in->snapid != CEPH_NOSNAP) {
12182     return -CEPHFS_EROFS;
12183   }
12184
12185   if (size == 0) {
12186     value = "";
12187   } else if (value == NULL) {
12188       return -CEPHFS_EINVAL;
12189   }
12190
12191   bool posix_acl_xattr = false;
12192   if (acl_type == POSIX_ACL)
12193     posix_acl_xattr = !strncmp(name, "system.", 7);
12194
12195   if (strncmp(name, "user.", 5) &&
12196       strncmp(name, "security.", 9) &&
12197       strncmp(name, "trusted.", 8) &&
12198       strncmp(name, "ceph.", 5) &&
12199       !posix_acl_xattr)
12200     return -CEPHFS_EOPNOTSUPP;
12201
12202   bool check_realm = false;
12203
12204   if (posix_acl_xattr) {
12205     if (!strcmp(name, ACL_EA_ACCESS)) {
12206       mode_t new_mode = in->mode;
12207       if (value) {
12208         int ret = posix_acl_equiv_mode(value, size, &new_mode);
12209         if (ret < 0)
12210           return ret;
12211         if (ret == 0) {
12212           value = NULL;
12213           size = 0;
12214         }
12215         if (new_mode != in->mode) {
12216           struct ceph_statx stx;
12217           stx.stx_mode = new_mode;
12218           ret = _do_setattr(in, &stx, CEPH_SETATTR_MODE, perms, NULL);
12219           if (ret < 0)
12220             return ret;
12221         }
12222       }
12223     } else if (!strcmp(name, ACL_EA_DEFAULT)) {
12224       if (value) {
12225         if (!S_ISDIR(in->mode))
12226           return -CEPHFS_EACCES;
12227         int ret = posix_acl_check(value, size);
12228         if (ret < 0)
12229           return -CEPHFS_EINVAL;
12230         if (ret == 0) {
12231           value = NULL;
12232           size = 0;
12233         }
12234       }
12235     } else {
12236       return -CEPHFS_EOPNOTSUPP;
12237     }
12238   } else {
12239     const VXattr *vxattr = _match_vxattr(in, name);
12240     if (vxattr) {
12241       if (vxattr->readonly)
12242         return -CEPHFS_EOPNOTSUPP;
12243       if (vxattr->name.compare(0, 10, "ceph.quota") == 0 && value)
12244         check_realm = true;
12245     }
12246   }
12247
12248   int ret = _do_setxattr(in, name, value, size, flags, perms);
12249   if (ret >= 0 && check_realm) {
12250     // check if snaprealm was created for quota inode
12251     if (in->quota.is_enable() &&
12252         !(in->snaprealm && in->snaprealm->ino == in->ino))
12253       ret = -CEPHFS_EOPNOTSUPP;
12254   }
12255
12256   return ret;
12257 }
12258
12259 int Client::_setxattr(InodeRef &in, const char *name, const void *value,
12260                       size_t size, int flags, const UserPerm& perms)
12261 {
12262   if (cct->_conf->client_permissions) {
12263     int r = xattr_permission(in.get(), name, MAY_WRITE, perms);
12264     if (r < 0)
12265       return r;
12266   }
12267   return _setxattr(in.get(), name, value, size, flags, perms);
12268 }
12269
12270 int Client::_setxattr_check_data_pool(string& name, string& value, const OSDMap *osdmap)
12271 {
12272   string tmp;
12273   if (name == "layout") {
12274     string::iterator begin = value.begin();
12275     string::iterator end = value.end();
12276     keys_and_values<string::iterator> p;    // create instance of parser
12277     std::map<string, string> m;             // map to receive results
12278     if (!qi::parse(begin, end, p, m)) {     // returns true if successful
12279       return -CEPHFS_EINVAL;
12280     }
12281     if (begin != end)
12282       return -CEPHFS_EINVAL;
12283     for (map<string,string>::iterator q = m.begin(); q != m.end(); ++q) {
12284       if (q->first == "pool") {
12285         tmp = q->second;
12286         break;
12287       }
12288     }
12289   } else if (name == "layout.pool") {
12290     tmp = value;
12291   }
12292
12293   if (tmp.length()) {
12294     int64_t pool;
12295     try {
12296       pool = boost::lexical_cast<unsigned>(tmp);
12297       if (!osdmap->have_pg_pool(pool))
12298         return -CEPHFS_ENOENT;
12299     } catch (boost::bad_lexical_cast const&) {
12300       pool = osdmap->lookup_pg_pool_name(tmp);
12301       if (pool < 0) {
12302         return -CEPHFS_ENOENT;
12303       }
12304     }
12305   }
12306
12307   return 0;
12308 }
12309
12310 void Client::_setxattr_maybe_wait_for_osdmap(const char *name, const void *value, size_t size)
12311 {
12312   // For setting pool of layout, MetaRequest need osdmap epoch.
12313   // There is a race which create a new data pool but client and mds both don't have.
12314   // Make client got the latest osdmap which make mds quickly judge whether get newer osdmap.
12315   ldout(cct, 15) << __func__ << ": name = " << name << dendl;
12316   if (strcmp(name, "ceph.file.layout.pool") == 0 || strcmp(name, "ceph.dir.layout.pool") == 0 ||
12317       strcmp(name, "ceph.file.layout") == 0 || strcmp(name, "ceph.dir.layout") == 0) {
12318     string rest(strstr(name, "layout"));
12319     string v((const char*)value, size);
12320     int r = objecter->with_osdmap([&](const OSDMap& o) {
12321       return _setxattr_check_data_pool(rest, v, &o);
12322     });
12323
12324     if (r == -CEPHFS_ENOENT) {
12325       bs::error_code ec;
12326       ldout(cct, 20) << __func__ << ": waiting for latest osdmap" << dendl;
12327       objecter->wait_for_latest_osdmap(ca::use_blocked[ec]);
12328       ldout(cct, 20) << __func__ << ": got latest osdmap: " << ec << dendl;
12329     }
12330   }
12331 }
12332
12333 int Client::ll_setxattr(Inode *in, const char *name, const void *value,
12334                         size_t size, int flags, const UserPerm& perms)
12335 {
12336   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12337   if (!mref_reader.is_state_satisfied())
12338     return -CEPHFS_ENOTCONN;
12339
12340   _setxattr_maybe_wait_for_osdmap(name, value, size);
12341
12342   vinodeno_t vino = _get_vino(in);
12343
12344   ldout(cct, 3) << __func__ << " " << vino << " " << name << " size " << size << dendl;
12345   tout(cct) << __func__ << std::endl;
12346   tout(cct) << vino.ino.val << std::endl;
12347   tout(cct) << name << std::endl;
12348
12349   std::scoped_lock lock(client_lock);
12350   if (!fuse_default_permissions) {
12351     int r = xattr_permission(in, name, MAY_WRITE, perms);
12352     if (r < 0)
12353       return r;
12354   }
12355   return _setxattr(in, name, value, size, flags, perms);
12356 }
12357
12358 int Client::_removexattr(Inode *in, const char *name, const UserPerm& perms)
12359 {
12360   if (in->snapid != CEPH_NOSNAP) {
12361     return -CEPHFS_EROFS;
12362   }
12363
12364   // same xattrs supported by kernel client
12365   if (strncmp(name, "user.", 5) &&
12366       strncmp(name, "system.", 7) &&
12367       strncmp(name, "security.", 9) &&
12368       strncmp(name, "trusted.", 8) &&
12369       strncmp(name, "ceph.", 5))
12370     return -CEPHFS_EOPNOTSUPP;
12371
12372   const VXattr *vxattr = _match_vxattr(in, name);
12373   if (vxattr && vxattr->readonly)
12374     return -CEPHFS_EOPNOTSUPP;
12375
12376   MetaRequest *req = new MetaRequest(CEPH_MDS_OP_RMXATTR);
12377   filepath path;
12378   in->make_nosnap_relative_path(path);
12379   req->set_filepath(path);
12380   req->set_filepath2(name);
12381   req->set_inode(in);
12382
12383   int res = make_request(req, perms);
12384
12385   trim_cache();
12386   ldout(cct, 8) << "_removexattr(" << in->ino << ", \"" << name << "\") = " << res << dendl;
12387   return res;
12388 }
12389
12390 int Client::_removexattr(InodeRef &in, const char *name, const UserPerm& perms)
12391 {
12392   if (cct->_conf->client_permissions) {
12393     int r = xattr_permission(in.get(), name, MAY_WRITE, perms);
12394     if (r < 0)
12395       return r;
12396   }
12397   return _removexattr(in.get(), name, perms);
12398 }
12399
12400 int Client::ll_removexattr(Inode *in, const char *name, const UserPerm& perms)
12401 {
12402   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12403   if (!mref_reader.is_state_satisfied())
12404     return -CEPHFS_ENOTCONN;
12405
12406   vinodeno_t vino = _get_vino(in);
12407
12408   ldout(cct, 3) << "ll_removexattr " << vino << " " << name << dendl;
12409   tout(cct) << "ll_removexattr" << std::endl;
12410   tout(cct) << vino.ino.val << std::endl;
12411   tout(cct) << name << std::endl;
12412
12413   std::scoped_lock lock(client_lock);
12414   if (!fuse_default_permissions) {
12415     int r = xattr_permission(in, name, MAY_WRITE, perms);
12416     if (r < 0)
12417       return r;
12418   }
12419
12420   return _removexattr(in, name, perms);
12421 }
12422
12423 bool Client::_vxattrcb_quota_exists(Inode *in)
12424 {
12425   return in->quota.is_enable() &&
12426    (in->snapid != CEPH_NOSNAP ||
12427     (in->snaprealm && in->snaprealm->ino == in->ino));
12428 }
12429 size_t Client::_vxattrcb_quota(Inode *in, char *val, size_t size)
12430 {
12431   return snprintf(val, size,
12432                   "max_bytes=%lld max_files=%lld",
12433                   (long long int)in->quota.max_bytes,
12434                   (long long int)in->quota.max_files);
12435 }
12436 size_t Client::_vxattrcb_quota_max_bytes(Inode *in, char *val, size_t size)
12437 {
12438   return snprintf(val, size, "%lld", (long long int)in->quota.max_bytes);
12439 }
12440 size_t Client::_vxattrcb_quota_max_files(Inode *in, char *val, size_t size)
12441 {
12442   return snprintf(val, size, "%lld", (long long int)in->quota.max_files);
12443 }
12444
12445 bool Client::_vxattrcb_layout_exists(Inode *in)
12446 {
12447   return in->layout != file_layout_t();
12448 }
12449 size_t Client::_vxattrcb_layout(Inode *in, char *val, size_t size)
12450 {
12451   int r = snprintf(val, size,
12452       "stripe_unit=%llu stripe_count=%llu object_size=%llu pool=",
12453       (unsigned long long)in->layout.stripe_unit,
12454       (unsigned long long)in->layout.stripe_count,
12455       (unsigned long long)in->layout.object_size);
12456   objecter->with_osdmap([&](const OSDMap& o) {
12457       if (o.have_pg_pool(in->layout.pool_id))
12458         r += snprintf(val + r, size - r, "%s",
12459                       o.get_pool_name(in->layout.pool_id).c_str());
12460       else
12461         r += snprintf(val + r, size - r, "%" PRIu64,
12462                       (uint64_t)in->layout.pool_id);
12463     });
12464   if (in->layout.pool_ns.length())
12465     r += snprintf(val + r, size - r, " pool_namespace=%s",
12466                   in->layout.pool_ns.c_str());
12467   return r;
12468 }
12469 size_t Client::_vxattrcb_layout_stripe_unit(Inode *in, char *val, size_t size)
12470 {
12471   return snprintf(val, size, "%llu", (unsigned long long)in->layout.stripe_unit);
12472 }
12473 size_t Client::_vxattrcb_layout_stripe_count(Inode *in, char *val, size_t size)
12474 {
12475   return snprintf(val, size, "%llu", (unsigned long long)in->layout.stripe_count);
12476 }
12477 size_t Client::_vxattrcb_layout_object_size(Inode *in, char *val, size_t size)
12478 {
12479   return snprintf(val, size, "%llu", (unsigned long long)in->layout.object_size);
12480 }
12481 size_t Client::_vxattrcb_layout_pool(Inode *in, char *val, size_t size)
12482 {
12483   size_t r;
12484   objecter->with_osdmap([&](const OSDMap& o) {
12485       if (o.have_pg_pool(in->layout.pool_id))
12486         r = snprintf(val, size, "%s", o.get_pool_name(
12487                        in->layout.pool_id).c_str());
12488       else
12489         r = snprintf(val, size, "%" PRIu64, (uint64_t)in->layout.pool_id);
12490     });
12491   return r;
12492 }
12493 size_t Client::_vxattrcb_layout_pool_namespace(Inode *in, char *val, size_t size)
12494 {
12495   return snprintf(val, size, "%s", in->layout.pool_ns.c_str());
12496 }
12497 size_t Client::_vxattrcb_dir_entries(Inode *in, char *val, size_t size)
12498 {
12499   return snprintf(val, size, "%llu", (unsigned long long)(in->dirstat.nfiles + in->dirstat.nsubdirs));
12500 }
12501 size_t Client::_vxattrcb_dir_files(Inode *in, char *val, size_t size)
12502 {
12503   return snprintf(val, size, "%llu", (unsigned long long)in->dirstat.nfiles);
12504 }
12505 size_t Client::_vxattrcb_dir_subdirs(Inode *in, char *val, size_t size)
12506 {
12507   return snprintf(val, size, "%llu", (unsigned long long)in->dirstat.nsubdirs);
12508 }
12509 size_t Client::_vxattrcb_dir_rentries(Inode *in, char *val, size_t size)
12510 {
12511   return snprintf(val, size, "%llu", (unsigned long long)(in->rstat.rfiles + in->rstat.rsubdirs));
12512 }
12513 size_t Client::_vxattrcb_dir_rfiles(Inode *in, char *val, size_t size)
12514 {
12515   return snprintf(val, size, "%llu", (unsigned long long)in->rstat.rfiles);
12516 }
12517 size_t Client::_vxattrcb_dir_rsubdirs(Inode *in, char *val, size_t size)
12518 {
12519   return snprintf(val, size, "%llu", (unsigned long long)in->rstat.rsubdirs);
12520 }
12521 size_t Client::_vxattrcb_dir_rsnaps(Inode *in, char *val, size_t size)
12522 {
12523   return snprintf(val, size, "%llu", (unsigned long long)in->rstat.rsnaps);
12524 }
12525 size_t Client::_vxattrcb_dir_rbytes(Inode *in, char *val, size_t size)
12526 {
12527   return snprintf(val, size, "%llu", (unsigned long long)in->rstat.rbytes);
12528 }
12529 size_t Client::_vxattrcb_dir_rctime(Inode *in, char *val, size_t size)
12530 {
12531   return snprintf(val, size, "%ld.%09ld", (long)in->rstat.rctime.sec(),
12532       (long)in->rstat.rctime.nsec());
12533 }
12534 bool Client::_vxattrcb_dir_pin_exists(Inode *in)
12535 {
12536   return in->dir_pin != -CEPHFS_ENODATA;
12537 }
12538 size_t Client::_vxattrcb_dir_pin(Inode *in, char *val, size_t size)
12539 {
12540   return snprintf(val, size, "%ld", (long)in->dir_pin);
12541 }
12542
12543 bool Client::_vxattrcb_snap_btime_exists(Inode *in)
12544 {
12545   return !in->snap_btime.is_zero();
12546 }
12547
12548 size_t Client::_vxattrcb_snap_btime(Inode *in, char *val, size_t size)
12549 {
12550   return snprintf(val, size, "%llu.%09lu",
12551       (long long unsigned)in->snap_btime.sec(),
12552       (long unsigned)in->snap_btime.nsec());
12553 }
12554
12555 bool Client::_vxattrcb_mirror_info_exists(Inode *in)
12556 {
12557   // checking one of the xattrs would suffice
12558   return in->xattrs.count("ceph.mirror.info.cluster_id") != 0;
12559 }
12560
12561 size_t Client::_vxattrcb_mirror_info(Inode *in, char *val, size_t size)
12562 {
12563   return snprintf(val, size, "cluster_id=%.*s fs_id=%.*s",
12564                   in->xattrs["ceph.mirror.info.cluster_id"].length(),
12565                   in->xattrs["ceph.mirror.info.cluster_id"].c_str(),
12566                   in->xattrs["ceph.mirror.info.fs_id"].length(),
12567                   in->xattrs["ceph.mirror.info.fs_id"].c_str());
12568 }
12569
12570 size_t Client::_vxattrcb_cluster_fsid(Inode *in, char *val, size_t size)
12571 {
12572   return snprintf(val, size, "%s", monclient->get_fsid().to_string().c_str());
12573 }
12574
12575 size_t Client::_vxattrcb_client_id(Inode *in, char *val, size_t size)
12576 {
12577   auto name = messenger->get_myname();
12578   return snprintf(val, size, "%s%ld", name.type_str(), name.num());
12579 }
12580
12581 #define CEPH_XATTR_NAME(_type, _name) "ceph." #_type "." #_name
12582 #define CEPH_XATTR_NAME2(_type, _name, _name2) "ceph." #_type "." #_name "." #_name2
12583
12584 #define XATTR_NAME_CEPH(_type, _name, _flags)                 \
12585 {                                                              \
12586   name: CEPH_XATTR_NAME(_type, _name),                         \
12587   getxattr_cb: &Client::_vxattrcb_ ## _type ## _ ## _name,     \
12588   readonly: true,                                              \
12589   exists_cb: NULL,                                             \
12590   flags: _flags,                                               \
12591 }
12592 #define XATTR_LAYOUT_FIELD(_type, _name, _field)                \
12593 {                                                               \
12594   name: CEPH_XATTR_NAME2(_type, _name, _field),                 \
12595   getxattr_cb: &Client::_vxattrcb_ ## _name ## _ ## _field,     \
12596   readonly: false,                                              \
12597   exists_cb: &Client::_vxattrcb_layout_exists,                  \
12598   flags: 0,                                                     \
12599 }
12600 #define XATTR_QUOTA_FIELD(_type, _name)                         \
12601 {                                                               \
12602   name: CEPH_XATTR_NAME(_type, _name),                          \
12603   getxattr_cb: &Client::_vxattrcb_ ## _type ## _ ## _name,      \
12604   readonly: false,                                              \
12605   exists_cb: &Client::_vxattrcb_quota_exists,                   \
12606   flags: 0,                                                     \
12607 }
12608
12609 const Client::VXattr Client::_dir_vxattrs[] = {
12610   {
12611     name: "ceph.dir.layout",
12612     getxattr_cb: &Client::_vxattrcb_layout,
12613     readonly: false,
12614     exists_cb: &Client::_vxattrcb_layout_exists,
12615     flags: 0,
12616   },
12617   XATTR_LAYOUT_FIELD(dir, layout, stripe_unit),
12618   XATTR_LAYOUT_FIELD(dir, layout, stripe_count),
12619   XATTR_LAYOUT_FIELD(dir, layout, object_size),
12620   XATTR_LAYOUT_FIELD(dir, layout, pool),
12621   XATTR_LAYOUT_FIELD(dir, layout, pool_namespace),
12622   XATTR_NAME_CEPH(dir, entries, VXATTR_DIRSTAT),
12623   XATTR_NAME_CEPH(dir, files, VXATTR_DIRSTAT),
12624   XATTR_NAME_CEPH(dir, subdirs, VXATTR_DIRSTAT),
12625   XATTR_NAME_CEPH(dir, rentries, VXATTR_RSTAT),
12626   XATTR_NAME_CEPH(dir, rfiles, VXATTR_RSTAT),
12627   XATTR_NAME_CEPH(dir, rsubdirs, VXATTR_RSTAT),
12628   XATTR_NAME_CEPH(dir, rsnaps, VXATTR_RSTAT),
12629   XATTR_NAME_CEPH(dir, rbytes, VXATTR_RSTAT),
12630   XATTR_NAME_CEPH(dir, rctime, VXATTR_RSTAT),
12631   {
12632     name: "ceph.quota",
12633     getxattr_cb: &Client::_vxattrcb_quota,
12634     readonly: false,
12635     exists_cb: &Client::_vxattrcb_quota_exists,
12636     flags: 0,
12637   },
12638   XATTR_QUOTA_FIELD(quota, max_bytes),
12639   XATTR_QUOTA_FIELD(quota, max_files),
12640   {
12641     name: "ceph.dir.pin",
12642     getxattr_cb: &Client::_vxattrcb_dir_pin,
12643     readonly: false,
12644     exists_cb: &Client::_vxattrcb_dir_pin_exists,
12645     flags: 0,
12646   },
12647   {
12648     name: "ceph.snap.btime",
12649     getxattr_cb: &Client::_vxattrcb_snap_btime,
12650     readonly: true,
12651     exists_cb: &Client::_vxattrcb_snap_btime_exists,
12652     flags: 0,
12653   },
12654   {
12655     name: "ceph.mirror.info",
12656     getxattr_cb: &Client::_vxattrcb_mirror_info,
12657     readonly: false,
12658     exists_cb: &Client::_vxattrcb_mirror_info_exists,
12659     flags: 0,
12660   },
12661   { name: "" }     /* Required table terminator */
12662 };
12663
12664 const Client::VXattr Client::_file_vxattrs[] = {
12665   {
12666     name: "ceph.file.layout",
12667     getxattr_cb: &Client::_vxattrcb_layout,
12668     readonly: false,
12669     exists_cb: &Client::_vxattrcb_layout_exists,
12670     flags: 0,
12671   },
12672   XATTR_LAYOUT_FIELD(file, layout, stripe_unit),
12673   XATTR_LAYOUT_FIELD(file, layout, stripe_count),
12674   XATTR_LAYOUT_FIELD(file, layout, object_size),
12675   XATTR_LAYOUT_FIELD(file, layout, pool),
12676   XATTR_LAYOUT_FIELD(file, layout, pool_namespace),
12677   {
12678     name: "ceph.snap.btime",
12679     getxattr_cb: &Client::_vxattrcb_snap_btime,
12680     readonly: true,
12681     exists_cb: &Client::_vxattrcb_snap_btime_exists,
12682     flags: 0,
12683   },
12684   { name: "" }     /* Required table terminator */
12685 };
12686
12687 const Client::VXattr Client::_common_vxattrs[] = {
12688   {
12689     name: "ceph.cluster_fsid",
12690     getxattr_cb: &Client::_vxattrcb_cluster_fsid,
12691     readonly: true,
12692     exists_cb: nullptr,
12693     flags: 0,
12694   },
12695   {
12696     name: "ceph.client_id",
12697     getxattr_cb: &Client::_vxattrcb_client_id,
12698     readonly: true,
12699     exists_cb: nullptr,
12700     flags: 0,
12701   },
12702   { name: "" }     /* Required table terminator */
12703 };
12704
12705 const Client::VXattr *Client::_get_vxattrs(Inode *in)
12706 {
12707   if (in->is_dir())
12708     return _dir_vxattrs;
12709   else if (in->is_file())
12710     return _file_vxattrs;
12711   return NULL;
12712 }
12713
12714 const Client::VXattr *Client::_match_vxattr(Inode *in, const char *name)
12715 {
12716   if (strncmp(name, "ceph.", 5) == 0) {
12717     const VXattr *vxattr = _get_vxattrs(in);
12718     if (vxattr) {
12719       while (!vxattr->name.empty()) {
12720         if (vxattr->name == name)
12721           return vxattr;
12722         vxattr++;
12723       }
12724     }
12725
12726     // for common vxattrs
12727     vxattr = _common_vxattrs;
12728     while (!vxattr->name.empty()) {
12729       if (vxattr->name == name)
12730         return vxattr;
12731       vxattr++;
12732     }
12733   }
12734
12735   return NULL;
12736 }
12737
12738 int Client::ll_readlink(Inode *in, char *buf, size_t buflen, const UserPerm& perms)
12739 {
12740   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12741   if (!mref_reader.is_state_satisfied())
12742     return -CEPHFS_ENOTCONN;
12743
12744   vinodeno_t vino = _get_vino(in);
12745
12746   ldout(cct, 3) << "ll_readlink " << vino << dendl;
12747   tout(cct) << "ll_readlink" << std::endl;
12748   tout(cct) << vino.ino.val << std::endl;
12749
12750   std::scoped_lock lock(client_lock);
12751   for (auto dn : in->dentries) {
12752     touch_dn(dn);
12753   }
12754
12755   int r = _readlink(in, buf, buflen); // FIXME: no permission checking!
12756   ldout(cct, 3) << "ll_readlink " << vino << " = " << r << dendl;
12757   return r;
12758 }
12759
12760 int Client::_mknod(Inode *dir, const char *name, mode_t mode, dev_t rdev,
12761                    const UserPerm& perms, InodeRef *inp)
12762 {
12763   ldout(cct, 8) << "_mknod(" << dir->ino << " " << name << ", 0" << oct
12764                 << mode << dec << ", " << rdev << ", uid " << perms.uid()
12765                 << ", gid " << perms.gid() << ")" << dendl;
12766
12767   if (strlen(name) > NAME_MAX)
12768     return -CEPHFS_ENAMETOOLONG;
12769
12770   if (dir->snapid != CEPH_NOSNAP) {
12771     return -CEPHFS_EROFS;
12772   }
12773   if (is_quota_files_exceeded(dir, perms)) {
12774     return -CEPHFS_EDQUOT;
12775   }
12776
12777   MetaRequest *req = new MetaRequest(CEPH_MDS_OP_MKNOD);
12778
12779   filepath path;
12780   dir->make_nosnap_relative_path(path);
12781   path.push_dentry(name);
12782   req->set_filepath(path);
12783   req->set_inode(dir);
12784   req->head.args.mknod.rdev = rdev;
12785   req->dentry_drop = CEPH_CAP_FILE_SHARED;
12786   req->dentry_unless = CEPH_CAP_FILE_EXCL;
12787
12788   bufferlist xattrs_bl;
12789   int res = _posix_acl_create(dir, &mode, xattrs_bl, perms);
12790   if (res < 0)
12791     goto fail;
12792   req->head.args.mknod.mode = mode;
12793   if (xattrs_bl.length() > 0)
12794     req->set_data(xattrs_bl);
12795
12796   Dentry *de;
12797   res = get_or_create(dir, name, &de);
12798   if (res < 0)
12799     goto fail;
12800   req->set_dentry(de);
12801
12802   res = make_request(req, perms, inp);
12803
12804   trim_cache();
12805
12806   ldout(cct, 8) << "mknod(" << path << ", 0" << oct << mode << dec << ") = " << res << dendl;
12807   return res;
12808
12809  fail:
12810   put_request(req);
12811   return res;
12812 }
12813
12814 int Client::ll_mknod(Inode *parent, const char *name, mode_t mode,
12815                      dev_t rdev, struct stat *attr, Inode **out,
12816                      const UserPerm& perms)
12817 {
12818   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12819   if (!mref_reader.is_state_satisfied())
12820     return -CEPHFS_ENOTCONN;
12821
12822   vinodeno_t vparent = _get_vino(parent);
12823
12824   ldout(cct, 3) << "ll_mknod " << vparent << " " << name << dendl;
12825   tout(cct) << "ll_mknod" << std::endl;
12826   tout(cct) << vparent.ino.val << std::endl;
12827   tout(cct) << name << std::endl;
12828   tout(cct) << mode << std::endl;
12829   tout(cct) << rdev << std::endl;
12830
12831   std::scoped_lock lock(client_lock);
12832   if (!fuse_default_permissions) {
12833     int r = may_create(parent, perms);
12834     if (r < 0)
12835       return r;
12836   }
12837
12838   InodeRef in;
12839   int r = _mknod(parent, name, mode, rdev, perms, &in);
12840   if (r == 0) {
12841     fill_stat(in, attr);
12842     _ll_get(in.get());
12843   }
12844   tout(cct) << attr->st_ino << std::endl;
12845   ldout(cct, 3) << "ll_mknod " << vparent << " " << name
12846           << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
12847   *out = in.get();
12848   return r;
12849 }
12850
12851 int Client::ll_mknodx(Inode *parent, const char *name, mode_t mode,
12852                       dev_t rdev, Inode **out,
12853                       struct ceph_statx *stx, unsigned want, unsigned flags,
12854                       const UserPerm& perms)
12855 {
12856   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12857   if (!mref_reader.is_state_satisfied())
12858     return -CEPHFS_ENOTCONN;
12859
12860   unsigned caps = statx_to_mask(flags, want);
12861
12862   vinodeno_t vparent = _get_vino(parent);
12863
12864   ldout(cct, 3) << "ll_mknodx " << vparent << " " << name << dendl;
12865   tout(cct) << "ll_mknodx" << std::endl;
12866   tout(cct) << vparent.ino.val << std::endl;
12867   tout(cct) << name << std::endl;
12868   tout(cct) << mode << std::endl;
12869   tout(cct) << rdev << std::endl;
12870
12871   std::scoped_lock lock(client_lock);
12872
12873   if (!fuse_default_permissions) {
12874     int r = may_create(parent, perms);
12875     if (r < 0)
12876       return r;
12877   }
12878
12879   InodeRef in;
12880   int r = _mknod(parent, name, mode, rdev, perms, &in);
12881   if (r == 0) {
12882     fill_statx(in, caps, stx);
12883     _ll_get(in.get());
12884   }
12885   tout(cct) << stx->stx_ino << std::endl;
12886   ldout(cct, 3) << "ll_mknodx " << vparent << " " << name
12887           << " = " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
12888   *out = in.get();
12889   return r;
12890 }
12891
12892 int Client::_create(Inode *dir, const char *name, int flags, mode_t mode,
12893                     InodeRef *inp, Fh **fhp, int stripe_unit, int stripe_count,
12894                     int object_size, const char *data_pool, bool *created,
12895                     const UserPerm& perms, std::string alternate_name)
12896 {
12897   ldout(cct, 8) << "_create(" << dir->ino << " " << name << ", 0" << oct <<
12898     mode << dec << ")" << dendl;
12899
12900   if (strlen(name) > NAME_MAX)
12901     return -CEPHFS_ENAMETOOLONG;
12902   if (dir->snapid != CEPH_NOSNAP) {
12903     return -CEPHFS_EROFS;
12904   }
12905   if (is_quota_files_exceeded(dir, perms)) {
12906     return -CEPHFS_EDQUOT;
12907   }
12908
12909   // use normalized flags to generate cmode
12910   int cflags = ceph_flags_sys2wire(flags);
12911   if (cct->_conf.get_val<bool>("client_force_lazyio"))
12912     cflags |= CEPH_O_LAZY;
12913
12914   int cmode = ceph_flags_to_mode(cflags);
12915
12916   int64_t pool_id = -1;
12917   if (data_pool && *data_pool) {
12918     pool_id = objecter->with_osdmap(
12919       std::mem_fn(&OSDMap::lookup_pg_pool_name), data_pool);
12920     if (pool_id < 0)
12921       return -CEPHFS_EINVAL;
12922     if (pool_id > 0xffffffffll)
12923       return -CEPHFS_ERANGE;  // bummer!
12924   }
12925
12926   MetaRequest *req = new MetaRequest(CEPH_MDS_OP_CREATE);
12927
12928   filepath path;
12929   dir->make_nosnap_relative_path(path);
12930   path.push_dentry(name);
12931   req->set_filepath(path);
12932   req->set_alternate_name(std::move(alternate_name));
12933   req->set_inode(dir);
12934   req->head.args.open.flags = cflags | CEPH_O_CREAT;
12935
12936   req->head.args.open.stripe_unit = stripe_unit;
12937   req->head.args.open.stripe_count = stripe_count;
12938   req->head.args.open.object_size = object_size;
12939   if (cct->_conf->client_debug_getattr_caps)
12940     req->head.args.open.mask = DEBUG_GETATTR_CAPS;
12941   else
12942     req->head.args.open.mask = 0;
12943   req->head.args.open.pool = pool_id;
12944   req->dentry_drop = CEPH_CAP_FILE_SHARED;
12945   req->dentry_unless = CEPH_CAP_FILE_EXCL;
12946
12947   mode |= S_IFREG;
12948   bufferlist xattrs_bl;
12949   int res = _posix_acl_create(dir, &mode, xattrs_bl, perms);
12950   if (res < 0)
12951     goto fail;
12952   req->head.args.open.mode = mode;
12953   if (xattrs_bl.length() > 0)
12954     req->set_data(xattrs_bl);
12955
12956   Dentry *de;
12957   res = get_or_create(dir, name, &de);
12958   if (res < 0)
12959     goto fail;
12960   req->set_dentry(de);
12961
12962   res = make_request(req, perms, inp, created);
12963   if (res < 0) {
12964     goto reply_error;
12965   }
12966
12967   /* If the caller passed a value in fhp, do the open */
12968   if(fhp) {
12969     (*inp)->get_open_ref(cmode);
12970     *fhp = _create_fh(inp->get(), flags, cmode, perms);
12971   }
12972
12973  reply_error:
12974   trim_cache();
12975
12976   ldout(cct, 8) << "create(" << path << ", 0" << oct << mode << dec
12977                 << " layout " << stripe_unit
12978                 << ' ' << stripe_count
12979                 << ' ' << object_size
12980                 <<") = " << res << dendl;
12981   return res;
12982
12983  fail:
12984   put_request(req);
12985   return res;
12986 }
12987
12988 int Client::_mkdir(Inode *dir, const char *name, mode_t mode, const UserPerm& perm,
12989                    InodeRef *inp, const std::map<std::string, std::string> &metadata,
12990                    std::string alternate_name)
12991 {
12992   ldout(cct, 8) << "_mkdir(" << dir->ino << " " << name << ", 0" << oct
12993                 << mode << dec << ", uid " << perm.uid()
12994                 << ", gid " << perm.gid() << ")" << dendl;
12995
12996   if (strlen(name) > NAME_MAX)
12997     return -CEPHFS_ENAMETOOLONG;
12998
12999   if (dir->snapid != CEPH_NOSNAP && dir->snapid != CEPH_SNAPDIR) {
13000     return -CEPHFS_EROFS;
13001   }
13002   if (is_quota_files_exceeded(dir, perm)) {
13003     return -CEPHFS_EDQUOT;
13004   }
13005
13006   bool is_snap_op = dir->snapid == CEPH_SNAPDIR;
13007   MetaRequest *req = new MetaRequest(is_snap_op ?
13008                                      CEPH_MDS_OP_MKSNAP : CEPH_MDS_OP_MKDIR);
13009
13010   filepath path;
13011   dir->make_nosnap_relative_path(path);
13012   path.push_dentry(name);
13013   req->set_filepath(path);
13014   req->set_inode(dir);
13015   req->dentry_drop = CEPH_CAP_FILE_SHARED;
13016   req->dentry_unless = CEPH_CAP_FILE_EXCL;
13017   req->set_alternate_name(std::move(alternate_name));
13018
13019   mode |= S_IFDIR;
13020   bufferlist bl;
13021   int res = _posix_acl_create(dir, &mode, bl, perm);
13022   if (res < 0)
13023     goto fail;
13024   req->head.args.mkdir.mode = mode;
13025   if (is_snap_op) {
13026     SnapPayload payload;
13027     // clear the bufferlist that may have been populated by the call
13028     // to _posix_acl_create(). MDS mksnap does not make use of it.
13029     // So, reuse it to pass metadata payload.
13030     bl.clear();
13031     payload.metadata = metadata;
13032     encode(payload, bl);
13033   }
13034   if (bl.length() > 0) {
13035     req->set_data(bl);
13036   }
13037
13038   Dentry *de;
13039   res = get_or_create(dir, name, &de);
13040   if (res < 0)
13041     goto fail;
13042   req->set_dentry(de);
13043
13044   ldout(cct, 10) << "_mkdir: making request" << dendl;
13045   res = make_request(req, perm, inp);
13046   ldout(cct, 10) << "_mkdir result is " << res << dendl;
13047
13048   trim_cache();
13049
13050   ldout(cct, 8) << "_mkdir(" << path << ", 0" << oct << mode << dec << ") = " << res << dendl;
13051   return res;
13052
13053  fail:
13054   put_request(req);
13055   return res;
13056 }
13057
13058 int Client::ll_mkdir(Inode *parent, const char *name, mode_t mode,
13059                      struct stat *attr, Inode **out, const UserPerm& perm)
13060 {
13061   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
13062   if (!mref_reader.is_state_satisfied())
13063     return -CEPHFS_ENOTCONN;
13064
13065   vinodeno_t vparent = _get_vino(parent);
13066
13067   ldout(cct, 3) << "ll_mkdir " << vparent << " " << name << dendl;
13068   tout(cct) << "ll_mkdir" << std::endl;
13069   tout(cct) << vparent.ino.val << std::endl;
13070   tout(cct) << name << std::endl;
13071   tout(cct) << mode << std::endl;
13072
13073   std::scoped_lock lock(client_lock);
13074
13075   if (!fuse_default_permissions) {
13076     int r = may_create(parent, perm);
13077     if (r < 0)
13078       return r;
13079   }
13080
13081   InodeRef in;
13082   int r = _mkdir(parent, name, mode, perm, &in);
13083   if (r == 0) {
13084     fill_stat(in, attr);
13085     _ll_get(in.get());
13086   }
13087   tout(cct) << attr->st_ino << std::endl;
13088   ldout(cct, 3) << "ll_mkdir " << vparent << " " << name
13089           << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
13090   *out = in.get();
13091   return r;
13092 }
13093
13094 int Client::ll_mkdirx(Inode *parent, const char *name, mode_t mode, Inode **out,
13095                       struct ceph_statx *stx, unsigned want, unsigned flags,
13096                       const UserPerm& perms)
13097 {
13098   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
13099   if (!mref_reader.is_state_satisfied())
13100     return -CEPHFS_ENOTCONN;
13101
13102   vinodeno_t vparent = _get_vino(parent);
13103
13104   ldout(cct, 3) << "ll_mkdirx " << vparent << " " << name << dendl;
13105   tout(cct) << "ll_mkdirx" << std::endl;
13106   tout(cct) << vparent.ino.val << std::endl;
13107   tout(cct) << name << std::endl;
13108   tout(cct) << mode << std::endl;
13109
13110   std::scoped_lock lock(client_lock);
13111
13112   if (!fuse_default_permissions) {
13113     int r = may_create(parent, perms);
13114     if (r < 0)
13115       return r;
13116   }
13117
13118   InodeRef in;
13119   int r = _mkdir(parent, name, mode, perms, &in);
13120   if (r == 0) {
13121     fill_statx(in, statx_to_mask(flags, want), stx);
13122     _ll_get(in.get());
13123   } else {
13124     stx->stx_ino = 0;
13125     stx->stx_mask = 0;
13126   }
13127   tout(cct) << stx->stx_ino << std::endl;
13128   ldout(cct, 3) << "ll_mkdirx " << vparent << " " << name
13129           << " = " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
13130   *out = in.get();
13131   return r;
13132 }
13133
13134 int Client::_symlink(Inode *dir, const char *name, const char *target,
13135                      const UserPerm& perms, std::string alternate_name, InodeRef *inp)
13136 {
13137   ldout(cct, 8) << "_symlink(" << dir->ino << " " << name << ", " << target
13138                 << ", uid " << perms.uid() << ", gid " << perms.gid() << ")"
13139                 << dendl;
13140
13141   if (strlen(name) > NAME_MAX)
13142     return -CEPHFS_ENAMETOOLONG;
13143
13144   if (dir->snapid != CEPH_NOSNAP) {
13145     return -CEPHFS_EROFS;
13146   }
13147   if (is_quota_files_exceeded(dir, perms)) {
13148     return -CEPHFS_EDQUOT;
13149   }
13150
13151   MetaRequest *req = new MetaRequest(CEPH_MDS_OP_SYMLINK);
13152
13153   filepath path;
13154   dir->make_nosnap_relative_path(path);
13155   path.push_dentry(name);
13156   req->set_filepath(path);
13157   req->set_alternate_name(std::move(alternate_name));
13158   req->set_inode(dir);
13159   req->set_string2(target);
13160   req->dentry_drop = CEPH_CAP_FILE_SHARED;
13161   req->dentry_unless = CEPH_CAP_FILE_EXCL;
13162
13163   Dentry *de;
13164   int res = get_or_create(dir, name, &de);
13165   if (res < 0)
13166     goto fail;
13167   req->set_dentry(de);
13168
13169   res = make_request(req, perms, inp);
13170
13171   trim_cache();
13172   ldout(cct, 8) << "_symlink(\"" << path << "\", \"" << target << "\") = " <<
13173     res << dendl;
13174   return res;
13175
13176  fail:
13177   put_request(req);
13178   return res;
13179 }
13180
13181 int Client::ll_symlink(Inode *parent, const char *name, const char *value,
13182                        struct stat *attr, Inode **out, const UserPerm& perms)
13183 {
13184   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
13185   if (!mref_reader.is_state_satisfied())
13186     return -CEPHFS_ENOTCONN;
13187
13188   vinodeno_t vparent = _get_vino(parent);
13189
13190   ldout(cct, 3) << "ll_symlink " << vparent << " " << name << " -> " << value
13191                 << dendl;
13192   tout(cct) << "ll_symlink" << std::endl;
13193   tout(cct) << vparent.ino.val << std::endl;
13194   tout(cct) << name << std::endl;
13195   tout(cct) << value << std::endl;
13196
13197   std::scoped_lock lock(client_lock);
13198
13199   if (!fuse_default_permissions) {
13200     int r = may_create(parent, perms);
13201     if (r < 0)
13202       return r;
13203   }
13204
13205   InodeRef in;
13206   int r = _symlink(parent, name, value, perms, "", &in);
13207   if (r == 0) {
13208     fill_stat(in, attr);
13209     _ll_get(in.get());
13210   }
13211   tout(cct) << attr->st_ino << std::endl;
13212   ldout(cct, 3) << "ll_symlink " << vparent << " " << name
13213           << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
13214   *out = in.get();
13215   return r;
13216 }
13217
13218 int Client::ll_symlinkx(Inode *parent, const char *name, const char *value,
13219                         Inode **out, struct ceph_statx *stx, unsigned want,
13220                         unsigned flags, const UserPerm& perms)
13221 {
13222   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
13223   if (!mref_reader.is_state_satisfied())
13224     return -CEPHFS_ENOTCONN;
13225
13226   vinodeno_t vparent = _get_vino(parent);
13227
13228   ldout(cct, 3) << "ll_symlinkx " << vparent << " " << name << " -> " << value
13229                 << dendl;
13230   tout(cct) << "ll_symlinkx" << std::endl;
13231   tout(cct) << vparent.ino.val << std::endl;
13232   tout(cct) << name << std::endl;
13233   tout(cct) << value << std::endl;
13234
13235   std::scoped_lock lock(client_lock);
13236
13237   if (!fuse_default_permissions) {
13238     int r = may_create(parent, perms);
13239     if (r < 0)
13240       return r;
13241   }
13242
13243   InodeRef in;
13244   int r = _symlink(parent, name, value, perms, "", &in);
13245   if (r == 0) {
13246     fill_statx(in, statx_to_mask(flags, want), stx);
13247     _ll_get(in.get());
13248   }
13249   tout(cct) << stx->stx_ino << std::endl;
13250   ldout(cct, 3) << "ll_symlinkx " << vparent << " " << name
13251           << " = " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
13252   *out = in.get();
13253   return r;
13254 }
13255
13256 int Client::_unlink(Inode *dir, const char *name, const UserPerm& perm)
13257 {
13258   ldout(cct, 8) << "_unlink(" << dir->ino << " " << name
13259                 << " uid " << perm.uid() << " gid " << perm.gid()
13260                 << ")" << dendl;
13261
13262   if (dir->snapid != CEPH_NOSNAP) {
13263     return -CEPHFS_EROFS;
13264   }
13265
13266   MetaRequest *req = new MetaRequest(CEPH_MDS_OP_UNLINK);
13267
13268   filepath path;
13269   dir->make_nosnap_relative_path(path);
13270   path.push_dentry(name);
13271   req->set_filepath(path);
13272
13273   InodeRef otherin;
13274   Inode *in;
13275   Dentry *de;
13276
13277   int res = get_or_create(dir, name, &de);
13278   if (res < 0)
13279     goto fail;
13280   req->set_dentry(de);
13281   req->dentry_drop = CEPH_CAP_FILE_SHARED;
13282   req->dentry_unless = CEPH_CAP_FILE_EXCL;
13283
13284   res = _lookup(dir, name, 0, &otherin, perm);
13285   if (res < 0)
13286     goto fail;
13287
13288   in = otherin.get();
13289   req->set_other_inode(in);
13290   in->break_all_delegs();
13291   req->other_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
13292
13293   req->set_inode(dir);
13294
13295   res = make_request(req, perm);
13296
13297   trim_cache();
13298   ldout(cct, 8) << "unlink(" << path << ") = " << res << dendl;
13299   return res;
13300
13301  fail:
13302   put_request(req);
13303   return res;
13304 }
13305
13306 int Client::ll_unlink(Inode *in, const char *name, const UserPerm& perm)
13307 {
13308   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
13309   if (!mref_reader.is_state_satisfied())
13310     return -CEPHFS_ENOTCONN;
13311
13312   vinodeno_t vino = _get_vino(in);
13313
13314   ldout(cct, 3) << "ll_unlink " << vino << " " << name << dendl;
13315   tout(cct) << "ll_unlink" << std::endl;
13316   tout(cct) << vino.ino.val << std::endl;
13317   tout(cct) << name << std::endl;
13318
13319   std::scoped_lock lock(client_lock);
13320
13321   if (!fuse_default_permissions) {
13322     int r = may_delete(in, name, perm);
13323     if (r < 0)
13324       return r;
13325   }
13326   return _unlink(in, name, perm);
13327 }
13328
13329 int Client::_rmdir(Inode *dir, const char *name, const UserPerm& perms)
13330 {
13331   ldout(cct, 8) << "_rmdir(" << dir->ino << " " << name << " uid "
13332                 << perms.uid() << " gid " << perms.gid() << ")" << dendl;
13333
13334   if (dir->snapid != CEPH_NOSNAP && dir->snapid != CEPH_SNAPDIR) {
13335     return -CEPHFS_EROFS;
13336   }
13337
13338   int op = dir->snapid == CEPH_SNAPDIR ? CEPH_MDS_OP_RMSNAP : CEPH_MDS_OP_RMDIR;
13339   MetaRequest *req = new MetaRequest(op);
13340   filepath path;
13341   dir->make_nosnap_relative_path(path);
13342   path.push_dentry(name);
13343   req->set_filepath(path);
13344   req->set_inode(dir);
13345
13346   req->dentry_drop = CEPH_CAP_FILE_SHARED;
13347   req->dentry_unless = CEPH_CAP_FILE_EXCL;
13348   req->other_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
13349
13350   InodeRef in;
13351
13352   Dentry *de;
13353   int res = get_or_create(dir, name, &de);
13354   if (res < 0)
13355     goto fail;
13356   if (op == CEPH_MDS_OP_RMDIR)
13357     req->set_dentry(de);
13358   else
13359     de->get();
13360
13361   res = _lookup(dir, name, 0, &in, perms);
13362   if (res < 0)
13363     goto fail;
13364
13365   if (op == CEPH_MDS_OP_RMSNAP) {
13366     unlink(de, true, true);
13367     de->put();
13368   }
13369   req->set_other_inode(in.get());
13370
13371   res = make_request(req, perms);
13372
13373   trim_cache();
13374   ldout(cct, 8) << "rmdir(" << path << ") = " << res << dendl;
13375   return res;
13376
13377  fail:
13378   put_request(req);
13379   return res;
13380 }
13381
13382 int Client::ll_rmdir(Inode *in, const char *name, const UserPerm& perms)
13383 {
13384   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
13385   if (!mref_reader.is_state_satisfied())
13386     return -CEPHFS_ENOTCONN;
13387
13388   vinodeno_t vino = _get_vino(in);
13389
13390   ldout(cct, 3) << "ll_rmdir " << vino << " " << name << dendl;
13391   tout(cct) << "ll_rmdir" << std::endl;
13392   tout(cct) << vino.ino.val << std::endl;
13393   tout(cct) << name << std::endl;
13394
13395   std::scoped_lock lock(client_lock);
13396
13397   if (!fuse_default_permissions) {
13398     int r = may_delete(in, name, perms);
13399     if (r < 0)
13400       return r;
13401   }
13402
13403   return _rmdir(in, name, perms);
13404 }
13405
13406 int Client::_rename(Inode *fromdir, const char *fromname, Inode *todir, const char *toname, const UserPerm& perm, std::string alternate_name)
13407 {
13408   ldout(cct, 8) << "_rename(" << fromdir->ino << " " << fromname << " to "
13409                 << todir->ino << " " << toname
13410                 << " uid " << perm.uid() << " gid " << perm.gid() << ")"
13411                 << dendl;
13412
13413   if (fromdir->snapid != todir->snapid)
13414     return -CEPHFS_EXDEV;
13415
13416   int op = CEPH_MDS_OP_RENAME;
13417   if (fromdir->snapid != CEPH_NOSNAP) {
13418     if (fromdir == todir && fromdir->snapid == CEPH_SNAPDIR)
13419       op = CEPH_MDS_OP_RENAMESNAP;
13420     else
13421       return -CEPHFS_EROFS;
13422   }
13423   if (fromdir != todir) {
13424     Inode *fromdir_root =
13425       fromdir->quota.is_enable() ? fromdir : get_quota_root(fromdir, perm);
13426     Inode *todir_root =
13427       todir->quota.is_enable() ? todir : get_quota_root(todir, perm);
13428     if (fromdir_root != todir_root) {
13429       return -CEPHFS_EXDEV;
13430     }
13431   }
13432
13433   InodeRef target;
13434   MetaRequest *req = new MetaRequest(op);
13435
13436   filepath from;
13437   fromdir->make_nosnap_relative_path(from);
13438   from.push_dentry(fromname);
13439   filepath to;
13440   todir->make_nosnap_relative_path(to);
13441   to.push_dentry(toname);
13442   req->set_filepath(to);
13443   req->set_filepath2(from);
13444   req->set_alternate_name(std::move(alternate_name));
13445
13446   Dentry *oldde;
13447   int res = get_or_create(fromdir, fromname, &oldde);
13448   if (res < 0)
13449     goto fail;
13450   Dentry *de;
13451   res = get_or_create(todir, toname, &de);
13452   if (res < 0)
13453     goto fail;
13454
13455   if (op == CEPH_MDS_OP_RENAME) {
13456     req->set_old_dentry(oldde);
13457     req->old_dentry_drop = CEPH_CAP_FILE_SHARED;
13458     req->old_dentry_unless = CEPH_CAP_FILE_EXCL;
13459
13460     req->set_dentry(de);
13461     req->dentry_drop = CEPH_CAP_FILE_SHARED;
13462     req->dentry_unless = CEPH_CAP_FILE_EXCL;
13463
13464     InodeRef oldin, otherin;
13465     res = _lookup(fromdir, fromname, 0, &oldin, perm);
13466     if (res < 0)
13467       goto fail;
13468
13469     Inode *oldinode = oldin.get();
13470     oldinode->break_all_delegs();
13471     req->set_old_inode(oldinode);
13472     req->old_inode_drop = CEPH_CAP_LINK_SHARED;
13473
13474     res = _lookup(todir, toname, 0, &otherin, perm);
13475     switch (res) {
13476     case 0:
13477       {
13478         Inode *in = otherin.get();
13479         req->set_other_inode(in);
13480         in->break_all_delegs();
13481       }
13482       req->other_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
13483       break;
13484     case -CEPHFS_ENOENT:
13485       break;
13486     default:
13487       goto fail;
13488     }
13489
13490     req->set_inode(todir);
13491   } else {
13492     // renamesnap reply contains no tracedn, so we need to invalidate
13493     // dentry manually
13494     unlink(oldde, true, true);
13495     unlink(de, true, true);
13496
13497     req->set_inode(todir);
13498   }
13499
13500   res = make_request(req, perm, &target);
13501   ldout(cct, 10) << "rename result is " << res << dendl;
13502
13503   // renamed item from our cache
13504
13505   trim_cache();
13506   ldout(cct, 8) << "_rename(" << from << ", " << to << ") = " << res << dendl;
13507   return res;
13508
13509  fail:
13510   put_request(req);
13511   return res;
13512 }
13513
13514 int Client::ll_rename(Inode *parent, const char *name, Inode *newparent,
13515                       const char *newname, const UserPerm& perm)
13516 {
13517   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
13518   if (!mref_reader.is_state_satisfied())
13519     return -CEPHFS_ENOTCONN;
13520
13521   vinodeno_t vparent = _get_vino(parent);
13522   vinodeno_t vnewparent = _get_vino(newparent);
13523
13524   ldout(cct, 3) << "ll_rename " << vparent << " " << name << " to "
13525           << vnewparent << " " << newname << dendl;
13526   tout(cct) << "ll_rename" << std::endl;
13527   tout(cct) << vparent.ino.val << std::endl;
13528   tout(cct) << name << std::endl;
13529   tout(cct) << vnewparent.ino.val << std::endl;
13530   tout(cct) << newname << std::endl;
13531
13532   std::scoped_lock lock(client_lock);
13533
13534   if (!fuse_default_permissions) {
13535     int r = may_delete(parent, name, perm);
13536     if (r < 0)
13537       return r;
13538     r = may_delete(newparent, newname, perm);
13539     if (r < 0 && r != -CEPHFS_ENOENT)
13540       return r;
13541   }
13542
13543   return _rename(parent, name, newparent, newname, perm, "");
13544 }
13545
13546 int Client::_link(Inode *in, Inode *dir, const char *newname, const UserPerm& perm, std::string alternate_name, InodeRef *inp)
13547 {
13548   ldout(cct, 8) << "_link(" << in->ino << " to " << dir->ino << " " << newname
13549                 << " uid " << perm.uid() << " gid " << perm.gid() << ")" << dendl;
13550
13551   if (strlen(newname) > NAME_MAX)
13552     return -CEPHFS_ENAMETOOLONG;
13553
13554   if (in->snapid != CEPH_NOSNAP || dir->snapid != CEPH_NOSNAP) {
13555     return -CEPHFS_EROFS;
13556   }
13557   if (is_quota_files_exceeded(dir, perm)) {
13558     return -CEPHFS_EDQUOT;
13559   }
13560
13561   in->break_all_delegs();
13562   MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LINK);
13563
13564   filepath path(newname, dir->ino);
13565   req->set_filepath(path);
13566   req->set_alternate_name(std::move(alternate_name));
13567   filepath existing(in->ino);
13568   req->set_filepath2(existing);
13569
13570   req->set_inode(dir);
13571   req->inode_drop = CEPH_CAP_FILE_SHARED;
13572   req->inode_unless = CEPH_CAP_FILE_EXCL;
13573
13574   Dentry *de;
13575   int res = get_or_create(dir, newname, &de);
13576   if (res < 0)
13577     goto fail;
13578   req->set_dentry(de);
13579
13580   res = make_request(req, perm, inp);
13581   ldout(cct, 10) << "link result is " << res << dendl;
13582
13583   trim_cache();
13584   ldout(cct, 8) << "link(" << existing << ", " << path << ") = " << res << dendl;
13585   return res;
13586
13587  fail:
13588   put_request(req);
13589   return res;
13590 }
13591
13592 int Client::ll_link(Inode *in, Inode *newparent, const char *newname,
13593                     const UserPerm& perm)
13594 {
13595   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
13596   if (!mref_reader.is_state_satisfied())
13597     return -CEPHFS_ENOTCONN;
13598
13599   vinodeno_t vino = _get_vino(in);
13600   vinodeno_t vnewparent = _get_vino(newparent);
13601
13602   ldout(cct, 3) << "ll_link " << vino << " to " << vnewparent << " " <<
13603     newname << dendl;
13604   tout(cct) << "ll_link" << std::endl;
13605   tout(cct) << vino.ino.val << std::endl;
13606   tout(cct) << vnewparent << std::endl;
13607   tout(cct) << newname << std::endl;
13608
13609   InodeRef target;
13610
13611   std::scoped_lock lock(client_lock);
13612
13613   if (!fuse_default_permissions) {
13614     if (S_ISDIR(in->mode))
13615       return -CEPHFS_EPERM;
13616
13617     int r = may_hardlink(in, perm);
13618     if (r < 0)
13619       return r;
13620
13621     r = may_create(newparent, perm);
13622     if (r < 0)
13623       return r;
13624   }
13625
13626   return _link(in, newparent, newname, perm, "", &target);
13627 }
13628
13629 int Client::ll_num_osds(void)
13630 {
13631   std::scoped_lock lock(client_lock);
13632   return objecter->with_osdmap(std::mem_fn(&OSDMap::get_num_osds));
13633 }
13634
13635 int Client::ll_osdaddr(int osd, uint32_t *addr)
13636 {
13637   std::scoped_lock lock(client_lock);
13638
13639   entity_addr_t g;
13640   bool exists = objecter->with_osdmap([&](const OSDMap& o) {
13641       if (!o.exists(osd))
13642         return false;
13643       g = o.get_addrs(osd).front();
13644       return true;
13645     });
13646   if (!exists)
13647     return -1;
13648   uint32_t nb_addr = (g.in4_addr()).sin_addr.s_addr;
13649   *addr = ntohl(nb_addr);
13650   return 0;
13651 }
13652
13653 uint32_t Client::ll_stripe_unit(Inode *in)
13654 {
13655   std::scoped_lock lock(client_lock);
13656   return in->layout.stripe_unit;
13657 }
13658
13659 uint64_t Client::ll_snap_seq(Inode *in)
13660 {
13661   std::scoped_lock lock(client_lock);
13662   return in->snaprealm->seq;
13663 }
13664
13665 int Client::ll_file_layout(Inode *in, file_layout_t *layout)
13666 {
13667   std::scoped_lock lock(client_lock);
13668   *layout = in->layout;
13669   return 0;
13670 }
13671
13672 int Client::ll_file_layout(Fh *fh, file_layout_t *layout)
13673 {
13674   return ll_file_layout(fh->inode.get(), layout);
13675 }
13676
13677 /* Currently we cannot take advantage of redundancy in reads, since we
13678    would have to go through all possible placement groups (a
13679    potentially quite large number determined by a hash), and use CRUSH
13680    to calculate the appropriate set of OSDs for each placement group,
13681    then index into that.  An array with one entry per OSD is much more
13682    tractable and works for demonstration purposes. */
13683
13684 int Client::ll_get_stripe_osd(Inode *in, uint64_t blockno,
13685                               file_layout_t* layout)
13686 {
13687   std::scoped_lock lock(client_lock);
13688
13689   inodeno_t ino = in->ino;
13690   uint32_t object_size = layout->object_size;
13691   uint32_t su = layout->stripe_unit;
13692   uint32_t stripe_count = layout->stripe_count;
13693   uint64_t stripes_per_object = object_size / su;
13694   uint64_t stripeno = 0, stripepos = 0;
13695
13696   if(stripe_count) {
13697       stripeno = blockno / stripe_count;    // which horizontal stripe        (Y)
13698       stripepos = blockno % stripe_count;   // which object in the object set (X)
13699   }
13700   uint64_t objectsetno = stripeno / stripes_per_object;       // which object set
13701   uint64_t objectno = objectsetno * stripe_count + stripepos;  // object id
13702
13703   object_t oid = file_object_t(ino, objectno);
13704   return objecter->with_osdmap([&](const OSDMap& o) {
13705       ceph_object_layout olayout =
13706         o.file_to_object_layout(oid, *layout);
13707       pg_t pg = (pg_t)olayout.ol_pgid;
13708       vector<int> osds;
13709       int primary;
13710       o.pg_to_acting_osds(pg, &osds, &primary);
13711       return primary;
13712     });
13713 }
13714
13715 /* Return the offset of the block, internal to the object */
13716
13717 uint64_t Client::ll_get_internal_offset(Inode *in, uint64_t blockno)
13718 {
13719   std::scoped_lock lock(client_lock);
13720   file_layout_t *layout=&(in->layout);
13721   uint32_t object_size = layout->object_size;
13722   uint32_t su = layout->stripe_unit;
13723   uint64_t stripes_per_object = object_size / su;
13724
13725   return (blockno % stripes_per_object) * su;
13726 }
13727
13728 int Client::ll_opendir(Inode *in, int flags, dir_result_t** dirpp,
13729                        const UserPerm& perms)
13730 {
13731   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
13732   if (!mref_reader.is_state_satisfied())
13733     return -CEPHFS_ENOTCONN;
13734
13735   vinodeno_t vino = _get_vino(in);
13736
13737   ldout(cct, 3) << "ll_opendir " << vino << dendl;
13738   tout(cct) << "ll_opendir" << std::endl;
13739   tout(cct) << vino.ino.val << std::endl;
13740
13741   std::scoped_lock lock(client_lock);
13742
13743   if (!fuse_default_permissions) {
13744     int r = may_open(in, flags, perms);
13745     if (r < 0)
13746       return r;
13747   }
13748
13749   int r = _opendir(in, dirpp, perms);
13750   tout(cct) << (uintptr_t)*dirpp << std::endl;
13751
13752   ldout(cct, 3) << "ll_opendir " << vino << " = " << r << " (" << *dirpp << ")"
13753                 << dendl;
13754   return r;
13755 }
13756
13757 int Client::ll_releasedir(dir_result_t *dirp)
13758 {
13759   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
13760   if (!mref_reader.is_state_satisfied())
13761     return -CEPHFS_ENOTCONN;
13762
13763   ldout(cct, 3) << "ll_releasedir " << dirp << dendl;
13764   tout(cct) << "ll_releasedir" << std::endl;
13765   tout(cct) << (uintptr_t)dirp << std::endl;
13766
13767   std::scoped_lock lock(client_lock);
13768
13769   _closedir(dirp);
13770   return 0;
13771 }
13772
13773 int Client::ll_fsyncdir(dir_result_t *dirp)
13774 {
13775   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
13776   if (!mref_reader.is_state_satisfied())
13777     return -CEPHFS_ENOTCONN;
13778
13779   ldout(cct, 3) << "ll_fsyncdir " << dirp << dendl;
13780   tout(cct) << "ll_fsyncdir" << std::endl;
13781   tout(cct) << (uintptr_t)dirp << std::endl;
13782
13783   std::scoped_lock lock(client_lock);
13784   return _fsync(dirp->inode.get(), false);
13785 }
13786
13787 int Client::ll_open(Inode *in, int flags, Fh **fhp, const UserPerm& perms)
13788 {
13789   ceph_assert(!(flags & O_CREAT));
13790
13791   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
13792   if (!mref_reader.is_state_satisfied())
13793     return -CEPHFS_ENOTCONN;
13794
13795   vinodeno_t vino = _get_vino(in);
13796
13797   ldout(cct, 3) << "ll_open " << vino << " " << ceph_flags_sys2wire(flags) << dendl;
13798   tout(cct) << "ll_open" << std::endl;
13799   tout(cct) << vino.ino.val << std::endl;
13800   tout(cct) << ceph_flags_sys2wire(flags) << std::endl;
13801
13802   std::scoped_lock lock(client_lock);
13803
13804   int r;
13805   if (!fuse_default_permissions) {
13806     r = may_open(in, flags, perms);
13807     if (r < 0)
13808       goto out;
13809   }
13810
13811   r = _open(in, flags, 0, fhp /* may be NULL */, perms);
13812
13813  out:
13814   Fh *fhptr = fhp ? *fhp : NULL;
13815   if (fhptr) {
13816     ll_unclosed_fh_set.insert(fhptr);
13817   }
13818   tout(cct) << (uintptr_t)fhptr << std::endl;
13819   ldout(cct, 3) << "ll_open " << vino << " " << ceph_flags_sys2wire(flags) <<
13820       " = " << r << " (" << fhptr << ")" << dendl;
13821   return r;
13822 }
13823
13824 int Client::_ll_create(Inode *parent, const char *name, mode_t mode,
13825                       int flags, InodeRef *in, int caps, Fh **fhp,
13826                       const UserPerm& perms)
13827 {
13828   *fhp = NULL;
13829
13830   vinodeno_t vparent = _get_vino(parent);
13831
13832   ldout(cct, 8) << "_ll_create " << vparent << " " << name << " 0" << oct <<
13833     mode << dec << " " << ceph_flags_sys2wire(flags) << ", uid " << perms.uid()
13834                 << ", gid " << perms.gid() << dendl;
13835   tout(cct) << "ll_create" << std::endl;
13836   tout(cct) << vparent.ino.val << std::endl;
13837   tout(cct) << name << std::endl;
13838   tout(cct) << mode << std::endl;
13839   tout(cct) << ceph_flags_sys2wire(flags) << std::endl;
13840
13841   bool created = false;
13842   int r = _lookup(parent, name, caps, in, perms);
13843
13844   if (r == 0 && (flags & O_CREAT) && (flags & O_EXCL))
13845     return -CEPHFS_EEXIST;
13846
13847   if (r == -CEPHFS_ENOENT && (flags & O_CREAT)) {
13848     if (!fuse_default_permissions) {
13849       r = may_create(parent, perms);
13850       if (r < 0)
13851         goto out;
13852     }
13853     r = _create(parent, name, flags, mode, in, fhp, 0, 0, 0, NULL, &created,
13854                 perms, "");
13855     if (r < 0)
13856       goto out;
13857   }
13858
13859   if (r < 0)
13860     goto out;
13861
13862   ceph_assert(*in);
13863
13864   ldout(cct, 20) << "_ll_create created = " << created << dendl;
13865   if (!created) {
13866     if (!fuse_default_permissions) {
13867       r = may_open(in->get(), flags, perms);
13868       if (r < 0) {
13869         if (*fhp) {
13870           int release_r = _release_fh(*fhp);
13871           ceph_assert(release_r == 0);  // during create, no async data ops should have happened
13872         }
13873         goto out;
13874       }
13875     }
13876     if (*fhp == NULL) {
13877       r = _open(in->get(), flags, mode, fhp, perms);
13878       if (r < 0)
13879         goto out;
13880     }
13881   }
13882
13883 out:
13884   if (*fhp) {
13885     ll_unclosed_fh_set.insert(*fhp);
13886   }
13887
13888   ino_t ino = 0;
13889   if (r >= 0) {
13890     Inode *inode = in->get();
13891     if (use_faked_inos())
13892       ino = inode->faked_ino;
13893     else
13894       ino = inode->ino;
13895   }
13896
13897   tout(cct) << (uintptr_t)*fhp << std::endl;
13898   tout(cct) << ino << std::endl;
13899   ldout(cct, 8) << "_ll_create " << vparent << " " << name << " 0" << oct <<
13900     mode << dec << " " << ceph_flags_sys2wire(flags) << " = " << r << " (" <<
13901     *fhp << " " << hex << ino << dec << ")" << dendl;
13902
13903   return r;
13904 }
13905
13906 int Client::ll_create(Inode *parent, const char *name, mode_t mode,
13907                       int flags, struct stat *attr, Inode **outp, Fh **fhp,
13908                       const UserPerm& perms)
13909 {
13910   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
13911   if (!mref_reader.is_state_satisfied())
13912     return -CEPHFS_ENOTCONN;
13913
13914   std::scoped_lock lock(client_lock);
13915   InodeRef in;
13916
13917   int r = _ll_create(parent, name, mode, flags, &in, CEPH_STAT_CAP_INODE_ALL,
13918                       fhp, perms);
13919   if (r >= 0) {
13920     ceph_assert(in);
13921
13922     // passing an Inode in outp requires an additional ref
13923     if (outp) {
13924       _ll_get(in.get());
13925       *outp = in.get();
13926     }
13927     fill_stat(in, attr);
13928   } else {
13929     attr->st_ino = 0;
13930   }
13931
13932   return r;
13933 }
13934
13935 int Client::ll_createx(Inode *parent, const char *name, mode_t mode,
13936                         int oflags, Inode **outp, Fh **fhp,
13937                         struct ceph_statx *stx, unsigned want, unsigned lflags,
13938                         const UserPerm& perms)
13939 {
13940   unsigned caps = statx_to_mask(lflags, want);
13941   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
13942   if (!mref_reader.is_state_satisfied())
13943     return -CEPHFS_ENOTCONN;
13944
13945   std::scoped_lock lock(client_lock);
13946   InodeRef in;
13947
13948   int r = _ll_create(parent, name, mode, oflags, &in, caps, fhp, perms);
13949   if (r >= 0) {
13950     ceph_assert(in);
13951
13952     // passing an Inode in outp requires an additional ref
13953     if (outp) {
13954       _ll_get(in.get());
13955       *outp = in.get();
13956     }
13957     fill_statx(in, caps, stx);
13958   } else {
13959     stx->stx_ino = 0;
13960     stx->stx_mask = 0;
13961   }
13962
13963   return r;
13964 }
13965
13966 loff_t Client::ll_lseek(Fh *fh, loff_t offset, int whence)
13967 {
13968   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
13969   if (!mref_reader.is_state_satisfied())
13970     return -CEPHFS_ENOTCONN;
13971
13972   tout(cct) << "ll_lseek" << std::endl;
13973   tout(cct) << offset << std::endl;
13974   tout(cct) << whence << std::endl;
13975
13976   std::scoped_lock lock(client_lock);
13977   return _lseek(fh, offset, whence);
13978 }
13979
13980 int Client::ll_read(Fh *fh, loff_t off, loff_t len, bufferlist *bl)
13981 {
13982   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
13983   if (!mref_reader.is_state_satisfied())
13984     return -CEPHFS_ENOTCONN;
13985
13986   ldout(cct, 3) << "ll_read " << fh << " " << fh->inode->ino << " " << " " << off << "~" << len << dendl;
13987   tout(cct) << "ll_read" << std::endl;
13988   tout(cct) << (uintptr_t)fh << std::endl;
13989   tout(cct) << off << std::endl;
13990   tout(cct) << len << std::endl;
13991
13992   /* We can't return bytes written larger than INT_MAX, clamp len to that */
13993   len = std::min(len, (loff_t)INT_MAX);
13994   std::scoped_lock lock(client_lock);
13995
13996   int r = _read(fh, off, len, bl);
13997   ldout(cct, 3) << "ll_read " << fh << " " << off << "~" << len << " = " << r
13998                 << dendl;
13999   return r;
14000 }
14001
14002 int Client::ll_read_block(Inode *in, uint64_t blockid,
14003                           char *buf,
14004                           uint64_t offset,
14005                           uint64_t length,
14006                           file_layout_t* layout)
14007 {
14008   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14009   if (!mref_reader.is_state_satisfied())
14010     return -CEPHFS_ENOTCONN;
14011
14012   vinodeno_t vino = _get_vino(in);
14013   object_t oid = file_object_t(vino.ino, blockid);
14014   C_SaferCond onfinish;
14015   bufferlist bl;
14016
14017   objecter->read(oid,
14018                  object_locator_t(layout->pool_id),
14019                  offset,
14020                  length,
14021                  vino.snapid,
14022                  &bl,
14023                  CEPH_OSD_FLAG_READ,
14024                  &onfinish);
14025
14026   int r = onfinish.wait();
14027   if (r >= 0) {
14028       bl.begin().copy(bl.length(), buf);
14029       r = bl.length();
14030   }
14031
14032   return r;
14033 }
14034
14035 /* It appears that the OSD doesn't return success unless the entire
14036    buffer was written, return the write length on success. */
14037
14038 int Client::ll_write_block(Inode *in, uint64_t blockid,
14039                            char* buf, uint64_t offset,
14040                            uint64_t length, file_layout_t* layout,
14041                            uint64_t snapseq, uint32_t sync)
14042 {
14043   vinodeno_t vino = ll_get_vino(in);
14044   int r = 0;
14045   std::unique_ptr<C_SaferCond> onsafe = nullptr;
14046
14047   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14048   if (!mref_reader.is_state_satisfied())
14049     return -CEPHFS_ENOTCONN;
14050
14051   if (length == 0) {
14052     return -CEPHFS_EINVAL;
14053   }
14054   if (true || sync) {
14055     /* if write is stable, the epilogue is waiting on
14056      * flock */
14057     onsafe.reset(new C_SaferCond("Client::ll_write_block flock"));
14058   }
14059   object_t oid = file_object_t(vino.ino, blockid);
14060   SnapContext fakesnap;
14061   ceph::bufferlist bl;
14062   if (length > 0) {
14063     bl.push_back(buffer::copy(buf, length));
14064   }
14065
14066   ldout(cct, 1) << "ll_block_write for " << vino.ino << "." << blockid
14067                 << dendl;
14068
14069   fakesnap.seq = snapseq;
14070
14071   /* lock just in time */
14072   objecter->write(oid,
14073                   object_locator_t(layout->pool_id),
14074                   offset,
14075                   length,
14076                   fakesnap,
14077                   bl,
14078                   ceph::real_clock::now(),
14079                   0,
14080                   onsafe.get());
14081
14082   if (nullptr != onsafe) {
14083     r = onsafe->wait();
14084   }
14085
14086   if (r < 0) {
14087     return r;
14088   } else {
14089     return length;
14090   }
14091 }
14092
14093 int Client::ll_commit_blocks(Inode *in,
14094                              uint64_t offset,
14095                              uint64_t length)
14096 {
14097     /*
14098     BarrierContext *bctx;
14099     vinodeno_t vino = _get_vino(in);
14100     uint64_t ino = vino.ino;
14101
14102     ldout(cct, 1) << "ll_commit_blocks for " << vino.ino << " from "
14103                   << offset << " to " << length << dendl;
14104
14105     if (length == 0) {
14106       return -CEPHFS_EINVAL;
14107     }
14108
14109     std::scoped_lock lock(client_lock);
14110     map<uint64_t, BarrierContext*>::iterator p = barriers.find(ino);
14111     if (p != barriers.end()) {
14112       barrier_interval civ(offset, offset + length);
14113       p->second->commit_barrier(civ);
14114     }
14115     */
14116     return 0;
14117 }
14118
14119 int Client::ll_write(Fh *fh, loff_t off, loff_t len, const char *data)
14120 {
14121   ldout(cct, 3) << "ll_write " << fh << " " << fh->inode->ino << " " << off <<
14122     "~" << len << dendl;
14123   tout(cct) << "ll_write" << std::endl;
14124   tout(cct) << (uintptr_t)fh << std::endl;
14125   tout(cct) << off << std::endl;
14126   tout(cct) << len << std::endl;
14127
14128   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14129   if (!mref_reader.is_state_satisfied())
14130     return -CEPHFS_ENOTCONN;
14131
14132   /* We can't return bytes written larger than INT_MAX, clamp len to that */
14133   len = std::min(len, (loff_t)INT_MAX);
14134   std::scoped_lock lock(client_lock);
14135
14136   int r = _write(fh, off, len, data, NULL, 0);
14137   ldout(cct, 3) << "ll_write " << fh << " " << off << "~" << len << " = " << r
14138                 << dendl;
14139   return r;
14140 }
14141
14142 int64_t Client::ll_writev(struct Fh *fh, const struct iovec *iov, int iovcnt, int64_t off)
14143 {
14144   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14145   if (!mref_reader.is_state_satisfied())
14146     return -CEPHFS_ENOTCONN;
14147
14148   std::unique_lock cl(client_lock);
14149   return _preadv_pwritev_locked(fh, iov, iovcnt, off, true, false, cl);
14150 }
14151
14152 int64_t Client::ll_readv(struct Fh *fh, const struct iovec *iov, int iovcnt, int64_t off)
14153 {
14154   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14155   if (!mref_reader.is_state_satisfied())
14156     return -CEPHFS_ENOTCONN;
14157
14158   std::unique_lock cl(client_lock);
14159   return _preadv_pwritev_locked(fh, iov, iovcnt, off, false, false, cl);
14160 }
14161
14162 int Client::ll_flush(Fh *fh)
14163 {
14164   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14165   if (!mref_reader.is_state_satisfied())
14166     return -CEPHFS_ENOTCONN;
14167
14168   ldout(cct, 3) << "ll_flush " << fh << " " << fh->inode->ino << " " << dendl;
14169   tout(cct) << "ll_flush" << std::endl;
14170   tout(cct) << (uintptr_t)fh << std::endl;
14171
14172   std::scoped_lock lock(client_lock);
14173   return _flush(fh);
14174 }
14175
14176 int Client::ll_fsync(Fh *fh, bool syncdataonly)
14177 {
14178   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14179   if (!mref_reader.is_state_satisfied())
14180     return -CEPHFS_ENOTCONN;
14181
14182   ldout(cct, 3) << "ll_fsync " << fh << " " << fh->inode->ino << " " << dendl;
14183   tout(cct) << "ll_fsync" << std::endl;
14184   tout(cct) << (uintptr_t)fh << std::endl;
14185
14186   std::scoped_lock lock(client_lock);
14187   int r = _fsync(fh, syncdataonly);
14188   if (r) {
14189     // If we're returning an error, clear it from the FH
14190     fh->take_async_err();
14191   }
14192   return r;
14193 }
14194
14195 int Client::ll_sync_inode(Inode *in, bool syncdataonly)
14196 {
14197   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14198   if (!mref_reader.is_state_satisfied())
14199     return -CEPHFS_ENOTCONN;
14200
14201   ldout(cct, 3) << "ll_sync_inode " << *in << " " << dendl;
14202   tout(cct) << "ll_sync_inode" << std::endl;
14203   tout(cct) << (uintptr_t)in << std::endl;
14204
14205   std::scoped_lock lock(client_lock);
14206   return _fsync(in, syncdataonly);
14207 }
14208
14209 #ifdef FALLOC_FL_PUNCH_HOLE
14210
14211 int Client::_fallocate(Fh *fh, int mode, int64_t offset, int64_t length)
14212 {
14213   ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
14214
14215   if (offset < 0 || length <= 0)
14216     return -CEPHFS_EINVAL;
14217
14218   if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
14219     return -CEPHFS_EOPNOTSUPP;
14220
14221   if ((mode & FALLOC_FL_PUNCH_HOLE) && !(mode & FALLOC_FL_KEEP_SIZE))
14222     return -CEPHFS_EOPNOTSUPP;
14223
14224   Inode *in = fh->inode.get();
14225
14226   if (objecter->osdmap_pool_full(in->layout.pool_id) &&
14227       !(mode & FALLOC_FL_PUNCH_HOLE)) {
14228     return -CEPHFS_ENOSPC;
14229   }
14230
14231   if (in->snapid != CEPH_NOSNAP)
14232     return -CEPHFS_EROFS;
14233
14234   if ((fh->mode & CEPH_FILE_MODE_WR) == 0)
14235     return -CEPHFS_EBADF;
14236
14237   uint64_t size = offset + length;
14238   if (!(mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE)) &&
14239       size > in->size &&
14240       is_quota_bytes_exceeded(in, size - in->size, fh->actor_perms)) {
14241     return -CEPHFS_EDQUOT;
14242   }
14243
14244   int have;
14245   int r = get_caps(fh, CEPH_CAP_FILE_WR, CEPH_CAP_FILE_BUFFER, &have, -1);
14246   if (r < 0)
14247     return r;
14248
14249   std::unique_ptr<C_SaferCond> onuninline = nullptr;
14250   if (mode & FALLOC_FL_PUNCH_HOLE) {
14251     if (in->inline_version < CEPH_INLINE_NONE &&
14252         (have & CEPH_CAP_FILE_BUFFER)) {
14253       bufferlist bl;
14254       auto inline_iter = in->inline_data.cbegin();
14255       int len = in->inline_data.length();
14256       if (offset < len) {
14257         if (offset > 0)
14258           inline_iter.copy(offset, bl);
14259         int size = length;
14260         if (offset + size > len)
14261           size = len - offset;
14262         if (size > 0)
14263           bl.append_zero(size);
14264         if (offset + size < len) {
14265           inline_iter += size;
14266           inline_iter.copy(len - offset - size, bl);
14267         }
14268         in->inline_data = bl;
14269         in->inline_version++;
14270       }
14271       in->mtime = in->ctime = ceph_clock_now();
14272       in->change_attr++;
14273       in->mark_caps_dirty(CEPH_CAP_FILE_WR);
14274     } else {
14275       if (in->inline_version < CEPH_INLINE_NONE) {
14276         onuninline.reset(new C_SaferCond("Client::_fallocate_uninline_data flock"));
14277         uninline_data(in, onuninline.get());
14278       }
14279
14280       C_SaferCond onfinish("Client::_punch_hole flock");
14281
14282       get_cap_ref(in, CEPH_CAP_FILE_BUFFER);
14283
14284       _invalidate_inode_cache(in, offset, length);
14285       filer->zero(in->ino, &in->layout,
14286                   in->snaprealm->get_snap_context(),
14287                   offset, length,
14288                   ceph::real_clock::now(),
14289                   0, true, &onfinish);
14290       in->mtime = in->ctime = ceph_clock_now();
14291       in->change_attr++;
14292       in->mark_caps_dirty(CEPH_CAP_FILE_WR);
14293
14294       client_lock.unlock();
14295       onfinish.wait();
14296       client_lock.lock();
14297       put_cap_ref(in, CEPH_CAP_FILE_BUFFER);
14298     }
14299   } else if (!(mode & FALLOC_FL_KEEP_SIZE)) {
14300     uint64_t size = offset + length;
14301     if (size > in->size) {
14302       in->size = size;
14303       in->mtime = in->ctime = ceph_clock_now();
14304       in->change_attr++;
14305       in->mark_caps_dirty(CEPH_CAP_FILE_WR);
14306
14307       if (is_quota_bytes_approaching(in, fh->actor_perms)) {
14308         check_caps(in, CHECK_CAPS_NODELAY);
14309       } else if (is_max_size_approaching(in)) {
14310         check_caps(in, 0);
14311       }
14312     }
14313   }
14314
14315   if (nullptr != onuninline) {
14316     client_lock.unlock();
14317     int ret = onuninline->wait();
14318     client_lock.lock();
14319
14320     if (ret >= 0 || ret == -CEPHFS_ECANCELED) {
14321       in->inline_data.clear();
14322       in->inline_version = CEPH_INLINE_NONE;
14323       in->mark_caps_dirty(CEPH_CAP_FILE_WR);
14324       check_caps(in, 0);
14325     } else
14326       r = ret;
14327   }
14328
14329   put_cap_ref(in, CEPH_CAP_FILE_WR);
14330   return r;
14331 }
14332 #else
14333
14334 int Client::_fallocate(Fh *fh, int mode, int64_t offset, int64_t length)
14335 {
14336   return -CEPHFS_EOPNOTSUPP;
14337 }
14338
14339 #endif
14340
14341
14342 int Client::ll_fallocate(Fh *fh, int mode, int64_t offset, int64_t length)
14343 {
14344   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14345   if (!mref_reader.is_state_satisfied())
14346     return -CEPHFS_ENOTCONN;
14347
14348   ldout(cct, 3) << __func__ << " " << fh << " " << fh->inode->ino << " " << dendl;
14349   tout(cct) << __func__ << " " << mode << " " << offset << " " << length << std::endl;
14350   tout(cct) << (uintptr_t)fh << std::endl;
14351
14352   std::scoped_lock lock(client_lock);
14353   return _fallocate(fh, mode, offset, length);
14354 }
14355
14356 int Client::fallocate(int fd, int mode, loff_t offset, loff_t length)
14357 {
14358   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14359   if (!mref_reader.is_state_satisfied())
14360     return -CEPHFS_ENOTCONN;
14361
14362   tout(cct) << __func__ << " " << " " << fd << mode << " " << offset << " " << length << std::endl;
14363
14364   std::scoped_lock lock(client_lock);
14365   Fh *fh = get_filehandle(fd);
14366   if (!fh)
14367     return -CEPHFS_EBADF;
14368 #if defined(__linux__) && defined(O_PATH)
14369   if (fh->flags & O_PATH)
14370     return -CEPHFS_EBADF;
14371 #endif
14372   return _fallocate(fh, mode, offset, length);
14373 }
14374
14375 int Client::ll_release(Fh *fh)
14376 {
14377   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14378   if (!mref_reader.is_state_satisfied())
14379     return -CEPHFS_ENOTCONN;
14380
14381   ldout(cct, 3) << __func__ << " (fh)" << fh << " " << fh->inode->ino << " " <<
14382     dendl;
14383   tout(cct) << __func__ << " (fh)" << std::endl;
14384   tout(cct) << (uintptr_t)fh << std::endl;
14385
14386   std::scoped_lock lock(client_lock);
14387
14388   if (ll_unclosed_fh_set.count(fh))
14389     ll_unclosed_fh_set.erase(fh);
14390   return _release_fh(fh);
14391 }
14392
14393 int Client::ll_getlk(Fh *fh, struct flock *fl, uint64_t owner)
14394 {
14395   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14396   if (!mref_reader.is_state_satisfied())
14397     return -CEPHFS_ENOTCONN;
14398
14399   ldout(cct, 3) << "ll_getlk (fh)" << fh << " " << fh->inode->ino << dendl;
14400   tout(cct) << "ll_getk (fh)" << (uintptr_t)fh << std::endl;
14401
14402   std::scoped_lock lock(client_lock);
14403   return _getlk(fh, fl, owner);
14404 }
14405
14406 int Client::ll_setlk(Fh *fh, struct flock *fl, uint64_t owner, int sleep)
14407 {
14408   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14409   if (!mref_reader.is_state_satisfied())
14410     return -CEPHFS_ENOTCONN;
14411
14412   ldout(cct, 3) << __func__ << "  (fh) " << fh << " " << fh->inode->ino << dendl;
14413   tout(cct) << __func__ << " (fh)" << (uintptr_t)fh << std::endl;
14414
14415   std::scoped_lock lock(client_lock);
14416   return _setlk(fh, fl, owner, sleep);
14417 }
14418
14419 int Client::ll_flock(Fh *fh, int cmd, uint64_t owner)
14420 {
14421   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14422   if (!mref_reader.is_state_satisfied())
14423     return -CEPHFS_ENOTCONN;
14424
14425   ldout(cct, 3) << __func__ << "  (fh) " << fh << " " << fh->inode->ino << dendl;
14426   tout(cct) << __func__ << " (fh)" << (uintptr_t)fh << std::endl;
14427
14428   std::scoped_lock lock(client_lock);
14429   return _flock(fh, cmd, owner);
14430 }
14431
14432 int Client::set_deleg_timeout(uint32_t timeout)
14433 {
14434   std::scoped_lock lock(client_lock);
14435
14436   /*
14437    * The whole point is to prevent blocklisting so we must time out the
14438    * delegation before the session autoclose timeout kicks in.
14439    */
14440   if (timeout >= mdsmap->get_session_autoclose())
14441     return -CEPHFS_EINVAL;
14442
14443   deleg_timeout = timeout;
14444   return 0;
14445 }
14446
14447 int Client::ll_delegation(Fh *fh, unsigned cmd, ceph_deleg_cb_t cb, void *priv)
14448 {
14449   int ret = -CEPHFS_EINVAL;
14450
14451   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14452   if (!mref_reader.is_state_satisfied())
14453     return -CEPHFS_ENOTCONN;
14454
14455   std::scoped_lock lock(client_lock);
14456
14457   Inode *inode = fh->inode.get();
14458
14459   switch(cmd) {
14460   case CEPH_DELEGATION_NONE:
14461     inode->unset_deleg(fh);
14462     ret = 0;
14463     break;
14464   default:
14465     try {
14466       ret = inode->set_deleg(fh, cmd, cb, priv);
14467     } catch (std::bad_alloc&) {
14468       ret = -CEPHFS_ENOMEM;
14469     }
14470     break;
14471   }
14472   return ret;
14473 }
14474
14475 class C_Client_RequestInterrupt : public Context  {
14476 private:
14477   Client *client;
14478   MetaRequest *req;
14479 public:
14480   C_Client_RequestInterrupt(Client *c, MetaRequest *r) : client(c), req(r) {
14481     req->get();
14482   }
14483   void finish(int r) override {
14484     std::scoped_lock l(client->client_lock);
14485     ceph_assert(req->head.op == CEPH_MDS_OP_SETFILELOCK);
14486     client->_interrupt_filelock(req);
14487     client->put_request(req);
14488   }
14489 };
14490
14491 void Client::ll_interrupt(void *d)
14492 {
14493   MetaRequest *req = static_cast<MetaRequest*>(d);
14494   ldout(cct, 3) << __func__ << " tid " << req->get_tid() << dendl;
14495   tout(cct) << __func__ << " tid " << req->get_tid() << std::endl;
14496   interrupt_finisher.queue(new C_Client_RequestInterrupt(this, req));
14497 }
14498
14499 // =========================================
14500 // layout
14501
14502 // expose file layouts
14503
14504 int Client::describe_layout(const char *relpath, file_layout_t *lp,
14505                             const UserPerm& perms)
14506 {
14507   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14508   if (!mref_reader.is_state_satisfied())
14509     return -CEPHFS_ENOTCONN;
14510
14511   std::scoped_lock lock(client_lock);
14512
14513   filepath path(relpath);
14514   InodeRef in;
14515   int r = path_walk(path, &in, perms);
14516   if (r < 0)
14517     return r;
14518
14519   *lp = in->layout;
14520
14521   ldout(cct, 3) << __func__ << "(" << relpath << ") = 0" << dendl;
14522   return 0;
14523 }
14524
14525 int Client::fdescribe_layout(int fd, file_layout_t *lp)
14526 {
14527   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14528   if (!mref_reader.is_state_satisfied())
14529     return -CEPHFS_ENOTCONN;
14530
14531   std::scoped_lock lock(client_lock);
14532
14533   Fh *f = get_filehandle(fd);
14534   if (!f)
14535     return -CEPHFS_EBADF;
14536   Inode *in = f->inode.get();
14537
14538   *lp = in->layout;
14539
14540   ldout(cct, 3) << __func__ << "(" << fd << ") = 0" << dendl;
14541   return 0;
14542 }
14543
14544 int64_t Client::get_default_pool_id()
14545 {
14546   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14547   if (!mref_reader.is_state_satisfied())
14548     return -CEPHFS_ENOTCONN;
14549
14550   std::scoped_lock lock(client_lock);
14551
14552   /* first data pool is the default */
14553   return mdsmap->get_first_data_pool();
14554 }
14555
14556 // expose osdmap
14557
14558 int64_t Client::get_pool_id(const char *pool_name)
14559 {
14560   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14561   if (!mref_reader.is_state_satisfied())
14562     return -CEPHFS_ENOTCONN;
14563
14564   std::scoped_lock lock(client_lock);
14565
14566   return objecter->with_osdmap(std::mem_fn(&OSDMap::lookup_pg_pool_name),
14567                                pool_name);
14568 }
14569
14570 string Client::get_pool_name(int64_t pool)
14571 {
14572   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14573   if (!mref_reader.is_state_satisfied())
14574     return string();
14575
14576   std::scoped_lock lock(client_lock);
14577
14578   return objecter->with_osdmap([pool](const OSDMap& o) {
14579       return o.have_pg_pool(pool) ? o.get_pool_name(pool) : string();
14580     });
14581 }
14582
14583 int Client::get_pool_replication(int64_t pool)
14584 {
14585   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14586   if (!mref_reader.is_state_satisfied())
14587     return -CEPHFS_ENOTCONN;
14588
14589   std::scoped_lock lock(client_lock);
14590
14591   return objecter->with_osdmap([pool](const OSDMap& o) {
14592       return o.have_pg_pool(pool) ? o.get_pg_pool(pool)->get_size() : -CEPHFS_ENOENT;
14593     });
14594 }
14595
14596 int Client::get_file_extent_osds(int fd, loff_t off, loff_t *len, vector<int>& osds)
14597 {
14598   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14599   if (!mref_reader.is_state_satisfied())
14600     return -CEPHFS_ENOTCONN;
14601
14602   std::scoped_lock lock(client_lock);
14603
14604   Fh *f = get_filehandle(fd);
14605   if (!f)
14606     return -CEPHFS_EBADF;
14607   Inode *in = f->inode.get();
14608
14609   vector<ObjectExtent> extents;
14610   Striper::file_to_extents(cct, in->ino, &in->layout, off, 1, in->truncate_size, extents);
14611   ceph_assert(extents.size() == 1);
14612
14613   objecter->with_osdmap([&](const OSDMap& o) {
14614       pg_t pg = o.object_locator_to_pg(extents[0].oid, extents[0].oloc);
14615       o.pg_to_acting_osds(pg, osds);
14616     });
14617
14618   if (osds.empty())
14619     return -CEPHFS_EINVAL;
14620
14621   /*
14622    * Return the remainder of the extent (stripe unit)
14623    *
14624    * If length = 1 is passed to Striper::file_to_extents we get a single
14625    * extent back, but its length is one so we still need to compute the length
14626    * to the end of the stripe unit.
14627    *
14628    * If length = su then we may get 1 or 2 objects back in the extents vector
14629    * which would have to be examined. Even then, the offsets are local to the
14630    * object, so matching up to the file offset is extra work.
14631    *
14632    * It seems simpler to stick with length = 1 and manually compute the
14633    * remainder.
14634    */
14635   if (len) {
14636     uint64_t su = in->layout.stripe_unit;
14637     *len = su - (off % su);
14638   }
14639
14640   return 0;
14641 }
14642
14643 int Client::get_osd_crush_location(int id, vector<pair<string, string> >& path)
14644 {
14645   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14646   if (!mref_reader.is_state_satisfied())
14647     return -CEPHFS_ENOTCONN;
14648
14649   std::scoped_lock lock(client_lock);
14650
14651   if (id < 0)
14652     return -CEPHFS_EINVAL;
14653   return objecter->with_osdmap([&](const OSDMap& o) {
14654       return o.crush->get_full_location_ordered(id, path);
14655     });
14656 }
14657
14658 int Client::get_file_stripe_address(int fd, loff_t offset,
14659                                     vector<entity_addr_t>& address)
14660 {
14661   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14662   if (!mref_reader.is_state_satisfied())
14663     return -CEPHFS_ENOTCONN;
14664
14665   std::scoped_lock lock(client_lock);
14666
14667   Fh *f = get_filehandle(fd);
14668   if (!f)
14669     return -CEPHFS_EBADF;
14670   Inode *in = f->inode.get();
14671
14672   // which object?
14673   vector<ObjectExtent> extents;
14674   Striper::file_to_extents(cct, in->ino, &in->layout, offset, 1,
14675                            in->truncate_size, extents);
14676   ceph_assert(extents.size() == 1);
14677
14678   // now we have the object and its 'layout'
14679   return objecter->with_osdmap([&](const OSDMap& o) {
14680       pg_t pg = o.object_locator_to_pg(extents[0].oid, extents[0].oloc);
14681       vector<int> osds;
14682       o.pg_to_acting_osds(pg, osds);
14683       if (osds.empty())
14684         return -CEPHFS_EINVAL;
14685       for (unsigned i = 0; i < osds.size(); i++) {
14686         entity_addr_t addr = o.get_addrs(osds[i]).front();
14687         address.push_back(addr);
14688       }
14689       return 0;
14690     });
14691 }
14692
14693 int Client::get_osd_addr(int osd, entity_addr_t& addr)
14694 {
14695   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14696   if (!mref_reader.is_state_satisfied())
14697     return -CEPHFS_ENOTCONN;
14698
14699   std::scoped_lock lock(client_lock);
14700
14701   return objecter->with_osdmap([&](const OSDMap& o) {
14702       if (!o.exists(osd))
14703         return -CEPHFS_ENOENT;
14704
14705       addr = o.get_addrs(osd).front();
14706       return 0;
14707     });
14708 }
14709
14710 int Client::enumerate_layout(int fd, vector<ObjectExtent>& result,
14711                              loff_t length, loff_t offset)
14712 {
14713   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14714   if (!mref_reader.is_state_satisfied())
14715     return -CEPHFS_ENOTCONN;
14716
14717   std::scoped_lock lock(client_lock);
14718
14719   Fh *f = get_filehandle(fd);
14720   if (!f)
14721     return -CEPHFS_EBADF;
14722   Inode *in = f->inode.get();
14723
14724   // map to a list of extents
14725   Striper::file_to_extents(cct, in->ino, &in->layout, offset, length, in->truncate_size, result);
14726
14727   ldout(cct, 3) << __func__ << "(" << fd << ", " << length << ", " << offset << ") = 0" << dendl;
14728   return 0;
14729 }
14730
14731
14732 /* find an osd with the same ip.  -CEPHFS_ENXIO if none. */
14733 int Client::get_local_osd()
14734 {
14735   RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14736   if (!mref_reader.is_state_satisfied())
14737     return -CEPHFS_ENOTCONN;
14738
14739   std::scoped_lock lock(client_lock);
14740
14741   objecter->with_osdmap([this](const OSDMap& o) {
14742       if (o.get_epoch() != local_osd_epoch) {
14743         local_osd = o.find_osd_on_ip(messenger->get_myaddrs().front());
14744         local_osd_epoch = o.get_epoch();
14745       }
14746     });
14747   return local_osd;
14748 }
14749
14750
14751
14752
14753
14754
14755 // ===============================
14756
14757 void Client::ms_handle_connect(Connection *con)
14758 {
14759   ldout(cct, 10) << __func__ << " on " << con->get_peer_addr() << dendl;
14760 }
14761
14762 bool Client::ms_handle_reset(Connection *con)
14763 {
14764   ldout(cct, 0) << __func__ << " on " << con->get_peer_addr() << dendl;
14765   return false;
14766 }
14767
14768 void Client::ms_handle_remote_reset(Connection *con)
14769 {
14770   std::scoped_lock lock(client_lock);
14771   ldout(cct, 0) << __func__ << " on " << con->get_peer_addr() << dendl;
14772   switch (con->get_peer_type()) {
14773   case CEPH_ENTITY_TYPE_MDS:
14774     {
14775       // kludge to figure out which mds this is; fixme with a Connection* state
14776       mds_rank_t mds = MDS_RANK_NONE;
14777       MetaSession *s = NULL;
14778       for (auto &p : mds_sessions) {
14779         if (mdsmap->get_addrs(p.first) == con->get_peer_addrs()) {
14780           mds = p.first;
14781           s = &p.second;
14782         }
14783       }
14784       if (mds >= 0) {
14785         assert (s != NULL);
14786         switch (s->state) {
14787         case MetaSession::STATE_CLOSING:
14788           ldout(cct, 1) << "reset from mds we were closing; we'll call that closed" << dendl;
14789           _closed_mds_session(s);
14790           break;
14791
14792         case MetaSession::STATE_OPENING:
14793           {
14794             ldout(cct, 1) << "reset from mds we were opening; retrying" << dendl;
14795             list<Context*> waiters;
14796             waiters.swap(s->waiting_for_open);
14797             _closed_mds_session(s);
14798             MetaSession *news = _get_or_open_mds_session(mds);
14799             news->waiting_for_open.swap(waiters);
14800           }
14801           break;
14802
14803         case MetaSession::STATE_OPEN:
14804           {
14805             objecter->maybe_request_map(); /* to check if we are blocklisted */
14806             if (cct->_conf.get_val<bool>("client_reconnect_stale")) {
14807               ldout(cct, 1) << "reset from mds we were open; close mds session for reconnect" << dendl;
14808               _closed_mds_session(s);
14809             } else {
14810               ldout(cct, 1) << "reset from mds we were open; mark session as stale" << dendl;
14811               s->state = MetaSession::STATE_STALE;
14812             }
14813           }
14814           break;
14815
14816         case MetaSession::STATE_NEW:
14817         case MetaSession::STATE_CLOSED:
14818         default:
14819           break;
14820         }
14821       }
14822     }
14823     break;
14824   }
14825 }
14826
14827 bool Client::ms_handle_refused(Connection *con)
14828 {
14829   ldout(cct, 1) << __func__ << " on " << con->get_peer_addr() << dendl;
14830   return false;
14831 }
14832
14833 Inode *Client::get_quota_root(Inode *in, const UserPerm& perms)
14834 {
14835   Inode *quota_in = root_ancestor;
14836   SnapRealm *realm = in->snaprealm;
14837   while (realm) {
14838     ldout(cct, 10) << __func__ << " realm " << realm->ino << dendl;
14839     if (realm->ino != in->ino) {
14840       auto p = inode_map.find(vinodeno_t(realm->ino, CEPH_NOSNAP));
14841       if (p == inode_map.end())
14842         break;
14843
14844       if (p->second->quota.is_enable()) {
14845         quota_in = p->second;
14846         break;
14847       }
14848     }
14849     realm = realm->pparent;
14850   }
14851   ldout(cct, 10) << __func__ << " " << in->vino() << " -> " << quota_in->vino() << dendl;
14852   return quota_in;
14853 }
14854
14855 /**
14856  * Traverse quota ancestors of the Inode, return true
14857  * if any of them passes the passed function
14858  */
14859 bool Client::check_quota_condition(Inode *in, const UserPerm& perms,
14860                                    std::function<bool (const Inode &in)> test)
14861 {
14862   while (true) {
14863     ceph_assert(in != NULL);
14864     if (test(*in)) {
14865       return true;
14866     }
14867
14868     if (in == root_ancestor) {
14869       // We're done traversing, drop out
14870       return false;
14871     } else {
14872       // Continue up the tree
14873       in = get_quota_root(in, perms);
14874     }
14875   }
14876
14877   return false;
14878 }
14879
14880 bool Client::is_quota_files_exceeded(Inode *in, const UserPerm& perms)
14881 {
14882   return check_quota_condition(in, perms,
14883       [](const Inode &in) {
14884         return in.quota.max_files && in.rstat.rsize() >= in.quota.max_files;
14885       });
14886 }
14887
14888 bool Client::is_quota_bytes_exceeded(Inode *in, int64_t new_bytes,
14889                                      const UserPerm& perms)
14890 {
14891   return check_quota_condition(in, perms,
14892       [&new_bytes](const Inode &in) {
14893         return in.quota.max_bytes && (in.rstat.rbytes + new_bytes)
14894                > in.quota.max_bytes;
14895       });
14896 }
14897
14898 bool Client::is_quota_bytes_approaching(Inode *in, const UserPerm& perms)
14899 {
14900   ceph_assert(in->size >= in->reported_size);
14901   const uint64_t size = in->size - in->reported_size;
14902   return check_quota_condition(in, perms,
14903       [&size](const Inode &in) {
14904         if (in.quota.max_bytes) {
14905           if (in.rstat.rbytes >= in.quota.max_bytes) {
14906             return true;
14907           }
14908
14909           const uint64_t space = in.quota.max_bytes - in.rstat.rbytes;
14910           return (space >> 4) < size;
14911         } else {
14912           return false;
14913         }
14914       });
14915 }
14916
14917 enum {
14918   POOL_CHECKED = 1,
14919   POOL_CHECKING = 2,
14920   POOL_READ = 4,
14921   POOL_WRITE = 8,
14922 };
14923
14924 int Client::check_pool_perm(Inode *in, int need)
14925 {
14926   ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
14927
14928   if (!cct->_conf->client_check_pool_perm)
14929     return 0;
14930
14931   /* Only need to do this for regular files */
14932   if (!in->is_file())
14933     return 0;
14934
14935   int64_t pool_id = in->layout.pool_id;
14936   std::string pool_ns = in->layout.pool_ns;
14937   std::pair<int64_t, std::string> perm_key(pool_id, pool_ns);
14938   int have = 0;
14939   while (true) {
14940     auto it = pool_perms.find(perm_key);
14941     if (it == pool_perms.end())
14942       break;
14943     if (it->second == POOL_CHECKING) {
14944       // avoid concurrent checkings
14945       wait_on_list(waiting_for_pool_perm);
14946     } else {
14947       have = it->second;
14948       ceph_assert(have & POOL_CHECKED);
14949       break;
14950     }
14951   }
14952
14953   if (!have) {
14954     if (in->snapid != CEPH_NOSNAP) {
14955       // pool permission check needs to write to the first object. But for snapshot,
14956       // head of the first object may have alread been deleted. To avoid creating
14957       // orphan object, skip the check for now.
14958       return 0;
14959     }
14960
14961     pool_perms[perm_key] = POOL_CHECKING;
14962
14963     char oid_buf[32];
14964     snprintf(oid_buf, sizeof(oid_buf), "%llx.00000000", (unsigned long long)in->ino);
14965     object_t oid = oid_buf;
14966
14967     SnapContext nullsnapc;
14968
14969     C_SaferCond rd_cond;
14970     ObjectOperation rd_op;
14971     rd_op.stat(nullptr, nullptr, nullptr);
14972
14973     objecter->mutate(oid, OSDMap::file_to_object_locator(in->layout), rd_op,
14974                      nullsnapc, ceph::real_clock::now(), 0, &rd_cond);
14975
14976     C_SaferCond wr_cond;
14977     ObjectOperation wr_op;
14978     wr_op.create(true);
14979
14980     objecter->mutate(oid, OSDMap::file_to_object_locator(in->layout), wr_op,
14981                      nullsnapc, ceph::real_clock::now(), 0, &wr_cond);
14982
14983     client_lock.unlock();
14984     int rd_ret = rd_cond.wait();
14985     int wr_ret = wr_cond.wait();
14986     client_lock.lock();
14987
14988     bool errored = false;
14989
14990     if (rd_ret == 0 || rd_ret == -CEPHFS_ENOENT)
14991       have |= POOL_READ;
14992     else if (rd_ret != -CEPHFS_EPERM) {
14993       ldout(cct, 10) << __func__ << " on pool " << pool_id << " ns " << pool_ns
14994                      << " rd_err = " << rd_ret << " wr_err = " << wr_ret << dendl;
14995       errored = true;
14996     }
14997
14998     if (wr_ret == 0 || wr_ret == -CEPHFS_EEXIST)
14999       have |= POOL_WRITE;
15000     else if (wr_ret != -CEPHFS_EPERM) {
15001       ldout(cct, 10) << __func__ << " on pool " << pool_id << " ns " << pool_ns
15002                      << " rd_err = " << rd_ret << " wr_err = " << wr_ret << dendl;
15003       errored = true;
15004     }
15005
15006     if (errored) {
15007       // Indeterminate: erase CHECKING state so that subsequent calls re-check.
15008       // Raise EIO because actual error code might be misleading for
15009       // userspace filesystem user.
15010       pool_perms.erase(perm_key);
15011       signal_cond_list(waiting_for_pool_perm);
15012       return -CEPHFS_EIO;
15013     }
15014
15015     pool_perms[perm_key] = have | POOL_CHECKED;
15016     signal_cond_list(waiting_for_pool_perm);
15017   }
15018
15019   if ((need & CEPH_CAP_FILE_RD) && !(have & POOL_READ)) {
15020     ldout(cct, 10) << __func__ << " on pool " << pool_id << " ns " << pool_ns
15021                    << " need " << ccap_string(need) << ", but no read perm" << dendl;
15022     return -CEPHFS_EPERM;
15023   }
15024   if ((need & CEPH_CAP_FILE_WR) && !(have & POOL_WRITE)) {
15025     ldout(cct, 10) << __func__ << " on pool " << pool_id << " ns " << pool_ns
15026                    << " need " << ccap_string(need) << ", but no write perm" << dendl;
15027     return -CEPHFS_EPERM;
15028   }
15029
15030   return 0;
15031 }
15032
15033 int Client::_posix_acl_permission(Inode *in, const UserPerm& perms, unsigned want)
15034 {
15035   if (acl_type == POSIX_ACL) {
15036     if (in->xattrs.count(ACL_EA_ACCESS)) {
15037       const bufferptr& access_acl = in->xattrs[ACL_EA_ACCESS];
15038
15039       return posix_acl_permits(access_acl, in->uid, in->gid, perms, want);
15040     }
15041   }
15042   return -CEPHFS_EAGAIN;
15043 }
15044
15045 int Client::_posix_acl_chmod(Inode *in, mode_t mode, const UserPerm& perms)
15046 {
15047   if (acl_type == NO_ACL)
15048     return 0;
15049
15050   int r = _getattr(in, CEPH_STAT_CAP_XATTR, perms, in->xattr_version == 0);
15051   if (r < 0)
15052     goto out;
15053
15054   if (acl_type == POSIX_ACL) {
15055     if (in->xattrs.count(ACL_EA_ACCESS)) {
15056       const bufferptr& access_acl = in->xattrs[ACL_EA_ACCESS];
15057       bufferptr acl(access_acl.c_str(), access_acl.length());
15058       r = posix_acl_access_chmod(acl, mode);
15059       if (r < 0)
15060         goto out;
15061       r = _do_setxattr(in, ACL_EA_ACCESS, acl.c_str(), acl.length(), 0, perms);
15062     } else {
15063       r = 0;
15064     }
15065   }
15066 out:
15067   ldout(cct, 10) << __func__ << " ino " << in->ino << " result=" << r << dendl;
15068   return r;
15069 }
15070
15071 int Client::_posix_acl_create(Inode *dir, mode_t *mode, bufferlist& xattrs_bl,
15072                               const UserPerm& perms)
15073 {
15074   if (acl_type == NO_ACL)
15075     return 0;
15076
15077   if (S_ISLNK(*mode))
15078     return 0;
15079
15080   int r = _getattr(dir, CEPH_STAT_CAP_XATTR, perms, dir->xattr_version == 0);
15081   if (r < 0)
15082     goto out;
15083
15084   if (acl_type == POSIX_ACL) {
15085     if (dir->xattrs.count(ACL_EA_DEFAULT)) {
15086       map<string, bufferptr> xattrs;
15087
15088       const bufferptr& default_acl = dir->xattrs[ACL_EA_DEFAULT];
15089       bufferptr acl(default_acl.c_str(), default_acl.length());
15090       r = posix_acl_inherit_mode(acl, mode);
15091       if (r < 0)
15092         goto out;
15093
15094       if (r > 0) {
15095         r = posix_acl_equiv_mode(acl.c_str(), acl.length(), mode);
15096         if (r < 0)
15097           goto out;
15098         if (r > 0)
15099           xattrs[ACL_EA_ACCESS] = acl;
15100       }
15101
15102       if (S_ISDIR(*mode))
15103         xattrs[ACL_EA_DEFAULT] = dir->xattrs[ACL_EA_DEFAULT];
15104
15105       r = xattrs.size();
15106       if (r > 0)
15107         encode(xattrs, xattrs_bl);
15108     } else {
15109       if (umask_cb)
15110         *mode &= ~umask_cb(callback_handle);
15111       r = 0;
15112     }
15113   }
15114 out:
15115   ldout(cct, 10) << __func__ << " dir ino " << dir->ino << " result=" << r << dendl;
15116   return r;
15117 }
15118
15119 void Client::set_filer_flags(int flags)
15120 {
15121   std::scoped_lock l(client_lock);
15122   ceph_assert(flags == 0 ||
15123          flags == CEPH_OSD_FLAG_LOCALIZE_READS);
15124   objecter->add_global_op_flags(flags);
15125 }
15126
15127 void Client::clear_filer_flags(int flags)
15128 {
15129   std::scoped_lock l(client_lock);
15130   ceph_assert(flags == CEPH_OSD_FLAG_LOCALIZE_READS);
15131   objecter->clear_global_op_flag(flags);
15132 }
15133
15134 // called before mount
15135 void Client::set_uuid(const std::string& uuid)
15136 {
15137   RWRef_t iref_reader(initialize_state, CLIENT_INITIALIZED);
15138   ceph_assert(iref_reader.is_state_satisfied());
15139
15140   std::scoped_lock l(client_lock);
15141   assert(!uuid.empty());
15142
15143   metadata["uuid"] = uuid;
15144   _close_sessions();
15145 }
15146
15147 // called before mount. 0 means infinite
15148 void Client::set_session_timeout(unsigned timeout)
15149 {
15150   RWRef_t iref_reader(initialize_state, CLIENT_INITIALIZED);
15151   ceph_assert(iref_reader.is_state_satisfied());
15152
15153   std::scoped_lock l(client_lock);
15154
15155   metadata["timeout"] = stringify(timeout);
15156 }
15157
15158 // called before mount
15159 int Client::start_reclaim(const std::string& uuid, unsigned flags,
15160                           const std::string& fs_name)
15161 {
15162   RWRef_t iref_reader(initialize_state, CLIENT_INITIALIZED);
15163   if (!iref_reader.is_state_satisfied())
15164     return -CEPHFS_ENOTCONN;
15165
15166   if (uuid.empty())
15167     return -CEPHFS_EINVAL;
15168
15169   std::unique_lock l(client_lock);
15170   {
15171     auto it = metadata.find("uuid");
15172     if (it != metadata.end() && it->second == uuid)
15173       return -CEPHFS_EINVAL;
15174   }
15175
15176   int r = subscribe_mdsmap(fs_name);
15177   if (r < 0) {
15178     lderr(cct) << "mdsmap subscription failed: " << cpp_strerror(r) << dendl;
15179     return r;
15180   }
15181
15182   if (metadata.empty())
15183     populate_metadata("");
15184
15185   while (mdsmap->get_epoch() == 0)
15186     wait_on_list(waiting_for_mdsmap);
15187
15188   reclaim_errno = 0;
15189   for (unsigned mds = 0; mds < mdsmap->get_num_in_mds(); ) {
15190     if (!mdsmap->is_up(mds)) {
15191       ldout(cct, 10) << "mds." << mds << " not active, waiting for new mdsmap" << dendl;
15192       wait_on_list(waiting_for_mdsmap);
15193       continue;
15194     }
15195
15196     MetaSession *session;
15197     if (!have_open_session(mds)) {
15198       session = _get_or_open_mds_session(mds);
15199       if (session->state == MetaSession::STATE_REJECTED)
15200         return -CEPHFS_EPERM;
15201       if (session->state != MetaSession::STATE_OPENING) {
15202         // umounting?
15203         return -CEPHFS_EINVAL;
15204       }
15205       ldout(cct, 10) << "waiting for session to mds." << mds << " to open" << dendl;
15206       wait_on_context_list(session->waiting_for_open);
15207       continue;
15208     }
15209
15210     session = &mds_sessions.at(mds);
15211     if (!session->mds_features.test(CEPHFS_FEATURE_RECLAIM_CLIENT))
15212       return -CEPHFS_EOPNOTSUPP;
15213
15214     if (session->reclaim_state == MetaSession::RECLAIM_NULL ||
15215         session->reclaim_state == MetaSession::RECLAIMING) {
15216       session->reclaim_state = MetaSession::RECLAIMING;
15217       auto m = make_message<MClientReclaim>(uuid, flags);
15218       session->con->send_message2(std::move(m));
15219       wait_on_list(waiting_for_reclaim);
15220     } else if (session->reclaim_state == MetaSession::RECLAIM_FAIL) {
15221       return reclaim_errno ? : -CEPHFS_ENOTRECOVERABLE;
15222     } else {
15223       mds++;
15224     }
15225   }
15226
15227   // didn't find target session in any mds
15228   if (reclaim_target_addrs.empty()) {
15229     if (flags & CEPH_RECLAIM_RESET)
15230       return -CEPHFS_ENOENT;
15231     return -CEPHFS_ENOTRECOVERABLE;
15232   }
15233
15234   if (flags & CEPH_RECLAIM_RESET)
15235     return 0;
15236
15237   // use blocklist to check if target session was killed
15238   // (config option mds_session_blocklist_on_evict needs to be true)
15239   ldout(cct, 10) << __func__ << ": waiting for OSD epoch " << reclaim_osd_epoch << dendl;
15240   bs::error_code ec;
15241   l.unlock();
15242   objecter->wait_for_map(reclaim_osd_epoch, ca::use_blocked[ec]);
15243   l.lock();
15244
15245   if (ec)
15246     return ceph::from_error_code(ec);
15247
15248   bool blocklisted = objecter->with_osdmap(
15249       [this](const OSDMap &osd_map) -> bool {
15250         return osd_map.is_blocklisted(reclaim_target_addrs);
15251       });
15252   if (blocklisted)
15253     return -CEPHFS_ENOTRECOVERABLE;
15254
15255   metadata["reclaiming_uuid"] = uuid;
15256   return 0;
15257 }
15258
15259 void Client::finish_reclaim()
15260 {
15261   auto it = metadata.find("reclaiming_uuid");
15262   if (it == metadata.end()) {
15263     for (auto &p : mds_sessions)
15264       p.second.reclaim_state = MetaSession::RECLAIM_NULL;
15265     return;
15266   }
15267
15268   for (auto &p : mds_sessions) {
15269     p.second.reclaim_state = MetaSession::RECLAIM_NULL;
15270     auto m = make_message<MClientReclaim>("", MClientReclaim::FLAG_FINISH);
15271     p.second.con->send_message2(std::move(m));
15272   }
15273
15274   metadata["uuid"] = it->second;
15275   metadata.erase(it);
15276 }
15277
15278 void Client::handle_client_reclaim_reply(const MConstRef<MClientReclaimReply>& reply)
15279 {
15280   mds_rank_t from = mds_rank_t(reply->get_source().num());
15281   ldout(cct, 10) << __func__ << " " << *reply << " from mds." << from << dendl;
15282
15283   std::scoped_lock cl(client_lock);
15284   MetaSession *session = _get_mds_session(from, reply->get_connection().get());
15285   if (!session) {
15286     ldout(cct, 10) << " discarding reclaim reply from sessionless mds." <<  from << dendl;
15287     return;
15288   }
15289
15290   if (reply->get_result() >= 0) {
15291     session->reclaim_state = MetaSession::RECLAIM_OK;
15292     if (reply->get_epoch() > reclaim_osd_epoch)
15293       reclaim_osd_epoch = reply->get_epoch();
15294     if (!reply->get_addrs().empty())
15295       reclaim_target_addrs = reply->get_addrs();
15296   } else {
15297     session->reclaim_state = MetaSession::RECLAIM_FAIL;
15298     reclaim_errno = reply->get_result();
15299   }
15300
15301   signal_cond_list(waiting_for_reclaim);
15302 }
15303
15304 /**
15305  * This is included in cap release messages, to cause
15306  * the MDS to wait until this OSD map epoch.  It is necessary
15307  * in corner cases where we cancel RADOS ops, so that
15308  * nobody else tries to do IO to the same objects in
15309  * the same epoch as the cancelled ops.
15310  */
15311 void Client::set_cap_epoch_barrier(epoch_t e)
15312 {
15313   ldout(cct, 5) << __func__ << " epoch = " << e << dendl;
15314   cap_epoch_barrier = e;
15315 }
15316
15317 const char** Client::get_tracked_conf_keys() const
15318 {
15319   static const char* keys[] = {
15320     "client_cache_size",
15321     "client_cache_mid",
15322     "client_acl_type",
15323     "client_deleg_timeout",
15324     "client_deleg_break_on_open",
15325     "client_oc_size",
15326     "client_oc_max_objects",
15327     "client_oc_max_dirty",
15328     "client_oc_target_dirty",
15329     "client_oc_max_dirty_age",
15330     NULL
15331   };
15332   return keys;
15333 }
15334
15335 void Client::handle_conf_change(const ConfigProxy& conf,
15336                                 const std::set <std::string> &changed)
15337 {
15338   std::scoped_lock lock(client_lock);
15339
15340   if (changed.count("client_cache_mid")) {
15341     lru.lru_set_midpoint(cct->_conf->client_cache_mid);
15342   }
15343   if (changed.count("client_acl_type")) {
15344     acl_type = NO_ACL;
15345     if (cct->_conf->client_acl_type == "posix_acl")
15346       acl_type = POSIX_ACL;
15347   }
15348   if (changed.count("client_oc_size")) {
15349     objectcacher->set_max_size(cct->_conf->client_oc_size);
15350   }
15351   if (changed.count("client_oc_max_objects")) {
15352     objectcacher->set_max_objects(cct->_conf->client_oc_max_objects);
15353   }
15354   if (changed.count("client_oc_max_dirty")) {
15355     objectcacher->set_max_dirty(cct->_conf->client_oc_max_dirty);
15356   }
15357   if (changed.count("client_oc_target_dirty")) {
15358     objectcacher->set_target_dirty(cct->_conf->client_oc_target_dirty);
15359   }
15360   if (changed.count("client_oc_max_dirty_age")) {
15361     objectcacher->set_max_dirty_age(cct->_conf->client_oc_max_dirty_age);
15362   }
15363 }
15364
15365 void intrusive_ptr_add_ref(Inode *in)
15366 {
15367   in->get();
15368 }
15369
15370 void intrusive_ptr_release(Inode *in)
15371 {
15372   in->client->put_inode(in);
15373 }
15374
15375 mds_rank_t Client::_get_random_up_mds() const
15376 {
15377   ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
15378
15379   std::set<mds_rank_t> up;
15380   mdsmap->get_up_mds_set(up);
15381
15382   if (up.empty())
15383     return MDS_RANK_NONE;
15384   std::set<mds_rank_t>::const_iterator p = up.begin();
15385   for (int n = rand() % up.size(); n; n--)
15386     ++p;
15387   return *p;
15388 }
15389
15390
15391 StandaloneClient::StandaloneClient(Messenger *m, MonClient *mc,
15392                                    boost::asio::io_context& ictx)
15393   : Client(m, mc, new Objecter(m->cct, m, mc, ictx))
15394 {
15395   monclient->set_messenger(m);
15396   objecter->set_client_incarnation(0);
15397 }
15398
15399 StandaloneClient::~StandaloneClient()
15400 {
15401   delete objecter;
15402   objecter = nullptr;
15403 }
15404
15405 int StandaloneClient::init()
15406 {
15407   RWRef_t iref_writer(initialize_state, CLIENT_INITIALIZING, false);
15408   ceph_assert(iref_writer.is_first_writer());
15409
15410   _pre_init();
15411   objecter->init();
15412
15413   client_lock.lock();
15414
15415   messenger->add_dispatcher_tail(objecter);
15416   messenger->add_dispatcher_tail(this);
15417
15418   monclient->set_want_keys(CEPH_ENTITY_TYPE_MDS | CEPH_ENTITY_TYPE_OSD);
15419   int r = monclient->init();
15420   if (r < 0) {
15421     // need to do cleanup because we're in an intermediate init state
15422     {
15423       std::scoped_lock l(timer_lock);
15424       timer.shutdown();
15425     }
15426
15427     client_lock.unlock();
15428     objecter->shutdown();
15429     objectcacher->stop();
15430     monclient->shutdown();
15431     return r;
15432   }
15433   objecter->start();
15434
15435   client_lock.unlock();
15436   _finish_init();
15437   iref_writer.update_state(CLIENT_INITIALIZED);
15438
15439   return 0;
15440 }
15441
15442 void StandaloneClient::shutdown()
15443 {
15444   Client::shutdown();
15445   objecter->shutdown();
15446   monclient->shutdown();
15447 }