ceph/src/client/Client.cc

   1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
   2 // vim: ts=8 sw=2 smarttab
   3 /*
   4  * Ceph - scalable distributed file system
   5  *
   6  * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
   7  *
   8  * This is free software; you can redistribute it and/or
   9  * modify it under the terms of the GNU Lesser General Public
  10  * License version 2.1, as published by the Free Software
  11  * Foundation.  See file COPYING.
  12  *
  13  */
  14
  15
  16 // unix-ey fs stuff
  17 #include <unistd.h>
  18 #include <sys/types.h>
  19 #include <time.h>
  20 #include <utime.h>
  21 #include <sys/stat.h>
  22 #include <sys/param.h>
  23 #include <fcntl.h>
  24 #include <sys/file.h>
  25 #include <sys/utsname.h>
  26 #include <sys/uio.h>
  27
  28 #include <boost/lexical_cast.hpp>
  29 #include <boost/fusion/include/std_pair.hpp>
  30
  31 #if defined(__FreeBSD__)
  32 #define XATTR_CREATE    0x1
  33 #define XATTR_REPLACE   0x2
  34 #else
  35 #include <sys/xattr.h>
  36 #endif
  37
  38 #if defined(__linux__)
  39 #include <linux/falloc.h>
  40 #endif
  41
  42 #include <sys/statvfs.h>
  43
  44 #include "common/config.h"
  45 #include "common/version.h"
  46
  47 // ceph stuff
  48 #include "messages/MClientSession.h"
  49 #include "messages/MClientReconnect.h"
  50 #include "messages/MClientRequest.h"
  51 #include "messages/MClientRequestForward.h"
  52 #include "messages/MClientReply.h"
  53 #include "messages/MClientCaps.h"
  54 #include "messages/MClientLease.h"
  55 #include "messages/MClientSnap.h"
  56 #include "messages/MCommandReply.h"
  57 #include "messages/MOSDMap.h"
  58 #include "messages/MClientQuota.h"
  59 #include "messages/MClientCapRelease.h"
  60 #include "messages/MMDSMap.h"
  61 #include "messages/MFSMap.h"
  62 #include "messages/MFSMapUser.h"
  63
  64 #include "mon/MonClient.h"
  65
  66 #include "mds/flock.h"
  67 #include "osd/OSDMap.h"
  68 #include "osdc/Filer.h"
  69
  70 #include "common/Cond.h"
  71 #include "common/Mutex.h"
  72 #include "common/perf_counters.h"
  73 #include "common/admin_socket.h"
  74 #include "common/errno.h"
  75 #include "include/str_list.h"
  76
  77 #define dout_subsys ceph_subsys_client
  78
  79 #include "include/lru.h"
  80 #include "include/compat.h"
  81 #include "include/stringify.h"
  82
  83 #include "Client.h"
  84 #include "Inode.h"
  85 #include "Dentry.h"
  86 #include "Delegation.h"
  87 #include "Dir.h"
  88 #include "ClientSnapRealm.h"
  89 #include "Fh.h"
  90 #include "MetaSession.h"
  91 #include "MetaRequest.h"
  92 #include "ObjecterWriteback.h"
  93 #include "posix_acl.h"
  94
  95 #include "include/assert.h"
  96 #include "include/stat.h"
  97
  98 #include "include/cephfs/ceph_statx.h"
  99
 100 #if HAVE_GETGROUPLIST
 101 #include <grp.h>
 102 #include <pwd.h>
 103 #include <unistd.h>
 104 #endif
 105
 106 #undef dout_prefix
 107 #define dout_prefix *_dout << "client." << whoami << " "
 108
 109 #define  tout(cct)       if (!cct->_conf->client_trace.empty()) traceout
 110
 111 // FreeBSD fails to define this
 112 #ifndef O_DSYNC
 113 #define O_DSYNC 0x0
 114 #endif
 115 // Darwin fails to define this
 116 #ifndef O_RSYNC
 117 #define O_RSYNC 0x0
 118 #endif
 119
 120 #ifndef O_DIRECT
 121 #define O_DIRECT 0x0
 122 #endif
 123
 124 #define DEBUG_GETATTR_CAPS (CEPH_CAP_XATTR_SHARED)
 125
 126 void client_flush_set_callback(void *p, ObjectCacher::ObjectSet *oset)
 127 {
 128   Client *client = static_cast<Client*>(p);
 129   client->flush_set_callback(oset);
 130 }
 131
 132
 133 // -------------
 134
 135 Client::CommandHook::CommandHook(Client *client) :
 136   m_client(client)
 137 {
 138 }
 139
 140 bool Client::CommandHook::call(std::string command, cmdmap_t& cmdmap,
 141                                std::string format, bufferlist& out)
 142 {
 143   Formatter *f = Formatter::create(format);
 144   f->open_object_section("result");
 145   m_client->client_lock.Lock();
 146   if (command == "mds_requests")
 147     m_client->dump_mds_requests(f);
 148   else if (command == "mds_sessions")
 149     m_client->dump_mds_sessions(f);
 150   else if (command == "dump_cache")
 151     m_client->dump_cache(f);
 152   else if (command == "kick_stale_sessions")
 153     m_client->_kick_stale_sessions();
 154   else if (command == "status")
 155     m_client->dump_status(f);
 156   else
 157     assert(0 == "bad command registered");
 158   m_client->client_lock.Unlock();
 159   f->close_section();
 160   f->flush(out);
 161   delete f;
 162   return true;
 163 }
 164
 165
 166 // -------------
 167
 168 dir_result_t::dir_result_t(Inode *in, const UserPerm& perms)
 169   : inode(in), offset(0), next_offset(2),
 170     release_count(0), ordered_count(0), cache_index(0), start_shared_gen(0),
 171     perms(perms)
 172   { }
 173
 174 void Client::_reset_faked_inos()
 175 {
 176   ino_t start = 1024;
 177   free_faked_inos.clear();
 178   free_faked_inos.insert(start, (uint32_t)-1 - start + 1);
 179   last_used_faked_ino = 0;
 180   _use_faked_inos = sizeof(ino_t) < 8 || cct->_conf->client_use_faked_inos;
 181 }
 182
 183 void Client::_assign_faked_ino(Inode *in)
 184 {
 185   interval_set<ino_t>::const_iterator it = free_faked_inos.lower_bound(last_used_faked_ino + 1);
 186   if (it == free_faked_inos.end() && last_used_faked_ino > 0) {
 187     last_used_faked_ino = 0;
 188     it = free_faked_inos.lower_bound(last_used_faked_ino + 1);
 189   }
 190   assert(it != free_faked_inos.end());
 191   if (last_used_faked_ino < it.get_start()) {
 192     assert(it.get_len() > 0);
 193     last_used_faked_ino = it.get_start();
 194   } else {
 195     ++last_used_faked_ino;
 196     assert(it.get_start() + it.get_len() > last_used_faked_ino);
 197   }
 198   in->faked_ino = last_used_faked_ino;
 199   free_faked_inos.erase(in->faked_ino);
 200   faked_ino_map[in->faked_ino] = in->vino();
 201 }
 202
 203 void Client::_release_faked_ino(Inode *in)
 204 {
 205   free_faked_inos.insert(in->faked_ino);
 206   faked_ino_map.erase(in->faked_ino);
 207 }
 208
 209 vinodeno_t Client::_map_faked_ino(ino_t ino)
 210 {
 211   vinodeno_t vino;
 212   if (ino == 1)
 213     vino = root->vino();
 214   else if (faked_ino_map.count(ino))
 215     vino = faked_ino_map[ino];
 216   else
 217     vino = vinodeno_t(0, CEPH_NOSNAP);
 218   ldout(cct, 10) << "map_faked_ino " << ino << " -> " << vino << dendl;
 219   return vino;
 220 }
 221
 222 vinodeno_t Client::map_faked_ino(ino_t ino)
 223 {
 224   Mutex::Locker lock(client_lock);
 225   return _map_faked_ino(ino);
 226 }
 227
 228 // cons/des
 229
 230 Client::Client(Messenger *m, MonClient *mc, Objecter *objecter_)
 231   : Dispatcher(m->cct),
 232     m_command_hook(this),
 233     timer(m->cct, client_lock),
 234     callback_handle(NULL),
 235     switch_interrupt_cb(NULL),
 236     remount_cb(NULL),
 237     ino_invalidate_cb(NULL),
 238     dentry_invalidate_cb(NULL),
 239     getgroups_cb(NULL),
 240     umask_cb(NULL),
 241     can_invalidate_dentries(false),
 242     async_ino_invalidator(m->cct),
 243     async_dentry_invalidator(m->cct),
 244     interrupt_finisher(m->cct),
 245     remount_finisher(m->cct),
 246     objecter_finisher(m->cct),
 247     tick_event(NULL),
 248     messenger(m), monclient(mc),
 249     objecter(objecter_),
 250     whoami(mc->get_global_id()), cap_epoch_barrier(0),
 251     last_tid(0), oldest_tid(0), last_flush_tid(1),
 252     initialized(false),
 253     mounted(false), unmounting(false), blacklisted(false),
 254     local_osd(-ENXIO), local_osd_epoch(0),
 255     unsafe_sync_write(0),
 256     client_lock("Client::client_lock"),
 257     deleg_timeout(0)
 258 {
 259   _reset_faked_inos();
 260   //
 261   root = 0;
 262
 263   num_flushing_caps = 0;
 264
 265   _dir_vxattrs_name_size = _vxattrs_calcu_name_size(_dir_vxattrs);
 266   _file_vxattrs_name_size = _vxattrs_calcu_name_size(_file_vxattrs);
 267
 268   user_id = cct->_conf->client_mount_uid;
 269   group_id = cct->_conf->client_mount_gid;
 270
 271   acl_type = NO_ACL;
 272   if (cct->_conf->client_acl_type == "posix_acl")
 273     acl_type = POSIX_ACL;
 274
 275   lru.lru_set_midpoint(cct->_conf->client_cache_mid);
 276
 277   // file handles
 278   free_fd_set.insert(10, 1<<30);
 279
 280   mdsmap.reset(new MDSMap);
 281
 282   // osd interfaces
 283   writeback_handler.reset(new ObjecterWriteback(objecter, &objecter_finisher,
 284                                             &client_lock));
 285   objectcacher.reset(new ObjectCacher(cct, "libcephfs", *writeback_handler, client_lock,
 286                                   client_flush_set_callback,    // all commit callback
 287                                   (void*)this,
 288                                   cct->_conf->client_oc_size,
 289                                   cct->_conf->client_oc_max_objects,
 290                                   cct->_conf->client_oc_max_dirty,
 291                                   cct->_conf->client_oc_target_dirty,
 292                                   cct->_conf->client_oc_max_dirty_age,
 293                                   true));
 294   objecter_finisher.start();
 295   filer.reset(new Filer(objecter, &objecter_finisher));
 296   objecter->enable_blacklist_events();
 297 }
 298
 299
 300 Client::~Client()
 301 {
 302   assert(!client_lock.is_locked());
 303
 304   // It is necessary to hold client_lock, because any inode destruction
 305   // may call into ObjectCacher, which asserts that it's lock (which is
 306   // client_lock) is held.
 307   client_lock.Lock();
 308   tear_down_cache();
 309   client_lock.Unlock();
 310 }
 311
 312 void Client::tear_down_cache()
 313 {
 314   // fd's
 315   for (ceph::unordered_map<int, Fh*>::iterator it = fd_map.begin();
 316        it != fd_map.end();
 317        ++it) {
 318     Fh *fh = it->second;
 319     ldout(cct, 1) << "tear_down_cache forcing close of fh " << it->first << " ino " << fh->inode->ino << dendl;
 320     _release_fh(fh);
 321   }
 322   fd_map.clear();
 323
 324   while (!opened_dirs.empty()) {
 325     dir_result_t *dirp = *opened_dirs.begin();
 326     ldout(cct, 1) << "tear_down_cache forcing close of dir " << dirp << " ino " << dirp->inode->ino << dendl;
 327     _closedir(dirp);
 328   }
 329
 330   // caps!
 331   // *** FIXME ***
 332
 333   // empty lru
 334   trim_cache();
 335   assert(lru.lru_get_size() == 0);
 336
 337   // close root ino
 338   assert(inode_map.size() <= 1 + root_parents.size());
 339   if (root && inode_map.size() == 1 + root_parents.size()) {
 340     delete root;
 341     root = 0;
 342     root_ancestor = 0;
 343     while (!root_parents.empty())
 344       root_parents.erase(root_parents.begin());
 345     inode_map.clear();
 346     _reset_faked_inos();
 347   }
 348
 349   assert(inode_map.empty());
 350 }
 351
 352 inodeno_t Client::get_root_ino()
 353 {
 354   Mutex::Locker l(client_lock);
 355   if (use_faked_inos())
 356     return root->faked_ino;
 357   else
 358     return root->ino;
 359 }
 360
 361 Inode *Client::get_root()
 362 {
 363   Mutex::Locker l(client_lock);
 364   root->ll_get();
 365   return root;
 366 }
 367
 368
 369 // debug crapola
 370
 371 void Client::dump_inode(Formatter *f, Inode *in, set<Inode*>& did, bool disconnected)
 372 {
 373   filepath path;
 374   in->make_long_path(path);
 375   ldout(cct, 1) << "dump_inode: "
 376                 << (disconnected ? "DISCONNECTED ":"")
 377                 << "inode " << in->ino
 378                 << " " << path
 379                 << " ref " << in->get_num_ref()
 380                 << *in << dendl;
 381
 382   if (f) {
 383     f->open_object_section("inode");
 384     f->dump_stream("path") << path;
 385     if (disconnected)
 386       f->dump_int("disconnected", 1);
 387     in->dump(f);
 388     f->close_section();
 389   }
 390
 391   did.insert(in);
 392   if (in->dir) {
 393     ldout(cct, 1) << "  dir " << in->dir << " size " << in->dir->dentries.size() << dendl;
 394     for (ceph::unordered_map<string, Dentry*>::iterator it = in->dir->dentries.begin();
 395          it != in->dir->dentries.end();
 396          ++it) {
 397       ldout(cct, 1) << "   " << in->ino << " dn " << it->first << " " << it->second << " ref " << it->second->ref << dendl;
 398       if (f) {
 399         f->open_object_section("dentry");
 400         it->second->dump(f);
 401         f->close_section();
 402       }
 403       if (it->second->inode)
 404         dump_inode(f, it->second->inode.get(), did, false);
 405     }
 406   }
 407 }
 408
 409 void Client::dump_cache(Formatter *f)
 410 {
 411   set<Inode*> did;
 412
 413   ldout(cct, 1) << "dump_cache" << dendl;
 414
 415   if (f)
 416     f->open_array_section("cache");
 417
 418   if (root)
 419     dump_inode(f, root, did, true);
 420
 421   // make a second pass to catch anything disconnected
 422   for (ceph::unordered_map<vinodeno_t, Inode*>::iterator it = inode_map.begin();
 423        it != inode_map.end();
 424        ++it) {
 425     if (did.count(it->second))
 426       continue;
 427     dump_inode(f, it->second, did, true);
 428   }
 429
 430   if (f)
 431     f->close_section();
 432 }
 433
 434 void Client::dump_status(Formatter *f)
 435 {
 436   assert(client_lock.is_locked_by_me());
 437
 438   ldout(cct, 1) << __func__ << dendl;
 439
 440   const epoch_t osd_epoch
 441     = objecter->with_osdmap(std::mem_fn(&OSDMap::get_epoch));
 442
 443   if (f) {
 444     f->open_object_section("metadata");
 445     for (const auto& kv : metadata)
 446       f->dump_string(kv.first.c_str(), kv.second);
 447     f->close_section();
 448
 449     f->dump_int("dentry_count", lru.lru_get_size());
 450     f->dump_int("dentry_pinned_count", lru.lru_get_num_pinned());
 451     f->dump_int("id", get_nodeid().v);
 452     f->dump_int("inode_count", inode_map.size());
 453     f->dump_int("mds_epoch", mdsmap->get_epoch());
 454     f->dump_int("osd_epoch", osd_epoch);
 455     f->dump_int("osd_epoch_barrier", cap_epoch_barrier);
 456   }
 457 }
 458
 459 int Client::init()
 460 {
 461   timer.init();
 462   objectcacher->start();
 463
 464   client_lock.Lock();
 465   assert(!initialized);
 466
 467   messenger->add_dispatcher_tail(this);
 468   client_lock.Unlock();
 469
 470   _finish_init();
 471   return 0;
 472 }
 473
 474 void Client::_finish_init()
 475 {
 476   client_lock.Lock();
 477   // logger
 478   PerfCountersBuilder plb(cct, "client", l_c_first, l_c_last);
 479   plb.add_time_avg(l_c_reply, "reply", "Latency of receiving a reply on metadata request");
 480   plb.add_time_avg(l_c_lat, "lat", "Latency of processing a metadata request");
 481   plb.add_time_avg(l_c_wrlat, "wrlat", "Latency of a file data write operation");
 482   logger.reset(plb.create_perf_counters());
 483   cct->get_perfcounters_collection()->add(logger.get());
 484
 485   client_lock.Unlock();
 486
 487   cct->_conf->add_observer(this);
 488
 489   AdminSocket* admin_socket = cct->get_admin_socket();
 490   int ret = admin_socket->register_command("mds_requests",
 491                                            "mds_requests",
 492                                            &m_command_hook,
 493                                            "show in-progress mds requests");
 494   if (ret < 0) {
 495     lderr(cct) << "error registering admin socket command: "
 496                << cpp_strerror(-ret) << dendl;
 497   }
 498   ret = admin_socket->register_command("mds_sessions",
 499                                        "mds_sessions",
 500                                        &m_command_hook,
 501                                        "show mds session state");
 502   if (ret < 0) {
 503     lderr(cct) << "error registering admin socket command: "
 504                << cpp_strerror(-ret) << dendl;
 505   }
 506   ret = admin_socket->register_command("dump_cache",
 507                                        "dump_cache",
 508                                        &m_command_hook,
 509                                        "show in-memory metadata cache contents");
 510   if (ret < 0) {
 511     lderr(cct) << "error registering admin socket command: "
 512                << cpp_strerror(-ret) << dendl;
 513   }
 514   ret = admin_socket->register_command("kick_stale_sessions",
 515                                        "kick_stale_sessions",
 516                                        &m_command_hook,
 517                                        "kick sessions that were remote reset");
 518   if (ret < 0) {
 519     lderr(cct) << "error registering admin socket command: "
 520                << cpp_strerror(-ret) << dendl;
 521   }
 522   ret = admin_socket->register_command("status",
 523                                        "status",
 524                                        &m_command_hook,
 525                                        "show overall client status");
 526   if (ret < 0) {
 527     lderr(cct) << "error registering admin socket command: "
 528                << cpp_strerror(-ret) << dendl;
 529   }
 530
 531   client_lock.Lock();
 532   initialized = true;
 533   client_lock.Unlock();
 534 }
 535
 536 void Client::shutdown()
 537 {
 538   ldout(cct, 1) << "shutdown" << dendl;
 539
 540   // If we were not mounted, but were being used for sending
 541   // MDS commands, we may have sessions that need closing.
 542   client_lock.Lock();
 543   _close_sessions();
 544   client_lock.Unlock();
 545
 546   cct->_conf->remove_observer(this);
 547
 548   AdminSocket* admin_socket = cct->get_admin_socket();
 549   admin_socket->unregister_command("mds_requests");
 550   admin_socket->unregister_command("mds_sessions");
 551   admin_socket->unregister_command("dump_cache");
 552   admin_socket->unregister_command("kick_stale_sessions");
 553   admin_socket->unregister_command("status");
 554
 555   if (ino_invalidate_cb) {
 556     ldout(cct, 10) << "shutdown stopping cache invalidator finisher" << dendl;
 557     async_ino_invalidator.wait_for_empty();
 558     async_ino_invalidator.stop();
 559   }
 560
 561   if (dentry_invalidate_cb) {
 562     ldout(cct, 10) << "shutdown stopping dentry invalidator finisher" << dendl;
 563     async_dentry_invalidator.wait_for_empty();
 564     async_dentry_invalidator.stop();
 565   }
 566
 567   if (switch_interrupt_cb) {
 568     ldout(cct, 10) << "shutdown stopping interrupt finisher" << dendl;
 569     interrupt_finisher.wait_for_empty();
 570     interrupt_finisher.stop();
 571   }
 572
 573   if (remount_cb) {
 574     ldout(cct, 10) << "shutdown stopping remount finisher" << dendl;
 575     remount_finisher.wait_for_empty();
 576     remount_finisher.stop();
 577   }
 578
 579   objectcacher->stop();  // outside of client_lock! this does a join.
 580
 581   client_lock.Lock();
 582   assert(initialized);
 583   initialized = false;
 584   timer.shutdown();
 585   client_lock.Unlock();
 586
 587   objecter_finisher.wait_for_empty();
 588   objecter_finisher.stop();
 589
 590   if (logger) {
 591     cct->get_perfcounters_collection()->remove(logger.get());
 592     logger.reset();
 593   }
 594 }
 595
 596
 597 // ===================
 598 // metadata cache stuff
 599
 600 void Client::trim_cache(bool trim_kernel_dcache)
 601 {
 602   uint64_t max = cct->_conf->client_cache_size;
 603   ldout(cct, 20) << "trim_cache size " << lru.lru_get_size() << " max " << max << dendl;
 604   unsigned last = 0;
 605   while (lru.lru_get_size() != last) {
 606     last = lru.lru_get_size();
 607
 608     if (!unmounting && lru.lru_get_size() <= max)  break;
 609
 610     // trim!
 611     Dentry *dn = static_cast<Dentry*>(lru.lru_get_next_expire());
 612     if (!dn)
 613       break;  // done
 614
 615     trim_dentry(dn);
 616   }
 617
 618   if (trim_kernel_dcache && lru.lru_get_size() > max)
 619     _invalidate_kernel_dcache();
 620
 621   // hose root?
 622   if (lru.lru_get_size() == 0 && root && root->get_num_ref() == 0 && inode_map.size() == 1 + root_parents.size()) {
 623     ldout(cct, 15) << "trim_cache trimmed root " << root << dendl;
 624     delete root;
 625     root = 0;
 626     root_ancestor = 0;
 627     while (!root_parents.empty())
 628       root_parents.erase(root_parents.begin());
 629     inode_map.clear();
 630     _reset_faked_inos();
 631   }
 632 }
 633
 634 void Client::trim_cache_for_reconnect(MetaSession *s)
 635 {
 636   mds_rank_t mds = s->mds_num;
 637   ldout(cct, 20) << "trim_cache_for_reconnect mds." << mds << dendl;
 638
 639   int trimmed = 0;
 640   list<Dentry*> skipped;
 641   while (lru.lru_get_size() > 0) {
 642     Dentry *dn = static_cast<Dentry*>(lru.lru_expire());
 643     if (!dn)
 644       break;
 645
 646     if ((dn->inode && dn->inode->caps.count(mds)) ||
 647         dn->dir->parent_inode->caps.count(mds)) {
 648       trim_dentry(dn);
 649       trimmed++;
 650     } else
 651       skipped.push_back(dn);
 652   }
 653
 654   for(list<Dentry*>::iterator p = skipped.begin(); p != skipped.end(); ++p)
 655     lru.lru_insert_mid(*p);
 656
 657   ldout(cct, 20) << "trim_cache_for_reconnect mds." << mds
 658                  << " trimmed " << trimmed << " dentries" << dendl;
 659
 660   if (s->caps.size() > 0)
 661     _invalidate_kernel_dcache();
 662 }
 663
 664 void Client::trim_dentry(Dentry *dn)
 665 {
 666   ldout(cct, 15) << "trim_dentry unlinking dn " << dn->name
 667                  << " in dir " << hex << dn->dir->parent_inode->ino
 668                  << dendl;
 669   if (dn->inode) {
 670     Inode *diri = dn->dir->parent_inode;
 671     diri->dir_release_count++;
 672     clear_dir_complete_and_ordered(diri, true);
 673   }
 674   unlink(dn, false, false);  // drop dir, drop dentry
 675 }
 676
 677
 678 void Client::update_inode_file_bits(Inode *in,
 679                                     uint64_t truncate_seq, uint64_t truncate_size,
 680                                     uint64_t size, uint64_t change_attr,
 681                                     uint64_t time_warp_seq, utime_t ctime,
 682                                     utime_t mtime,
 683                                     utime_t atime,
 684                                     version_t inline_version,
 685                                     bufferlist& inline_data,
 686                                     int issued)
 687 {
 688   bool warn = false;
 689   ldout(cct, 10) << "update_inode_file_bits " << *in << " " << ccap_string(issued)
 690            << " mtime " << mtime << dendl;
 691   ldout(cct, 25) << "truncate_seq: mds " << truncate_seq <<  " local "
 692            << in->truncate_seq << " time_warp_seq: mds " << time_warp_seq
 693            << " local " << in->time_warp_seq << dendl;
 694   uint64_t prior_size = in->size;
 695
 696   if (inline_version > in->inline_version) {
 697     in->inline_data = inline_data;
 698     in->inline_version = inline_version;
 699   }
 700
 701   /* always take a newer change attr */
 702   if (change_attr > in->change_attr)
 703     in->change_attr = change_attr;
 704
 705   if (truncate_seq > in->truncate_seq ||
 706       (truncate_seq == in->truncate_seq && size > in->size)) {
 707     ldout(cct, 10) << "size " << in->size << " -> " << size << dendl;
 708     in->size = size;
 709     in->reported_size = size;
 710     if (truncate_seq != in->truncate_seq) {
 711       ldout(cct, 10) << "truncate_seq " << in->truncate_seq << " -> "
 712                << truncate_seq << dendl;
 713       in->truncate_seq = truncate_seq;
 714       in->oset.truncate_seq = truncate_seq;
 715
 716       // truncate cached file data
 717       if (prior_size > size) {
 718         _invalidate_inode_cache(in, truncate_size, prior_size - truncate_size);
 719       }
 720     }
 721
 722     // truncate inline data
 723     if (in->inline_version < CEPH_INLINE_NONE) {
 724       uint32_t len = in->inline_data.length();
 725       if (size < len)
 726         in->inline_data.splice(size, len - size);
 727     }
 728   }
 729   if (truncate_seq >= in->truncate_seq &&
 730       in->truncate_size != truncate_size) {
 731     if (in->is_file()) {
 732       ldout(cct, 10) << "truncate_size " << in->truncate_size << " -> "
 733                << truncate_size << dendl;
 734       in->truncate_size = truncate_size;
 735       in->oset.truncate_size = truncate_size;
 736     } else {
 737       ldout(cct, 0) << "Hmmm, truncate_seq && truncate_size changed on non-file inode!" << dendl;
 738     }
 739   }
 740
 741   // be careful with size, mtime, atime
 742   if (issued & (CEPH_CAP_FILE_EXCL|
 743                 CEPH_CAP_FILE_WR|
 744                 CEPH_CAP_FILE_BUFFER|
 745                 CEPH_CAP_AUTH_EXCL|
 746                 CEPH_CAP_XATTR_EXCL)) {
 747     ldout(cct, 30) << "Yay have enough caps to look at our times" << dendl;
 748     if (ctime > in->ctime)
 749       in->ctime = ctime;
 750     if (time_warp_seq > in->time_warp_seq) {
 751       ldout(cct, 10) << "mds time_warp_seq " << time_warp_seq << " on inode " << *in
 752                << " is higher than local time_warp_seq "
 753                << in->time_warp_seq << dendl;
 754       //the mds updated times, so take those!
 755       in->mtime = mtime;
 756       in->atime = atime;
 757       in->time_warp_seq = time_warp_seq;
 758     } else if (time_warp_seq == in->time_warp_seq) {
 759       //take max times
 760       if (mtime > in->mtime)
 761         in->mtime = mtime;
 762       if (atime > in->atime)
 763         in->atime = atime;
 764     } else if (issued & CEPH_CAP_FILE_EXCL) {
 765       //ignore mds values as we have a higher seq
 766     } else warn = true;
 767   } else {
 768     ldout(cct, 30) << "Don't have enough caps, just taking mds' time values" << dendl;
 769     if (time_warp_seq >= in->time_warp_seq) {
 770       in->ctime = ctime;
 771       in->mtime = mtime;
 772       in->atime = atime;
 773       in->time_warp_seq = time_warp_seq;
 774     } else warn = true;
 775   }
 776   if (warn) {
 777     ldout(cct, 0) << "WARNING: " << *in << " mds time_warp_seq "
 778             << time_warp_seq << " is lower than local time_warp_seq "
 779             << in->time_warp_seq
 780             << dendl;
 781   }
 782 }
 783
 784 void Client::_fragmap_remove_non_leaves(Inode *in)
 785 {
 786   for (map<frag_t,int>::iterator p = in->fragmap.begin(); p != in->fragmap.end(); )
 787     if (!in->dirfragtree.is_leaf(p->first))
 788       in->fragmap.erase(p++);
 789     else
 790       ++p;
 791 }
 792
 793 void Client::_fragmap_remove_stopped_mds(Inode *in, mds_rank_t mds)
 794 {
 795   for (auto p = in->fragmap.begin(); p != in->fragmap.end(); )
 796     if (p->second == mds)
 797       in->fragmap.erase(p++);
 798     else
 799       ++p;
 800 }
 801
 802 Inode * Client::add_update_inode(InodeStat *st, utime_t from,
 803                                  MetaSession *session,
 804                                  const UserPerm& request_perms)
 805 {
 806   Inode *in;
 807   bool was_new = false;
 808   if (inode_map.count(st->vino)) {
 809     in = inode_map[st->vino];
 810     ldout(cct, 12) << "add_update_inode had " << *in << " caps " << ccap_string(st->cap.caps) << dendl;
 811   } else {
 812     in = new Inode(this, st->vino, &st->layout);
 813     inode_map[st->vino] = in;
 814
 815     if (use_faked_inos())
 816       _assign_faked_ino(in);
 817
 818     if (!root) {
 819       root = in;
 820       root_ancestor = in;
 821       cwd = root;
 822     } else if (!mounted) {
 823       root_parents[root_ancestor] = in;
 824       root_ancestor = in;
 825     }
 826
 827     // immutable bits
 828     in->ino = st->vino.ino;
 829     in->snapid = st->vino.snapid;
 830     in->mode = st->mode & S_IFMT;
 831     was_new = true;
 832   }
 833
 834   in->rdev = st->rdev;
 835   if (in->is_symlink())
 836     in->symlink = st->symlink;
 837
 838   if (was_new)
 839     ldout(cct, 12) << "add_update_inode adding " << *in << " caps " << ccap_string(st->cap.caps) << dendl;
 840
 841   if (!st->cap.caps)
 842     return in;   // as with readdir returning indoes in different snaprealms (no caps!)
 843
 844   // only update inode if mds info is strictly newer, or it is the same and projected (odd).
 845   bool updating_inode = false;
 846   int issued = 0;
 847   if (st->version == 0 ||
 848       (in->version & ~1) < st->version) {
 849     updating_inode = true;
 850
 851     int implemented = 0;
 852     issued = in->caps_issued(&implemented) | in->caps_dirty();
 853     issued |= implemented;
 854
 855     in->version = st->version;
 856
 857     if ((issued & CEPH_CAP_AUTH_EXCL) == 0) {
 858       in->mode = st->mode;
 859       in->uid = st->uid;
 860       in->gid = st->gid;
 861       in->btime = st->btime;
 862     }
 863
 864     if ((issued & CEPH_CAP_LINK_EXCL) == 0) {
 865       in->nlink = st->nlink;
 866     }
 867
 868     in->dirstat = st->dirstat;
 869     in->rstat = st->rstat;
 870     in->quota = st->quota;
 871     in->layout = st->layout;
 872
 873     if (in->is_dir()) {
 874       in->dir_layout = st->dir_layout;
 875       ldout(cct, 20) << " dir hash is " << (int)in->dir_layout.dl_dir_hash << dendl;
 876     }
 877
 878     update_inode_file_bits(in, st->truncate_seq, st->truncate_size, st->size,
 879                            st->change_attr, st->time_warp_seq, st->ctime,
 880                            st->mtime, st->atime, st->inline_version,
 881                            st->inline_data, issued);
 882   } else if (st->inline_version > in->inline_version) {
 883     in->inline_data = st->inline_data;
 884     in->inline_version = st->inline_version;
 885   }
 886
 887   if ((in->xattr_version  == 0 || !(issued & CEPH_CAP_XATTR_EXCL)) &&
 888       st->xattrbl.length() &&
 889       st->xattr_version > in->xattr_version) {
 890     bufferlist::iterator p = st->xattrbl.begin();
 891     ::decode(in->xattrs, p);
 892     in->xattr_version = st->xattr_version;
 893   }
 894
 895   // move me if/when version reflects fragtree changes.
 896   if (in->dirfragtree != st->dirfragtree) {
 897     in->dirfragtree = st->dirfragtree;
 898     _fragmap_remove_non_leaves(in);
 899   }
 900
 901   if (in->snapid == CEPH_NOSNAP) {
 902     add_update_cap(in, session, st->cap.cap_id, st->cap.caps, st->cap.seq,
 903                    st->cap.mseq, inodeno_t(st->cap.realm), st->cap.flags,
 904                    request_perms);
 905     if (in->auth_cap && in->auth_cap->session == session)
 906       in->max_size = st->max_size;
 907   } else
 908     in->snap_caps |= st->cap.caps;
 909
 910   // setting I_COMPLETE needs to happen after adding the cap
 911   if (updating_inode &&
 912       in->is_dir() &&
 913       (st->cap.caps & CEPH_CAP_FILE_SHARED) &&
 914       (issued & CEPH_CAP_FILE_EXCL) == 0 &&
 915       in->dirstat.nfiles == 0 &&
 916       in->dirstat.nsubdirs == 0) {
 917     ldout(cct, 10) << " marking (I_COMPLETE|I_DIR_ORDERED) on empty dir " << *in << dendl;
 918     in->flags |= I_COMPLETE | I_DIR_ORDERED;
 919     if (in->dir) {
 920       ldout(cct, 10) << " dir is open on empty dir " << in->ino << " with "
 921                      << in->dir->dentries.size() << " entries, marking all dentries null" << dendl;
 922       in->dir->readdir_cache.clear();
 923       for (auto p = in->dir->dentries.begin();
 924            p != in->dir->dentries.end();
 925            ++p) {
 926         unlink(p->second, true, true);  // keep dir, keep dentry
 927       }
 928       if (in->dir->dentries.empty())
 929         close_dir(in->dir);
 930     }
 931   }
 932
 933   return in;
 934 }
 935
 936
 937 /*
 938  * insert_dentry_inode - insert + link a single dentry + inode into the metadata cache.
 939  */
 940 Dentry *Client::insert_dentry_inode(Dir *dir, const string& dname, LeaseStat *dlease,
 941                                     Inode *in, utime_t from, MetaSession *session,
 942                                     Dentry *old_dentry)
 943 {
 944   Dentry *dn = NULL;
 945   if (dir->dentries.count(dname))
 946     dn = dir->dentries[dname];
 947
 948   ldout(cct, 12) << "insert_dentry_inode '" << dname << "' vino " << in->vino()
 949                  << " in dir " << dir->parent_inode->vino() << " dn " << dn
 950                  << dendl;
 951
 952   if (dn && dn->inode) {
 953     if (dn->inode->vino() == in->vino()) {
 954       touch_dn(dn);
 955       ldout(cct, 12) << " had dentry " << dname
 956                << " with correct vino " << dn->inode->vino()
 957                << dendl;
 958     } else {
 959       ldout(cct, 12) << " had dentry " << dname
 960                << " with WRONG vino " << dn->inode->vino()
 961                << dendl;
 962       unlink(dn, true, true);  // keep dir, keep dentry
 963     }
 964   }
 965
 966   if (!dn || !dn->inode) {
 967     InodeRef tmp_ref(in);
 968     if (old_dentry) {
 969       if (old_dentry->dir != dir) {
 970         Inode *old_diri = old_dentry->dir->parent_inode;
 971         old_diri->dir_ordered_count++;
 972         clear_dir_complete_and_ordered(old_diri, false);
 973       }
 974       unlink(old_dentry, dir == old_dentry->dir, false);  // drop dentry, keep dir open if its the same dir
 975     }
 976     Inode *diri = dir->parent_inode;
 977     diri->dir_ordered_count++;
 978     clear_dir_complete_and_ordered(diri, false);
 979     dn = link(dir, dname, in, dn);
 980   }
 981
 982   update_dentry_lease(dn, dlease, from, session);
 983   return dn;
 984 }
 985
 986 void Client::update_dentry_lease(Dentry *dn, LeaseStat *dlease, utime_t from, MetaSession *session)
 987 {
 988   utime_t dttl = from;
 989   dttl += (float)dlease->duration_ms / 1000.0;
 990
 991   assert(dn);
 992
 993   if (dlease->mask & CEPH_LOCK_DN) {
 994     if (dttl > dn->lease_ttl) {
 995       ldout(cct, 10) << "got dentry lease on " << dn->name
 996                << " dur " << dlease->duration_ms << "ms ttl " << dttl << dendl;
 997       dn->lease_ttl = dttl;
 998       dn->lease_mds = session->mds_num;
 999       dn->lease_seq = dlease->seq;
1000       dn->lease_gen = session->cap_gen;
1001     }
1002   }
1003   dn->cap_shared_gen = dn->dir->parent_inode->shared_gen;
1004 }
1005
1006
1007 /*
1008  * update MDS location cache for a single inode
1009  */
1010 void Client::update_dir_dist(Inode *in, DirStat *dst)
1011 {
1012   // auth
1013   ldout(cct, 20) << "got dirfrag map for " << in->ino << " frag " << dst->frag << " to mds " << dst->auth << dendl;
1014   if (dst->auth >= 0) {
1015     in->fragmap[dst->frag] = dst->auth;
1016   } else {
1017     in->fragmap.erase(dst->frag);
1018   }
1019   if (!in->dirfragtree.is_leaf(dst->frag)) {
1020     in->dirfragtree.force_to_leaf(cct, dst->frag);
1021     _fragmap_remove_non_leaves(in);
1022   }
1023
1024   // replicated
1025   in->dir_replicated = !dst->dist.empty();  // FIXME that's just one frag!
1026
1027   // dist
1028   /*
1029   if (!st->dirfrag_dist.empty()) {   // FIXME
1030     set<int> dist = st->dirfrag_dist.begin()->second;
1031     if (dist.empty() && !in->dir_contacts.empty())
1032       ldout(cct, 9) << "lost dist spec for " << in->ino
1033               << " " << dist << dendl;
1034     if (!dist.empty() && in->dir_contacts.empty())
1035       ldout(cct, 9) << "got dist spec for " << in->ino
1036               << " " << dist << dendl;
1037     in->dir_contacts = dist;
1038   }
1039   */
1040 }
1041
1042 void Client::clear_dir_complete_and_ordered(Inode *diri, bool complete)
1043 {
1044   if (diri->flags & I_COMPLETE) {
1045     if (complete) {
1046       ldout(cct, 10) << " clearing (I_COMPLETE|I_DIR_ORDERED) on " << *diri << dendl;
1047       diri->flags &= ~(I_COMPLETE | I_DIR_ORDERED);
1048     } else {
1049       if (diri->flags & I_DIR_ORDERED) {
1050         ldout(cct, 10) << " clearing I_DIR_ORDERED on " << *diri << dendl;
1051         diri->flags &= ~I_DIR_ORDERED;
1052       }
1053     }
1054     if (diri->dir)
1055       diri->dir->readdir_cache.clear();
1056   }
1057 }
1058
1059 /*
1060  * insert results from readdir or lssnap into the metadata cache.
1061  */
1062 void Client::insert_readdir_results(MetaRequest *request, MetaSession *session, Inode *diri) {
1063
1064   MClientReply *reply = request->reply;
1065   ConnectionRef con = request->reply->get_connection();
1066   uint64_t features = con->get_features();
1067
1068   dir_result_t *dirp = request->dirp;
1069   assert(dirp);
1070
1071   // the extra buffer list is only set for readdir and lssnap replies
1072   bufferlist::iterator p = reply->get_extra_bl().begin();
1073   if (!p.end()) {
1074     // snapdir?
1075     if (request->head.op == CEPH_MDS_OP_LSSNAP) {
1076       assert(diri);
1077       diri = open_snapdir(diri);
1078     }
1079
1080     // only open dir if we're actually adding stuff to it!
1081     Dir *dir = diri->open_dir();
1082     assert(dir);
1083
1084     // dirstat
1085     DirStat dst(p);
1086     __u32 numdn;
1087     __u16 flags;
1088     ::decode(numdn, p);
1089     ::decode(flags, p);
1090
1091     bool end = ((unsigned)flags & CEPH_READDIR_FRAG_END);
1092     bool hash_order = ((unsigned)flags & CEPH_READDIR_HASH_ORDER);
1093
1094     frag_t fg = (unsigned)request->head.args.readdir.frag;
1095     unsigned readdir_offset = dirp->next_offset;
1096     string readdir_start = dirp->last_name;
1097     assert(!readdir_start.empty() || readdir_offset == 2);
1098
1099     unsigned last_hash = 0;
1100     if (hash_order) {
1101       if (!readdir_start.empty()) {
1102         last_hash = ceph_frag_value(diri->hash_dentry_name(readdir_start));
1103       } else if (flags & CEPH_READDIR_OFFSET_HASH) {
1104         /* mds understands offset_hash */
1105         last_hash = (unsigned)request->head.args.readdir.offset_hash;
1106       }
1107     }
1108
1109     if (fg != dst.frag) {
1110       ldout(cct, 10) << "insert_trace got new frag " << fg << " -> " << dst.frag << dendl;
1111       fg = dst.frag;
1112       if (!hash_order) {
1113         readdir_offset = 2;
1114         readdir_start.clear();
1115         dirp->offset = dir_result_t::make_fpos(fg, readdir_offset, false);
1116       }
1117     }
1118
1119     ldout(cct, 10) << __func__ << " " << numdn << " readdir items, end=" << end
1120                    << ", hash_order=" << hash_order
1121                    << ", readdir_start " << readdir_start
1122                    << ", last_hash " << last_hash
1123                    << ", next_offset " << readdir_offset << dendl;
1124
1125     if (diri->snapid != CEPH_SNAPDIR &&
1126         fg.is_leftmost() && readdir_offset == 2 &&
1127         !(hash_order && last_hash)) {
1128       dirp->release_count = diri->dir_release_count;
1129       dirp->ordered_count = diri->dir_ordered_count;
1130       dirp->start_shared_gen = diri->shared_gen;
1131       dirp->cache_index = 0;
1132     }
1133
1134     dirp->buffer_frag = fg;
1135
1136     _readdir_drop_dirp_buffer(dirp);
1137     dirp->buffer.reserve(numdn);
1138
1139     string dname;
1140     LeaseStat dlease;
1141     for (unsigned i=0; i<numdn; i++) {
1142       ::decode(dname, p);
1143       ::decode(dlease, p);
1144       InodeStat ist(p, features);
1145
1146       ldout(cct, 15) << "" << i << ": '" << dname << "'" << dendl;
1147
1148       Inode *in = add_update_inode(&ist, request->sent_stamp, session,
1149                                    request->perms);
1150       Dentry *dn;
1151       if (diri->dir->dentries.count(dname)) {
1152         Dentry *olddn = diri->dir->dentries[dname];
1153         if (olddn->inode != in) {
1154           // replace incorrect dentry
1155           unlink(olddn, true, true);  // keep dir, dentry
1156           dn = link(dir, dname, in, olddn);
1157           assert(dn == olddn);
1158         } else {
1159           // keep existing dn
1160           dn = olddn;
1161           touch_dn(dn);
1162         }
1163       } else {
1164         // new dn
1165         dn = link(dir, dname, in, NULL);
1166       }
1167
1168       update_dentry_lease(dn, &dlease, request->sent_stamp, session);
1169       if (hash_order) {
1170         unsigned hash = ceph_frag_value(diri->hash_dentry_name(dname));
1171         if (hash != last_hash)
1172           readdir_offset = 2;
1173         last_hash = hash;
1174         dn->offset = dir_result_t::make_fpos(hash, readdir_offset++, true);
1175       } else {
1176         dn->offset = dir_result_t::make_fpos(fg, readdir_offset++, false);
1177       }
1178       // add to readdir cache
1179       if (dirp->release_count == diri->dir_release_count &&
1180           dirp->ordered_count == diri->dir_ordered_count &&
1181           dirp->start_shared_gen == diri->shared_gen) {
1182         if (dirp->cache_index == dir->readdir_cache.size()) {
1183           if (i == 0) {
1184             assert(!dirp->inode->is_complete_and_ordered());
1185             dir->readdir_cache.reserve(dirp->cache_index + numdn);
1186           }
1187           dir->readdir_cache.push_back(dn);
1188         } else if (dirp->cache_index < dir->readdir_cache.size()) {
1189           if (dirp->inode->is_complete_and_ordered())
1190             assert(dir->readdir_cache[dirp->cache_index] == dn);
1191           else
1192             dir->readdir_cache[dirp->cache_index] = dn;
1193         } else {
1194           assert(0 == "unexpected readdir buffer idx");
1195         }
1196         dirp->cache_index++;
1197       }
1198       // add to cached result list
1199       dirp->buffer.push_back(dir_result_t::dentry(dn->offset, dname, in));
1200       ldout(cct, 15) << __func__ << "  " << hex << dn->offset << dec << ": '" << dname << "' -> " << in->ino << dendl;
1201     }
1202
1203     if (numdn > 0)
1204       dirp->last_name = dname;
1205     if (end)
1206       dirp->next_offset = 2;
1207     else
1208       dirp->next_offset = readdir_offset;
1209
1210     if (dir->is_empty())
1211       close_dir(dir);
1212   }
1213 }
1214
1215 /** insert_trace
1216  *
1217  * insert a trace from a MDS reply into the cache.
1218  */
1219 Inode* Client::insert_trace(MetaRequest *request, MetaSession *session)
1220 {
1221   MClientReply *reply = request->reply;
1222   int op = request->get_op();
1223
1224   ldout(cct, 10) << "insert_trace from " << request->sent_stamp << " mds." << session->mds_num
1225            << " is_target=" << (int)reply->head.is_target
1226            << " is_dentry=" << (int)reply->head.is_dentry
1227            << dendl;
1228
1229   bufferlist::iterator p = reply->get_trace_bl().begin();
1230   if (request->got_unsafe) {
1231     ldout(cct, 10) << "insert_trace -- already got unsafe; ignoring" << dendl;
1232     assert(p.end());
1233     return NULL;
1234   }
1235
1236   if (p.end()) {
1237     ldout(cct, 10) << "insert_trace -- no trace" << dendl;
1238
1239     Dentry *d = request->dentry();
1240     if (d) {
1241       Inode *diri = d->dir->parent_inode;
1242       diri->dir_release_count++;
1243       clear_dir_complete_and_ordered(diri, true);
1244     }
1245
1246     if (d && reply->get_result() == 0) {
1247       if (op == CEPH_MDS_OP_RENAME) {
1248         // rename
1249         Dentry *od = request->old_dentry();
1250         ldout(cct, 10) << " unlinking rename src dn " << od << " for traceless reply" << dendl;
1251         assert(od);
1252         unlink(od, true, true);  // keep dir, dentry
1253       } else if (op == CEPH_MDS_OP_RMDIR ||
1254                  op == CEPH_MDS_OP_UNLINK) {
1255         // unlink, rmdir
1256         ldout(cct, 10) << " unlinking unlink/rmdir dn " << d << " for traceless reply" << dendl;
1257         unlink(d, true, true);  // keep dir, dentry
1258       }
1259     }
1260     return NULL;
1261   }
1262
1263   ConnectionRef con = request->reply->get_connection();
1264   uint64_t features = con->get_features();
1265   ldout(cct, 10) << " features 0x" << hex << features << dec << dendl;
1266
1267   // snap trace
1268   SnapRealm *realm = NULL;
1269   if (reply->snapbl.length())
1270     update_snap_trace(reply->snapbl, &realm);
1271
1272   ldout(cct, 10) << " hrm "
1273            << " is_target=" << (int)reply->head.is_target
1274            << " is_dentry=" << (int)reply->head.is_dentry
1275            << dendl;
1276
1277   InodeStat dirst;
1278   DirStat dst;
1279   string dname;
1280   LeaseStat dlease;
1281   InodeStat ist;
1282
1283   if (reply->head.is_dentry) {
1284     dirst.decode(p, features);
1285     dst.decode(p);
1286     ::decode(dname, p);
1287     ::decode(dlease, p);
1288   }
1289
1290   Inode *in = 0;
1291   if (reply->head.is_target) {
1292     ist.decode(p, features);
1293     if (cct->_conf->client_debug_getattr_caps) {
1294       unsigned wanted = 0;
1295       if (op == CEPH_MDS_OP_GETATTR || op == CEPH_MDS_OP_LOOKUP)
1296         wanted = request->head.args.getattr.mask;
1297       else if (op == CEPH_MDS_OP_OPEN || op == CEPH_MDS_OP_CREATE)
1298         wanted = request->head.args.open.mask;
1299
1300       if ((wanted & CEPH_CAP_XATTR_SHARED) &&
1301           !(ist.xattr_version > 0 && ist.xattrbl.length() > 0))
1302           assert(0 == "MDS reply does not contain xattrs");
1303     }
1304
1305     in = add_update_inode(&ist, request->sent_stamp, session,
1306                           request->perms);
1307   }
1308
1309   Inode *diri = NULL;
1310   if (reply->head.is_dentry) {
1311     diri = add_update_inode(&dirst, request->sent_stamp, session,
1312                             request->perms);
1313     update_dir_dist(diri, &dst);  // dir stat info is attached to ..
1314
1315     if (in) {
1316       Dir *dir = diri->open_dir();
1317       insert_dentry_inode(dir, dname, &dlease, in, request->sent_stamp, session,
1318                           (op == CEPH_MDS_OP_RENAME) ? request->old_dentry() : NULL);
1319     } else {
1320       Dentry *dn = NULL;
1321       if (diri->dir && diri->dir->dentries.count(dname)) {
1322         dn = diri->dir->dentries[dname];
1323         if (dn->inode) {
1324           diri->dir_ordered_count++;
1325           clear_dir_complete_and_ordered(diri, false);
1326           unlink(dn, true, true);  // keep dir, dentry
1327         }
1328       }
1329       if (dlease.duration_ms > 0) {
1330         if (!dn) {
1331           Dir *dir = diri->open_dir();
1332           dn = link(dir, dname, NULL, NULL);
1333         }
1334         update_dentry_lease(dn, &dlease, request->sent_stamp, session);
1335       }
1336     }
1337   } else if (op == CEPH_MDS_OP_LOOKUPSNAP ||
1338              op == CEPH_MDS_OP_MKSNAP) {
1339     ldout(cct, 10) << " faking snap lookup weirdness" << dendl;
1340     // fake it for snap lookup
1341     vinodeno_t vino = ist.vino;
1342     vino.snapid = CEPH_SNAPDIR;
1343     assert(inode_map.count(vino));
1344     diri = inode_map[vino];
1345
1346     string dname = request->path.last_dentry();
1347
1348     LeaseStat dlease;
1349     dlease.duration_ms = 0;
1350
1351     if (in) {
1352       Dir *dir = diri->open_dir();
1353       insert_dentry_inode(dir, dname, &dlease, in, request->sent_stamp, session);
1354     } else {
1355       if (diri->dir && diri->dir->dentries.count(dname)) {
1356         Dentry *dn = diri->dir->dentries[dname];
1357         if (dn->inode)
1358           unlink(dn, true, true);  // keep dir, dentry
1359       }
1360     }
1361   }
1362
1363   if (in) {
1364     if (op == CEPH_MDS_OP_READDIR ||
1365         op == CEPH_MDS_OP_LSSNAP) {
1366       insert_readdir_results(request, session, in);
1367     } else if (op == CEPH_MDS_OP_LOOKUPNAME) {
1368       // hack: return parent inode instead
1369       in = diri;
1370     }
1371
1372     if (request->dentry() == NULL && in != request->inode()) {
1373       // pin the target inode if its parent dentry is not pinned
1374       request->set_other_inode(in);
1375     }
1376   }
1377
1378   if (realm)
1379     put_snap_realm(realm);
1380
1381   request->target = in;
1382   return in;
1383 }
1384
1385 // -------
1386
1387 mds_rank_t Client::choose_target_mds(MetaRequest *req, Inode** phash_diri)
1388 {
1389   mds_rank_t mds = MDS_RANK_NONE;
1390   __u32 hash = 0;
1391   bool is_hash = false;
1392
1393   Inode *in = NULL;
1394   Dentry *de = NULL;
1395   Cap *cap = NULL;
1396
1397   if (req->resend_mds >= 0) {
1398     mds = req->resend_mds;
1399     req->resend_mds = -1;
1400     ldout(cct, 10) << "choose_target_mds resend_mds specified as mds." << mds << dendl;
1401     goto out;
1402   }
1403
1404   if (cct->_conf->client_use_random_mds)
1405     goto random_mds;
1406
1407   in = req->inode();
1408   de = req->dentry();
1409   if (in) {
1410     ldout(cct, 20) << "choose_target_mds starting with req->inode " << *in << dendl;
1411     if (req->path.depth()) {
1412       hash = in->hash_dentry_name(req->path[0]);
1413       ldout(cct, 20) << "choose_target_mds inode dir hash is " << (int)in->dir_layout.dl_dir_hash
1414                << " on " << req->path[0]
1415                << " => " << hash << dendl;
1416       is_hash = true;
1417     }
1418   } else if (de) {
1419     if (de->inode) {
1420       in = de->inode.get();
1421       ldout(cct, 20) << "choose_target_mds starting with req->dentry inode " << *in << dendl;
1422     } else {
1423       in = de->dir->parent_inode;
1424       hash = in->hash_dentry_name(de->name);
1425       ldout(cct, 20) << "choose_target_mds dentry dir hash is " << (int)in->dir_layout.dl_dir_hash
1426                << " on " << de->name
1427                << " => " << hash << dendl;
1428       is_hash = true;
1429     }
1430   }
1431   if (in) {
1432     if (in->snapid != CEPH_NOSNAP) {
1433       ldout(cct, 10) << "choose_target_mds " << *in << " is snapped, using nonsnap parent" << dendl;
1434       while (in->snapid != CEPH_NOSNAP) {
1435         if (in->snapid == CEPH_SNAPDIR)
1436           in = in->snapdir_parent.get();
1437         else if (!in->dn_set.empty())
1438           /* In most cases there will only be one dentry, so getting it
1439            * will be the correct action. If there are multiple hard links,
1440            * I think the MDS should be able to redirect as needed*/
1441           in = in->get_first_parent()->dir->parent_inode;
1442         else {
1443           ldout(cct, 10) << "got unlinked inode, can't look at parent" << dendl;
1444           break;
1445         }
1446       }
1447       is_hash = false;
1448     }
1449
1450     ldout(cct, 20) << "choose_target_mds " << *in << " is_hash=" << is_hash
1451              << " hash=" << hash << dendl;
1452
1453     if (is_hash && S_ISDIR(in->mode) && !in->fragmap.empty()) {
1454       frag_t fg = in->dirfragtree[hash];
1455       if (in->fragmap.count(fg)) {
1456         mds = in->fragmap[fg];
1457         if (phash_diri)
1458           *phash_diri = in;
1459         ldout(cct, 10) << "choose_target_mds from dirfragtree hash" << dendl;
1460         goto out;
1461       }
1462     }
1463
1464     if (req->auth_is_best())
1465       cap = in->auth_cap;
1466     if (!cap && !in->caps.empty())
1467       cap = in->caps.begin()->second;
1468     if (!cap)
1469       goto random_mds;
1470     mds = cap->session->mds_num;
1471     ldout(cct, 10) << "choose_target_mds from caps on inode " << *in << dendl;
1472
1473     goto out;
1474   }
1475
1476 random_mds:
1477   if (mds < 0) {
1478     mds = _get_random_up_mds();
1479     ldout(cct, 10) << "did not get mds through better means, so chose random mds " << mds << dendl;
1480   }
1481
1482 out:
1483   ldout(cct, 20) << "mds is " << mds << dendl;
1484   return mds;
1485 }
1486
1487
1488 void Client::connect_mds_targets(mds_rank_t mds)
1489 {
1490   ldout(cct, 10) << "connect_mds_targets for mds." << mds << dendl;
1491   assert(mds_sessions.count(mds));
1492   const MDSMap::mds_info_t& info = mdsmap->get_mds_info(mds);
1493   for (set<mds_rank_t>::const_iterator q = info.export_targets.begin();
1494        q != info.export_targets.end();
1495        ++q) {
1496     if (mds_sessions.count(*q) == 0 &&
1497         mdsmap->is_clientreplay_or_active_or_stopping(*q)) {
1498       ldout(cct, 10) << "check_mds_sessions opening mds." << mds
1499                      << " export target mds." << *q << dendl;
1500       _open_mds_session(*q);
1501     }
1502   }
1503 }
1504
1505 void Client::dump_mds_sessions(Formatter *f)
1506 {
1507   f->dump_int("id", get_nodeid().v);
1508   f->open_array_section("sessions");
1509   for (map<mds_rank_t,MetaSession*>::const_iterator p = mds_sessions.begin(); p != mds_sessions.end(); ++p) {
1510     f->open_object_section("session");
1511     p->second->dump(f);
1512     f->close_section();
1513   }
1514   f->close_section();
1515   f->dump_int("mdsmap_epoch", mdsmap->get_epoch());
1516 }
1517 void Client::dump_mds_requests(Formatter *f)
1518 {
1519   for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
1520        p != mds_requests.end();
1521        ++p) {
1522     f->open_object_section("request");
1523     p->second->dump(f);
1524     f->close_section();
1525   }
1526 }
1527
1528 int Client::verify_reply_trace(int r,
1529                                MetaRequest *request, MClientReply *reply,
1530                                InodeRef *ptarget, bool *pcreated,
1531                                const UserPerm& perms)
1532 {
1533   // check whether this request actually did the create, and set created flag
1534   bufferlist extra_bl;
1535   inodeno_t created_ino;
1536   bool got_created_ino = false;
1537   ceph::unordered_map<vinodeno_t, Inode*>::iterator p;
1538
1539   extra_bl.claim(reply->get_extra_bl());
1540   if (extra_bl.length() >= 8) {
1541     // if the extra bufferlist has a buffer, we assume its the created inode
1542     // and that this request to create succeeded in actually creating
1543     // the inode (won the race with other create requests)
1544     ::decode(created_ino, extra_bl);
1545     got_created_ino = true;
1546     ldout(cct, 10) << "make_request created ino " << created_ino << dendl;
1547   }
1548
1549   if (pcreated)
1550     *pcreated = got_created_ino;
1551
1552   if (request->target) {
1553     *ptarget = request->target;
1554     ldout(cct, 20) << "make_request target is " << *ptarget->get() << dendl;
1555   } else {
1556     if (got_created_ino && (p = inode_map.find(vinodeno_t(created_ino, CEPH_NOSNAP))) != inode_map.end()) {
1557       (*ptarget) = p->second;
1558       ldout(cct, 20) << "make_request created, target is " << *ptarget->get() << dendl;
1559     } else {
1560       // we got a traceless reply, and need to look up what we just
1561       // created.  for now, do this by name.  someday, do this by the
1562       // ino... which we know!  FIXME.
1563       InodeRef target;
1564       Dentry *d = request->dentry();
1565       if (d) {
1566         if (d->dir) {
1567           ldout(cct, 10) << "make_request got traceless reply, looking up #"
1568                          << d->dir->parent_inode->ino << "/" << d->name
1569                          << " got_ino " << got_created_ino
1570                          << " ino " << created_ino
1571                          << dendl;
1572           r = _do_lookup(d->dir->parent_inode, d->name, request->regetattr_mask,
1573                          &target, perms);
1574         } else {
1575           // if the dentry is not linked, just do our best. see #5021.
1576           assert(0 == "how did this happen?  i want logs!");
1577         }
1578       } else {
1579         Inode *in = request->inode();
1580         ldout(cct, 10) << "make_request got traceless reply, forcing getattr on #"
1581                        << in->ino << dendl;
1582         r = _getattr(in, request->regetattr_mask, perms, true);
1583         target = in;
1584       }
1585       if (r >= 0) {
1586         // verify ino returned in reply and trace_dist are the same
1587         if (got_created_ino &&
1588             created_ino.val != target->ino.val) {
1589           ldout(cct, 5) << "create got ino " << created_ino << " but then failed on lookup; EINTR?" << dendl;
1590           r = -EINTR;
1591         }
1592         if (ptarget)
1593           ptarget->swap(target);
1594       }
1595     }
1596   }
1597
1598   return r;
1599 }
1600
1601
1602 /**
1603  * make a request
1604  *
1605  * Blocking helper to make an MDS request.
1606  *
1607  * If the ptarget flag is set, behavior changes slightly: the caller
1608  * expects to get a pointer to the inode we are creating or operating
1609  * on.  As a result, we will follow up any traceless mutation reply
1610  * with a getattr or lookup to transparently handle a traceless reply
1611  * from the MDS (as when the MDS restarts and the client has to replay
1612  * a request).
1613  *
1614  * @param request the MetaRequest to execute
1615  * @param perms The user uid/gid to execute as (eventually, full group lists?)
1616  * @param ptarget [optional] address to store a pointer to the target inode we want to create or operate on
1617  * @param pcreated [optional; required if ptarget] where to store a bool of whether our create atomically created a file
1618  * @param use_mds [optional] prefer a specific mds (-1 for default)
1619  * @param pdirbl [optional; disallowed if ptarget] where to pass extra reply payload to the caller
1620  */
1621 int Client::make_request(MetaRequest *request,
1622                          const UserPerm& perms,
1623                          InodeRef *ptarget, bool *pcreated,
1624                          mds_rank_t use_mds,
1625                          bufferlist *pdirbl)
1626 {
1627   int r = 0;
1628
1629   // assign a unique tid
1630   ceph_tid_t tid = ++last_tid;
1631   request->set_tid(tid);
1632
1633   // and timestamp
1634   request->op_stamp = ceph_clock_now();
1635
1636   // make note
1637   mds_requests[tid] = request->get();
1638   if (oldest_tid == 0 && request->get_op() != CEPH_MDS_OP_SETFILELOCK)
1639     oldest_tid = tid;
1640
1641   request->set_caller_perms(perms);
1642
1643   if (cct->_conf->client_inject_fixed_oldest_tid) {
1644     ldout(cct, 20) << __func__ << " injecting fixed oldest_client_tid(1)" << dendl;
1645     request->set_oldest_client_tid(1);
1646   } else {
1647     request->set_oldest_client_tid(oldest_tid);
1648   }
1649
1650   // hack target mds?
1651   if (use_mds >= 0)
1652     request->resend_mds = use_mds;
1653
1654   while (1) {
1655     if (request->aborted())
1656       break;
1657
1658     if (blacklisted) {
1659       request->abort(-EBLACKLISTED);
1660       break;
1661     }
1662
1663     // set up wait cond
1664     Cond caller_cond;
1665     request->caller_cond = &caller_cond;
1666
1667     // choose mds
1668     Inode *hash_diri = NULL;
1669     mds_rank_t mds = choose_target_mds(request, &hash_diri);
1670     int mds_state = (mds == MDS_RANK_NONE) ? MDSMap::STATE_NULL : mdsmap->get_state(mds);
1671     if (mds_state != MDSMap::STATE_ACTIVE && mds_state != MDSMap::STATE_STOPPING) {
1672       if (mds_state == MDSMap::STATE_NULL && mds >= mdsmap->get_max_mds()) {
1673         if (hash_diri) {
1674           ldout(cct, 10) << " target mds." << mds << " has stopped, remove it from fragmap" << dendl;
1675           _fragmap_remove_stopped_mds(hash_diri, mds);
1676         } else {
1677           ldout(cct, 10) << " target mds." << mds << " has stopped, trying a random mds" << dendl;
1678           request->resend_mds = _get_random_up_mds();
1679         }
1680       } else {
1681         ldout(cct, 10) << " target mds." << mds << " not active, waiting for new mdsmap" << dendl;
1682         wait_on_list(waiting_for_mdsmap);
1683       }
1684       continue;
1685     }
1686
1687     // open a session?
1688     MetaSession *session = NULL;
1689     if (!have_open_session(mds)) {
1690       session = _get_or_open_mds_session(mds);
1691
1692       // wait
1693       if (session->state == MetaSession::STATE_OPENING) {
1694         ldout(cct, 10) << "waiting for session to mds." << mds << " to open" << dendl;
1695         wait_on_context_list(session->waiting_for_open);
1696         // Abort requests on REJECT from MDS
1697         if (rejected_by_mds.count(mds)) {
1698           request->abort(-EPERM);
1699           break;
1700         }
1701         continue;
1702       }
1703
1704       if (!have_open_session(mds))
1705         continue;
1706     } else {
1707       session = mds_sessions[mds];
1708     }
1709
1710     // send request.
1711     send_request(request, session);
1712
1713     // wait for signal
1714     ldout(cct, 20) << "awaiting reply|forward|kick on " << &caller_cond << dendl;
1715     request->kick = false;
1716     while (!request->reply &&         // reply
1717            request->resend_mds < 0 && // forward
1718            !request->kick)
1719       caller_cond.Wait(client_lock);
1720     request->caller_cond = NULL;
1721
1722     // did we get a reply?
1723     if (request->reply)
1724       break;
1725   }
1726
1727   if (!request->reply) {
1728     assert(request->aborted());
1729     assert(!request->got_unsafe);
1730     r = request->get_abort_code();
1731     request->item.remove_myself();
1732     unregister_request(request);
1733     put_request(request); // ours
1734     return r;
1735   }
1736
1737   // got it!
1738   MClientReply *reply = request->reply;
1739   request->reply = NULL;
1740   r = reply->get_result();
1741   if (r >= 0)
1742     request->success = true;
1743
1744   // kick dispatcher (we've got it!)
1745   assert(request->dispatch_cond);
1746   request->dispatch_cond->Signal();
1747   ldout(cct, 20) << "sendrecv kickback on tid " << tid << " " << request->dispatch_cond << dendl;
1748   request->dispatch_cond = 0;
1749
1750   if (r >= 0 && ptarget)
1751     r = verify_reply_trace(r, request, reply, ptarget, pcreated, perms);
1752
1753   if (pdirbl)
1754     pdirbl->claim(reply->get_extra_bl());
1755
1756   // -- log times --
1757   utime_t lat = ceph_clock_now();
1758   lat -= request->sent_stamp;
1759   ldout(cct, 20) << "lat " << lat << dendl;
1760   logger->tinc(l_c_lat, lat);
1761   logger->tinc(l_c_reply, lat);
1762
1763   put_request(request);
1764
1765   reply->put();
1766   return r;
1767 }
1768
1769 void Client::unregister_request(MetaRequest *req)
1770 {
1771   mds_requests.erase(req->tid);
1772   if (req->tid == oldest_tid) {
1773     map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.upper_bound(oldest_tid);
1774     while (true) {
1775       if (p == mds_requests.end()) {
1776         oldest_tid = 0;
1777         break;
1778       }
1779       if (p->second->get_op() != CEPH_MDS_OP_SETFILELOCK) {
1780         oldest_tid = p->first;
1781         break;
1782       }
1783       ++p;
1784     }
1785   }
1786   put_request(req);
1787 }
1788
1789 void Client::put_request(MetaRequest *request)
1790 {
1791   if (request->_put()) {
1792     int op = -1;
1793     if (request->success)
1794       op = request->get_op();
1795     InodeRef other_in;
1796     request->take_other_inode(&other_in);
1797     delete request;
1798
1799     if (other_in &&
1800         (op == CEPH_MDS_OP_RMDIR ||
1801          op == CEPH_MDS_OP_RENAME ||
1802          op == CEPH_MDS_OP_RMSNAP)) {
1803       _try_to_trim_inode(other_in.get(), false);
1804     }
1805   }
1806 }
1807
1808 int Client::encode_inode_release(Inode *in, MetaRequest *req,
1809                          mds_rank_t mds, int drop,
1810                          int unless, int force)
1811 {
1812   ldout(cct, 20) << "encode_inode_release enter(in:" << *in << ", req:" << req
1813            << " mds:" << mds << ", drop:" << drop << ", unless:" << unless
1814            << ", have:" << ", force:" << force << ")" << dendl;
1815   int released = 0;
1816   if (in->caps.count(mds)) {
1817     Cap *caps = in->caps[mds];
1818     drop &= ~(in->dirty_caps | get_caps_used(in));
1819     if ((drop & caps->issued) &&
1820         !(unless & caps->issued)) {
1821       ldout(cct, 25) << "Dropping caps. Initial " << ccap_string(caps->issued) << dendl;
1822       caps->issued &= ~drop;
1823       caps->implemented &= ~drop;
1824       released = 1;
1825       ldout(cct, 25) << "Now have: " << ccap_string(caps->issued) << dendl;
1826     } else {
1827       released = force;
1828     }
1829     if (released) {
1830       ceph_mds_request_release rel;
1831       rel.ino = in->ino;
1832       rel.cap_id = caps->cap_id;
1833       rel.seq = caps->seq;
1834       rel.issue_seq = caps->issue_seq;
1835       rel.mseq = caps->mseq;
1836       rel.caps = caps->implemented;
1837       rel.wanted = caps->wanted;
1838       rel.dname_len = 0;
1839       rel.dname_seq = 0;
1840       req->cap_releases.push_back(MClientRequest::Release(rel,""));
1841     }
1842   }
1843   ldout(cct, 25) << "encode_inode_release exit(in:" << *in << ") released:"
1844            << released << dendl;
1845   return released;
1846 }
1847
1848 void Client::encode_dentry_release(Dentry *dn, MetaRequest *req,
1849                            mds_rank_t mds, int drop, int unless)
1850 {
1851   ldout(cct, 20) << "encode_dentry_release enter(dn:"
1852            << dn << ")" << dendl;
1853   int released = 0;
1854   if (dn->dir)
1855     released = encode_inode_release(dn->dir->parent_inode, req,
1856                                     mds, drop, unless, 1);
1857   if (released && dn->lease_mds == mds) {
1858     ldout(cct, 25) << "preemptively releasing dn to mds" << dendl;
1859     MClientRequest::Release& rel = req->cap_releases.back();
1860     rel.item.dname_len = dn->name.length();
1861     rel.item.dname_seq = dn->lease_seq;
1862     rel.dname = dn->name;
1863   }
1864   ldout(cct, 25) << "encode_dentry_release exit(dn:"
1865            << dn << ")" << dendl;
1866 }
1867
1868
1869 /*
1870  * This requires the MClientRequest *request member to be set.
1871  * It will error out horribly without one.
1872  * Additionally, if you set any *drop member, you'd better have
1873  * set the corresponding dentry!
1874  */
1875 void Client::encode_cap_releases(MetaRequest *req, mds_rank_t mds)
1876 {
1877   ldout(cct, 20) << "encode_cap_releases enter (req: "
1878                  << req << ", mds: " << mds << ")" << dendl;
1879   if (req->inode_drop && req->inode())
1880     encode_inode_release(req->inode(), req,
1881                          mds, req->inode_drop,
1882                          req->inode_unless);
1883
1884   if (req->old_inode_drop && req->old_inode())
1885     encode_inode_release(req->old_inode(), req,
1886                          mds, req->old_inode_drop,
1887                          req->old_inode_unless);
1888   if (req->other_inode_drop && req->other_inode())
1889     encode_inode_release(req->other_inode(), req,
1890                          mds, req->other_inode_drop,
1891                          req->other_inode_unless);
1892
1893   if (req->dentry_drop && req->dentry())
1894     encode_dentry_release(req->dentry(), req,
1895                           mds, req->dentry_drop,
1896                           req->dentry_unless);
1897
1898   if (req->old_dentry_drop && req->old_dentry())
1899     encode_dentry_release(req->old_dentry(), req,
1900                           mds, req->old_dentry_drop,
1901                           req->old_dentry_unless);
1902   ldout(cct, 25) << "encode_cap_releases exit (req: "
1903            << req << ", mds " << mds <<dendl;
1904 }
1905
1906 bool Client::have_open_session(mds_rank_t mds)
1907 {
1908   return
1909     mds_sessions.count(mds) &&
1910     (mds_sessions[mds]->state == MetaSession::STATE_OPEN ||
1911      mds_sessions[mds]->state == MetaSession::STATE_STALE);
1912 }
1913
1914 MetaSession *Client::_get_mds_session(mds_rank_t mds, Connection *con)
1915 {
1916   if (mds_sessions.count(mds) == 0)
1917     return NULL;
1918   MetaSession *s = mds_sessions[mds];
1919   if (s->con != con)
1920     return NULL;
1921   return s;
1922 }
1923
1924 MetaSession *Client::_get_or_open_mds_session(mds_rank_t mds)
1925 {
1926   if (mds_sessions.count(mds))
1927     return mds_sessions[mds];
1928   return _open_mds_session(mds);
1929 }
1930
1931 /**
1932  * Populate a map of strings with client-identifying metadata,
1933  * such as the hostname.  Call this once at initialization.
1934  */
1935 void Client::populate_metadata(const std::string &mount_root)
1936 {
1937   // Hostname
1938   struct utsname u;
1939   int r = uname(&u);
1940   if (r >= 0) {
1941     metadata["hostname"] = u.nodename;
1942     ldout(cct, 20) << __func__ << " read hostname '" << u.nodename << "'" << dendl;
1943   } else {
1944     ldout(cct, 1) << __func__ << " failed to read hostname (" << cpp_strerror(r) << ")" << dendl;
1945   }
1946
1947   metadata["pid"] = stringify(getpid());
1948
1949   // Ceph entity id (the '0' in "client.0")
1950   metadata["entity_id"] = cct->_conf->name.get_id();
1951
1952   // Our mount position
1953   if (!mount_root.empty()) {
1954     metadata["root"] = mount_root;
1955   }
1956
1957   // Ceph version
1958   metadata["ceph_version"] = pretty_version_to_str();
1959   metadata["ceph_sha1"] = git_version_to_str();
1960
1961   // Apply any metadata from the user's configured overrides
1962   std::vector<std::string> tokens;
1963   get_str_vec(cct->_conf->client_metadata, ",", tokens);
1964   for (const auto &i : tokens) {
1965     auto eqpos = i.find("=");
1966     // Throw out anything that isn't of the form "<str>=<str>"
1967     if (eqpos == 0 || eqpos == std::string::npos || eqpos == i.size()) {
1968       lderr(cct) << "Invalid metadata keyval pair: '" << i << "'" << dendl;
1969       continue;
1970     }
1971     metadata[i.substr(0, eqpos)] = i.substr(eqpos + 1);
1972   }
1973 }
1974
1975 /**
1976  * Optionally add or override client metadata fields.
1977  */
1978 void Client::update_metadata(std::string const &k, std::string const &v)
1979 {
1980   Mutex::Locker l(client_lock);
1981   assert(initialized);
1982
1983   if (metadata.count(k)) {
1984     ldout(cct, 1) << __func__ << " warning, overriding metadata field '" << k
1985       << "' from '" << metadata[k] << "' to '" << v << "'" << dendl;
1986   }
1987
1988   metadata[k] = v;
1989 }
1990
1991 MetaSession *Client::_open_mds_session(mds_rank_t mds)
1992 {
1993   ldout(cct, 10) << "_open_mds_session mds." << mds << dendl;
1994   assert(mds_sessions.count(mds) == 0);
1995   MetaSession *session = new MetaSession;
1996   session->mds_num = mds;
1997   session->seq = 0;
1998   session->inst = mdsmap->get_inst(mds);
1999   session->con = messenger->get_connection(session->inst);
2000   session->state = MetaSession::STATE_OPENING;
2001   session->mds_state = MDSMap::STATE_NULL;
2002   mds_sessions[mds] = session;
2003
2004   // Maybe skip sending a request to open if this MDS daemon
2005   // has previously sent us a REJECT.
2006   if (rejected_by_mds.count(mds)) {
2007     if (rejected_by_mds[mds] == session->inst) {
2008       ldout(cct, 4) << "_open_mds_session mds." << mds << " skipping "
2009                        "because we were rejected" << dendl;
2010       return session;
2011     } else {
2012       ldout(cct, 4) << "_open_mds_session mds." << mds << " old inst "
2013                        "rejected us, trying with new inst" << dendl;
2014       rejected_by_mds.erase(mds);
2015     }
2016   }
2017
2018   MClientSession *m = new MClientSession(CEPH_SESSION_REQUEST_OPEN);
2019   m->client_meta = metadata;
2020   session->con->send_message(m);
2021   return session;
2022 }
2023
2024 void Client::_close_mds_session(MetaSession *s)
2025 {
2026   ldout(cct, 2) << "_close_mds_session mds." << s->mds_num << " seq " << s->seq << dendl;
2027   s->state = MetaSession::STATE_CLOSING;
2028   s->con->send_message(new MClientSession(CEPH_SESSION_REQUEST_CLOSE, s->seq));
2029 }
2030
2031 void Client::_closed_mds_session(MetaSession *s)
2032 {
2033   s->state = MetaSession::STATE_CLOSED;
2034   s->con->mark_down();
2035   signal_context_list(s->waiting_for_open);
2036   mount_cond.Signal();
2037   remove_session_caps(s);
2038   kick_requests_closed(s);
2039   mds_sessions.erase(s->mds_num);
2040   delete s;
2041 }
2042
2043 void Client::handle_client_session(MClientSession *m)
2044 {
2045   mds_rank_t from = mds_rank_t(m->get_source().num());
2046   ldout(cct, 10) << "handle_client_session " << *m << " from mds." << from << dendl;
2047
2048   MetaSession *session = _get_mds_session(from, m->get_connection().get());
2049   if (!session) {
2050     ldout(cct, 10) << " discarding session message from sessionless mds " << m->get_source_inst() << dendl;
2051     m->put();
2052     return;
2053   }
2054
2055   switch (m->get_op()) {
2056   case CEPH_SESSION_OPEN:
2057     renew_caps(session);
2058     session->state = MetaSession::STATE_OPEN;
2059     if (unmounting)
2060       mount_cond.Signal();
2061     else
2062       connect_mds_targets(from);
2063     signal_context_list(session->waiting_for_open);
2064     break;
2065
2066   case CEPH_SESSION_CLOSE:
2067     _closed_mds_session(session);
2068     break;
2069
2070   case CEPH_SESSION_RENEWCAPS:
2071     if (session->cap_renew_seq == m->get_seq()) {
2072       session->cap_ttl =
2073         session->last_cap_renew_request + mdsmap->get_session_timeout();
2074       wake_inode_waiters(session);
2075     }
2076     break;
2077
2078   case CEPH_SESSION_STALE:
2079     renew_caps(session);
2080     break;
2081
2082   case CEPH_SESSION_RECALL_STATE:
2083     trim_caps(session, m->get_max_caps());
2084     break;
2085
2086   case CEPH_SESSION_FLUSHMSG:
2087     session->con->send_message(new MClientSession(CEPH_SESSION_FLUSHMSG_ACK, m->get_seq()));
2088     break;
2089
2090   case CEPH_SESSION_FORCE_RO:
2091     force_session_readonly(session);
2092     break;
2093
2094   case CEPH_SESSION_REJECT:
2095     rejected_by_mds[session->mds_num] = session->inst;
2096     _closed_mds_session(session);
2097
2098     break;
2099
2100   default:
2101     ceph_abort();
2102   }
2103
2104   m->put();
2105 }
2106
2107 bool Client::_any_stale_sessions() const
2108 {
2109   assert(client_lock.is_locked_by_me());
2110
2111   for (const auto &i : mds_sessions) {
2112     if (i.second->state == MetaSession::STATE_STALE) {
2113       return true;
2114     }
2115   }
2116
2117   return false;
2118 }
2119
2120 void Client::_kick_stale_sessions()
2121 {
2122   ldout(cct, 1) << "kick_stale_sessions" << dendl;
2123
2124   for (map<mds_rank_t,MetaSession*>::iterator p = mds_sessions.begin();
2125        p != mds_sessions.end(); ) {
2126     MetaSession *s = p->second;
2127     ++p;
2128     if (s->state == MetaSession::STATE_STALE)
2129       _closed_mds_session(s);
2130   }
2131 }
2132
2133 void Client::send_request(MetaRequest *request, MetaSession *session,
2134                           bool drop_cap_releases)
2135 {
2136   // make the request
2137   mds_rank_t mds = session->mds_num;
2138   ldout(cct, 10) << "send_request rebuilding request " << request->get_tid()
2139                  << " for mds." << mds << dendl;
2140   MClientRequest *r = build_client_request(request);
2141   if (request->dentry()) {
2142     r->set_dentry_wanted();
2143   }
2144   if (request->got_unsafe) {
2145     r->set_replayed_op();
2146     if (request->target)
2147       r->head.ino = request->target->ino;
2148   } else {
2149     encode_cap_releases(request, mds);
2150     if (drop_cap_releases) // we haven't send cap reconnect yet, drop cap releases
2151       request->cap_releases.clear();
2152     else
2153       r->releases.swap(request->cap_releases);
2154   }
2155   r->set_mdsmap_epoch(mdsmap->get_epoch());
2156   if (r->head.op == CEPH_MDS_OP_SETXATTR) {
2157     objecter->with_osdmap([r](const OSDMap& o) {
2158         r->set_osdmap_epoch(o.get_epoch());
2159       });
2160   }
2161
2162   if (request->mds == -1) {
2163     request->sent_stamp = ceph_clock_now();
2164     ldout(cct, 20) << "send_request set sent_stamp to " << request->sent_stamp << dendl;
2165   }
2166   request->mds = mds;
2167
2168   Inode *in = request->inode();
2169   if (in && in->caps.count(mds))
2170     request->sent_on_mseq = in->caps[mds]->mseq;
2171
2172   session->requests.push_back(&request->item);
2173
2174   ldout(cct, 10) << "send_request " << *r << " to mds." << mds << dendl;
2175   session->con->send_message(r);
2176 }
2177
2178 MClientRequest* Client::build_client_request(MetaRequest *request)
2179 {
2180   MClientRequest *req = new MClientRequest(request->get_op());
2181   req->set_tid(request->tid);
2182   req->set_stamp(request->op_stamp);
2183   memcpy(&req->head, &request->head, sizeof(ceph_mds_request_head));
2184
2185   // if the filepath's haven't been set, set them!
2186   if (request->path.empty()) {
2187     Inode *in = request->inode();
2188     Dentry *de = request->dentry();
2189     if (in)
2190       in->make_nosnap_relative_path(request->path);
2191     else if (de) {
2192       if (de->inode)
2193         de->inode->make_nosnap_relative_path(request->path);
2194       else if (de->dir) {
2195         de->dir->parent_inode->make_nosnap_relative_path(request->path);
2196         request->path.push_dentry(de->name);
2197       }
2198       else ldout(cct, 1) << "Warning -- unable to construct a filepath!"
2199                    << " No path, inode, or appropriately-endowed dentry given!"
2200                    << dendl;
2201     } else ldout(cct, 1) << "Warning -- unable to construct a filepath!"
2202                    << " No path, inode, or dentry given!"
2203                    << dendl;
2204   }
2205   req->set_filepath(request->get_filepath());
2206   req->set_filepath2(request->get_filepath2());
2207   req->set_data(request->data);
2208   req->set_retry_attempt(request->retry_attempt++);
2209   req->head.num_fwd = request->num_fwd;
2210   const gid_t *_gids;
2211   int gid_count = request->perms.get_gids(&_gids);
2212   req->set_gid_list(gid_count, _gids);
2213   return req;
2214 }
2215
2216
2217
2218 void Client::handle_client_request_forward(MClientRequestForward *fwd)
2219 {
2220   mds_rank_t mds = mds_rank_t(fwd->get_source().num());
2221   MetaSession *session = _get_mds_session(mds, fwd->get_connection().get());
2222   if (!session) {
2223     fwd->put();
2224     return;
2225   }
2226   ceph_tid_t tid = fwd->get_tid();
2227
2228   if (mds_requests.count(tid) == 0) {
2229     ldout(cct, 10) << "handle_client_request_forward no pending request on tid " << tid << dendl;
2230     fwd->put();
2231     return;
2232   }
2233
2234   MetaRequest *request = mds_requests[tid];
2235   assert(request);
2236
2237   // reset retry counter
2238   request->retry_attempt = 0;
2239
2240   // request not forwarded, or dest mds has no session.
2241   // resend.
2242   ldout(cct, 10) << "handle_client_request tid " << tid
2243            << " fwd " << fwd->get_num_fwd()
2244            << " to mds." << fwd->get_dest_mds()
2245            << ", resending to " << fwd->get_dest_mds()
2246            << dendl;
2247
2248   request->mds = -1;
2249   request->item.remove_myself();
2250   request->num_fwd = fwd->get_num_fwd();
2251   request->resend_mds = fwd->get_dest_mds();
2252   request->caller_cond->Signal();
2253
2254   fwd->put();
2255 }
2256
2257 bool Client::is_dir_operation(MetaRequest *req)
2258 {
2259   int op = req->get_op();
2260   if (op == CEPH_MDS_OP_MKNOD || op == CEPH_MDS_OP_LINK ||
2261       op == CEPH_MDS_OP_UNLINK || op == CEPH_MDS_OP_RENAME ||
2262       op == CEPH_MDS_OP_MKDIR || op == CEPH_MDS_OP_RMDIR ||
2263       op == CEPH_MDS_OP_SYMLINK || op == CEPH_MDS_OP_CREATE)
2264     return true;
2265   return false;
2266 }
2267
2268 void Client::handle_client_reply(MClientReply *reply)
2269 {
2270   mds_rank_t mds_num = mds_rank_t(reply->get_source().num());
2271   MetaSession *session = _get_mds_session(mds_num, reply->get_connection().get());
2272   if (!session) {
2273     reply->put();
2274     return;
2275   }
2276
2277   ceph_tid_t tid = reply->get_tid();
2278   bool is_safe = reply->is_safe();
2279
2280   if (mds_requests.count(tid) == 0) {
2281     lderr(cct) << "handle_client_reply no pending request on tid " << tid
2282                << " safe is:" << is_safe << dendl;
2283     reply->put();
2284     return;
2285   }
2286   MetaRequest *request = mds_requests.at(tid);
2287
2288   ldout(cct, 20) << "handle_client_reply got a reply. Safe:" << is_safe
2289                  << " tid " << tid << dendl;
2290
2291   if (request->got_unsafe && !is_safe) {
2292     //duplicate response
2293     ldout(cct, 0) << "got a duplicate reply on tid " << tid << " from mds "
2294             << mds_num << " safe:" << is_safe << dendl;
2295     reply->put();
2296     return;
2297   }
2298
2299   if (-ESTALE == reply->get_result()) { // see if we can get to proper MDS
2300     ldout(cct, 20) << "got ESTALE on tid " << request->tid
2301                    << " from mds." << request->mds << dendl;
2302     request->send_to_auth = true;
2303     request->resend_mds = choose_target_mds(request);
2304     Inode *in = request->inode();
2305     if (request->resend_mds >= 0 &&
2306         request->resend_mds == request->mds &&
2307         (in == NULL ||
2308          in->caps.count(request->resend_mds) == 0 ||
2309          request->sent_on_mseq == in->caps[request->resend_mds]->mseq)) {
2310       // have to return ESTALE
2311     } else {
2312       request->caller_cond->Signal();
2313       reply->put();
2314       return;
2315     }
2316     ldout(cct, 20) << "have to return ESTALE" << dendl;
2317   }
2318
2319   assert(request->reply == NULL);
2320   request->reply = reply;
2321   insert_trace(request, session);
2322
2323   // Handle unsafe reply
2324   if (!is_safe) {
2325     request->got_unsafe = true;
2326     session->unsafe_requests.push_back(&request->unsafe_item);
2327     if (is_dir_operation(request)) {
2328       Inode *dir = request->inode();
2329       assert(dir);
2330       dir->unsafe_ops.push_back(&request->unsafe_dir_item);
2331     }
2332     if (request->target) {
2333       InodeRef &in = request->target;
2334       in->unsafe_ops.push_back(&request->unsafe_target_item);
2335     }
2336   }
2337
2338   // Only signal the caller once (on the first reply):
2339   // Either its an unsafe reply, or its a safe reply and no unsafe reply was sent.
2340   if (!is_safe || !request->got_unsafe) {
2341     Cond cond;
2342     request->dispatch_cond = &cond;
2343
2344     // wake up waiter
2345     ldout(cct, 20) << "handle_client_reply signalling caller " << (void*)request->caller_cond << dendl;
2346     request->caller_cond->Signal();
2347
2348     // wake for kick back
2349     while (request->dispatch_cond) {
2350       ldout(cct, 20) << "handle_client_reply awaiting kickback on tid " << tid << " " << &cond << dendl;
2351       cond.Wait(client_lock);
2352     }
2353   }
2354
2355   if (is_safe) {
2356     // the filesystem change is committed to disk
2357     // we're done, clean up
2358     if (request->got_unsafe) {
2359       request->unsafe_item.remove_myself();
2360       request->unsafe_dir_item.remove_myself();
2361       request->unsafe_target_item.remove_myself();
2362       signal_cond_list(request->waitfor_safe);
2363     }
2364     request->item.remove_myself();
2365     unregister_request(request);
2366   }
2367   if (unmounting)
2368     mount_cond.Signal();
2369 }
2370
2371 void Client::_handle_full_flag(int64_t pool)
2372 {
2373   ldout(cct, 1) << __func__ << ": FULL: cancelling outstanding operations "
2374     << "on " << pool << dendl;
2375   // Cancel all outstanding ops in this pool with -ENOSPC: it is necessary
2376   // to do this rather than blocking, because otherwise when we fill up we
2377   // potentially lock caps forever on files with dirty pages, and we need
2378   // to be able to release those caps to the MDS so that it can delete files
2379   // and free up space.
2380   epoch_t cancelled_epoch = objecter->op_cancel_writes(-ENOSPC, pool);
2381
2382   // For all inodes with layouts in this pool and a pending flush write op
2383   // (i.e. one of the ones we will cancel), we've got to purge_set their data
2384   // from ObjectCacher so that it doesn't re-issue the write in response to
2385   // the ENOSPC error.
2386   // Fortunately since we're cancelling everything in a given pool, we don't
2387   // need to know which ops belong to which ObjectSet, we can just blow all
2388   // the un-flushed cached data away and mark any dirty inodes' async_err
2389   // field with -ENOSPC as long as we're sure all the ops we cancelled were
2390   // affecting this pool, and all the objectsets we're purging were also
2391   // in this pool.
2392   for (unordered_map<vinodeno_t,Inode*>::iterator i = inode_map.begin();
2393        i != inode_map.end(); ++i)
2394   {
2395     Inode *inode = i->second;
2396     if (inode->oset.dirty_or_tx
2397         && (pool == -1 || inode->layout.pool_id == pool)) {
2398       ldout(cct, 4) << __func__ << ": FULL: inode 0x" << std::hex << i->first << std::dec
2399         << " has dirty objects, purging and setting ENOSPC" << dendl;
2400       objectcacher->purge_set(&inode->oset);
2401       inode->set_async_err(-ENOSPC);
2402     }
2403   }
2404
2405   if (cancelled_epoch != (epoch_t)-1) {
2406     set_cap_epoch_barrier(cancelled_epoch);
2407   }
2408 }
2409
2410 void Client::handle_osd_map(MOSDMap *m)
2411 {
2412   std::set<entity_addr_t> new_blacklists;
2413   objecter->consume_blacklist_events(&new_blacklists);
2414
2415   const auto myaddr = messenger->get_myaddr();
2416   if (!blacklisted && new_blacklists.count(myaddr)) {
2417     auto epoch = objecter->with_osdmap([](const OSDMap &o){
2418         return o.get_epoch();
2419         });
2420     lderr(cct) << "I was blacklisted at osd epoch " << epoch << dendl;
2421     blacklisted = true;
2422     for (std::map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
2423          p != mds_requests.end(); ) {
2424       auto req = p->second;
2425       ++p;
2426       req->abort(-EBLACKLISTED);
2427       if (req->caller_cond) {
2428         req->kick = true;
2429         req->caller_cond->Signal();
2430       }
2431     }
2432
2433     // Progress aborts on any requests that were on this waitlist.  Any
2434     // requests that were on a waiting_for_open session waitlist
2435     // will get kicked during close session below.
2436     signal_cond_list(waiting_for_mdsmap);
2437
2438     // Force-close all sessions: assume this is not abandoning any state
2439     // on the MDS side because the MDS will have seen the blacklist too.
2440     while(!mds_sessions.empty()) {
2441       auto i = mds_sessions.begin();
2442       auto session = i->second;
2443       _closed_mds_session(session);
2444     }
2445
2446     // Since we know all our OSD ops will fail, cancel them all preemtively,
2447     // so that on an unhealthy cluster we can umount promptly even if e.g.
2448     // some PGs were inaccessible.
2449     objecter->op_cancel_writes(-EBLACKLISTED);
2450
2451   } else if (blacklisted) {
2452     // Handle case where we were blacklisted but no longer are
2453     blacklisted = objecter->with_osdmap([myaddr](const OSDMap &o){
2454         return o.is_blacklisted(myaddr);});
2455   }
2456
2457   if (objecter->osdmap_full_flag()) {
2458     _handle_full_flag(-1);
2459   } else {
2460     // Accumulate local list of full pools so that I can drop
2461     // the objecter lock before re-entering objecter in
2462     // cancel_writes
2463     std::vector<int64_t> full_pools;
2464
2465     objecter->with_osdmap([&full_pools](const OSDMap &o) {
2466         for (const auto& kv : o.get_pools()) {
2467           if (kv.second.has_flag(pg_pool_t::FLAG_FULL)) {
2468             full_pools.push_back(kv.first);
2469           }
2470         }
2471       });
2472
2473     for (auto p : full_pools)
2474       _handle_full_flag(p);
2475
2476     // Subscribe to subsequent maps to watch for the full flag going
2477     // away.  For the global full flag objecter does this for us, but
2478     // it pays no attention to the per-pool full flag so in this branch
2479     // we do it ourselves.
2480     if (!full_pools.empty()) {
2481       objecter->maybe_request_map();
2482     }
2483   }
2484
2485   m->put();
2486 }
2487
2488
2489 // ------------------------
2490 // incoming messages
2491
2492
2493 bool Client::ms_dispatch(Message *m)
2494 {
2495   Mutex::Locker l(client_lock);
2496   if (!initialized) {
2497     ldout(cct, 10) << "inactive, discarding " << *m << dendl;
2498     m->put();
2499     return true;
2500   }
2501
2502   switch (m->get_type()) {
2503     // mounting and mds sessions
2504   case CEPH_MSG_MDS_MAP:
2505     handle_mds_map(static_cast<MMDSMap*>(m));
2506     break;
2507   case CEPH_MSG_FS_MAP:
2508     handle_fs_map(static_cast<MFSMap*>(m));
2509     break;
2510   case CEPH_MSG_FS_MAP_USER:
2511     handle_fs_map_user(static_cast<MFSMapUser*>(m));
2512     break;
2513   case CEPH_MSG_CLIENT_SESSION:
2514     handle_client_session(static_cast<MClientSession*>(m));
2515     break;
2516
2517   case CEPH_MSG_OSD_MAP:
2518     handle_osd_map(static_cast<MOSDMap*>(m));
2519     break;
2520
2521     // requests
2522   case CEPH_MSG_CLIENT_REQUEST_FORWARD:
2523     handle_client_request_forward(static_cast<MClientRequestForward*>(m));
2524     break;
2525   case CEPH_MSG_CLIENT_REPLY:
2526     handle_client_reply(static_cast<MClientReply*>(m));
2527     break;
2528
2529   case CEPH_MSG_CLIENT_SNAP:
2530     handle_snap(static_cast<MClientSnap*>(m));
2531     break;
2532   case CEPH_MSG_CLIENT_CAPS:
2533     handle_caps(static_cast<MClientCaps*>(m));
2534     break;
2535   case CEPH_MSG_CLIENT_LEASE:
2536     handle_lease(static_cast<MClientLease*>(m));
2537     break;
2538   case MSG_COMMAND_REPLY:
2539     if (m->get_source().type() == CEPH_ENTITY_TYPE_MDS) {
2540       handle_command_reply(static_cast<MCommandReply*>(m));
2541     } else {
2542       return false;
2543     }
2544     break;
2545   case CEPH_MSG_CLIENT_QUOTA:
2546     handle_quota(static_cast<MClientQuota*>(m));
2547     break;
2548
2549   default:
2550     return false;
2551   }
2552
2553   // unmounting?
2554   if (unmounting) {
2555     ldout(cct, 10) << "unmounting: trim pass, size was " << lru.lru_get_size()
2556              << "+" << inode_map.size() << dendl;
2557     long unsigned size = lru.lru_get_size() + inode_map.size();
2558     trim_cache();
2559     if (size < lru.lru_get_size() + inode_map.size()) {
2560       ldout(cct, 10) << "unmounting: trim pass, cache shrank, poking unmount()" << dendl;
2561       mount_cond.Signal();
2562     } else {
2563       ldout(cct, 10) << "unmounting: trim pass, size still " << lru.lru_get_size()
2564                << "+" << inode_map.size() << dendl;
2565     }
2566   }
2567
2568   return true;
2569 }
2570
2571 void Client::handle_fs_map(MFSMap *m)
2572 {
2573   fsmap.reset(new FSMap(m->get_fsmap()));
2574   m->put();
2575
2576   signal_cond_list(waiting_for_fsmap);
2577
2578   monclient->sub_got("fsmap", fsmap->get_epoch());
2579 }
2580
2581 void Client::handle_fs_map_user(MFSMapUser *m)
2582 {
2583   fsmap_user.reset(new FSMapUser);
2584   *fsmap_user = m->get_fsmap();
2585   m->put();
2586
2587   monclient->sub_got("fsmap.user", fsmap_user->get_epoch());
2588   signal_cond_list(waiting_for_fsmap);
2589 }
2590
2591 void Client::handle_mds_map(MMDSMap* m)
2592 {
2593   if (m->get_epoch() <= mdsmap->get_epoch()) {
2594     ldout(cct, 1) << "handle_mds_map epoch " << m->get_epoch()
2595                   << " is identical to or older than our "
2596                   << mdsmap->get_epoch() << dendl;
2597     m->put();
2598     return;
2599   }
2600
2601   ldout(cct, 1) << "handle_mds_map epoch " << m->get_epoch() << dendl;
2602
2603   std::unique_ptr<MDSMap> oldmap(new MDSMap);
2604   oldmap.swap(mdsmap);
2605
2606   mdsmap->decode(m->get_encoded());
2607
2608   // Cancel any commands for missing or laggy GIDs
2609   std::list<ceph_tid_t> cancel_ops;
2610   auto &commands = command_table.get_commands();
2611   for (const auto &i : commands) {
2612     auto &op = i.second;
2613     const mds_gid_t op_mds_gid = op.mds_gid;
2614     if (mdsmap->is_dne_gid(op_mds_gid) || mdsmap->is_laggy_gid(op_mds_gid)) {
2615       ldout(cct, 1) << __func__ << ": cancelling command op " << i.first << dendl;
2616       cancel_ops.push_back(i.first);
2617       if (op.outs) {
2618         std::ostringstream ss;
2619         ss << "MDS " << op_mds_gid << " went away";
2620         *(op.outs) = ss.str();
2621       }
2622       op.con->mark_down();
2623       if (op.on_finish) {
2624         op.on_finish->complete(-ETIMEDOUT);
2625       }
2626     }
2627   }
2628
2629   for (std::list<ceph_tid_t>::iterator i = cancel_ops.begin();
2630        i != cancel_ops.end(); ++i) {
2631     command_table.erase(*i);
2632   }
2633
2634   // reset session
2635   for (map<mds_rank_t,MetaSession*>::iterator p = mds_sessions.begin();
2636        p != mds_sessions.end(); ) {
2637     mds_rank_t mds = p->first;
2638     MetaSession *session = p->second;
2639     ++p;
2640
2641     int oldstate = oldmap->get_state(mds);
2642     int newstate = mdsmap->get_state(mds);
2643     if (!mdsmap->is_up(mds)) {
2644       session->con->mark_down();
2645     } else if (mdsmap->get_inst(mds) != session->inst) {
2646       session->con->mark_down();
2647       session->inst = mdsmap->get_inst(mds);
2648       // When new MDS starts to take over, notify kernel to trim unused entries
2649       // in its dcache/icache. Hopefully, the kernel will release some unused
2650       // inodes before the new MDS enters reconnect state.
2651       trim_cache_for_reconnect(session);
2652     } else if (oldstate == newstate)
2653       continue;  // no change
2654
2655     session->mds_state = newstate;
2656     if (newstate == MDSMap::STATE_RECONNECT) {
2657       session->con = messenger->get_connection(session->inst);
2658       send_reconnect(session);
2659     } else if (newstate >= MDSMap::STATE_ACTIVE) {
2660       if (oldstate < MDSMap::STATE_ACTIVE) {
2661         // kick new requests
2662         kick_requests(session);
2663         kick_flushing_caps(session);
2664         signal_context_list(session->waiting_for_open);
2665         kick_maxsize_requests(session);
2666         wake_inode_waiters(session);
2667       }
2668       connect_mds_targets(mds);
2669     } else if (newstate == MDSMap::STATE_NULL &&
2670                mds >= mdsmap->get_max_mds()) {
2671       _closed_mds_session(session);
2672     }
2673   }
2674
2675   // kick any waiting threads
2676   signal_cond_list(waiting_for_mdsmap);
2677
2678   m->put();
2679
2680   monclient->sub_got("mdsmap", mdsmap->get_epoch());
2681 }
2682
2683 void Client::send_reconnect(MetaSession *session)
2684 {
2685   mds_rank_t mds = session->mds_num;
2686   ldout(cct, 10) << "send_reconnect to mds." << mds << dendl;
2687
2688   // trim unused caps to reduce MDS's cache rejoin time
2689   trim_cache_for_reconnect(session);
2690
2691   session->readonly = false;
2692
2693   if (session->release) {
2694     session->release->put();
2695     session->release = NULL;
2696   }
2697
2698   // reset my cap seq number
2699   session->seq = 0;
2700   //connect to the mds' offload targets
2701   connect_mds_targets(mds);
2702   //make sure unsafe requests get saved
2703   resend_unsafe_requests(session);
2704
2705   MClientReconnect *m = new MClientReconnect;
2706
2707   // i have an open session.
2708   ceph::unordered_set<inodeno_t> did_snaprealm;
2709   for (ceph::unordered_map<vinodeno_t, Inode*>::iterator p = inode_map.begin();
2710        p != inode_map.end();
2711        ++p) {
2712     Inode *in = p->second;
2713     if (in->caps.count(mds)) {
2714       ldout(cct, 10) << " caps on " << p->first
2715                << " " << ccap_string(in->caps[mds]->issued)
2716                << " wants " << ccap_string(in->caps_wanted())
2717                << dendl;
2718       filepath path;
2719       in->make_long_path(path);
2720       ldout(cct, 10) << "    path " << path << dendl;
2721
2722       bufferlist flockbl;
2723       _encode_filelocks(in, flockbl);
2724
2725       Cap *cap = in->caps[mds];
2726       cap->seq = 0;  // reset seq.
2727       cap->issue_seq = 0;  // reset seq.
2728       cap->mseq = 0;  // reset seq.
2729       cap->issued = cap->implemented;
2730
2731       snapid_t snap_follows = 0;
2732       if (!in->cap_snaps.empty())
2733         snap_follows = in->cap_snaps.begin()->first;
2734
2735       m->add_cap(p->first.ino,
2736                  cap->cap_id,
2737                  path.get_ino(), path.get_path(),   // ino
2738                  in->caps_wanted(), // wanted
2739                  cap->issued,     // issued
2740                  in->snaprealm->ino,
2741                  snap_follows,
2742                  flockbl);
2743
2744       if (did_snaprealm.count(in->snaprealm->ino) == 0) {
2745         ldout(cct, 10) << " snaprealm " << *in->snaprealm << dendl;
2746         m->add_snaprealm(in->snaprealm->ino, in->snaprealm->seq, in->snaprealm->parent);
2747         did_snaprealm.insert(in->snaprealm->ino);
2748       }
2749     }
2750   }
2751
2752   early_kick_flushing_caps(session);
2753
2754   session->con->send_message(m);
2755
2756   mount_cond.Signal();
2757 }
2758
2759
2760 void Client::kick_requests(MetaSession *session)
2761 {
2762   ldout(cct, 10) << "kick_requests for mds." << session->mds_num << dendl;
2763   for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
2764        p != mds_requests.end();
2765        ++p) {
2766     MetaRequest *req = p->second;
2767     if (req->got_unsafe)
2768       continue;
2769     if (req->aborted()) {
2770       if (req->caller_cond) {
2771         req->kick = true;
2772         req->caller_cond->Signal();
2773       }
2774       continue;
2775     }
2776     if (req->retry_attempt > 0)
2777       continue; // new requests only
2778     if (req->mds == session->mds_num) {
2779       send_request(p->second, session);
2780     }
2781   }
2782 }
2783
2784 void Client::resend_unsafe_requests(MetaSession *session)
2785 {
2786   for (xlist<MetaRequest*>::iterator iter = session->unsafe_requests.begin();
2787        !iter.end();
2788        ++iter)
2789     send_request(*iter, session);
2790
2791   // also re-send old requests when MDS enters reconnect stage. So that MDS can
2792   // process completed requests in clientreplay stage.
2793   for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
2794        p != mds_requests.end();
2795        ++p) {
2796     MetaRequest *req = p->second;
2797     if (req->got_unsafe)
2798       continue;
2799     if (req->aborted())
2800       continue;
2801     if (req->retry_attempt == 0)
2802       continue; // old requests only
2803     if (req->mds == session->mds_num)
2804       send_request(req, session, true);
2805   }
2806 }
2807
2808 void Client::wait_unsafe_requests()
2809 {
2810   list<MetaRequest*> last_unsafe_reqs;
2811   for (map<mds_rank_t,MetaSession*>::iterator p = mds_sessions.begin();
2812        p != mds_sessions.end();
2813        ++p) {
2814     MetaSession *s = p->second;
2815     if (!s->unsafe_requests.empty()) {
2816       MetaRequest *req = s->unsafe_requests.back();
2817       req->get();
2818       last_unsafe_reqs.push_back(req);
2819     }
2820   }
2821
2822   for (list<MetaRequest*>::iterator p = last_unsafe_reqs.begin();
2823        p != last_unsafe_reqs.end();
2824        ++p) {
2825     MetaRequest *req = *p;
2826     if (req->unsafe_item.is_on_list())
2827       wait_on_list(req->waitfor_safe);
2828     put_request(req);
2829   }
2830 }
2831
2832 void Client::kick_requests_closed(MetaSession *session)
2833 {
2834   ldout(cct, 10) << "kick_requests_closed for mds." << session->mds_num << dendl;
2835   for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
2836        p != mds_requests.end(); ) {
2837     MetaRequest *req = p->second;
2838     ++p;
2839     if (req->mds == session->mds_num) {
2840       if (req->caller_cond) {
2841         req->kick = true;
2842         req->caller_cond->Signal();
2843       }
2844       req->item.remove_myself();
2845       if (req->got_unsafe) {
2846         lderr(cct) << "kick_requests_closed removing unsafe request " << req->get_tid() << dendl;
2847         req->unsafe_item.remove_myself();
2848         req->unsafe_dir_item.remove_myself();
2849         req->unsafe_target_item.remove_myself();
2850         signal_cond_list(req->waitfor_safe);
2851         unregister_request(req);
2852       }
2853     }
2854   }
2855   assert(session->requests.empty());
2856   assert(session->unsafe_requests.empty());
2857 }
2858
2859
2860
2861
2862 /************
2863  * leases
2864  */
2865
2866 void Client::got_mds_push(MetaSession *s)
2867 {
2868   s->seq++;
2869   ldout(cct, 10) << " mds." << s->mds_num << " seq now " << s->seq << dendl;
2870   if (s->state == MetaSession::STATE_CLOSING) {
2871     s->con->send_message(new MClientSession(CEPH_SESSION_REQUEST_CLOSE, s->seq));
2872   }
2873 }
2874
2875 void Client::handle_lease(MClientLease *m)
2876 {
2877   ldout(cct, 10) << "handle_lease " << *m << dendl;
2878
2879   assert(m->get_action() == CEPH_MDS_LEASE_REVOKE);
2880
2881   mds_rank_t mds = mds_rank_t(m->get_source().num());
2882   MetaSession *session = _get_mds_session(mds, m->get_connection().get());
2883   if (!session) {
2884     m->put();
2885     return;
2886   }
2887
2888   got_mds_push(session);
2889
2890   ceph_seq_t seq = m->get_seq();
2891
2892   Inode *in;
2893   vinodeno_t vino(m->get_ino(), CEPH_NOSNAP);
2894   if (inode_map.count(vino) == 0) {
2895     ldout(cct, 10) << " don't have vino " << vino << dendl;
2896     goto revoke;
2897   }
2898   in = inode_map[vino];
2899
2900   if (m->get_mask() & CEPH_LOCK_DN) {
2901     if (!in->dir || in->dir->dentries.count(m->dname) == 0) {
2902       ldout(cct, 10) << " don't have dir|dentry " << m->get_ino() << "/" << m->dname <<dendl;
2903       goto revoke;
2904     }
2905     Dentry *dn = in->dir->dentries[m->dname];
2906     ldout(cct, 10) << " revoked DN lease on " << dn << dendl;
2907     dn->lease_mds = -1;
2908   }
2909
2910  revoke:
2911   m->get_connection()->send_message(
2912     new MClientLease(
2913       CEPH_MDS_LEASE_RELEASE, seq,
2914       m->get_mask(), m->get_ino(), m->get_first(), m->get_last(), m->dname));
2915   m->put();
2916 }
2917
2918 void Client::put_inode(Inode *in, int n)
2919 {
2920   ldout(cct, 10) << "put_inode on " << *in << dendl;
2921   int left = in->_put(n);
2922   if (left == 0) {
2923     // release any caps
2924     remove_all_caps(in);
2925
2926     ldout(cct, 10) << "put_inode deleting " << *in << dendl;
2927     bool unclean = objectcacher->release_set(&in->oset);
2928     assert(!unclean);
2929     inode_map.erase(in->vino());
2930     if (use_faked_inos())
2931       _release_faked_ino(in);
2932
2933     if (in == root) {
2934       root = 0;
2935       root_ancestor = 0;
2936       while (!root_parents.empty())
2937         root_parents.erase(root_parents.begin());
2938     }
2939
2940     delete in;
2941   }
2942 }
2943
2944 void Client::close_dir(Dir *dir)
2945 {
2946   Inode *in = dir->parent_inode;
2947   ldout(cct, 15) << "close_dir dir " << dir << " on " << in << dendl;
2948   assert(dir->is_empty());
2949   assert(in->dir == dir);
2950   assert(in->dn_set.size() < 2);     // dirs can't be hard-linked
2951   if (!in->dn_set.empty())
2952     in->get_first_parent()->put();   // unpin dentry
2953
2954   delete in->dir;
2955   in->dir = 0;
2956   put_inode(in);               // unpin inode
2957 }
2958
2959   /**
2960    * Don't call this with in==NULL, use get_or_create for that
2961    * leave dn set to default NULL unless you're trying to add
2962    * a new inode to a pre-created Dentry
2963    */
2964 Dentry* Client::link(Dir *dir, const string& name, Inode *in, Dentry *dn)
2965 {
2966   if (!dn) {
2967     // create a new Dentry
2968     dn = new Dentry;
2969     dn->name = name;
2970
2971     // link to dir
2972     dn->dir = dir;
2973     dir->dentries[dn->name] = dn;
2974     lru.lru_insert_mid(dn);    // mid or top?
2975
2976     ldout(cct, 15) << "link dir " << dir->parent_inode << " '" << name << "' to inode " << in
2977                    << " dn " << dn << " (new dn)" << dendl;
2978   } else {
2979     ldout(cct, 15) << "link dir " << dir->parent_inode << " '" << name << "' to inode " << in
2980                    << " dn " << dn << " (old dn)" << dendl;
2981   }
2982
2983   if (in) {    // link to inode
2984     dn->inode = in;
2985     if (in->is_dir()) {
2986       if (in->dir)
2987         dn->get(); // dir -> dn pin
2988       if (in->ll_ref)
2989         dn->get(); // ll_ref -> dn pin
2990     }
2991
2992     assert(in->dn_set.count(dn) == 0);
2993
2994     // only one parent for directories!
2995     if (in->is_dir() && !in->dn_set.empty()) {
2996       Dentry *olddn = in->get_first_parent();
2997       assert(olddn->dir != dir || olddn->name != name);
2998       Inode *old_diri = olddn->dir->parent_inode;
2999       old_diri->dir_release_count++;
3000       clear_dir_complete_and_ordered(old_diri, true);
3001       unlink(olddn, true, true);  // keep dir, dentry
3002     }
3003
3004     in->dn_set.insert(dn);
3005
3006     ldout(cct, 20) << "link  inode " << in << " parents now " << in->dn_set << dendl;
3007   }
3008
3009   return dn;
3010 }
3011
3012 void Client::unlink(Dentry *dn, bool keepdir, bool keepdentry)
3013 {
3014   InodeRef in;
3015   in.swap(dn->inode);
3016   ldout(cct, 15) << "unlink dir " << dn->dir->parent_inode << " '" << dn->name << "' dn " << dn
3017                  << " inode " << dn->inode << dendl;
3018
3019   // unlink from inode
3020   if (in) {
3021     if (in->is_dir()) {
3022       if (in->dir)
3023         dn->put(); // dir -> dn pin
3024       if (in->ll_ref)
3025         dn->put(); // ll_ref -> dn pin
3026     }
3027     dn->inode = 0;
3028     assert(in->dn_set.count(dn));
3029     in->dn_set.erase(dn);
3030     ldout(cct, 20) << "unlink  inode " << in << " parents now " << in->dn_set << dendl;
3031   }
3032
3033   if (keepdentry) {
3034     dn->lease_mds = -1;
3035   } else {
3036     ldout(cct, 15) << "unlink  removing '" << dn->name << "' dn " << dn << dendl;
3037
3038     // unlink from dir
3039     dn->dir->dentries.erase(dn->name);
3040     if (dn->dir->is_empty() && !keepdir)
3041       close_dir(dn->dir);
3042     dn->dir = 0;
3043
3044     // delete den
3045     lru.lru_remove(dn);
3046     dn->put();
3047   }
3048 }
3049
3050 /**
3051  * For asynchronous flushes, check for errors from the IO and
3052  * update the inode if necessary
3053  */
3054 class C_Client_FlushComplete : public Context {
3055 private:
3056   Client *client;
3057   InodeRef inode;
3058 public:
3059   C_Client_FlushComplete(Client *c, Inode *in) : client(c), inode(in) { }
3060   void finish(int r) override {
3061     assert(client->client_lock.is_locked_by_me());
3062     if (r != 0) {
3063       client_t const whoami = client->whoami;  // For the benefit of ldout prefix
3064       ldout(client->cct, 1) << "I/O error from flush on inode " << inode
3065         << " 0x" << std::hex << inode->ino << std::dec
3066         << ": " << r << "(" << cpp_strerror(r) << ")" << dendl;
3067       inode->set_async_err(r);
3068     }
3069   }
3070 };
3071
3072
3073 /****
3074  * caps
3075  */
3076
3077 void Client::get_cap_ref(Inode *in, int cap)
3078 {
3079   if ((cap & CEPH_CAP_FILE_BUFFER) &&
3080       in->cap_refs[CEPH_CAP_FILE_BUFFER] == 0) {
3081     ldout(cct, 5) << "get_cap_ref got first FILE_BUFFER ref on " << *in << dendl;
3082     in->get();
3083   }
3084   if ((cap & CEPH_CAP_FILE_CACHE) &&
3085       in->cap_refs[CEPH_CAP_FILE_CACHE] == 0) {
3086     ldout(cct, 5) << "get_cap_ref got first FILE_CACHE ref on " << *in << dendl;
3087     in->get();
3088   }
3089   in->get_cap_ref(cap);
3090 }
3091
3092 void Client::put_cap_ref(Inode *in, int cap)
3093 {
3094   int last = in->put_cap_ref(cap);
3095   if (last) {
3096     int put_nref = 0;
3097     int drop = last & ~in->caps_issued();
3098     if (in->snapid == CEPH_NOSNAP) {
3099       if ((last & CEPH_CAP_FILE_WR) &&
3100           !in->cap_snaps.empty() &&
3101           in->cap_snaps.rbegin()->second.writing) {
3102         ldout(cct, 10) << "put_cap_ref finishing pending cap_snap on " << *in << dendl;
3103         in->cap_snaps.rbegin()->second.writing = 0;
3104         finish_cap_snap(in, in->cap_snaps.rbegin()->second, get_caps_used(in));
3105         signal_cond_list(in->waitfor_caps);  // wake up blocked sync writers
3106       }
3107       if (last & CEPH_CAP_FILE_BUFFER) {
3108         for (auto &p : in->cap_snaps)
3109           p.second.dirty_data = 0;
3110         signal_cond_list(in->waitfor_commit);
3111         ldout(cct, 5) << "put_cap_ref dropped last FILE_BUFFER ref on " << *in << dendl;
3112         ++put_nref;
3113       }
3114     }
3115     if (last & CEPH_CAP_FILE_CACHE) {
3116       ldout(cct, 5) << "put_cap_ref dropped last FILE_CACHE ref on " << *in << dendl;
3117       ++put_nref;
3118     }
3119     if (drop)
3120       check_caps(in, 0);
3121     if (put_nref)
3122       put_inode(in, put_nref);
3123   }
3124 }
3125
3126 int Client::get_caps(Inode *in, int need, int want, int *phave, loff_t endoff)
3127 {
3128   int r = check_pool_perm(in, need);
3129   if (r < 0)
3130     return r;
3131
3132   while (1) {
3133     int file_wanted = in->caps_file_wanted();
3134     if ((file_wanted & need) != need) {
3135       ldout(cct, 10) << "get_caps " << *in << " need " << ccap_string(need)
3136                      << " file_wanted " << ccap_string(file_wanted) << ", EBADF "
3137                      << dendl;
3138       return -EBADF;
3139     }
3140
3141     int implemented;
3142     int have = in->caps_issued(&implemented);
3143
3144     bool waitfor_caps = false;
3145     bool waitfor_commit = false;
3146
3147     if (have & need & CEPH_CAP_FILE_WR) {
3148       if (endoff > 0 &&
3149           (endoff >= (loff_t)in->max_size ||
3150            endoff > (loff_t)(in->size << 1)) &&
3151           endoff > (loff_t)in->wanted_max_size) {
3152         ldout(cct, 10) << "wanted_max_size " << in->wanted_max_size << " -> " << endoff << dendl;
3153         in->wanted_max_size = endoff;
3154         check_caps(in, 0);
3155       }
3156
3157       if (endoff >= 0 && endoff > (loff_t)in->max_size) {
3158         ldout(cct, 10) << "waiting on max_size, endoff " << endoff << " max_size " << in->max_size << " on " << *in << dendl;
3159         waitfor_caps = true;
3160       }
3161       if (!in->cap_snaps.empty()) {
3162         if (in->cap_snaps.rbegin()->second.writing) {
3163           ldout(cct, 10) << "waiting on cap_snap write to complete" << dendl;
3164           waitfor_caps = true;
3165         }
3166         for (auto &p : in->cap_snaps) {
3167           if (p.second.dirty_data) {
3168             waitfor_commit = true;
3169             break;
3170           }
3171         }
3172         if (waitfor_commit) {
3173           _flush(in, new C_Client_FlushComplete(this, in));
3174           ldout(cct, 10) << "waiting for WRBUFFER to get dropped" << dendl;
3175         }
3176       }
3177     }
3178
3179     if (!waitfor_caps && !waitfor_commit) {
3180       if ((have & need) == need) {
3181         int revoking = implemented & ~have;
3182         ldout(cct, 10) << "get_caps " << *in << " have " << ccap_string(have)
3183                  << " need " << ccap_string(need) << " want " << ccap_string(want)
3184                  << " revoking " << ccap_string(revoking)
3185                  << dendl;
3186         if ((revoking & want) == 0) {
3187           *phave = need | (have & want);
3188           in->get_cap_ref(need);
3189           return 0;
3190         }
3191       }
3192       ldout(cct, 10) << "waiting for caps " << *in << " need " << ccap_string(need) << " want " << ccap_string(want) << dendl;
3193       waitfor_caps = true;
3194     }
3195
3196     if ((need & CEPH_CAP_FILE_WR) && in->auth_cap &&
3197         in->auth_cap->session->readonly)
3198       return -EROFS;
3199
3200     if (in->flags & I_CAP_DROPPED) {
3201       int mds_wanted = in->caps_mds_wanted();
3202       if ((mds_wanted & need) != need) {
3203         int ret = _renew_caps(in);
3204         if (ret < 0)
3205           return ret;
3206         continue;
3207       }
3208       if ((mds_wanted & file_wanted) ==
3209           (file_wanted & (CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR))) {
3210         in->flags &= ~I_CAP_DROPPED;
3211       }
3212     }
3213
3214     if (waitfor_caps)
3215       wait_on_list(in->waitfor_caps);
3216     else if (waitfor_commit)
3217       wait_on_list(in->waitfor_commit);
3218   }
3219 }
3220
3221 int Client::get_caps_used(Inode *in)
3222 {
3223   unsigned used = in->caps_used();
3224   if (!(used & CEPH_CAP_FILE_CACHE) &&
3225       !objectcacher->set_is_empty(&in->oset))
3226     used |= CEPH_CAP_FILE_CACHE;
3227   return used;
3228 }
3229
3230 void Client::cap_delay_requeue(Inode *in)
3231 {
3232   ldout(cct, 10) << "cap_delay_requeue on " << *in << dendl;
3233   in->hold_caps_until = ceph_clock_now();
3234   in->hold_caps_until += cct->_conf->client_caps_release_delay;
3235   delayed_caps.push_back(&in->cap_item);
3236 }
3237
3238 void Client::send_cap(Inode *in, MetaSession *session, Cap *cap,
3239                       bool sync, int used, int want, int retain,
3240                       int flush, ceph_tid_t flush_tid)
3241 {
3242   int held = cap->issued | cap->implemented;
3243   int revoking = cap->implemented & ~cap->issued;
3244   retain &= ~revoking;
3245   int dropping = cap->issued & ~retain;
3246   int op = CEPH_CAP_OP_UPDATE;
3247
3248   ldout(cct, 10) << "send_cap " << *in
3249            << " mds." << session->mds_num << " seq " << cap->seq
3250            << (sync ? " sync " : " async ")
3251            << " used " << ccap_string(used)
3252            << " want " << ccap_string(want)
3253            << " flush " << ccap_string(flush)
3254            << " retain " << ccap_string(retain)
3255            << " held "<< ccap_string(held)
3256            << " revoking " << ccap_string(revoking)
3257            << " dropping " << ccap_string(dropping)
3258            << dendl;
3259
3260   if (cct->_conf->client_inject_release_failure && revoking) {
3261     const int would_have_issued = cap->issued & retain;
3262     const int would_have_implemented = cap->implemented & (cap->issued | used);
3263     // Simulated bug:
3264     //  - tell the server we think issued is whatever they issued plus whatever we implemented
3265     //  - leave what we have implemented in place
3266     ldout(cct, 20) << __func__ << " injecting failure to release caps" << dendl;
3267     cap->issued = cap->issued | cap->implemented;
3268
3269     // Make an exception for revoking xattr caps: we are injecting
3270     // failure to release other caps, but allow xattr because client
3271     // will block on xattr ops if it can't release these to MDS (#9800)
3272     const int xattr_mask = CEPH_CAP_XATTR_SHARED | CEPH_CAP_XATTR_EXCL;
3273     cap->issued ^= xattr_mask & revoking;
3274     cap->implemented ^= xattr_mask & revoking;
3275
3276     ldout(cct, 20) << __func__ << " issued " << ccap_string(cap->issued) << " vs " << ccap_string(would_have_issued) << dendl;
3277     ldout(cct, 20) << __func__ << " implemented " << ccap_string(cap->implemented) << " vs " << ccap_string(would_have_implemented) << dendl;
3278   } else {
3279     // Normal behaviour
3280     cap->issued &= retain;
3281     cap->implemented &= cap->issued | used;
3282   }
3283
3284   snapid_t follows = 0;
3285
3286   if (flush)
3287     follows = in->snaprealm->get_snap_context().seq;
3288
3289   MClientCaps *m = new MClientCaps(op,
3290                                    in->ino,
3291                                    0,
3292                                    cap->cap_id, cap->seq,
3293                                    cap->implemented,
3294                                    want,
3295                                    flush,
3296                                    cap->mseq,
3297                                    cap_epoch_barrier);
3298   m->caller_uid = in->cap_dirtier_uid;
3299   m->caller_gid = in->cap_dirtier_gid;
3300
3301   m->head.issue_seq = cap->issue_seq;
3302   m->set_tid(flush_tid);
3303
3304   m->head.uid = in->uid;
3305   m->head.gid = in->gid;
3306   m->head.mode = in->mode;
3307
3308   m->head.nlink = in->nlink;
3309
3310   if (flush & CEPH_CAP_XATTR_EXCL) {
3311     ::encode(in->xattrs, m->xattrbl);
3312     m->head.xattr_version = in->xattr_version;
3313   }
3314
3315   m->size = in->size;
3316   m->max_size = in->max_size;
3317   m->truncate_seq = in->truncate_seq;
3318   m->truncate_size = in->truncate_size;
3319   m->mtime = in->mtime;
3320   m->atime = in->atime;
3321   m->ctime = in->ctime;
3322   m->btime = in->btime;
3323   m->time_warp_seq = in->time_warp_seq;
3324   m->change_attr = in->change_attr;
3325   if (sync)
3326     m->flags |= CLIENT_CAPS_SYNC;
3327
3328   if (flush & CEPH_CAP_FILE_WR) {
3329     m->inline_version = in->inline_version;
3330     m->inline_data = in->inline_data;
3331   }
3332
3333   in->reported_size = in->size;
3334   m->set_snap_follows(follows);
3335   cap->wanted = want;
3336   if (cap == in->auth_cap) {
3337     m->set_max_size(in->wanted_max_size);
3338     in->requested_max_size = in->wanted_max_size;
3339     ldout(cct, 15) << "auth cap, setting max_size = " << in->requested_max_size << dendl;
3340   }
3341
3342   if (!session->flushing_caps_tids.empty())
3343     m->set_oldest_flush_tid(*session->flushing_caps_tids.begin());
3344
3345   session->con->send_message(m);
3346 }
3347
3348 static bool is_max_size_approaching(Inode *in)
3349 {
3350   /* mds will adjust max size according to the reported size */
3351   if (in->flushing_caps & CEPH_CAP_FILE_WR)
3352     return false;
3353   if (in->size >= in->max_size)
3354     return true;
3355   /* half of previous max_size increment has been used */
3356   if (in->max_size > in->reported_size &&
3357       (in->size << 1) >= in->max_size + in->reported_size)
3358     return true;
3359   return false;
3360 }
3361
3362 /**
3363  * check_caps
3364  *
3365  * Examine currently used and wanted versus held caps. Release, flush or ack
3366  * revoked caps to the MDS as appropriate.
3367  *
3368  * @param in the inode to check
3369  * @param flags flags to apply to cap check
3370  */
3371 void Client::check_caps(Inode *in, unsigned flags)
3372 {
3373   unsigned wanted = in->caps_wanted();
3374   unsigned used = get_caps_used(in);
3375   unsigned cap_used;
3376
3377   if (in->is_dir() && (in->flags & I_COMPLETE)) {
3378     // we do this here because we don't want to drop to Fs (and then
3379     // drop the Fs if we do a create!) if that alone makes us send lookups
3380     // to the MDS. Doing it in in->caps_wanted() has knock-on effects elsewhere
3381     wanted |= CEPH_CAP_FILE_EXCL;
3382   }
3383
3384   int implemented;
3385   int issued = in->caps_issued(&implemented);
3386   int revoking = implemented & ~issued;
3387
3388   int retain = wanted | used | CEPH_CAP_PIN;
3389   if (!unmounting) {
3390     if (wanted)
3391       retain |= CEPH_CAP_ANY;
3392     else
3393       retain |= CEPH_CAP_ANY_SHARED;
3394   }
3395
3396   ldout(cct, 10) << "check_caps on " << *in
3397            << " wanted " << ccap_string(wanted)
3398            << " used " << ccap_string(used)
3399            << " issued " << ccap_string(issued)
3400            << " revoking " << ccap_string(revoking)
3401            << " flags=" << flags
3402            << dendl;
3403
3404   if (in->snapid != CEPH_NOSNAP)
3405     return; //snap caps last forever, can't write
3406
3407   if (in->caps.empty())
3408     return;   // guard if at end of func
3409
3410   if ((revoking & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO)) &&
3411       (used & CEPH_CAP_FILE_CACHE) && !(used & CEPH_CAP_FILE_BUFFER)) {
3412     if (_release(in))
3413       used &= ~CEPH_CAP_FILE_CACHE;
3414   }
3415
3416   if (!in->cap_snaps.empty())
3417     flush_snaps(in);
3418
3419   if (flags & CHECK_CAPS_NODELAY)
3420     in->hold_caps_until = utime_t();
3421   else
3422     cap_delay_requeue(in);
3423
3424   utime_t now = ceph_clock_now();
3425
3426   map<mds_rank_t, Cap*>::iterator it = in->caps.begin();
3427   while (it != in->caps.end()) {
3428     mds_rank_t mds = it->first;
3429     Cap *cap = it->second;
3430     ++it;
3431
3432     MetaSession *session = mds_sessions[mds];
3433     assert(session);
3434
3435     cap_used = used;
3436     if (in->auth_cap && cap != in->auth_cap)
3437       cap_used &= ~in->auth_cap->issued;
3438
3439     revoking = cap->implemented & ~cap->issued;
3440
3441     ldout(cct, 10) << " cap mds." << mds
3442              << " issued " << ccap_string(cap->issued)
3443              << " implemented " << ccap_string(cap->implemented)
3444              << " revoking " << ccap_string(revoking) << dendl;
3445
3446     if (in->wanted_max_size > in->max_size &&
3447         in->wanted_max_size > in->requested_max_size &&
3448         cap == in->auth_cap)
3449       goto ack;
3450
3451     /* approaching file_max? */
3452     if ((cap->issued & CEPH_CAP_FILE_WR) &&
3453         cap == in->auth_cap &&
3454         is_max_size_approaching(in)) {
3455       ldout(cct, 10) << "size " << in->size << " approaching max_size " << in->max_size
3456                      << ", reported " << in->reported_size << dendl;
3457       goto ack;
3458     }
3459
3460     /* completed revocation? */
3461     if (revoking && (revoking & cap_used) == 0) {
3462       ldout(cct, 10) << "completed revocation of " << ccap_string(cap->implemented & ~cap->issued) << dendl;
3463       goto ack;
3464     }
3465
3466     /* want more caps from mds? */
3467     if (wanted & ~(cap->wanted | cap->issued))
3468       goto ack;
3469
3470     if (!revoking && unmounting && (cap_used == 0))
3471       goto ack;
3472
3473     if (wanted == cap->wanted &&         // mds knows what we want.
3474         ((cap->issued & ~retain) == 0) &&// and we don't have anything we wouldn't like
3475         !in->dirty_caps)                 // and we have no dirty caps
3476       continue;
3477
3478     if (now < in->hold_caps_until) {
3479       ldout(cct, 10) << "delaying cap release" << dendl;
3480       continue;
3481     }
3482
3483   ack:
3484     // re-send old cap/snapcap flushes first.
3485     if (session->mds_state >= MDSMap::STATE_RECONNECT &&
3486         session->mds_state < MDSMap::STATE_ACTIVE &&
3487         session->early_flushing_caps.count(in) == 0) {
3488       ldout(cct, 20) << " reflushing caps (check_caps) on " << *in
3489                      << " to mds." << session->mds_num << dendl;
3490       session->early_flushing_caps.insert(in);
3491       if (in->cap_snaps.size())
3492         flush_snaps(in, true);
3493       if (in->flushing_caps)
3494         flush_caps(in, session, flags & CHECK_CAPS_SYNCHRONOUS);
3495     }
3496
3497     int flushing;
3498     ceph_tid_t flush_tid;
3499     if (in->auth_cap == cap && in->dirty_caps) {
3500       flushing = mark_caps_flushing(in, &flush_tid);
3501     } else {
3502       flushing = 0;
3503       flush_tid = 0;
3504     }
3505
3506     send_cap(in, session, cap, flags & CHECK_CAPS_SYNCHRONOUS, cap_used, wanted,
3507              retain, flushing, flush_tid);
3508   }
3509 }
3510
3511
3512 void Client::queue_cap_snap(Inode *in, SnapContext& old_snapc)
3513 {
3514   int used = get_caps_used(in);
3515   int dirty = in->caps_dirty();
3516   ldout(cct, 10) << "queue_cap_snap " << *in << " snapc " << old_snapc << " used " << ccap_string(used) << dendl;
3517
3518   if (in->cap_snaps.size() &&
3519       in->cap_snaps.rbegin()->second.writing) {
3520     ldout(cct, 10) << "queue_cap_snap already have pending cap_snap on " << *in << dendl;
3521     return;
3522   } else if (in->caps_dirty() ||
3523             (used & CEPH_CAP_FILE_WR) ||
3524              (dirty & CEPH_CAP_ANY_WR)) {
3525     const auto &capsnapem = in->cap_snaps.emplace(std::piecewise_construct, std::make_tuple(old_snapc.seq), std::make_tuple(in));
3526     assert(capsnapem.second == true); /* element inserted */
3527     CapSnap &capsnap = capsnapem.first->second;
3528     capsnap.context = old_snapc;
3529     capsnap.issued = in->caps_issued();
3530     capsnap.dirty = in->caps_dirty();
3531
3532     capsnap.dirty_data = (used & CEPH_CAP_FILE_BUFFER);
3533
3534     capsnap.uid = in->uid;
3535     capsnap.gid = in->gid;
3536     capsnap.mode = in->mode;
3537     capsnap.btime = in->btime;
3538     capsnap.xattrs = in->xattrs;
3539     capsnap.xattr_version = in->xattr_version;
3540
3541     if (used & CEPH_CAP_FILE_WR) {
3542       ldout(cct, 10) << "queue_cap_snap WR used on " << *in << dendl;
3543       capsnap.writing = 1;
3544     } else {
3545       finish_cap_snap(in, capsnap, used);
3546     }
3547   } else {
3548     ldout(cct, 10) << "queue_cap_snap not dirty|writing on " << *in << dendl;
3549   }
3550 }
3551
3552 void Client::finish_cap_snap(Inode *in, CapSnap &capsnap, int used)
3553 {
3554   ldout(cct, 10) << "finish_cap_snap " << *in << " capsnap " << (void *)&capsnap << " used " << ccap_string(used) << dendl;
3555   capsnap.size = in->size;
3556   capsnap.mtime = in->mtime;
3557   capsnap.atime = in->atime;
3558   capsnap.ctime = in->ctime;
3559   capsnap.time_warp_seq = in->time_warp_seq;
3560   capsnap.change_attr = in->change_attr;
3561
3562   capsnap.dirty |= in->caps_dirty();
3563
3564   if (capsnap.dirty & CEPH_CAP_FILE_WR) {
3565     capsnap.inline_data = in->inline_data;
3566     capsnap.inline_version = in->inline_version;
3567   }
3568
3569   if (used & CEPH_CAP_FILE_BUFFER) {
3570     ldout(cct, 10) << "finish_cap_snap " << *in << " cap_snap " << &capsnap << " used " << used
3571              << " WRBUFFER, delaying" << dendl;
3572   } else {
3573     capsnap.dirty_data = 0;
3574     flush_snaps(in);
3575   }
3576 }
3577
3578 void Client::_flushed_cap_snap(Inode *in, snapid_t seq)
3579 {
3580   ldout(cct, 10) << "_flushed_cap_snap seq " << seq << " on " << *in << dendl;
3581   in->cap_snaps.at(seq).dirty_data = 0;
3582   flush_snaps(in);
3583 }
3584
3585 void Client::flush_snaps(Inode *in, bool all_again)
3586 {
3587   ldout(cct, 10) << "flush_snaps on " << *in << " all_again " << all_again << dendl;
3588   assert(in->cap_snaps.size());
3589
3590   // pick auth mds
3591   assert(in->auth_cap);
3592   MetaSession *session = in->auth_cap->session;
3593   int mseq = in->auth_cap->mseq;
3594
3595   for (auto &p : in->cap_snaps) {
3596     CapSnap &capsnap = p.second;
3597     if (!all_again) {
3598       // only flush once per session
3599       if (capsnap.flush_tid > 0)
3600         continue;
3601     }
3602
3603     ldout(cct, 10) << "flush_snaps mds." << session->mds_num
3604              << " follows " << p.first
3605              << " size " << capsnap.size
3606              << " mtime " << capsnap.mtime
3607              << " dirty_data=" << capsnap.dirty_data
3608              << " writing=" << capsnap.writing
3609              << " on " << *in << dendl;
3610     if (capsnap.dirty_data || capsnap.writing)
3611       continue;
3612
3613     if (capsnap.flush_tid == 0) {
3614       capsnap.flush_tid = ++last_flush_tid;
3615       if (!in->flushing_cap_item.is_on_list())
3616         session->flushing_caps.push_back(&in->flushing_cap_item);
3617       session->flushing_caps_tids.insert(capsnap.flush_tid);
3618     }
3619
3620     MClientCaps *m = new MClientCaps(CEPH_CAP_OP_FLUSHSNAP, in->ino, in->snaprealm->ino, 0, mseq,
3621                                      cap_epoch_barrier);
3622     if (user_id >= 0)
3623       m->caller_uid = user_id;
3624     if (group_id >= 0)
3625       m->caller_gid = group_id;
3626
3627     m->set_client_tid(capsnap.flush_tid);
3628     m->head.snap_follows = p.first;
3629
3630     m->head.caps = capsnap.issued;
3631     m->head.dirty = capsnap.dirty;
3632
3633     m->head.uid = capsnap.uid;
3634     m->head.gid = capsnap.gid;
3635     m->head.mode = capsnap.mode;
3636     m->btime = capsnap.btime;
3637
3638     m->size = capsnap.size;
3639
3640     m->head.xattr_version = capsnap.xattr_version;
3641     ::encode(capsnap.xattrs, m->xattrbl);
3642
3643     m->ctime = capsnap.ctime;
3644     m->btime = capsnap.btime;
3645     m->mtime = capsnap.mtime;
3646     m->atime = capsnap.atime;
3647     m->time_warp_seq = capsnap.time_warp_seq;
3648     m->change_attr = capsnap.change_attr;
3649
3650     if (capsnap.dirty & CEPH_CAP_FILE_WR) {
3651       m->inline_version = in->inline_version;
3652       m->inline_data = in->inline_data;
3653     }
3654
3655     assert(!session->flushing_caps_tids.empty());
3656     m->set_oldest_flush_tid(*session->flushing_caps_tids.begin());
3657
3658     session->con->send_message(m);
3659   }
3660 }
3661
3662
3663
3664 void Client::wait_on_list(list<Cond*>& ls)
3665 {
3666   Cond cond;
3667   ls.push_back(&cond);
3668   cond.Wait(client_lock);
3669   ls.remove(&cond);
3670 }
3671
3672 void Client::signal_cond_list(list<Cond*>& ls)
3673 {
3674   for (list<Cond*>::iterator it = ls.begin(); it != ls.end(); ++it)
3675     (*it)->Signal();
3676 }
3677
3678 void Client::wait_on_context_list(list<Context*>& ls)
3679 {
3680   Cond cond;
3681   bool done = false;
3682   int r;
3683   ls.push_back(new C_Cond(&cond, &done, &r));
3684   while (!done)
3685     cond.Wait(client_lock);
3686 }
3687
3688 void Client::signal_context_list(list<Context*>& ls)
3689 {
3690   while (!ls.empty()) {
3691     ls.front()->complete(0);
3692     ls.pop_front();
3693   }
3694 }
3695
3696 void Client::wake_inode_waiters(MetaSession *s)
3697 {
3698   xlist<Cap*>::iterator iter = s->caps.begin();
3699   while (!iter.end()){
3700     signal_cond_list((*iter)->inode->waitfor_caps);
3701     ++iter;
3702   }
3703 }
3704
3705
3706 // flush dirty data (from objectcache)
3707
3708 class C_Client_CacheInvalidate : public Context  {
3709 private:
3710   Client *client;
3711   vinodeno_t ino;
3712   int64_t offset, length;
3713 public:
3714   C_Client_CacheInvalidate(Client *c, Inode *in, int64_t off, int64_t len) :
3715     client(c), offset(off), length(len) {
3716     if (client->use_faked_inos())
3717       ino = vinodeno_t(in->faked_ino, CEPH_NOSNAP);
3718     else
3719       ino = in->vino();
3720   }
3721   void finish(int r) override {
3722     // _async_invalidate takes the lock when it needs to, call this back from outside of lock.
3723     assert(!client->client_lock.is_locked_by_me());
3724     client->_async_invalidate(ino, offset, length);
3725   }
3726 };
3727
3728 void Client::_async_invalidate(vinodeno_t ino, int64_t off, int64_t len)
3729 {
3730   if (unmounting)
3731     return;
3732   ldout(cct, 10) << "_async_invalidate " << ino << " " << off << "~" << len << dendl;
3733   ino_invalidate_cb(callback_handle, ino, off, len);
3734 }
3735
3736 void Client::_schedule_invalidate_callback(Inode *in, int64_t off, int64_t len) {
3737
3738   if (ino_invalidate_cb)
3739     // we queue the invalidate, which calls the callback and decrements the ref
3740     async_ino_invalidator.queue(new C_Client_CacheInvalidate(this, in, off, len));
3741 }
3742
3743 void Client::_invalidate_inode_cache(Inode *in)
3744 {
3745   ldout(cct, 10) << "_invalidate_inode_cache " << *in << dendl;
3746
3747   // invalidate our userspace inode cache
3748   if (cct->_conf->client_oc) {
3749     objectcacher->release_set(&in->oset);
3750     if (!objectcacher->set_is_empty(&in->oset))
3751       lderr(cct) << "failed to invalidate cache for " << *in << dendl;
3752   }
3753
3754   _schedule_invalidate_callback(in, 0, 0);
3755 }
3756
3757 void Client::_invalidate_inode_cache(Inode *in, int64_t off, int64_t len)
3758 {
3759   ldout(cct, 10) << "_invalidate_inode_cache " << *in << " " << off << "~" << len << dendl;
3760
3761   // invalidate our userspace inode cache
3762   if (cct->_conf->client_oc) {
3763     vector<ObjectExtent> ls;
3764     Striper::file_to_extents(cct, in->ino, &in->layout, off, len, in->truncate_size, ls);
3765     objectcacher->discard_set(&in->oset, ls);
3766   }
3767
3768   _schedule_invalidate_callback(in, off, len);
3769 }
3770
3771 bool Client::_release(Inode *in)
3772 {
3773   ldout(cct, 20) << "_release " << *in << dendl;
3774   if (in->cap_refs[CEPH_CAP_FILE_CACHE] == 0) {
3775     _invalidate_inode_cache(in);
3776     return true;
3777   }
3778   return false;
3779 }
3780
3781 bool Client::_flush(Inode *in, Context *onfinish)
3782 {
3783   ldout(cct, 10) << "_flush " << *in << dendl;
3784
3785   if (!in->oset.dirty_or_tx) {
3786     ldout(cct, 10) << " nothing to flush" << dendl;
3787     onfinish->complete(0);
3788     return true;
3789   }
3790
3791   if (objecter->osdmap_pool_full(in->layout.pool_id)) {
3792     ldout(cct, 1) << __func__ << ": FULL, purging for ENOSPC" << dendl;
3793     objectcacher->purge_set(&in->oset);
3794     if (onfinish) {
3795       onfinish->complete(-ENOSPC);
3796     }
3797     return true;
3798   }
3799
3800   return objectcacher->flush_set(&in->oset, onfinish);
3801 }
3802
3803 void Client::_flush_range(Inode *in, int64_t offset, uint64_t size)
3804 {
3805   assert(client_lock.is_locked());
3806   if (!in->oset.dirty_or_tx) {
3807     ldout(cct, 10) << " nothing to flush" << dendl;
3808     return;
3809   }
3810
3811   Mutex flock("Client::_flush_range flock");
3812   Cond cond;
3813   bool safe = false;
3814   Context *onflush = new C_SafeCond(&flock, &cond, &safe);
3815   bool ret = objectcacher->file_flush(&in->oset, &in->layout, in->snaprealm->get_snap_context(),
3816                                       offset, size, onflush);
3817   if (!ret) {
3818     // wait for flush
3819     client_lock.Unlock();
3820     flock.Lock();
3821     while (!safe)
3822       cond.Wait(flock);
3823     flock.Unlock();
3824     client_lock.Lock();
3825   }
3826 }
3827
3828 void Client::flush_set_callback(ObjectCacher::ObjectSet *oset)
3829 {
3830   //  Mutex::Locker l(client_lock);
3831   assert(client_lock.is_locked());   // will be called via dispatch() -> objecter -> ...
3832   Inode *in = static_cast<Inode *>(oset->parent);
3833   assert(in);
3834   _flushed(in);
3835 }
3836
3837 void Client::_flushed(Inode *in)
3838 {
3839   ldout(cct, 10) << "_flushed " << *in << dendl;
3840
3841   put_cap_ref(in, CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_BUFFER);
3842 }
3843
3844
3845
3846 // checks common to add_update_cap, handle_cap_grant
3847 void Client::check_cap_issue(Inode *in, Cap *cap, unsigned issued)
3848 {
3849   unsigned had = in->caps_issued();
3850
3851   if ((issued & CEPH_CAP_FILE_CACHE) &&
3852       !(had & CEPH_CAP_FILE_CACHE))
3853     in->cache_gen++;
3854
3855   if ((issued & CEPH_CAP_FILE_SHARED) &&
3856       !(had & CEPH_CAP_FILE_SHARED)) {
3857     in->shared_gen++;
3858
3859     if (in->is_dir())
3860       clear_dir_complete_and_ordered(in, true);
3861   }
3862 }
3863
3864 void Client::add_update_cap(Inode *in, MetaSession *mds_session, uint64_t cap_id,
3865                             unsigned issued, unsigned seq, unsigned mseq, inodeno_t realm,
3866                             int flags, const UserPerm& cap_perms)
3867 {
3868   Cap *cap = 0;
3869   mds_rank_t mds = mds_session->mds_num;
3870   if (in->caps.count(mds)) {
3871     cap = in->caps[mds];
3872
3873     /*
3874      * auth mds of the inode changed. we received the cap export
3875      * message, but still haven't received the cap import message.
3876      * handle_cap_export() updated the new auth MDS' cap.
3877      *
3878      * "ceph_seq_cmp(seq, cap->seq) <= 0" means we are processing
3879      * a message that was send before the cap import message. So
3880      * don't remove caps.
3881      */
3882     if (ceph_seq_cmp(seq, cap->seq) <= 0) {
3883       assert(cap == in->auth_cap);
3884       assert(cap->cap_id == cap_id);
3885       seq = cap->seq;
3886       mseq = cap->mseq;
3887       issued |= cap->issued;
3888       flags |= CEPH_CAP_FLAG_AUTH;
3889     }
3890   } else {
3891     mds_session->num_caps++;
3892     if (!in->is_any_caps()) {
3893       assert(in->snaprealm == 0);
3894       in->snaprealm = get_snap_realm(realm);
3895       in->snaprealm->inodes_with_caps.push_back(&in->snaprealm_item);
3896       ldout(cct, 15) << "add_update_cap first one, opened snaprealm " << in->snaprealm << dendl;
3897     }
3898     in->caps[mds] = cap = new Cap;
3899
3900     mds_session->caps.push_back(&cap->cap_item);
3901     cap->session = mds_session;
3902     cap->inode = in;
3903     cap->gen = mds_session->cap_gen;
3904     cap_list.push_back(&in->cap_item);
3905   }
3906
3907   check_cap_issue(in, cap, issued);
3908
3909   if (flags & CEPH_CAP_FLAG_AUTH) {
3910     if (in->auth_cap != cap &&
3911         (!in->auth_cap || ceph_seq_cmp(in->auth_cap->mseq, mseq) < 0)) {
3912       if (in->auth_cap && in->flushing_cap_item.is_on_list()) {
3913         ldout(cct, 10) << "add_update_cap changing auth cap: "
3914                        << "add myself to new auth MDS' flushing caps list" << dendl;
3915         adjust_session_flushing_caps(in, in->auth_cap->session, mds_session);
3916       }
3917       in->auth_cap = cap;
3918     }
3919   }
3920
3921   unsigned old_caps = cap->issued;
3922   cap->cap_id = cap_id;
3923   cap->issued |= issued;
3924   cap->implemented |= issued;
3925   cap->seq = seq;
3926   cap->issue_seq = seq;
3927   cap->mseq = mseq;
3928   cap->latest_perms = cap_perms;
3929   ldout(cct, 10) << "add_update_cap issued " << ccap_string(old_caps) << " -> " << ccap_string(cap->issued)
3930            << " from mds." << mds
3931            << " on " << *in
3932            << dendl;
3933
3934   if ((issued & ~old_caps) && in->auth_cap == cap) {
3935     // non-auth MDS is revoking the newly grant caps ?
3936     for (map<mds_rank_t,Cap*>::iterator it = in->caps.begin(); it != in->caps.end(); ++it) {
3937       if (it->second == cap)
3938         continue;
3939       if (it->second->implemented & ~it->second->issued & issued) {
3940         check_caps(in, CHECK_CAPS_NODELAY);
3941         break;
3942       }
3943     }
3944   }
3945
3946   if (issued & ~old_caps)
3947     signal_cond_list(in->waitfor_caps);
3948 }
3949
3950 void Client::remove_cap(Cap *cap, bool queue_release)
3951 {
3952   Inode *in = cap->inode;
3953   MetaSession *session = cap->session;
3954   mds_rank_t mds = cap->session->mds_num;
3955
3956   ldout(cct, 10) << "remove_cap mds." << mds << " on " << *in << dendl;
3957
3958   if (queue_release) {
3959     session->enqueue_cap_release(
3960       in->ino,
3961       cap->cap_id,
3962       cap->issue_seq,
3963       cap->mseq,
3964       cap_epoch_barrier);
3965   }
3966
3967   if (in->auth_cap == cap) {
3968     if (in->flushing_cap_item.is_on_list()) {
3969       ldout(cct, 10) << " removing myself from flushing_cap list" << dendl;
3970       in->flushing_cap_item.remove_myself();
3971     }
3972     in->auth_cap = NULL;
3973   }
3974   assert(in->caps.count(mds));
3975   in->caps.erase(mds);
3976
3977   cap->cap_item.remove_myself();
3978   delete cap;
3979   cap = nullptr;
3980
3981   if (!in->is_any_caps()) {
3982     ldout(cct, 15) << "remove_cap last one, closing snaprealm " << in->snaprealm << dendl;
3983     in->snaprealm_item.remove_myself();
3984     put_snap_realm(in->snaprealm);
3985     in->snaprealm = 0;
3986   }
3987 }
3988
3989 void Client::remove_all_caps(Inode *in)
3990 {
3991   while (!in->caps.empty())
3992     remove_cap(in->caps.begin()->second, true);
3993 }
3994
3995 void Client::remove_session_caps(MetaSession *s)
3996 {
3997   ldout(cct, 10) << "remove_session_caps mds." << s->mds_num << dendl;
3998
3999   while (s->caps.size()) {
4000     Cap *cap = *s->caps.begin();
4001     Inode *in = cap->inode;
4002     bool dirty_caps = false, cap_snaps = false;
4003     if (in->auth_cap == cap) {
4004       cap_snaps = !in->cap_snaps.empty();
4005       dirty_caps = in->dirty_caps | in->flushing_caps;
4006       in->wanted_max_size = 0;
4007       in->requested_max_size = 0;
4008       in->flags |= I_CAP_DROPPED;
4009     }
4010     remove_cap(cap, false);
4011     signal_cond_list(in->waitfor_caps);
4012     if (cap_snaps) {
4013       InodeRef tmp_ref(in);
4014       in->cap_snaps.clear();
4015     }
4016     if (dirty_caps) {
4017       lderr(cct) << "remove_session_caps still has dirty|flushing caps on " << *in << dendl;
4018       if (in->flushing_caps) {
4019         num_flushing_caps--;
4020         in->flushing_cap_tids.clear();
4021       }
4022       in->flushing_caps = 0;
4023       in->dirty_caps = 0;
4024       put_inode(in);
4025     }
4026   }
4027   s->flushing_caps_tids.clear();
4028   sync_cond.Signal();
4029 }
4030
4031 int Client::_do_remount(void)
4032 {
4033   errno = 0;
4034   int r = remount_cb(callback_handle);
4035   if (r != 0) {
4036     int e = errno;
4037     client_t whoami = get_nodeid();
4038     if (r == -1) {
4039       lderr(cct) <<
4040           "failed to remount (to trim kernel dentries): "
4041           "errno = " << e << " (" << strerror(e) << ")" << dendl;
4042     } else {
4043       lderr(cct) <<
4044           "failed to remount (to trim kernel dentries): "
4045           "return code = " << r << dendl;
4046     }
4047     bool should_abort = cct->_conf->get_val<bool>("client_die_on_failed_remount") ||
4048         cct->_conf->get_val<bool>("client_die_on_failed_dentry_invalidate");
4049     if (should_abort && !unmounting) {
4050       lderr(cct) << "failed to remount for kernel dentry trimming; quitting!" << dendl;
4051       ceph_abort();
4052     }
4053   }
4054   return r;
4055 }
4056
4057 class C_Client_Remount : public Context  {
4058 private:
4059   Client *client;
4060 public:
4061   explicit C_Client_Remount(Client *c) : client(c) {}
4062   void finish(int r) override {
4063     assert(r == 0);
4064     client->_do_remount();
4065   }
4066 };
4067
4068 void Client::_invalidate_kernel_dcache()
4069 {
4070   if (unmounting)
4071     return;
4072   if (can_invalidate_dentries) {
4073     if (dentry_invalidate_cb && root->dir) {
4074       for (ceph::unordered_map<string, Dentry*>::iterator p = root->dir->dentries.begin();
4075          p != root->dir->dentries.end();
4076          ++p) {
4077        if (p->second->inode)
4078         _schedule_invalidate_dentry_callback(p->second, false);
4079       }
4080     }
4081   } else if (remount_cb) {
4082     // Hacky:
4083     // when remounting a file system, linux kernel trims all unused dentries in the fs
4084     remount_finisher.queue(new C_Client_Remount(this));
4085   }
4086 }
4087
4088 void Client::trim_caps(MetaSession *s, int max)
4089 {
4090   mds_rank_t mds = s->mds_num;
4091   int caps_size = s->caps.size();
4092   ldout(cct, 10) << "trim_caps mds." << mds << " max " << max
4093     << " caps " << caps_size << dendl;
4094
4095   int trimmed = 0;
4096   xlist<Cap*>::iterator p = s->caps.begin();
4097   std::set<InodeRef> anchor; /* prevent put_inode from deleting all caps during traversal */
4098   while ((caps_size - trimmed) > max && !p.end()) {
4099     Cap *cap = *p;
4100     InodeRef in(cap->inode);
4101
4102     // Increment p early because it will be invalidated if cap
4103     // is deleted inside remove_cap
4104     ++p;
4105
4106     if (in->caps.size() > 1 && cap != in->auth_cap) {
4107       int mine = cap->issued | cap->implemented;
4108       int oissued = in->auth_cap ? in->auth_cap->issued : 0;
4109       // disposable non-auth cap
4110       if (!(get_caps_used(in.get()) & ~oissued & mine)) {
4111         ldout(cct, 20) << " removing unused, unneeded non-auth cap on " << *in << dendl;
4112         remove_cap(cap, true);
4113         /* N.B. no need to push onto anchor, as we are only removing one cap */
4114         trimmed++;
4115       }
4116     } else {
4117       ldout(cct, 20) << " trying to trim dentries for " << *in << dendl;
4118       bool all = true;
4119       set<Dentry*>::iterator q = in->dn_set.begin();
4120       while (q != in->dn_set.end()) {
4121         Dentry *dn = *q++;
4122         if (dn->lru_is_expireable()) {
4123           if (can_invalidate_dentries &&
4124               dn->dir->parent_inode->ino == MDS_INO_ROOT) {
4125             // Only issue one of these per DN for inodes in root: handle
4126             // others more efficiently by calling for root-child DNs at
4127             // the end of this function.
4128             _schedule_invalidate_dentry_callback(dn, true);
4129           }
4130           ldout(cct, 20) << " anchoring inode: " << in->ino << dendl;
4131           anchor.insert(in);
4132           trim_dentry(dn);
4133         } else {
4134           ldout(cct, 20) << "  not expirable: " << dn->name << dendl;
4135           all = false;
4136         }
4137       }
4138       if (all && in->ino != MDS_INO_ROOT) {
4139         ldout(cct, 20) << __func__ << " counting as trimmed: " << *in << dendl;
4140         trimmed++;
4141       }
4142     }
4143   }
4144   ldout(cct, 20) << " clearing anchored inodes" << dendl;
4145   anchor.clear();
4146
4147   caps_size = s->caps.size();
4148   if (caps_size > max)
4149     _invalidate_kernel_dcache();
4150 }
4151
4152 void Client::force_session_readonly(MetaSession *s)
4153 {
4154   s->readonly = true;
4155   for (xlist<Cap*>::iterator p = s->caps.begin(); !p.end(); ++p) {
4156     Inode *in = (*p)->inode;
4157     if (in->caps_wanted() & CEPH_CAP_FILE_WR)
4158       signal_cond_list(in->waitfor_caps);
4159   }
4160 }
4161
4162 void Client::mark_caps_dirty(Inode *in, int caps)
4163 {
4164   ldout(cct, 10) << "mark_caps_dirty " << *in << " " << ccap_string(in->dirty_caps) << " -> "
4165            << ccap_string(in->dirty_caps | caps) << dendl;
4166   if (caps && !in->caps_dirty())
4167     in->get();
4168   in->dirty_caps |= caps;
4169 }
4170
4171 int Client::mark_caps_flushing(Inode *in, ceph_tid_t* ptid)
4172 {
4173   MetaSession *session = in->auth_cap->session;
4174
4175   int flushing = in->dirty_caps;
4176   assert(flushing);
4177
4178   ceph_tid_t flush_tid = ++last_flush_tid;
4179   in->flushing_cap_tids[flush_tid] = flushing;
4180
4181   if (!in->flushing_caps) {
4182     ldout(cct, 10) << "mark_caps_flushing " << ccap_string(flushing) << " " << *in << dendl;
4183     num_flushing_caps++;
4184   } else {
4185     ldout(cct, 10) << "mark_caps_flushing (more) " << ccap_string(flushing) << " " << *in << dendl;
4186   }
4187
4188   in->flushing_caps |= flushing;
4189   in->dirty_caps = 0;
4190
4191   if (!in->flushing_cap_item.is_on_list())
4192     session->flushing_caps.push_back(&in->flushing_cap_item);
4193   session->flushing_caps_tids.insert(flush_tid);
4194
4195   *ptid = flush_tid;
4196   return flushing;
4197 }
4198
4199 void Client::adjust_session_flushing_caps(Inode *in, MetaSession *old_s,  MetaSession *new_s)
4200 {
4201   for (auto &p : in->cap_snaps) {
4202     CapSnap &capsnap = p.second;
4203     if (capsnap.flush_tid > 0) {
4204       old_s->flushing_caps_tids.erase(capsnap.flush_tid);
4205       new_s->flushing_caps_tids.insert(capsnap.flush_tid);
4206     }
4207   }
4208   for (map<ceph_tid_t, int>::iterator it = in->flushing_cap_tids.begin();
4209        it != in->flushing_cap_tids.end();
4210        ++it) {
4211     old_s->flushing_caps_tids.erase(it->first);
4212     new_s->flushing_caps_tids.insert(it->first);
4213   }
4214   new_s->flushing_caps.push_back(&in->flushing_cap_item);
4215 }
4216
4217 /*
4218  * Flush all caps back to the MDS. Because the callers generally wait on the
4219  * result of this function (syncfs and umount cases), we set
4220  * CHECK_CAPS_SYNCHRONOUS on the last check_caps call.
4221  */
4222 void Client::flush_caps_sync()
4223 {
4224   ldout(cct, 10) << __func__ << dendl;
4225   xlist<Inode*>::iterator p = delayed_caps.begin();
4226   while (!p.end()) {
4227     unsigned flags = CHECK_CAPS_NODELAY;
4228     Inode *in = *p;
4229
4230     ++p;
4231     delayed_caps.pop_front();
4232     if (p.end() && cap_list.empty())
4233       flags |= CHECK_CAPS_SYNCHRONOUS;
4234     check_caps(in, flags);
4235   }
4236
4237   // other caps, too
4238   p = cap_list.begin();
4239   while (!p.end()) {
4240     unsigned flags = CHECK_CAPS_NODELAY;
4241     Inode *in = *p;
4242
4243     ++p;
4244     if (p.end())
4245       flags |= CHECK_CAPS_SYNCHRONOUS;
4246     check_caps(in, flags);
4247   }
4248 }
4249
4250 void Client::flush_caps(Inode *in, MetaSession *session, bool sync)
4251 {
4252   ldout(cct, 10) << "flush_caps " << in << " mds." << session->mds_num << dendl;
4253   Cap *cap = in->auth_cap;
4254   assert(cap->session == session);
4255
4256   for (map<ceph_tid_t,int>::iterator p = in->flushing_cap_tids.begin();
4257        p != in->flushing_cap_tids.end();
4258        ++p) {
4259     bool req_sync = false;
4260
4261     /* If this is a synchronous request, then flush the journal on last one */
4262     if (sync && (p->first == in->flushing_cap_tids.rbegin()->first))
4263       req_sync = true;
4264
4265     send_cap(in, session, cap, req_sync,
4266              (get_caps_used(in) | in->caps_dirty()),
4267              in->caps_wanted(), (cap->issued | cap->implemented),
4268              p->second, p->first);
4269   }
4270 }
4271
4272 void Client::wait_sync_caps(Inode *in, ceph_tid_t want)
4273 {
4274   while (in->flushing_caps) {
4275     map<ceph_tid_t, int>::iterator it = in->flushing_cap_tids.begin();
4276     assert(it != in->flushing_cap_tids.end());
4277     if (it->first > want)
4278       break;
4279     ldout(cct, 10) << "wait_sync_caps on " << *in << " flushing "
4280                    << ccap_string(it->second) << " want " << want
4281                    << " last " << it->first << dendl;
4282     wait_on_list(in->waitfor_caps);
4283   }
4284 }
4285
4286 void Client::wait_sync_caps(ceph_tid_t want)
4287 {
4288  retry:
4289   ldout(cct, 10) << "wait_sync_caps want " << want  << " (last is " << last_flush_tid << ", "
4290            << num_flushing_caps << " total flushing)" << dendl;
4291   for (map<mds_rank_t,MetaSession*>::iterator p = mds_sessions.begin();
4292        p != mds_sessions.end();
4293        ++p) {
4294     MetaSession *s = p->second;
4295     if (s->flushing_caps_tids.empty())
4296         continue;
4297     ceph_tid_t oldest_tid = *s->flushing_caps_tids.begin();
4298     if (oldest_tid <= want) {
4299       ldout(cct, 10) << " waiting on mds." << p->first << " tid " << oldest_tid
4300                      << " (want " << want << ")" << dendl;
4301       sync_cond.Wait(client_lock);
4302       goto retry;
4303     }
4304   }
4305 }
4306
4307 void Client::kick_flushing_caps(MetaSession *session)
4308 {
4309   mds_rank_t mds = session->mds_num;
4310   ldout(cct, 10) << "kick_flushing_caps mds." << mds << dendl;
4311
4312   for (xlist<Inode*>::iterator p = session->flushing_caps.begin(); !p.end(); ++p) {
4313     Inode *in = *p;
4314     if (session->early_flushing_caps.count(in))
4315       continue;
4316     ldout(cct, 20) << " reflushing caps on " << *in << " to mds." << mds << dendl;
4317     if (in->cap_snaps.size())
4318       flush_snaps(in, true);
4319     if (in->flushing_caps)
4320       flush_caps(in, session);
4321   }
4322
4323   session->early_flushing_caps.clear();
4324 }
4325
4326 void Client::early_kick_flushing_caps(MetaSession *session)
4327 {
4328   session->early_flushing_caps.clear();
4329
4330   for (xlist<Inode*>::iterator p = session->flushing_caps.begin(); !p.end(); ++p) {
4331     Inode *in = *p;
4332     assert(in->auth_cap);
4333
4334     // if flushing caps were revoked, we re-send the cap flush in client reconnect
4335     // stage. This guarantees that MDS processes the cap flush message before issuing
4336     // the flushing caps to other client.
4337     if ((in->flushing_caps & in->auth_cap->issued) == in->flushing_caps)
4338       continue;
4339
4340     ldout(cct, 20) << " reflushing caps (early_kick) on " << *in
4341                    << " to mds." << session->mds_num << dendl;
4342
4343     session->early_flushing_caps.insert(in);
4344
4345     if (in->cap_snaps.size())
4346       flush_snaps(in, true);
4347     if (in->flushing_caps)
4348       flush_caps(in, session);
4349
4350   }
4351 }
4352
4353 void Client::kick_maxsize_requests(MetaSession *session)
4354 {
4355   xlist<Cap*>::iterator iter = session->caps.begin();
4356   while (!iter.end()){
4357     (*iter)->inode->requested_max_size = 0;
4358     (*iter)->inode->wanted_max_size = 0;
4359     signal_cond_list((*iter)->inode->waitfor_caps);
4360     ++iter;
4361   }
4362 }
4363
4364 void SnapRealm::build_snap_context()
4365 {
4366   set<snapid_t> snaps;
4367   snapid_t max_seq = seq;
4368
4369   // start with prior_parents?
4370   for (unsigned i=0; i<prior_parent_snaps.size(); i++)
4371     snaps.insert(prior_parent_snaps[i]);
4372
4373   // current parent's snaps
4374   if (pparent) {
4375     const SnapContext& psnapc = pparent->get_snap_context();
4376     for (unsigned i=0; i<psnapc.snaps.size(); i++)
4377       if (psnapc.snaps[i] >= parent_since)
4378         snaps.insert(psnapc.snaps[i]);
4379     if (psnapc.seq > max_seq)
4380       max_seq = psnapc.seq;
4381   }
4382
4383   // my snaps
4384   for (unsigned i=0; i<my_snaps.size(); i++)
4385     snaps.insert(my_snaps[i]);
4386
4387   // ok!
4388   cached_snap_context.seq = max_seq;
4389   cached_snap_context.snaps.resize(0);
4390   cached_snap_context.snaps.reserve(snaps.size());
4391   for (set<snapid_t>::reverse_iterator p = snaps.rbegin(); p != snaps.rend(); ++p)
4392     cached_snap_context.snaps.push_back(*p);
4393 }
4394
4395 void Client::invalidate_snaprealm_and_children(SnapRealm *realm)
4396 {
4397   list<SnapRealm*> q;
4398   q.push_back(realm);
4399
4400   while (!q.empty()) {
4401     realm = q.front();
4402     q.pop_front();
4403
4404     ldout(cct, 10) << "invalidate_snaprealm_and_children " << *realm << dendl;
4405     realm->invalidate_cache();
4406
4407     for (set<SnapRealm*>::iterator p = realm->pchildren.begin();
4408          p != realm->pchildren.end();
4409          ++p)
4410       q.push_back(*p);
4411   }
4412 }
4413
4414 SnapRealm *Client::get_snap_realm(inodeno_t r)
4415 {
4416   SnapRealm *realm = snap_realms[r];
4417   if (!realm)
4418     snap_realms[r] = realm = new SnapRealm(r);
4419   ldout(cct, 20) << "get_snap_realm " << r << " " << realm << " " << realm->nref << " -> " << (realm->nref + 1) << dendl;
4420   realm->nref++;
4421   return realm;
4422 }
4423
4424 SnapRealm *Client::get_snap_realm_maybe(inodeno_t r)
4425 {
4426   if (snap_realms.count(r) == 0) {
4427     ldout(cct, 20) << "get_snap_realm_maybe " << r << " fail" << dendl;
4428     return NULL;
4429   }
4430   SnapRealm *realm = snap_realms[r];
4431   ldout(cct, 20) << "get_snap_realm_maybe " << r << " " << realm << " " << realm->nref << " -> " << (realm->nref + 1) << dendl;
4432   realm->nref++;
4433   return realm;
4434 }
4435
4436 void Client::put_snap_realm(SnapRealm *realm)
4437 {
4438   ldout(cct, 20) << "put_snap_realm " << realm->ino << " " << realm
4439                  << " " << realm->nref << " -> " << (realm->nref - 1) << dendl;
4440   if (--realm->nref == 0) {
4441     snap_realms.erase(realm->ino);
4442     if (realm->pparent) {
4443       realm->pparent->pchildren.erase(realm);
4444       put_snap_realm(realm->pparent);
4445     }
4446     delete realm;
4447   }
4448 }
4449
4450 bool Client::adjust_realm_parent(SnapRealm *realm, inodeno_t parent)
4451 {
4452   if (realm->parent != parent) {
4453     ldout(cct, 10) << "adjust_realm_parent " << *realm
4454              << " " << realm->parent << " -> " << parent << dendl;
4455     realm->parent = parent;
4456     if (realm->pparent) {
4457       realm->pparent->pchildren.erase(realm);
4458       put_snap_realm(realm->pparent);
4459     }
4460     realm->pparent = get_snap_realm(parent);
4461     realm->pparent->pchildren.insert(realm);
4462     return true;
4463   }
4464   return false;
4465 }
4466
4467 static bool has_new_snaps(const SnapContext& old_snapc,
4468                           const SnapContext& new_snapc)
4469 {
4470   return !new_snapc.snaps.empty() && new_snapc.snaps[0] > old_snapc.seq;
4471 }
4472
4473
4474 void Client::update_snap_trace(bufferlist& bl, SnapRealm **realm_ret, bool flush)
4475 {
4476   SnapRealm *first_realm = NULL;
4477   ldout(cct, 10) << "update_snap_trace len " << bl.length() << dendl;
4478
4479   map<SnapRealm*, SnapContext> dirty_realms;
4480
4481   bufferlist::iterator p = bl.begin();
4482   while (!p.end()) {
4483     SnapRealmInfo info;
4484     ::decode(info, p);
4485     SnapRealm *realm = get_snap_realm(info.ino());
4486
4487     bool invalidate = false;
4488
4489     if (info.seq() > realm->seq) {
4490       ldout(cct, 10) << "update_snap_trace " << *realm << " seq " << info.seq() << " > " << realm->seq
4491                << dendl;
4492
4493       if (flush) {
4494         // writeback any dirty caps _before_ updating snap list (i.e. with old snap info)
4495         //  flush me + children
4496         list<SnapRealm*> q;
4497         q.push_back(realm);
4498         while (!q.empty()) {
4499           SnapRealm *realm = q.front();
4500           q.pop_front();
4501
4502           for (set<SnapRealm*>::iterator p = realm->pchildren.begin();
4503                p != realm->pchildren.end();
4504                ++p)
4505             q.push_back(*p);
4506
4507           if (dirty_realms.count(realm) == 0) {
4508             realm->nref++;
4509             dirty_realms[realm] = realm->get_snap_context();
4510           }
4511         }
4512       }
4513
4514       // update
4515       realm->seq = info.seq();
4516       realm->created = info.created();
4517       realm->parent_since = info.parent_since();
4518       realm->prior_parent_snaps = info.prior_parent_snaps;
4519       realm->my_snaps = info.my_snaps;
4520       invalidate = true;
4521     }
4522
4523     // _always_ verify parent
4524     if (adjust_realm_parent(realm, info.parent()))
4525       invalidate = true;
4526
4527     if (invalidate) {
4528       invalidate_snaprealm_and_children(realm);
4529       ldout(cct, 15) << "update_snap_trace " << *realm << " self|parent updated" << dendl;
4530       ldout(cct, 15) << "  snapc " << realm->get_snap_context() << dendl;
4531     } else {
4532       ldout(cct, 10) << "update_snap_trace " << *realm << " seq " << info.seq()
4533                << " <= " << realm->seq << " and same parent, SKIPPING" << dendl;
4534     }
4535
4536     if (!first_realm)
4537       first_realm = realm;
4538     else
4539       put_snap_realm(realm);
4540   }
4541
4542   for (map<SnapRealm*, SnapContext>::iterator q = dirty_realms.begin();
4543        q != dirty_realms.end();
4544        ++q) {
4545     SnapRealm *realm = q->first;
4546     // if there are new snaps ?
4547     if (has_new_snaps(q->second, realm->get_snap_context())) {
4548       ldout(cct, 10) << " flushing caps on " << *realm << dendl;
4549       xlist<Inode*>::iterator r = realm->inodes_with_caps.begin();
4550       while (!r.end()) {
4551         Inode *in = *r;
4552         ++r;
4553         queue_cap_snap(in, q->second);
4554       }
4555     } else {
4556       ldout(cct, 10) << " no new snap on " << *realm << dendl;
4557     }
4558     put_snap_realm(realm);
4559   }
4560
4561   if (realm_ret)
4562     *realm_ret = first_realm;
4563   else
4564     put_snap_realm(first_realm);
4565 }
4566
4567 void Client::handle_snap(MClientSnap *m)
4568 {
4569   ldout(cct, 10) << "handle_snap " << *m << dendl;
4570   mds_rank_t mds = mds_rank_t(m->get_source().num());
4571   MetaSession *session = _get_mds_session(mds, m->get_connection().get());
4572   if (!session) {
4573     m->put();
4574     return;
4575   }
4576
4577   got_mds_push(session);
4578
4579   map<Inode*, SnapContext> to_move;
4580   SnapRealm *realm = 0;
4581
4582   if (m->head.op == CEPH_SNAP_OP_SPLIT) {
4583     assert(m->head.split);
4584     SnapRealmInfo info;
4585     bufferlist::iterator p = m->bl.begin();
4586     ::decode(info, p);
4587     assert(info.ino() == m->head.split);
4588
4589     // flush, then move, ino's.
4590     realm = get_snap_realm(info.ino());
4591     ldout(cct, 10) << " splitting off " << *realm << dendl;
4592     for (vector<inodeno_t>::iterator p = m->split_inos.begin();
4593          p != m->split_inos.end();
4594          ++p) {
4595       vinodeno_t vino(*p, CEPH_NOSNAP);
4596       if (inode_map.count(vino)) {
4597         Inode *in = inode_map[vino];
4598         if (!in->snaprealm || in->snaprealm == realm)
4599           continue;
4600         if (in->snaprealm->created > info.created()) {
4601           ldout(cct, 10) << " NOT moving " << *in << " from _newer_ realm "
4602                    << *in->snaprealm << dendl;
4603           continue;
4604         }
4605         ldout(cct, 10) << " moving " << *in << " from " << *in->snaprealm << dendl;
4606
4607
4608         in->snaprealm_item.remove_myself();
4609         to_move[in] = in->snaprealm->get_snap_context();
4610         put_snap_realm(in->snaprealm);
4611       }
4612     }
4613
4614     // move child snaprealms, too
4615     for (vector<inodeno_t>::iterator p = m->split_realms.begin();
4616          p != m->split_realms.end();
4617          ++p) {
4618       ldout(cct, 10) << "adjusting snaprealm " << *p << " parent" << dendl;
4619       SnapRealm *child = get_snap_realm_maybe(*p);
4620       if (!child)
4621         continue;
4622       adjust_realm_parent(child, realm->ino);
4623       put_snap_realm(child);
4624     }
4625   }
4626
4627   update_snap_trace(m->bl, NULL, m->head.op != CEPH_SNAP_OP_DESTROY);
4628
4629   if (realm) {
4630     for (auto p = to_move.begin(); p != to_move.end(); ++p) {
4631       Inode *in = p->first;
4632       in->snaprealm = realm;
4633       realm->inodes_with_caps.push_back(&in->snaprealm_item);
4634       realm->nref++;
4635       // queue for snap writeback
4636       if (has_new_snaps(p->second, realm->get_snap_context()))
4637         queue_cap_snap(in, p->second);
4638     }
4639     put_snap_realm(realm);
4640   }
4641
4642   m->put();
4643 }
4644
4645 void Client::handle_quota(MClientQuota *m)
4646 {
4647   mds_rank_t mds = mds_rank_t(m->get_source().num());
4648   MetaSession *session = _get_mds_session(mds, m->get_connection().get());
4649   if (!session) {
4650     m->put();
4651     return;
4652   }
4653
4654   got_mds_push(session);
4655
4656   ldout(cct, 10) << "handle_quota " << *m << " from mds." << mds << dendl;
4657
4658   vinodeno_t vino(m->ino, CEPH_NOSNAP);
4659   if (inode_map.count(vino)) {
4660     Inode *in = NULL;
4661     in = inode_map[vino];
4662
4663     if (in) {
4664       in->quota = m->quota;
4665       in->rstat = m->rstat;
4666     }
4667   }
4668
4669   m->put();
4670 }
4671
4672 void Client::handle_caps(MClientCaps *m)
4673 {
4674   mds_rank_t mds = mds_rank_t(m->get_source().num());
4675   MetaSession *session = _get_mds_session(mds, m->get_connection().get());
4676   if (!session) {
4677     m->put();
4678     return;
4679   }
4680
4681   if (m->osd_epoch_barrier && !objecter->have_map(m->osd_epoch_barrier)) {
4682     // Pause RADOS operations until we see the required epoch
4683     objecter->set_epoch_barrier(m->osd_epoch_barrier);
4684   }
4685
4686   if (m->osd_epoch_barrier > cap_epoch_barrier) {
4687     // Record the barrier so that we will transmit it to MDS when releasing
4688     set_cap_epoch_barrier(m->osd_epoch_barrier);
4689   }
4690
4691   got_mds_push(session);
4692
4693   m->clear_payload();  // for if/when we send back to MDS
4694
4695   Inode *in = 0;
4696   vinodeno_t vino(m->get_ino(), CEPH_NOSNAP);
4697   if (inode_map.count(vino))
4698     in = inode_map[vino];
4699   if (!in) {
4700     if (m->get_op() == CEPH_CAP_OP_IMPORT) {
4701       ldout(cct, 5) << "handle_caps don't have vino " << vino << " on IMPORT, immediately releasing" << dendl;
4702       session->enqueue_cap_release(
4703         m->get_ino(),
4704         m->get_cap_id(),
4705         m->get_seq(),
4706         m->get_mseq(),
4707         cap_epoch_barrier);
4708     } else {
4709       ldout(cct, 5) << "handle_caps don't have vino " << vino << ", dropping" << dendl;
4710     }
4711     m->put();
4712
4713     // in case the mds is waiting on e.g. a revocation
4714     flush_cap_releases();
4715     return;
4716   }
4717
4718   switch (m->get_op()) {
4719   case CEPH_CAP_OP_EXPORT:
4720     return handle_cap_export(session, in, m);
4721   case CEPH_CAP_OP_FLUSHSNAP_ACK:
4722     return handle_cap_flushsnap_ack(session, in, m);
4723   case CEPH_CAP_OP_IMPORT:
4724     handle_cap_import(session, in, m);
4725   }
4726
4727   if (in->caps.count(mds) == 0) {
4728     ldout(cct, 5) << "handle_caps don't have " << *in << " cap on mds." << mds << dendl;
4729     m->put();
4730     return;
4731   }
4732
4733   Cap *cap = in->caps[mds];
4734
4735   switch (m->get_op()) {
4736   case CEPH_CAP_OP_TRUNC: return handle_cap_trunc(session, in, m);
4737   case CEPH_CAP_OP_IMPORT:
4738   case CEPH_CAP_OP_REVOKE:
4739   case CEPH_CAP_OP_GRANT: return handle_cap_grant(session, in, cap, m);
4740   case CEPH_CAP_OP_FLUSH_ACK: return handle_cap_flush_ack(session, in, cap, m);
4741   default:
4742     m->put();
4743   }
4744 }
4745
4746 void Client::handle_cap_import(MetaSession *session, Inode *in, MClientCaps *m)
4747 {
4748   mds_rank_t mds = session->mds_num;
4749
4750   ldout(cct, 5) << "handle_cap_import ino " << m->get_ino() << " mseq " << m->get_mseq()
4751                 << " IMPORT from mds." << mds << dendl;
4752
4753   const mds_rank_t peer_mds = mds_rank_t(m->peer.mds);
4754   Cap *cap = NULL;
4755   UserPerm cap_perms;
4756   if (m->peer.cap_id && in->caps.count(peer_mds)) {
4757     cap = in->caps[peer_mds];
4758     if (cap) {
4759       cap_perms = cap->latest_perms;
4760     }
4761   }
4762
4763   // add/update it
4764   SnapRealm *realm = NULL;
4765   update_snap_trace(m->snapbl, &realm);
4766
4767   add_update_cap(in, session, m->get_cap_id(),
4768                  m->get_caps(), m->get_seq(), m->get_mseq(), m->get_realm(),
4769                  CEPH_CAP_FLAG_AUTH, cap_perms);
4770
4771   if (cap && cap->cap_id == m->peer.cap_id) {
4772       remove_cap(cap, (m->peer.flags & CEPH_CAP_FLAG_RELEASE));
4773   }
4774
4775   if (realm)
4776     put_snap_realm(realm);
4777
4778   if (in->auth_cap && in->auth_cap->session->mds_num == mds) {
4779     // reflush any/all caps (if we are now the auth_cap)
4780     if (in->cap_snaps.size())
4781       flush_snaps(in, true);
4782     if (in->flushing_caps)
4783       flush_caps(in, session);
4784   }
4785 }
4786
4787 void Client::handle_cap_export(MetaSession *session, Inode *in, MClientCaps *m)
4788 {
4789   mds_rank_t mds = session->mds_num;
4790
4791   ldout(cct, 5) << "handle_cap_export ino " << m->get_ino() << " mseq " << m->get_mseq()
4792                 << " EXPORT from mds." << mds << dendl;
4793
4794   Cap *cap = NULL;
4795   if (in->caps.count(mds))
4796     cap = in->caps[mds];
4797
4798   const mds_rank_t peer_mds = mds_rank_t(m->peer.mds);
4799
4800   if (cap && cap->cap_id == m->get_cap_id()) {
4801     if (m->peer.cap_id) {
4802       MetaSession *tsession = _get_or_open_mds_session(peer_mds);
4803       if (in->caps.count(peer_mds)) {
4804         Cap *tcap = in->caps[peer_mds];
4805         if (tcap->cap_id == m->peer.cap_id &&
4806             ceph_seq_cmp(tcap->seq, m->peer.seq) < 0) {
4807           tcap->cap_id = m->peer.cap_id;
4808           tcap->seq = m->peer.seq - 1;
4809           tcap->issue_seq = tcap->seq;
4810           tcap->mseq = m->peer.mseq;
4811           tcap->issued |= cap->issued;
4812           tcap->implemented |= cap->issued;
4813           if (cap == in->auth_cap)
4814             in->auth_cap = tcap;
4815           if (in->auth_cap == tcap && in->flushing_cap_item.is_on_list())
4816             adjust_session_flushing_caps(in, session, tsession);
4817         }
4818       } else {
4819         add_update_cap(in, tsession, m->peer.cap_id, cap->issued,
4820                        m->peer.seq - 1, m->peer.mseq, (uint64_t)-1,
4821                        cap == in->auth_cap ? CEPH_CAP_FLAG_AUTH : 0,
4822                        cap->latest_perms);
4823       }
4824     } else {
4825       if (cap == in->auth_cap)
4826         in->flags |= I_CAP_DROPPED;
4827     }
4828
4829     remove_cap(cap, false);
4830   }
4831
4832   m->put();
4833 }
4834
4835 void Client::handle_cap_trunc(MetaSession *session, Inode *in, MClientCaps *m)
4836 {
4837   mds_rank_t mds = session->mds_num;
4838   assert(in->caps[mds]);
4839
4840   ldout(cct, 10) << "handle_cap_trunc on ino " << *in
4841            << " size " << in->size << " -> " << m->get_size()
4842            << dendl;
4843
4844   int implemented = 0;
4845   int issued = in->caps_issued(&implemented) | in->caps_dirty();
4846   issued |= implemented;
4847   update_inode_file_bits(in, m->get_truncate_seq(), m->get_truncate_size(),
4848                          m->get_size(), m->get_change_attr(), m->get_time_warp_seq(),
4849                          m->get_ctime(), m->get_mtime(), m->get_atime(),
4850                          m->inline_version, m->inline_data, issued);
4851   m->put();
4852 }
4853
4854 void Client::handle_cap_flush_ack(MetaSession *session, Inode *in, Cap *cap, MClientCaps *m)
4855 {
4856   ceph_tid_t flush_ack_tid = m->get_client_tid();
4857   int dirty = m->get_dirty();
4858   int cleaned = 0;
4859   int flushed = 0;
4860
4861   for (map<ceph_tid_t, int>::iterator it = in->flushing_cap_tids.begin();
4862        it != in->flushing_cap_tids.end(); ) {
4863     if (it->first == flush_ack_tid)
4864       cleaned = it->second;
4865     if (it->first <= flush_ack_tid) {
4866       session->flushing_caps_tids.erase(it->first);
4867       in->flushing_cap_tids.erase(it++);
4868       ++flushed;
4869       continue;
4870     }
4871     cleaned &= ~it->second;
4872     if (!cleaned)
4873       break;
4874     ++it;
4875   }
4876
4877   ldout(cct, 5) << "handle_cap_flush_ack mds." << session->mds_num
4878           << " cleaned " << ccap_string(cleaned) << " on " << *in
4879           << " with " << ccap_string(dirty) << dendl;
4880
4881   if (flushed) {
4882     signal_cond_list(in->waitfor_caps);
4883     if (session->flushing_caps_tids.empty() ||
4884         *session->flushing_caps_tids.begin() > flush_ack_tid)
4885       sync_cond.Signal();
4886   }
4887
4888   if (!dirty) {
4889     in->cap_dirtier_uid = -1;
4890     in->cap_dirtier_gid = -1;
4891   }
4892
4893   if (!cleaned) {
4894     ldout(cct, 10) << " tid " << m->get_client_tid() << " != any cap bit tids" << dendl;
4895   } else {
4896     if (in->flushing_caps) {
4897       ldout(cct, 5) << "  flushing_caps " << ccap_string(in->flushing_caps)
4898               << " -> " << ccap_string(in->flushing_caps & ~cleaned) << dendl;
4899       in->flushing_caps &= ~cleaned;
4900       if (in->flushing_caps == 0) {
4901         ldout(cct, 10) << " " << *in << " !flushing" << dendl;
4902         num_flushing_caps--;
4903         if (in->cap_snaps.empty())
4904           in->flushing_cap_item.remove_myself();
4905       }
4906       if (!in->caps_dirty())
4907         put_inode(in);
4908     }
4909   }
4910
4911   m->put();
4912 }
4913
4914
4915 void Client::handle_cap_flushsnap_ack(MetaSession *session, Inode *in, MClientCaps *m)
4916 {
4917   mds_rank_t mds = session->mds_num;
4918   assert(in->caps[mds]);
4919   snapid_t follows = m->get_snap_follows();
4920
4921   if (in->cap_snaps.count(follows)) {
4922     CapSnap &capsnap = in->cap_snaps.at(follows);
4923     if (m->get_client_tid() != capsnap.flush_tid) {
4924       ldout(cct, 10) << " tid " << m->get_client_tid() << " != " << capsnap.flush_tid << dendl;
4925     } else {
4926       ldout(cct, 5) << "handle_cap_flushedsnap mds." << mds << " flushed snap follows " << follows
4927               << " on " << *in << dendl;
4928       InodeRef tmp_ref;
4929       if (in->get_num_ref() == 1)
4930         tmp_ref = in; // make sure inode not get freed while erasing item from in->cap_snaps
4931       if (in->flushing_caps == 0 && in->cap_snaps.empty())
4932         in->flushing_cap_item.remove_myself();
4933       session->flushing_caps_tids.erase(capsnap.flush_tid);
4934       in->cap_snaps.erase(follows);
4935     }
4936   } else {
4937     ldout(cct, 5) << "handle_cap_flushedsnap DUP(?) mds." << mds << " flushed snap follows " << follows
4938             << " on " << *in << dendl;
4939     // we may not have it if we send multiple FLUSHSNAP requests and (got multiple FLUSHEDSNAPs back)
4940   }
4941
4942   m->put();
4943 }
4944
4945 class C_Client_DentryInvalidate : public Context  {
4946 private:
4947   Client *client;
4948   vinodeno_t dirino;
4949   vinodeno_t ino;
4950   string name;
4951 public:
4952   C_Client_DentryInvalidate(Client *c, Dentry *dn, bool del) :
4953     client(c), name(dn->name) {
4954       if (client->use_faked_inos()) {
4955         dirino.ino = dn->dir->parent_inode->faked_ino;
4956         if (del)
4957           ino.ino = dn->inode->faked_ino;
4958       } else {
4959         dirino = dn->dir->parent_inode->vino();
4960         if (del)
4961           ino = dn->inode->vino();
4962       }
4963       if (!del)
4964         ino.ino = inodeno_t();
4965   }
4966   void finish(int r) override {
4967     // _async_dentry_invalidate is responsible for its own locking
4968     assert(!client->client_lock.is_locked_by_me());
4969     client->_async_dentry_invalidate(dirino, ino, name);
4970   }
4971 };
4972
4973 void Client::_async_dentry_invalidate(vinodeno_t dirino, vinodeno_t ino, string& name)
4974 {
4975   if (unmounting)
4976     return;
4977   ldout(cct, 10) << "_async_dentry_invalidate '" << name << "' ino " << ino
4978                  << " in dir " << dirino << dendl;
4979   dentry_invalidate_cb(callback_handle, dirino, ino, name);
4980 }
4981
4982 void Client::_schedule_invalidate_dentry_callback(Dentry *dn, bool del)
4983 {
4984   if (dentry_invalidate_cb && dn->inode->ll_ref > 0)
4985     async_dentry_invalidator.queue(new C_Client_DentryInvalidate(this, dn, del));
4986 }
4987
4988 void Client::_try_to_trim_inode(Inode *in, bool sched_inval)
4989 {
4990   int ref = in->get_num_ref();
4991
4992   if (in->dir && !in->dir->dentries.empty()) {
4993     for (auto p = in->dir->dentries.begin();
4994          p != in->dir->dentries.end(); ) {
4995       Dentry *dn = p->second;
4996       ++p;
4997       /* rmsnap removes whole subtree, need trim inodes recursively.
4998        * we don't need to invalidate dentries recursively. because
4999        * invalidating a directory dentry effectively invalidate
5000        * whole subtree */
5001       if (in->snapid != CEPH_NOSNAP && dn->inode && dn->inode->is_dir())
5002         _try_to_trim_inode(dn->inode.get(), false);
5003
5004       if (dn->lru_is_expireable())
5005         unlink(dn, true, false);  // keep dir, drop dentry
5006     }
5007     if (in->dir->dentries.empty()) {
5008       close_dir(in->dir);
5009       --ref;
5010     }
5011   }
5012
5013   if (ref > 0 && (in->flags & I_SNAPDIR_OPEN)) {
5014     InodeRef snapdir = open_snapdir(in);
5015     _try_to_trim_inode(snapdir.get(), false);
5016     --ref;
5017   }
5018
5019   if (ref > 0 && in->ll_ref > 0 && sched_inval) {
5020     set<Dentry*>::iterator q = in->dn_set.begin();
5021     while (q != in->dn_set.end()) {
5022       Dentry *dn = *q++;
5023       // FIXME: we play lots of unlink/link tricks when handling MDS replies,
5024       //        so in->dn_set doesn't always reflect the state of kernel's dcache.
5025       _schedule_invalidate_dentry_callback(dn, true);
5026       unlink(dn, true, true);
5027     }
5028   }
5029 }
5030
5031 void Client::handle_cap_grant(MetaSession *session, Inode *in, Cap *cap, MClientCaps *m)
5032 {
5033   mds_rank_t mds = session->mds_num;
5034   int used = get_caps_used(in);
5035   int wanted = in->caps_wanted();
5036
5037   const int old_caps = cap->issued;
5038   const int new_caps = m->get_caps();
5039   ldout(cct, 5) << "handle_cap_grant on in " << m->get_ino()
5040                 << " mds." << mds << " seq " << m->get_seq()
5041                 << " caps now " << ccap_string(new_caps)
5042                 << " was " << ccap_string(old_caps) << dendl;
5043   cap->seq = m->get_seq();
5044
5045   in->layout = m->get_layout();
5046
5047   // update inode
5048   int implemented = 0;
5049   int issued = in->caps_issued(&implemented) | in->caps_dirty();
5050   issued |= implemented;
5051
5052   if ((issued & CEPH_CAP_AUTH_EXCL) == 0) {
5053     in->mode = m->head.mode;
5054     in->uid = m->head.uid;
5055     in->gid = m->head.gid;
5056     in->btime = m->btime;
5057   }
5058   bool deleted_inode = false;
5059   if ((issued & CEPH_CAP_LINK_EXCL) == 0) {
5060     in->nlink = m->head.nlink;
5061     if (in->nlink == 0 &&
5062         (new_caps & (CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL)))
5063       deleted_inode = true;
5064   }
5065   if ((issued & CEPH_CAP_XATTR_EXCL) == 0 &&
5066       m->xattrbl.length() &&
5067       m->head.xattr_version > in->xattr_version) {
5068     bufferlist::iterator p = m->xattrbl.begin();
5069     ::decode(in->xattrs, p);
5070     in->xattr_version = m->head.xattr_version;
5071   }
5072   update_inode_file_bits(in, m->get_truncate_seq(), m->get_truncate_size(), m->get_size(),
5073                          m->get_change_attr(), m->get_time_warp_seq(), m->get_ctime(),
5074                          m->get_mtime(), m->get_atime(),
5075                          m->inline_version, m->inline_data, issued);
5076
5077   // max_size
5078   if (cap == in->auth_cap &&
5079       m->get_max_size() != in->max_size) {
5080     ldout(cct, 10) << "max_size " << in->max_size << " -> " << m->get_max_size() << dendl;
5081     in->max_size = m->get_max_size();
5082     if (in->max_size > in->wanted_max_size) {
5083       in->wanted_max_size = 0;
5084       in->requested_max_size = 0;
5085     }
5086   }
5087
5088   bool check = false;
5089   if (m->get_op() == CEPH_CAP_OP_IMPORT && m->get_wanted() != wanted)
5090     check = true;
5091
5092   check_cap_issue(in, cap, new_caps);
5093
5094   // update caps
5095   int revoked = old_caps & ~new_caps;
5096   if (revoked) {
5097     ldout(cct, 10) << "  revocation of " << ccap_string(revoked) << dendl;
5098     cap->issued = new_caps;
5099     cap->implemented |= new_caps;
5100
5101     // recall delegations if we're losing caps necessary for them
5102     if (revoked & ceph_deleg_caps_for_type(CEPH_DELEGATION_RD))
5103       in->recall_deleg(false);
5104     else if (revoked & ceph_deleg_caps_for_type(CEPH_DELEGATION_WR))
5105       in->recall_deleg(true);
5106
5107     if (((used & ~new_caps) & CEPH_CAP_FILE_BUFFER)
5108         && !_flush(in, new C_Client_FlushComplete(this, in))) {
5109       // waitin' for flush
5110     } else if ((old_caps & ~new_caps) & CEPH_CAP_FILE_CACHE) {
5111       if (_release(in))
5112         check = true;
5113     } else {
5114       cap->wanted = 0; // don't let check_caps skip sending a response to MDS
5115       check = true;
5116     }
5117
5118   } else if (old_caps == new_caps) {
5119     ldout(cct, 10) << "  caps unchanged at " << ccap_string(old_caps) << dendl;
5120   } else {
5121     ldout(cct, 10) << "  grant, new caps are " << ccap_string(new_caps & ~old_caps) << dendl;
5122     cap->issued = new_caps;
5123     cap->implemented |= new_caps;
5124
5125     if (cap == in->auth_cap) {
5126       // non-auth MDS is revoking the newly grant caps ?
5127       for (map<mds_rank_t, Cap*>::iterator it = in->caps.begin(); it != in->caps.end(); ++it) {
5128         if (it->second == cap)
5129           continue;
5130         if (it->second->implemented & ~it->second->issued & new_caps) {
5131           check = true;
5132           break;
5133         }
5134       }
5135     }
5136   }
5137
5138   if (check)
5139     check_caps(in, 0);
5140
5141   // wake up waiters
5142   if (new_caps)
5143     signal_cond_list(in->waitfor_caps);
5144
5145   // may drop inode's last ref
5146   if (deleted_inode)
5147     _try_to_trim_inode(in, true);
5148
5149   m->put();
5150 }
5151
5152 int Client::_getgrouplist(gid_t** sgids, uid_t uid, gid_t gid)
5153 {
5154   // cppcheck-suppress variableScope
5155   int sgid_count;
5156   gid_t *sgid_buf;
5157
5158   if (getgroups_cb) {
5159     sgid_count = getgroups_cb(callback_handle, &sgid_buf);
5160     if (sgid_count > 0) {
5161       *sgids = sgid_buf;
5162       return sgid_count;
5163     }
5164   }
5165
5166 #if HAVE_GETGROUPLIST
5167   struct passwd *pw;
5168   pw = getpwuid(uid);
5169   if (pw == NULL) {
5170     ldout(cct, 3) << "getting user entry failed" << dendl;
5171     return -errno;
5172   }
5173   //use PAM to get the group list
5174   // initial number of group entries, defaults to posix standard of 16
5175   // PAM implementations may provide more than 16 groups....
5176   sgid_count = 16;
5177   sgid_buf = (gid_t*)malloc(sgid_count * sizeof(gid_t));
5178   if (sgid_buf == NULL) {
5179     ldout(cct, 3) << "allocating group memory failed" << dendl;
5180     return -ENOMEM;
5181   }
5182
5183   while (1) {
5184 #if defined(__APPLE__)
5185     if (getgrouplist(pw->pw_name, gid, (int*)sgid_buf, &sgid_count) == -1) {
5186 #else
5187     if (getgrouplist(pw->pw_name, gid, sgid_buf, &sgid_count) == -1) {
5188 #endif
5189       // we need to resize the group list and try again
5190       void *_realloc = NULL;
5191       if ((_realloc = realloc(sgid_buf, sgid_count * sizeof(gid_t))) == NULL) {
5192         ldout(cct, 3) << "allocating group memory failed" << dendl;
5193         free(sgid_buf);
5194         return -ENOMEM;
5195       }
5196       sgid_buf = (gid_t*)_realloc;
5197       continue;
5198     }
5199     // list was successfully retrieved
5200     break;
5201   }
5202   *sgids = sgid_buf;
5203   return sgid_count;
5204 #else
5205   return 0;
5206 #endif
5207 }
5208
5209 int Client::inode_permission(Inode *in, const UserPerm& perms, unsigned want)
5210 {
5211   if (perms.uid() == 0)
5212     return 0;
5213
5214   if (perms.uid() != in->uid && (in->mode & S_IRWXG)) {
5215     int ret = _posix_acl_permission(in, perms, want);
5216     if (ret != -EAGAIN)
5217       return ret;
5218   }
5219
5220   // check permissions before doing anything else
5221   if (!in->check_mode(perms, want))
5222     return -EACCES;
5223   return 0;
5224 }
5225
5226 int Client::xattr_permission(Inode *in, const char *name, unsigned want,
5227                              const UserPerm& perms)
5228 {
5229   int r = _getattr_for_perm(in, perms);
5230   if (r < 0)
5231     goto out;
5232
5233   r = 0;
5234   if (strncmp(name, "system.", 7) == 0) {
5235     if ((want & MAY_WRITE) && (perms.uid() != 0 && perms.uid() != in->uid))
5236       r = -EPERM;
5237   } else {
5238     r = inode_permission(in, perms, want);
5239   }
5240 out:
5241   ldout(cct, 3) << __func__ << " " << in << " = " << r <<  dendl;
5242   return r;
5243 }
5244
5245 ostream& operator<<(ostream &out, const UserPerm& perm) {
5246   out << "UserPerm(uid: " << perm.uid() << ", gid: " << perm.gid() << ")";
5247   return out;
5248 }
5249
5250 int Client::may_setattr(Inode *in, struct ceph_statx *stx, int mask,
5251                         const UserPerm& perms)
5252 {
5253   ldout(cct, 20) << __func__ << " " << *in << "; " << perms << dendl;
5254   int r = _getattr_for_perm(in, perms);
5255   if (r < 0)
5256     goto out;
5257
5258   if (mask & CEPH_SETATTR_SIZE) {
5259     r = inode_permission(in, perms, MAY_WRITE);
5260     if (r < 0)
5261       goto out;
5262   }
5263
5264   r = -EPERM;
5265   if (mask & CEPH_SETATTR_UID) {
5266     if (perms.uid() != 0 && (perms.uid() != in->uid || stx->stx_uid != in->uid))
5267       goto out;
5268   }
5269   if (mask & CEPH_SETATTR_GID) {
5270     if (perms.uid() != 0 && (perms.uid() != in->uid ||
5271                (!perms.gid_in_groups(stx->stx_gid) && stx->stx_gid != in->gid)))
5272       goto out;
5273   }
5274
5275   if (mask & CEPH_SETATTR_MODE) {
5276     if (perms.uid() != 0 && perms.uid() != in->uid)
5277       goto out;
5278
5279     gid_t i_gid = (mask & CEPH_SETATTR_GID) ? stx->stx_gid : in->gid;
5280     if (perms.uid() != 0 && !perms.gid_in_groups(i_gid))
5281       stx->stx_mode &= ~S_ISGID;
5282   }
5283
5284   if (mask & (CEPH_SETATTR_CTIME | CEPH_SETATTR_BTIME |
5285               CEPH_SETATTR_MTIME | CEPH_SETATTR_ATIME)) {
5286     if (perms.uid() != 0 && perms.uid() != in->uid) {
5287       int check_mask = CEPH_SETATTR_CTIME | CEPH_SETATTR_BTIME;
5288       if (!(mask & CEPH_SETATTR_MTIME_NOW))
5289         check_mask |= CEPH_SETATTR_MTIME;
5290       if (!(mask & CEPH_SETATTR_ATIME_NOW))
5291         check_mask |= CEPH_SETATTR_ATIME;
5292       if (check_mask & mask) {
5293         goto out;
5294       } else {
5295         r = inode_permission(in, perms, MAY_WRITE);
5296         if (r < 0)
5297           goto out;
5298       }
5299     }
5300   }
5301   r = 0;
5302 out:
5303   ldout(cct, 3) << __func__ << " " << in << " = " << r <<  dendl;
5304   return r;
5305 }
5306
5307 int Client::may_open(Inode *in, int flags, const UserPerm& perms)
5308 {
5309   ldout(cct, 20) << __func__ << " " << *in << "; " << perms << dendl;
5310   unsigned want = 0;
5311
5312   if ((flags & O_ACCMODE) == O_WRONLY)
5313     want = MAY_WRITE;
5314   else if ((flags & O_ACCMODE) == O_RDWR)
5315     want = MAY_READ | MAY_WRITE;
5316   else if ((flags & O_ACCMODE) == O_RDONLY)
5317     want = MAY_READ;
5318   if (flags & O_TRUNC)
5319     want |= MAY_WRITE;
5320
5321   int r = 0;
5322   switch (in->mode & S_IFMT) {
5323     case S_IFLNK:
5324       r = -ELOOP;
5325       goto out;
5326     case S_IFDIR:
5327       if (want & MAY_WRITE) {
5328         r = -EISDIR;
5329         goto out;
5330       }
5331       break;
5332   }
5333
5334   r = _getattr_for_perm(in, perms);
5335   if (r < 0)
5336     goto out;
5337
5338   r = inode_permission(in, perms, want);
5339 out:
5340   ldout(cct, 3) << __func__ << " " << in << " = " << r <<  dendl;
5341   return r;
5342 }
5343
5344 int Client::may_lookup(Inode *dir, const UserPerm& perms)
5345 {
5346   ldout(cct, 20) << __func__ << " " << *dir << "; " << perms << dendl;
5347   int r = _getattr_for_perm(dir, perms);
5348   if (r < 0)
5349     goto out;
5350
5351   r = inode_permission(dir, perms, MAY_EXEC);
5352 out:
5353   ldout(cct, 3) << __func__ << " " << dir << " = " << r <<  dendl;
5354   return r;
5355 }
5356
5357 int Client::may_create(Inode *dir, const UserPerm& perms)
5358 {
5359   ldout(cct, 20) << __func__ << " " << *dir << "; " << perms << dendl;
5360   int r = _getattr_for_perm(dir, perms);
5361   if (r < 0)
5362     goto out;
5363
5364   r = inode_permission(dir, perms, MAY_EXEC | MAY_WRITE);
5365 out:
5366   ldout(cct, 3) << __func__ << " " << dir << " = " << r <<  dendl;
5367   return r;
5368 }
5369
5370 int Client::may_delete(Inode *dir, const char *name, const UserPerm& perms)
5371 {
5372   ldout(cct, 20) << __func__ << " " << *dir << "; " << "; name " << name << "; " << perms << dendl;
5373   int r = _getattr_for_perm(dir, perms);
5374   if (r < 0)
5375     goto out;
5376
5377   r = inode_permission(dir, perms, MAY_EXEC | MAY_WRITE);
5378   if (r < 0)
5379     goto out;
5380
5381   /* 'name == NULL' means rmsnap */
5382   if (perms.uid() != 0 && name && (dir->mode & S_ISVTX)) {
5383     InodeRef otherin;
5384     r = _lookup(dir, name, CEPH_CAP_AUTH_SHARED, &otherin, perms);
5385     if (r < 0)
5386       goto out;
5387     if (dir->uid != perms.uid() && otherin->uid != perms.uid())
5388       r = -EPERM;
5389   }
5390 out:
5391   ldout(cct, 3) << __func__ << " " << dir << " = " << r <<  dendl;
5392   return r;
5393 }
5394
5395 int Client::may_hardlink(Inode *in, const UserPerm& perms)
5396 {
5397   ldout(cct, 20) << __func__ << " " << *in << "; " << perms << dendl;
5398   int r = _getattr_for_perm(in, perms);
5399   if (r < 0)
5400     goto out;
5401
5402   if (perms.uid() == 0 || perms.uid() == in->uid) {
5403     r = 0;
5404     goto out;
5405   }
5406
5407   r = -EPERM;
5408   if (!S_ISREG(in->mode))
5409     goto out;
5410
5411   if (in->mode & S_ISUID)
5412     goto out;
5413
5414   if ((in->mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP))
5415     goto out;
5416
5417   r = inode_permission(in, perms, MAY_READ | MAY_WRITE);
5418 out:
5419   ldout(cct, 3) << __func__ << " " << in << " = " << r <<  dendl;
5420   return r;
5421 }
5422
5423 int Client::_getattr_for_perm(Inode *in, const UserPerm& perms)
5424 {
5425   int mask = CEPH_STAT_CAP_MODE;
5426   bool force = false;
5427   if (acl_type != NO_ACL) {
5428     mask |= CEPH_STAT_CAP_XATTR;
5429     force = in->xattr_version == 0;
5430   }
5431   return _getattr(in, mask, perms, force);
5432 }
5433
5434 vinodeno_t Client::_get_vino(Inode *in)
5435 {
5436   /* The caller must hold the client lock */
5437   return vinodeno_t(in->ino, in->snapid);
5438 }
5439
5440 inodeno_t Client::_get_inodeno(Inode *in)
5441 {
5442   /* The caller must hold the client lock */
5443   return in->ino;
5444 }
5445
5446
5447 /**
5448  * Resolve an MDS spec to a list of MDS daemon GIDs.
5449  *
5450  * The spec is a string representing a GID, rank, filesystem:rank, or name/id.
5451  * It may be '*' in which case it matches all GIDs.
5452  *
5453  * If no error is returned, the `targets` vector will be populated with at least
5454  * one MDS.
5455  */
5456 int Client::resolve_mds(
5457     const std::string &mds_spec,
5458     std::vector<mds_gid_t> *targets)
5459 {
5460   assert(fsmap);
5461   assert(targets != nullptr);
5462
5463   mds_role_t role;
5464   std::stringstream ss;
5465   int role_r = fsmap->parse_role(mds_spec, &role, ss);
5466   if (role_r == 0) {
5467     // We got a role, resolve it to a GID
5468     ldout(cct, 10) << __func__ << ": resolved '" << mds_spec << "' to role '"
5469       << role << "'" << dendl;
5470     targets->push_back(
5471         fsmap->get_filesystem(role.fscid)->mds_map.get_info(role.rank).global_id);
5472     return 0;
5473   }
5474
5475   std::string strtol_err;
5476   long long rank_or_gid = strict_strtoll(mds_spec.c_str(), 10, &strtol_err);
5477   if (strtol_err.empty()) {
5478     // It is a possible GID
5479     const mds_gid_t mds_gid = mds_gid_t(rank_or_gid);
5480     if (fsmap->gid_exists(mds_gid)) {
5481       ldout(cct, 10) << __func__ << ": validated GID " << mds_gid << dendl;
5482       targets->push_back(mds_gid);
5483     } else {
5484       lderr(cct) << __func__ << ": GID " << mds_gid << " not in MDS map"
5485                  << dendl;
5486       return -ENOENT;
5487     }
5488   } else if (mds_spec == "*") {
5489     // It is a wildcard: use all MDSs
5490     const auto mds_info = fsmap->get_mds_info();
5491
5492     if (mds_info.empty()) {
5493       lderr(cct) << __func__ << ": * passed but no MDS daemons found" << dendl;
5494       return -ENOENT;
5495     }
5496
5497     for (const auto i : mds_info) {
5498       targets->push_back(i.first);
5499     }
5500   } else {
5501     // It did not parse as an integer, it is not a wildcard, it must be a name
5502     const mds_gid_t mds_gid = fsmap->find_mds_gid_by_name(mds_spec);
5503     if (mds_gid == 0) {
5504       lderr(cct) << "MDS ID '" << mds_spec << "' not found" << dendl;
5505
5506       lderr(cct) << "FSMap: " << *fsmap << dendl;
5507
5508       return -ENOENT;
5509     } else {
5510       ldout(cct, 10) << __func__ << ": resolved ID '" << mds_spec
5511                      << "' to GID " << mds_gid << dendl;
5512       targets->push_back(mds_gid);
5513     }
5514   }
5515
5516   return 0;
5517 }
5518
5519
5520 /**
5521  * Authenticate with mon and establish global ID
5522  */
5523 int Client::authenticate()
5524 {
5525   assert(client_lock.is_locked_by_me());
5526
5527   if (monclient->is_authenticated()) {
5528     return 0;
5529   }
5530
5531   client_lock.Unlock();
5532   int r = monclient->authenticate(cct->_conf->client_mount_timeout);
5533   client_lock.Lock();
5534   if (r < 0) {
5535     return r;
5536   }
5537
5538   whoami = monclient->get_global_id();
5539   messenger->set_myname(entity_name_t::CLIENT(whoami.v));
5540
5541   return 0;
5542 }
5543
5544 int Client::fetch_fsmap(bool user)
5545 {
5546   int r;
5547   // Retrieve FSMap to enable looking up daemon addresses.  We need FSMap
5548   // rather than MDSMap because no one MDSMap contains all the daemons, and
5549   // a `tell` can address any daemon.
5550   version_t fsmap_latest;
5551   do {
5552     C_SaferCond cond;
5553     monclient->get_version("fsmap", &fsmap_latest, NULL, &cond);
5554     client_lock.Unlock();
5555     r = cond.wait();
5556     client_lock.Lock();
5557   } while (r == -EAGAIN);
5558
5559   if (r < 0) {
5560     lderr(cct) << "Failed to learn FSMap version: " << cpp_strerror(r) << dendl;
5561     return r;
5562   }
5563
5564   ldout(cct, 10) << __func__ << " learned FSMap version " << fsmap_latest << dendl;
5565
5566   if (user) {
5567     if (!fsmap_user || fsmap_user->get_epoch() < fsmap_latest) {
5568       monclient->sub_want("fsmap.user", fsmap_latest, CEPH_SUBSCRIBE_ONETIME);
5569       monclient->renew_subs();
5570       wait_on_list(waiting_for_fsmap);
5571     }
5572     assert(fsmap_user);
5573     assert(fsmap_user->get_epoch() >= fsmap_latest);
5574   } else {
5575     if (!fsmap || fsmap->get_epoch() < fsmap_latest) {
5576       monclient->sub_want("fsmap", fsmap_latest, CEPH_SUBSCRIBE_ONETIME);
5577       monclient->renew_subs();
5578       wait_on_list(waiting_for_fsmap);
5579     }
5580     assert(fsmap);
5581     assert(fsmap->get_epoch() >= fsmap_latest);
5582   }
5583   ldout(cct, 10) << __func__ << " finished waiting for FSMap version "
5584                  << fsmap_latest << dendl;
5585   return 0;
5586 }
5587
5588 /**
5589  *
5590  * @mds_spec one of ID, rank, GID, "*"
5591  *
5592  */
5593 int Client::mds_command(
5594     const std::string &mds_spec,
5595     const vector<string>& cmd,
5596     const bufferlist& inbl,
5597     bufferlist *outbl,
5598     string *outs,
5599     Context *onfinish)
5600 {
5601   Mutex::Locker lock(client_lock);
5602
5603   if (!initialized)
5604     return -ENOTCONN;
5605
5606   int r;
5607   r = authenticate();
5608   if (r < 0) {
5609     return r;
5610   }
5611
5612   r = fetch_fsmap(false);
5613   if (r < 0) {
5614     return r;
5615   }
5616
5617   // Look up MDS target(s) of the command
5618   std::vector<mds_gid_t> targets;
5619   r = resolve_mds(mds_spec, &targets);
5620   if (r < 0) {
5621     return r;
5622   }
5623
5624   // If daemons are laggy, we won't send them commands.  If all
5625   // are laggy then we fail.
5626   std::vector<mds_gid_t> non_laggy;
5627   for (const auto gid : targets) {
5628     const auto info = fsmap->get_info_gid(gid);
5629     if (!info.laggy()) {
5630       non_laggy.push_back(gid);
5631     }
5632   }
5633   if (non_laggy.size() == 0) {
5634     *outs = "All targeted MDS daemons are laggy";
5635     return -ENOENT;
5636   }
5637
5638   if (metadata.empty()) {
5639     // We are called on an unmounted client, so metadata
5640     // won't be initialized yet.
5641     populate_metadata("");
5642   }
5643
5644   // Send commands to targets
5645   C_GatherBuilder gather(cct, onfinish);
5646   for (const auto target_gid : non_laggy) {
5647     const auto info = fsmap->get_info_gid(target_gid);
5648
5649     // Open a connection to the target MDS
5650     entity_inst_t inst = info.get_inst();
5651     ConnectionRef conn = messenger->get_connection(inst);
5652
5653     // Generate MDSCommandOp state
5654     auto &op = command_table.start_command();
5655
5656     op.on_finish = gather.new_sub();
5657     op.cmd = cmd;
5658     op.outbl = outbl;
5659     op.outs = outs;
5660     op.inbl = inbl;
5661     op.mds_gid = target_gid;
5662     op.con = conn;
5663
5664     ldout(cct, 4) << __func__ << ": new command op to " << target_gid
5665       << " tid=" << op.tid << cmd << dendl;
5666
5667     // Construct and send MCommand
5668     MCommand *m = op.get_message(monclient->get_fsid());
5669     conn->send_message(m);
5670   }
5671   gather.activate();
5672
5673   return 0;
5674 }
5675
5676 void Client::handle_command_reply(MCommandReply *m)
5677 {
5678   ceph_tid_t const tid = m->get_tid();
5679
5680   ldout(cct, 10) << __func__ << ": tid=" << m->get_tid() << dendl;
5681
5682   if (!command_table.exists(tid)) {
5683     ldout(cct, 1) << __func__ << ": unknown tid " << tid << ", dropping" << dendl;
5684     m->put();
5685     return;
5686   }
5687
5688   auto &op = command_table.get_command(tid);
5689   if (op.outbl) {
5690     op.outbl->claim(m->get_data());
5691   }
5692   if (op.outs) {
5693     *op.outs = m->rs;
5694   }
5695
5696   if (op.on_finish) {
5697     op.on_finish->complete(m->r);
5698   }
5699
5700   command_table.erase(tid);
5701
5702   m->put();
5703 }
5704
5705 // -------------------
5706 // MOUNT
5707
5708 int Client::mount(const std::string &mount_root, const UserPerm& perms,
5709                   bool require_mds)
5710 {
5711   Mutex::Locker lock(client_lock);
5712
5713   if (mounted) {
5714     ldout(cct, 5) << "already mounted" << dendl;
5715     return 0;
5716   }
5717
5718   unmounting = false;
5719
5720   int r = authenticate();
5721   if (r < 0) {
5722     lderr(cct) << "authentication failed: " << cpp_strerror(r) << dendl;
5723     return r;
5724   }
5725
5726   std::string want = "mdsmap";
5727   const auto &mds_ns = cct->_conf->client_mds_namespace;
5728   if (!mds_ns.empty()) {
5729     r = fetch_fsmap(true);
5730     if (r < 0)
5731       return r;
5732     fs_cluster_id_t cid = fsmap_user->get_fs_cid(mds_ns);
5733     if (cid == FS_CLUSTER_ID_NONE)
5734       return -ENOENT;
5735
5736     std::ostringstream oss;
5737     oss << want << "." << cid;
5738     want = oss.str();
5739   }
5740   ldout(cct, 10) << "Subscribing to map '" << want << "'" << dendl;
5741
5742   monclient->sub_want(want, 0, 0);
5743   monclient->renew_subs();
5744
5745   tick(); // start tick
5746
5747   if (require_mds) {
5748     while (1) {
5749       auto availability = mdsmap->is_cluster_available();
5750       if (availability == MDSMap::STUCK_UNAVAILABLE) {
5751         // Error out
5752         ldout(cct, 10) << "mds cluster unavailable: epoch=" << mdsmap->get_epoch() << dendl;
5753         return CEPH_FUSE_NO_MDS_UP;
5754       } else if (availability == MDSMap::AVAILABLE) {
5755         // Continue to mount
5756         break;
5757       } else if (availability == MDSMap::TRANSIENT_UNAVAILABLE) {
5758         // Else, wait.  MDSMonitor will update the map to bring
5759         // us to a conclusion eventually.
5760         wait_on_list(waiting_for_mdsmap);
5761       } else {
5762         // Unexpected value!
5763         ceph_abort();
5764       }
5765     }
5766   }
5767
5768   populate_metadata(mount_root.empty() ? "/" : mount_root);
5769
5770   filepath fp(CEPH_INO_ROOT);
5771   if (!mount_root.empty()) {
5772     fp = filepath(mount_root.c_str());
5773   }
5774   while (true) {
5775     MetaRequest *req = new MetaRequest(CEPH_MDS_OP_GETATTR);
5776     req->set_filepath(fp);
5777     req->head.args.getattr.mask = CEPH_STAT_CAP_INODE_ALL;
5778     int res = make_request(req, perms);
5779     if (res < 0) {
5780       if (res == -EACCES && root) {
5781         ldout(cct, 1) << __func__ << " EACCES on parent of mount point; quotas may not work" << dendl;
5782         break;
5783       }
5784       return res;
5785     }
5786
5787     if (fp.depth())
5788       fp.pop_dentry();
5789     else
5790       break;
5791   }
5792
5793   assert(root);
5794   _ll_get(root);
5795
5796   mounted = true;
5797
5798   // trace?
5799   if (!cct->_conf->client_trace.empty()) {
5800     traceout.open(cct->_conf->client_trace.c_str());
5801     if (traceout.is_open()) {
5802       ldout(cct, 1) << "opened trace file '" << cct->_conf->client_trace << "'" << dendl;
5803     } else {
5804       ldout(cct, 1) << "FAILED to open trace file '" << cct->_conf->client_trace << "'" << dendl;
5805     }
5806   }
5807
5808   /*
5809   ldout(cct, 3) << "op: // client trace data structs" << dendl;
5810   ldout(cct, 3) << "op: struct stat st;" << dendl;
5811   ldout(cct, 3) << "op: struct utimbuf utim;" << dendl;
5812   ldout(cct, 3) << "op: int readlinkbuf_len = 1000;" << dendl;
5813   ldout(cct, 3) << "op: char readlinkbuf[readlinkbuf_len];" << dendl;
5814   ldout(cct, 3) << "op: map<string, inode_t*> dir_contents;" << dendl;
5815   ldout(cct, 3) << "op: map<int, int> open_files;" << dendl;
5816   ldout(cct, 3) << "op: int fd;" << dendl;
5817   */
5818   return 0;
5819 }
5820
5821 // UNMOUNT
5822
5823 void Client::_close_sessions()
5824 {
5825   while (!mds_sessions.empty()) {
5826     // send session closes!
5827     for (map<mds_rank_t,MetaSession*>::iterator p = mds_sessions.begin();
5828         p != mds_sessions.end();
5829         ++p) {
5830       if (p->second->state != MetaSession::STATE_CLOSING) {
5831         _close_mds_session(p->second);
5832       }
5833     }
5834
5835     // wait for sessions to close
5836     ldout(cct, 2) << "waiting for " << mds_sessions.size() << " mds sessions to close" << dendl;
5837     mount_cond.Wait(client_lock);
5838   }
5839 }
5840
5841 void Client::flush_mdlog_sync()
5842 {
5843   if (mds_requests.empty())
5844     return;
5845   for (map<mds_rank_t,MetaSession*>::iterator p = mds_sessions.begin();
5846        p != mds_sessions.end();
5847        ++p) {
5848     MetaSession *s = p->second;
5849     flush_mdlog(s);
5850   }
5851 }
5852
5853 void Client::flush_mdlog(MetaSession *session)
5854 {
5855   // Only send this to Luminous or newer MDS daemons, older daemons
5856   // will crash if they see an unknown CEPH_SESSION_* value in this msg.
5857   const uint64_t features = session->con->get_features();
5858   if (HAVE_FEATURE(features, SERVER_LUMINOUS)) {
5859     MClientSession *m = new MClientSession(CEPH_SESSION_REQUEST_FLUSH_MDLOG);
5860     session->con->send_message(m);
5861   }
5862 }
5863
5864
5865 void Client::_unmount()
5866 {
5867   if (unmounting)
5868     return;
5869
5870   ldout(cct, 2) << "unmounting" << dendl;
5871   unmounting = true;
5872
5873   deleg_timeout = 0;
5874
5875   flush_mdlog_sync(); // flush the mdlog for pending requests, if any
5876   while (!mds_requests.empty()) {
5877     ldout(cct, 10) << "waiting on " << mds_requests.size() << " requests" << dendl;
5878     mount_cond.Wait(client_lock);
5879   }
5880
5881   if (tick_event)
5882     timer.cancel_event(tick_event);
5883   tick_event = 0;
5884
5885   cwd.reset();
5886
5887   // clean up any unclosed files
5888   while (!fd_map.empty()) {
5889     Fh *fh = fd_map.begin()->second;
5890     fd_map.erase(fd_map.begin());
5891     ldout(cct, 0) << " destroyed lost open file " << fh << " on " << *fh->inode << dendl;
5892     _release_fh(fh);
5893   }
5894
5895   while (!ll_unclosed_fh_set.empty()) {
5896     set<Fh*>::iterator it = ll_unclosed_fh_set.begin();
5897     Fh *fh = *it;
5898     ll_unclosed_fh_set.erase(fh);
5899     ldout(cct, 0) << " destroyed lost open file " << fh << " on " << *(fh->inode) << dendl;
5900     _release_fh(fh);
5901   }
5902
5903   while (!opened_dirs.empty()) {
5904     dir_result_t *dirp = *opened_dirs.begin();
5905     ldout(cct, 0) << " destroyed lost open dir " << dirp << " on " << *dirp->inode << dendl;
5906     _closedir(dirp);
5907   }
5908
5909   _ll_drop_pins();
5910
5911   if (blacklisted) {
5912     ldout(cct, 0) << " skipping clean shutdown, we are blacklisted" << dendl;
5913
5914     if (cct->_conf->client_oc) {
5915       // Purge all cached data so that ObjectCacher doesn't get hung up
5916       // trying to flush it.  ObjectCacher's behaviour on EBLACKLISTED
5917       // is to just leave things marked dirty
5918       // (http://tracker.ceph.com/issues/9105)
5919       for (const auto &i : inode_map) {
5920         objectcacher->purge_set(&(i.second->oset));
5921       }
5922     }
5923
5924     mounted = false;
5925     return;
5926   }
5927
5928   while (unsafe_sync_write > 0) {
5929     ldout(cct, 0) << unsafe_sync_write << " unsafe_sync_writes, waiting"  << dendl;
5930     mount_cond.Wait(client_lock);
5931   }
5932
5933   if (cct->_conf->client_oc) {
5934     // flush/release all buffered data
5935     ceph::unordered_map<vinodeno_t, Inode*>::iterator next;
5936     for (ceph::unordered_map<vinodeno_t, Inode*>::iterator p = inode_map.begin();
5937          p != inode_map.end();
5938          p = next) {
5939       next = p;
5940       ++next;
5941       Inode *in = p->second;
5942       if (!in) {
5943         ldout(cct, 0) << "null inode_map entry ino " << p->first << dendl;
5944         assert(in);
5945       }
5946       if (!in->caps.empty()) {
5947         InodeRef tmp_ref(in);
5948         _release(in);
5949         _flush(in, new C_Client_FlushComplete(this, in));
5950       }
5951     }
5952   }
5953
5954   flush_caps_sync();
5955   wait_sync_caps(last_flush_tid);
5956
5957   // empty lru cache
5958   trim_cache();
5959
5960   while (lru.lru_get_size() > 0 ||
5961          !inode_map.empty()) {
5962     ldout(cct, 2) << "cache still has " << lru.lru_get_size()
5963             << "+" << inode_map.size() << " items"
5964             << ", waiting (for caps to release?)"
5965             << dendl;
5966     utime_t until = ceph_clock_now() + utime_t(5, 0);
5967     int r = mount_cond.WaitUntil(client_lock, until);
5968     if (r == ETIMEDOUT) {
5969       dump_cache(NULL);
5970     }
5971   }
5972   assert(lru.lru_get_size() == 0);
5973   assert(inode_map.empty());
5974
5975   // stop tracing
5976   if (!cct->_conf->client_trace.empty()) {
5977     ldout(cct, 1) << "closing trace file '" << cct->_conf->client_trace << "'" << dendl;
5978     traceout.close();
5979   }
5980
5981   _close_sessions();
5982
5983   mounted = false;
5984
5985   ldout(cct, 2) << "unmounted." << dendl;
5986 }
5987
5988 void Client::unmount()
5989 {
5990   Mutex::Locker lock(client_lock);
5991   _unmount();
5992 }
5993
5994 void Client::flush_cap_releases()
5995 {
5996   // send any cap releases
5997   for (map<mds_rank_t,MetaSession*>::iterator p = mds_sessions.begin();
5998        p != mds_sessions.end();
5999        ++p) {
6000     if (p->second->release && mdsmap->is_clientreplay_or_active_or_stopping(
6001           p->first)) {
6002       if (cct->_conf->client_inject_release_failure) {
6003         ldout(cct, 20) << __func__ << " injecting failure to send cap release message" << dendl;
6004         p->second->release->put();
6005       } else {
6006         p->second->con->send_message(p->second->release);
6007       }
6008       p->second->release = 0;
6009     }
6010   }
6011 }
6012
6013 void Client::tick()
6014 {
6015   if (cct->_conf->client_debug_inject_tick_delay > 0) {
6016     sleep(cct->_conf->client_debug_inject_tick_delay);
6017     assert(0 == cct->_conf->set_val("client_debug_inject_tick_delay", "0"));
6018     cct->_conf->apply_changes(NULL);
6019   }
6020
6021   ldout(cct, 21) << "tick" << dendl;
6022   tick_event = timer.add_event_after(
6023     cct->_conf->client_tick_interval,
6024     new FunctionContext([this](int) {
6025         // Called back via Timer, which takes client_lock for us
6026         assert(client_lock.is_locked_by_me());
6027         tick();
6028       }));
6029   utime_t now = ceph_clock_now();
6030
6031   if (!mounted && !mds_requests.empty()) {
6032     MetaRequest *req = mds_requests.begin()->second;
6033     if (req->op_stamp + cct->_conf->client_mount_timeout < now) {
6034       req->abort(-ETIMEDOUT);
6035       if (req->caller_cond) {
6036         req->kick = true;
6037         req->caller_cond->Signal();
6038       }
6039       signal_cond_list(waiting_for_mdsmap);
6040       for (map<mds_rank_t,MetaSession*>::iterator p = mds_sessions.begin();
6041            p != mds_sessions.end();
6042           ++p)
6043         signal_context_list(p->second->waiting_for_open);
6044     }
6045   }
6046
6047   if (mdsmap->get_epoch()) {
6048     // renew caps?
6049     utime_t el = now - last_cap_renew;
6050     if (el > mdsmap->get_session_timeout() / 3.0)
6051       renew_caps();
6052
6053     flush_cap_releases();
6054   }
6055
6056   // delayed caps
6057   xlist<Inode*>::iterator p = delayed_caps.begin();
6058   while (!p.end()) {
6059     Inode *in = *p;
6060     ++p;
6061     if (in->hold_caps_until > now)
6062       break;
6063     delayed_caps.pop_front();
6064     cap_list.push_back(&in->cap_item);
6065     check_caps(in, CHECK_CAPS_NODELAY);
6066   }
6067
6068   trim_cache(true);
6069 }
6070
6071 void Client::renew_caps()
6072 {
6073   ldout(cct, 10) << "renew_caps()" << dendl;
6074   last_cap_renew = ceph_clock_now();
6075
6076   for (map<mds_rank_t,MetaSession*>::iterator p = mds_sessions.begin();
6077        p != mds_sessions.end();
6078        ++p) {
6079     ldout(cct, 15) << "renew_caps requesting from mds." << p->first << dendl;
6080     if (mdsmap->get_state(p->first) >= MDSMap::STATE_REJOIN)
6081       renew_caps(p->second);
6082   }
6083 }
6084
6085 void Client::renew_caps(MetaSession *session)
6086 {
6087   ldout(cct, 10) << "renew_caps mds." << session->mds_num << dendl;
6088   session->last_cap_renew_request = ceph_clock_now();
6089   uint64_t seq = ++session->cap_renew_seq;
6090   session->con->send_message(new MClientSession(CEPH_SESSION_REQUEST_RENEWCAPS, seq));
6091 }
6092
6093
6094 // ===============================================================
6095 // high level (POSIXy) interface
6096
6097 int Client::_do_lookup(Inode *dir, const string& name, int mask,
6098                        InodeRef *target, const UserPerm& perms)
6099 {
6100   int op = dir->snapid == CEPH_SNAPDIR ? CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP;
6101   MetaRequest *req = new MetaRequest(op);
6102   filepath path;
6103   dir->make_nosnap_relative_path(path);
6104   path.push_dentry(name);
6105   req->set_filepath(path);
6106   req->set_inode(dir);
6107   if (cct->_conf->client_debug_getattr_caps && op == CEPH_MDS_OP_LOOKUP)
6108       mask |= DEBUG_GETATTR_CAPS;
6109   req->head.args.getattr.mask = mask;
6110
6111   ldout(cct, 10) << "_do_lookup on " << path << dendl;
6112
6113   int r = make_request(req, perms, target);
6114   ldout(cct, 10) << "_do_lookup res is " << r << dendl;
6115   return r;
6116 }
6117
6118 int Client::_lookup(Inode *dir, const string& dname, int mask, InodeRef *target,
6119                     const UserPerm& perms)
6120 {
6121   int r = 0;
6122   Dentry *dn = NULL;
6123
6124   if (!dir->is_dir()) {
6125     r = -ENOTDIR;
6126     goto done;
6127   }
6128
6129   if (dname == "..") {
6130     if (dir->dn_set.empty())
6131       *target = dir;
6132     else
6133       *target = dir->get_first_parent()->dir->parent_inode; //dirs can't be hard-linked
6134     goto done;
6135   }
6136
6137   if (dname == ".") {
6138     *target = dir;
6139     goto done;
6140   }
6141
6142   if (dname.length() > NAME_MAX) {
6143     r = -ENAMETOOLONG;
6144     goto done;
6145   }
6146
6147   if (dname == cct->_conf->client_snapdir &&
6148       dir->snapid == CEPH_NOSNAP) {
6149     *target = open_snapdir(dir);
6150     goto done;
6151   }
6152
6153   if (dir->dir &&
6154       dir->dir->dentries.count(dname)) {
6155     dn = dir->dir->dentries[dname];
6156
6157     ldout(cct, 20) << "_lookup have dn " << dname << " mds." << dn->lease_mds << " ttl " << dn->lease_ttl
6158              << " seq " << dn->lease_seq
6159              << dendl;
6160
6161     if (!dn->inode || dn->inode->caps_issued_mask(mask, true)) {
6162       // is dn lease valid?
6163       utime_t now = ceph_clock_now();
6164       if (dn->lease_mds >= 0 &&
6165           dn->lease_ttl > now &&
6166           mds_sessions.count(dn->lease_mds)) {
6167         MetaSession *s = mds_sessions[dn->lease_mds];
6168         if (s->cap_ttl > now &&
6169             s->cap_gen == dn->lease_gen) {
6170           // touch this mds's dir cap too, even though we don't _explicitly_ use it here, to
6171           // make trim_caps() behave.
6172           dir->try_touch_cap(dn->lease_mds);
6173           goto hit_dn;
6174         }
6175         ldout(cct, 20) << " bad lease, cap_ttl " << s->cap_ttl << ", cap_gen " << s->cap_gen
6176                        << " vs lease_gen " << dn->lease_gen << dendl;
6177       }
6178       // dir lease?
6179       if (dir->caps_issued_mask(CEPH_CAP_FILE_SHARED, true)) {
6180         if (dn->cap_shared_gen == dir->shared_gen &&
6181             (!dn->inode || dn->inode->caps_issued_mask(mask, true)))
6182               goto hit_dn;
6183         if (!dn->inode && (dir->flags & I_COMPLETE)) {
6184           ldout(cct, 10) << "_lookup concluded ENOENT locally for "
6185                          << *dir << " dn '" << dname << "'" << dendl;
6186           return -ENOENT;
6187         }
6188       }
6189     } else {
6190       ldout(cct, 20) << " no cap on " << dn->inode->vino() << dendl;
6191     }
6192   } else {
6193     // can we conclude ENOENT locally?
6194     if (dir->caps_issued_mask(CEPH_CAP_FILE_SHARED, true) &&
6195         (dir->flags & I_COMPLETE)) {
6196       ldout(cct, 10) << "_lookup concluded ENOENT locally for " << *dir << " dn '" << dname << "'" << dendl;
6197       return -ENOENT;
6198     }
6199   }
6200
6201   r = _do_lookup(dir, dname, mask, target, perms);
6202   goto done;
6203
6204  hit_dn:
6205   if (dn->inode) {
6206     *target = dn->inode;
6207   } else {
6208     r = -ENOENT;
6209   }
6210   touch_dn(dn);
6211
6212  done:
6213   if (r < 0)
6214     ldout(cct, 10) << "_lookup " << *dir << " " << dname << " = " << r << dendl;
6215   else
6216     ldout(cct, 10) << "_lookup " << *dir << " " << dname << " = " << **target << dendl;
6217   return r;
6218 }
6219
6220 int Client::get_or_create(Inode *dir, const char* name,
6221                           Dentry **pdn, bool expect_null)
6222 {
6223   // lookup
6224   ldout(cct, 20) << "get_or_create " << *dir << " name " << name << dendl;
6225   dir->open_dir();
6226   if (dir->dir->dentries.count(name)) {
6227     Dentry *dn = dir->dir->dentries[name];
6228
6229     // is dn lease valid?
6230     utime_t now = ceph_clock_now();
6231     if (dn->inode &&
6232         dn->lease_mds >= 0 &&
6233         dn->lease_ttl > now &&
6234         mds_sessions.count(dn->lease_mds)) {
6235       MetaSession *s = mds_sessions[dn->lease_mds];
6236       if (s->cap_ttl > now &&
6237           s->cap_gen == dn->lease_gen) {
6238         if (expect_null)
6239           return -EEXIST;
6240       }
6241     }
6242     *pdn = dn;
6243   } else {
6244     // otherwise link up a new one
6245     *pdn = link(dir->dir, name, NULL, NULL);
6246   }
6247
6248   // success
6249   return 0;
6250 }
6251
6252 int Client::path_walk(const filepath& origpath, InodeRef *end,
6253                       const UserPerm& perms, bool followsym, int mask)
6254 {
6255   filepath path = origpath;
6256   InodeRef cur;
6257   if (origpath.absolute())
6258     cur = root;
6259   else
6260     cur = cwd;
6261   assert(cur);
6262
6263   ldout(cct, 10) << "path_walk " << path << dendl;
6264
6265   int symlinks = 0;
6266
6267   unsigned i=0;
6268   while (i < path.depth() && cur) {
6269     int caps = 0;
6270     const string &dname = path[i];
6271     ldout(cct, 10) << " " << i << " " << *cur << " " << dname << dendl;
6272     ldout(cct, 20) << "  (path is " << path << ")" << dendl;
6273     InodeRef next;
6274     if (cct->_conf->client_permissions) {
6275       int r = may_lookup(cur.get(), perms);
6276       if (r < 0)
6277         return r;
6278       caps = CEPH_CAP_AUTH_SHARED;
6279     }
6280
6281     /* Get extra requested caps on the last component */
6282     if (i == (path.depth() - 1))
6283       caps |= mask;
6284     int r = _lookup(cur.get(), dname, caps, &next, perms);
6285     if (r < 0)
6286       return r;
6287     // only follow trailing symlink if followsym.  always follow
6288     // 'directory' symlinks.
6289     if (next && next->is_symlink()) {
6290       symlinks++;
6291       ldout(cct, 20) << " symlink count " << symlinks << ", value is '" << next->symlink << "'" << dendl;
6292       if (symlinks > MAXSYMLINKS) {
6293         return -ELOOP;
6294       }
6295
6296       if (i < path.depth() - 1) {
6297         // dir symlink
6298         // replace consumed components of path with symlink dir target
6299         filepath resolved(next->symlink.c_str());
6300         resolved.append(path.postfixpath(i + 1));
6301         path = resolved;
6302         i = 0;
6303         if (next->symlink[0] == '/') {
6304           cur = root;
6305         }
6306         continue;
6307       } else if (followsym) {
6308         if (next->symlink[0] == '/') {
6309           path = next->symlink.c_str();
6310           i = 0;
6311           // reset position
6312           cur = root;
6313         } else {
6314           filepath more(next->symlink.c_str());
6315           // we need to remove the symlink component from off of the path
6316           // before adding the target that the symlink points to.  remain
6317           // at the same position in the path.
6318           path.pop_dentry();
6319           path.append(more);
6320         }
6321         continue;
6322       }
6323     }
6324     cur.swap(next);
6325     i++;
6326   }
6327   if (!cur)
6328     return -ENOENT;
6329   if (end)
6330     end->swap(cur);
6331   return 0;
6332 }
6333
6334
6335 // namespace ops
6336
6337 int Client::link(const char *relexisting, const char *relpath, const UserPerm& perm)
6338 {
6339   Mutex::Locker lock(client_lock);
6340   tout(cct) << "link" << std::endl;
6341   tout(cct) << relexisting << std::endl;
6342   tout(cct) << relpath << std::endl;
6343
6344   if (unmounting)
6345     return -ENOTCONN;
6346
6347   filepath existing(relexisting);
6348
6349   InodeRef in, dir;
6350   int r = path_walk(existing, &in, perm, true);
6351   if (r < 0)
6352     return r;
6353   if (std::string(relpath) == "/") {
6354     r = -EEXIST;
6355     return r;
6356   }
6357   filepath path(relpath);
6358   string name = path.last_dentry();
6359   path.pop_dentry();
6360
6361   r = path_walk(path, &dir, perm, true);
6362   if (r < 0)
6363     return r;
6364   if (cct->_conf->client_permissions) {
6365     if (S_ISDIR(in->mode)) {
6366       r = -EPERM;
6367       return r;
6368     }
6369     r = may_hardlink(in.get(), perm);
6370     if (r < 0)
6371       return r;
6372     r = may_create(dir.get(), perm);
6373     if (r < 0)
6374       return r;
6375   }
6376   r = _link(in.get(), dir.get(), name.c_str(), perm);
6377   return r;
6378 }
6379
6380 int Client::unlink(const char *relpath, const UserPerm& perm)
6381 {
6382   Mutex::Locker lock(client_lock);
6383   tout(cct) << "unlink" << std::endl;
6384   tout(cct) << relpath << std::endl;
6385
6386   if (unmounting)
6387     return -ENOTCONN;
6388
6389   if (std::string(relpath) == "/")
6390     return -EISDIR;
6391
6392   filepath path(relpath);
6393   string name = path.last_dentry();
6394   path.pop_dentry();
6395   InodeRef dir;
6396   int r = path_walk(path, &dir, perm);
6397   if (r < 0)
6398     return r;
6399   if (cct->_conf->client_permissions) {
6400     r = may_delete(dir.get(), name.c_str(), perm);
6401     if (r < 0)
6402       return r;
6403   }
6404   return _unlink(dir.get(), name.c_str(), perm);
6405 }
6406
6407 int Client::rename(const char *relfrom, const char *relto, const UserPerm& perm)
6408 {
6409   Mutex::Locker lock(client_lock);
6410   tout(cct) << "rename" << std::endl;
6411   tout(cct) << relfrom << std::endl;
6412   tout(cct) << relto << std::endl;
6413
6414   if (unmounting)
6415     return -ENOTCONN;
6416
6417   if (std::string(relfrom) == "/" || std::string(relto) == "/")
6418     return -EBUSY;
6419
6420   filepath from(relfrom);
6421   filepath to(relto);
6422   string fromname = from.last_dentry();
6423   from.pop_dentry();
6424   string toname = to.last_dentry();
6425   to.pop_dentry();
6426
6427   InodeRef fromdir, todir;
6428   int r = path_walk(from, &fromdir, perm);
6429   if (r < 0)
6430     goto out;
6431   r = path_walk(to, &todir, perm);
6432   if (r < 0)
6433     goto out;
6434
6435   if (cct->_conf->client_permissions) {
6436     int r = may_delete(fromdir.get(), fromname.c_str(), perm);
6437     if (r < 0)
6438       return r;
6439     r = may_delete(todir.get(), toname.c_str(), perm);
6440     if (r < 0 && r != -ENOENT)
6441       return r;
6442   }
6443   r = _rename(fromdir.get(), fromname.c_str(), todir.get(), toname.c_str(), perm);
6444 out:
6445   return r;
6446 }
6447
6448 // dirs
6449
6450 int Client::mkdir(const char *relpath, mode_t mode, const UserPerm& perm)
6451 {
6452   Mutex::Locker lock(client_lock);
6453   tout(cct) << "mkdir" << std::endl;
6454   tout(cct) << relpath << std::endl;
6455   tout(cct) << mode << std::endl;
6456   ldout(cct, 10) << "mkdir: " << relpath << dendl;
6457
6458   if (unmounting)
6459     return -ENOTCONN;
6460
6461   if (std::string(relpath) == "/")
6462     return -EEXIST;
6463
6464   filepath path(relpath);
6465   string name = path.last_dentry();
6466   path.pop_dentry();
6467   InodeRef dir;
6468   int r = path_walk(path, &dir, perm);
6469   if (r < 0)
6470     return r;
6471   if (cct->_conf->client_permissions) {
6472     r = may_create(dir.get(), perm);
6473     if (r < 0)
6474       return r;
6475   }
6476   return _mkdir(dir.get(), name.c_str(), mode, perm);
6477 }
6478
6479 int Client::mkdirs(const char *relpath, mode_t mode, const UserPerm& perms)
6480 {
6481   Mutex::Locker lock(client_lock);
6482   ldout(cct, 10) << "Client::mkdirs " << relpath << dendl;
6483   tout(cct) << "mkdirs" << std::endl;
6484   tout(cct) << relpath << std::endl;
6485   tout(cct) << mode << std::endl;
6486
6487   if (unmounting)
6488     return -ENOTCONN;
6489
6490   //get through existing parts of path
6491   filepath path(relpath);
6492   unsigned int i;
6493   int r = 0, caps = 0;
6494   InodeRef cur, next;
6495   cur = cwd;
6496   for (i=0; i<path.depth(); ++i) {
6497     if (cct->_conf->client_permissions) {
6498       r = may_lookup(cur.get(), perms);
6499       if (r < 0)
6500         break;
6501       caps = CEPH_CAP_AUTH_SHARED;
6502     }
6503     r = _lookup(cur.get(), path[i].c_str(), caps, &next, perms);
6504     if (r < 0)
6505       break;
6506     cur.swap(next);
6507   }
6508   //check that we have work left to do
6509   if (i==path.depth()) return -EEXIST;
6510   if (r!=-ENOENT) return r;
6511   ldout(cct, 20) << "mkdirs got through " << i << " directories on path " << relpath << dendl;
6512   //make new directory at each level
6513   for (; i<path.depth(); ++i) {
6514     if (cct->_conf->client_permissions) {
6515       r = may_create(cur.get(), perms);
6516       if (r < 0)
6517         return r;
6518     }
6519     //make new dir
6520     r = _mkdir(cur.get(), path[i].c_str(), mode, perms, &next);
6521
6522     //check proper creation/existence
6523     if(-EEXIST == r && i < path.depth() - 1) {
6524       r = _lookup(cur.get(), path[i].c_str(), CEPH_CAP_AUTH_SHARED, &next, perms);
6525     }
6526     if (r < 0)
6527       return r;
6528     //move to new dir and continue
6529     cur.swap(next);
6530     ldout(cct, 20) << "mkdirs: successfully created directory "
6531                    << filepath(cur->ino).get_path() << dendl;
6532   }
6533   return 0;
6534 }
6535
6536 int Client::rmdir(const char *relpath, const UserPerm& perms)
6537 {
6538   Mutex::Locker lock(client_lock);
6539   tout(cct) << "rmdir" << std::endl;
6540   tout(cct) << relpath << std::endl;
6541
6542   if (unmounting)
6543     return -ENOTCONN;
6544
6545   if (std::string(relpath) == "/")
6546     return -EBUSY;
6547
6548   filepath path(relpath);
6549   string name = path.last_dentry();
6550   path.pop_dentry();
6551   InodeRef dir;
6552   int r = path_walk(path, &dir, perms);
6553   if (r < 0)
6554     return r;
6555   if (cct->_conf->client_permissions) {
6556     int r = may_delete(dir.get(), name.c_str(), perms);
6557     if (r < 0)
6558       return r;
6559   }
6560   return _rmdir(dir.get(), name.c_str(), perms);
6561 }
6562
6563 int Client::mknod(const char *relpath, mode_t mode, const UserPerm& perms, dev_t rdev)
6564 {
6565   Mutex::Locker lock(client_lock);
6566   tout(cct) << "mknod" << std::endl;
6567   tout(cct) << relpath << std::endl;
6568   tout(cct) << mode << std::endl;
6569   tout(cct) << rdev << std::endl;
6570
6571   if (unmounting)
6572     return -ENOTCONN;
6573
6574   if (std::string(relpath) == "/")
6575     return -EEXIST;
6576
6577   filepath path(relpath);
6578   string name = path.last_dentry();
6579   path.pop_dentry();
6580   InodeRef dir;
6581   int r = path_walk(path, &dir, perms);
6582   if (r < 0)
6583     return r;
6584   if (cct->_conf->client_permissions) {
6585     int r = may_create(dir.get(), perms);
6586     if (r < 0)
6587       return r;
6588   }
6589   return _mknod(dir.get(), name.c_str(), mode, rdev, perms);
6590 }
6591
6592 // symlinks
6593
6594 int Client::symlink(const char *target, const char *relpath, const UserPerm& perms)
6595 {
6596   Mutex::Locker lock(client_lock);
6597   tout(cct) << "symlink" << std::endl;
6598   tout(cct) << target << std::endl;
6599   tout(cct) << relpath << std::endl;
6600
6601   if (unmounting)
6602     return -ENOTCONN;
6603
6604   if (std::string(relpath) == "/")
6605     return -EEXIST;
6606
6607   filepath path(relpath);
6608   string name = path.last_dentry();
6609   path.pop_dentry();
6610   InodeRef dir;
6611   int r = path_walk(path, &dir, perms);
6612   if (r < 0)
6613     return r;
6614   if (cct->_conf->client_permissions) {
6615     int r = may_create(dir.get(), perms);
6616     if (r < 0)
6617       return r;
6618   }
6619   return _symlink(dir.get(), name.c_str(), target, perms);
6620 }
6621
6622 int Client::readlink(const char *relpath, char *buf, loff_t size, const UserPerm& perms)
6623 {
6624   Mutex::Locker lock(client_lock);
6625   tout(cct) << "readlink" << std::endl;
6626   tout(cct) << relpath << std::endl;
6627
6628   if (unmounting)
6629     return -ENOTCONN;
6630
6631   filepath path(relpath);
6632   InodeRef in;
6633   int r = path_walk(path, &in, perms, false);
6634   if (r < 0)
6635     return r;
6636
6637   return _readlink(in.get(), buf, size);
6638 }
6639
6640 int Client::_readlink(Inode *in, char *buf, size_t size)
6641 {
6642   if (!in->is_symlink())
6643     return -EINVAL;
6644
6645   // copy into buf (at most size bytes)
6646   int r = in->symlink.length();
6647   if (r > (int)size)
6648     r = size;
6649   memcpy(buf, in->symlink.c_str(), r);
6650   return r;
6651 }
6652
6653
6654 // inode stuff
6655
6656 int Client::_getattr(Inode *in, int mask, const UserPerm& perms, bool force)
6657 {
6658   bool yes = in->caps_issued_mask(mask, true);
6659
6660   ldout(cct, 10) << "_getattr mask " << ccap_string(mask) << " issued=" << yes << dendl;
6661   if (yes && !force)
6662     return 0;
6663
6664   MetaRequest *req = new MetaRequest(CEPH_MDS_OP_GETATTR);
6665   filepath path;
6666   in->make_nosnap_relative_path(path);
6667   req->set_filepath(path);
6668   req->set_inode(in);
6669   req->head.args.getattr.mask = mask;
6670
6671   int res = make_request(req, perms);
6672   ldout(cct, 10) << "_getattr result=" << res << dendl;
6673   return res;
6674 }
6675
6676 int Client::_do_setattr(Inode *in, struct ceph_statx *stx, int mask,
6677                         const UserPerm& perms, InodeRef *inp)
6678 {
6679   int issued = in->caps_issued();
6680
6681   ldout(cct, 10) << "_setattr mask " << mask << " issued " <<
6682     ccap_string(issued) << dendl;
6683
6684   if (in->snapid != CEPH_NOSNAP) {
6685     return -EROFS;
6686   }
6687   if ((mask & CEPH_SETATTR_SIZE) &&
6688       (unsigned long)stx->stx_size > in->size &&
6689       is_quota_bytes_exceeded(in, (unsigned long)stx->stx_size - in->size,
6690                               perms)) {
6691     return -EDQUOT;
6692   }
6693
6694   // make the change locally?
6695   if ((in->cap_dirtier_uid >= 0 && perms.uid() != in->cap_dirtier_uid) ||
6696       (in->cap_dirtier_gid >= 0 && perms.gid() != in->cap_dirtier_gid)) {
6697     ldout(cct, 10) << __func__ << " caller " << perms.uid() << ":" << perms.gid()
6698                    << " != cap dirtier " << in->cap_dirtier_uid << ":"
6699                    << in->cap_dirtier_gid << ", forcing sync setattr"
6700                    << dendl;
6701     /*
6702      * This works because we implicitly flush the caps as part of the
6703      * request, so the cap update check will happen with the writeback
6704      * cap context, and then the setattr check will happen with the
6705      * caller's context.
6706      *
6707      * In reality this pattern is likely pretty rare (different users
6708      * setattr'ing the same file).  If that turns out not to be the
6709      * case later, we can build a more complex pipelined cap writeback
6710      * infrastructure...
6711      */
6712     if (!mask)
6713       mask |= CEPH_SETATTR_CTIME;
6714     goto force_request;
6715   }
6716
6717   if (!mask) {
6718     // caller just needs us to bump the ctime
6719     in->ctime = ceph_clock_now();
6720     in->cap_dirtier_uid = perms.uid();
6721     in->cap_dirtier_gid = perms.gid();
6722     if (issued & CEPH_CAP_AUTH_EXCL)
6723       mark_caps_dirty(in, CEPH_CAP_AUTH_EXCL);
6724     else if (issued & CEPH_CAP_FILE_EXCL)
6725       mark_caps_dirty(in, CEPH_CAP_FILE_EXCL);
6726     else if (issued & CEPH_CAP_XATTR_EXCL)
6727       mark_caps_dirty(in, CEPH_CAP_XATTR_EXCL);
6728     else
6729       mask |= CEPH_SETATTR_CTIME;
6730   }
6731
6732   if (in->caps_issued_mask(CEPH_CAP_AUTH_EXCL)) {
6733     bool kill_sguid = mask & (CEPH_SETATTR_SIZE|CEPH_SETATTR_KILL_SGUID);
6734
6735     mask &= ~CEPH_SETATTR_KILL_SGUID;
6736
6737     if (mask & CEPH_SETATTR_UID) {
6738       in->ctime = ceph_clock_now();
6739       in->cap_dirtier_uid = perms.uid();
6740       in->cap_dirtier_gid = perms.gid();
6741       in->uid = stx->stx_uid;
6742       mark_caps_dirty(in, CEPH_CAP_AUTH_EXCL);
6743       mask &= ~CEPH_SETATTR_UID;
6744       kill_sguid = true;
6745       ldout(cct,10) << "changing uid to " << stx->stx_uid << dendl;
6746     }
6747     if (mask & CEPH_SETATTR_GID) {
6748       in->ctime = ceph_clock_now();
6749       in->cap_dirtier_uid = perms.uid();
6750       in->cap_dirtier_gid = perms.gid();
6751       in->gid = stx->stx_gid;
6752       mark_caps_dirty(in, CEPH_CAP_AUTH_EXCL);
6753       mask &= ~CEPH_SETATTR_GID;
6754       kill_sguid = true;
6755       ldout(cct,10) << "changing gid to " << stx->stx_gid << dendl;
6756     }
6757
6758     if (mask & CEPH_SETATTR_MODE) {
6759       in->ctime = ceph_clock_now();
6760       in->cap_dirtier_uid = perms.uid();
6761       in->cap_dirtier_gid = perms.gid();
6762       in->mode = (in->mode & ~07777) | (stx->stx_mode & 07777);
6763       mark_caps_dirty(in, CEPH_CAP_AUTH_EXCL);
6764       mask &= ~CEPH_SETATTR_MODE;
6765       ldout(cct,10) << "changing mode to " << stx->stx_mode << dendl;
6766     } else if (kill_sguid && S_ISREG(in->mode) && (in->mode & (S_IXUSR|S_IXGRP|S_IXOTH))) {
6767       /* Must squash the any setuid/setgid bits with an ownership change */
6768       in->mode &= ~(S_ISUID|S_ISGID);
6769       mark_caps_dirty(in, CEPH_CAP_AUTH_EXCL);
6770     }
6771
6772     if (mask & CEPH_SETATTR_BTIME) {
6773       in->ctime = ceph_clock_now();
6774       in->cap_dirtier_uid = perms.uid();
6775       in->cap_dirtier_gid = perms.gid();
6776       in->btime = utime_t(stx->stx_btime);
6777       mark_caps_dirty(in, CEPH_CAP_AUTH_EXCL);
6778       mask &= ~CEPH_SETATTR_BTIME;
6779       ldout(cct,10) << "changing btime to " << in->btime << dendl;
6780     }
6781   } else if (mask & CEPH_SETATTR_SIZE) {
6782     /* If we don't have Ax, then we must ask the server to clear them on truncate */
6783     mask |= CEPH_SETATTR_KILL_SGUID;
6784   }
6785
6786   if (in->caps_issued_mask(CEPH_CAP_FILE_EXCL)) {
6787     if (mask & (CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME)) {
6788       if (mask & CEPH_SETATTR_MTIME)
6789         in->mtime = utime_t(stx->stx_mtime);
6790       if (mask & CEPH_SETATTR_ATIME)
6791         in->atime = utime_t(stx->stx_atime);
6792       in->ctime = ceph_clock_now();
6793       in->cap_dirtier_uid = perms.uid();
6794       in->cap_dirtier_gid = perms.gid();
6795       in->time_warp_seq++;
6796       mark_caps_dirty(in, CEPH_CAP_FILE_EXCL);
6797       mask &= ~(CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME);
6798     }
6799   }
6800   if (!mask) {
6801     in->change_attr++;
6802     return 0;
6803   }
6804
6805 force_request:
6806   MetaRequest *req = new MetaRequest(CEPH_MDS_OP_SETATTR);
6807
6808   filepath path;
6809
6810   in->make_nosnap_relative_path(path);
6811   req->set_filepath(path);
6812   req->set_inode(in);
6813
6814   if (mask & CEPH_SETATTR_KILL_SGUID) {
6815     req->inode_drop |= CEPH_CAP_AUTH_SHARED;
6816   }
6817   if (mask & CEPH_SETATTR_MODE) {
6818     req->head.args.setattr.mode = stx->stx_mode;
6819     req->inode_drop |= CEPH_CAP_AUTH_SHARED;
6820     ldout(cct,10) << "changing mode to " << stx->stx_mode << dendl;
6821   }
6822   if (mask & CEPH_SETATTR_UID) {
6823     req->head.args.setattr.uid = stx->stx_uid;
6824     req->inode_drop |= CEPH_CAP_AUTH_SHARED;
6825     ldout(cct,10) << "changing uid to " << stx->stx_uid << dendl;
6826   }
6827   if (mask & CEPH_SETATTR_GID) {
6828     req->head.args.setattr.gid = stx->stx_gid;
6829     req->inode_drop |= CEPH_CAP_AUTH_SHARED;
6830     ldout(cct,10) << "changing gid to " << stx->stx_gid << dendl;
6831   }
6832   if (mask & CEPH_SETATTR_BTIME) {
6833     req->head.args.setattr.btime = utime_t(stx->stx_btime);
6834     req->inode_drop |= CEPH_CAP_AUTH_SHARED;
6835   }
6836   if (mask & CEPH_SETATTR_MTIME) {
6837     req->head.args.setattr.mtime = utime_t(stx->stx_mtime);
6838     req->inode_drop |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_RD |
6839       CEPH_CAP_FILE_WR;
6840   }
6841   if (mask & CEPH_SETATTR_ATIME) {
6842     req->head.args.setattr.atime = utime_t(stx->stx_atime);
6843     req->inode_drop |= CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_RD |
6844       CEPH_CAP_FILE_WR;
6845   }
6846   if (mask & CEPH_SETATTR_SIZE) {
6847     if ((unsigned long)stx->stx_size < mdsmap->get_max_filesize()) {
6848       req->head.args.setattr.size = stx->stx_size;
6849       ldout(cct,10) << "changing size to " << stx->stx_size << dendl;
6850     } else { //too big!
6851       put_request(req);
6852       ldout(cct,10) << "unable to set size to " << stx->stx_size << ". Too large!" << dendl;
6853       return -EFBIG;
6854     }
6855     req->inode_drop |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_RD |
6856       CEPH_CAP_FILE_WR;
6857   }
6858   req->head.args.setattr.mask = mask;
6859
6860   req->regetattr_mask = mask;
6861
6862   int res = make_request(req, perms, inp);
6863   ldout(cct, 10) << "_setattr result=" << res << dendl;
6864   return res;
6865 }
6866
6867 /* Note that we only care about attrs that setattr cares about */
6868 void Client::stat_to_statx(struct stat *st, struct ceph_statx *stx)
6869 {
6870   stx->stx_size = st->st_size;
6871   stx->stx_mode = st->st_mode;
6872   stx->stx_uid = st->st_uid;
6873   stx->stx_gid = st->st_gid;
6874   stx->stx_mtime = st->st_mtim;
6875   stx->stx_atime = st->st_atim;
6876 }
6877
6878 int Client::__setattrx(Inode *in, struct ceph_statx *stx, int mask,
6879                        const UserPerm& perms, InodeRef *inp)
6880 {
6881   int ret = _do_setattr(in, stx, mask, perms, inp);
6882   if (ret < 0)
6883    return ret;
6884   if (mask & CEPH_SETATTR_MODE)
6885     ret = _posix_acl_chmod(in, stx->stx_mode, perms);
6886   return ret;
6887 }
6888
6889 int Client::_setattrx(InodeRef &in, struct ceph_statx *stx, int mask,
6890                       const UserPerm& perms)
6891 {
6892   mask &= (CEPH_SETATTR_MODE | CEPH_SETATTR_UID |
6893            CEPH_SETATTR_GID | CEPH_SETATTR_MTIME |
6894            CEPH_SETATTR_ATIME | CEPH_SETATTR_SIZE |
6895            CEPH_SETATTR_CTIME | CEPH_SETATTR_BTIME);
6896   if (cct->_conf->client_permissions) {
6897     int r = may_setattr(in.get(), stx, mask, perms);
6898     if (r < 0)
6899       return r;
6900   }
6901   return __setattrx(in.get(), stx, mask, perms);
6902 }
6903
6904 int Client::_setattr(InodeRef &in, struct stat *attr, int mask,
6905                      const UserPerm& perms)
6906 {
6907   struct ceph_statx stx;
6908
6909   stat_to_statx(attr, &stx);
6910   mask &= ~CEPH_SETATTR_BTIME;
6911
6912   if ((mask & CEPH_SETATTR_UID) && attr->st_uid == static_cast<uid_t>(-1)) {
6913     mask &= ~CEPH_SETATTR_UID;
6914   }
6915   if ((mask & CEPH_SETATTR_GID) && attr->st_gid == static_cast<uid_t>(-1)) {
6916     mask &= ~CEPH_SETATTR_GID;
6917   }
6918
6919   return _setattrx(in, &stx, mask, perms);
6920 }
6921
6922 int Client::setattr(const char *relpath, struct stat *attr, int mask,
6923                     const UserPerm& perms)
6924 {
6925   Mutex::Locker lock(client_lock);
6926   tout(cct) << "setattr" << std::endl;
6927   tout(cct) << relpath << std::endl;
6928   tout(cct) << mask  << std::endl;
6929
6930   if (unmounting)
6931     return -ENOTCONN;
6932
6933   filepath path(relpath);
6934   InodeRef in;
6935   int r = path_walk(path, &in, perms);
6936   if (r < 0)
6937     return r;
6938   return _setattr(in, attr, mask, perms);
6939 }
6940
6941 int Client::setattrx(const char *relpath, struct ceph_statx *stx, int mask,
6942                      const UserPerm& perms, int flags)
6943 {
6944   Mutex::Locker lock(client_lock);
6945   tout(cct) << "setattrx" << std::endl;
6946   tout(cct) << relpath << std::endl;
6947   tout(cct) << mask  << std::endl;
6948
6949   if (unmounting)
6950     return -ENOTCONN;
6951
6952   filepath path(relpath);
6953   InodeRef in;
6954   int r = path_walk(path, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW));
6955   if (r < 0)
6956     return r;
6957   return _setattrx(in, stx, mask, perms);
6958 }
6959
6960 int Client::fsetattr(int fd, struct stat *attr, int mask, const UserPerm& perms)
6961 {
6962   Mutex::Locker lock(client_lock);
6963   tout(cct) << "fsetattr" << std::endl;
6964   tout(cct) << fd << std::endl;
6965   tout(cct) << mask  << std::endl;
6966
6967   if (unmounting)
6968     return -ENOTCONN;
6969
6970   Fh *f = get_filehandle(fd);
6971   if (!f)
6972     return -EBADF;
6973 #if defined(__linux__) && defined(O_PATH)
6974   if (f->flags & O_PATH)
6975     return -EBADF;
6976 #endif
6977   return _setattr(f->inode, attr, mask, perms);
6978 }
6979
6980 int Client::fsetattrx(int fd, struct ceph_statx *stx, int mask, const UserPerm& perms)
6981 {
6982   Mutex::Locker lock(client_lock);
6983   tout(cct) << "fsetattr" << std::endl;
6984   tout(cct) << fd << std::endl;
6985   tout(cct) << mask  << std::endl;
6986
6987   if (unmounting)
6988     return -ENOTCONN;
6989
6990   Fh *f = get_filehandle(fd);
6991   if (!f)
6992     return -EBADF;
6993 #if defined(__linux__) && defined(O_PATH)
6994   if (f->flags & O_PATH)
6995     return -EBADF;
6996 #endif
6997   return _setattrx(f->inode, stx, mask, perms);
6998 }
6999
7000 int Client::stat(const char *relpath, struct stat *stbuf, const UserPerm& perms,
7001                  frag_info_t *dirstat, int mask)
7002 {
7003   ldout(cct, 3) << "stat enter (relpath " << relpath << " mask " << mask << ")" << dendl;
7004   Mutex::Locker lock(client_lock);
7005   tout(cct) << "stat" << std::endl;
7006   tout(cct) << relpath << std::endl;
7007
7008   if (unmounting)
7009     return -ENOTCONN;
7010
7011   filepath path(relpath);
7012   InodeRef in;
7013   int r = path_walk(path, &in, perms, true, mask);
7014   if (r < 0)
7015     return r;
7016   r = _getattr(in, mask, perms);
7017   if (r < 0) {
7018     ldout(cct, 3) << "stat exit on error!" << dendl;
7019     return r;
7020   }
7021   fill_stat(in, stbuf, dirstat);
7022   ldout(cct, 3) << "stat exit (relpath " << relpath << " mask " << mask << ")" << dendl;
7023   return r;
7024 }
7025
7026 unsigned Client::statx_to_mask(unsigned int flags, unsigned int want)
7027 {
7028   unsigned mask = 0;
7029
7030   /* if NO_ATTR_SYNC is set, then we don't need any -- just use what's in cache */
7031   if (flags & AT_NO_ATTR_SYNC)
7032     goto out;
7033
7034   /* Always set PIN to distinguish from AT_NO_ATTR_SYNC case */
7035   mask |= CEPH_CAP_PIN;
7036   if (want & (CEPH_STATX_MODE|CEPH_STATX_UID|CEPH_STATX_GID|CEPH_STATX_BTIME|CEPH_STATX_CTIME|CEPH_STATX_VERSION))
7037     mask |= CEPH_CAP_AUTH_SHARED;
7038   if (want & (CEPH_STATX_NLINK|CEPH_STATX_CTIME|CEPH_STATX_VERSION))
7039     mask |= CEPH_CAP_LINK_SHARED;
7040   if (want & (CEPH_STATX_ATIME|CEPH_STATX_MTIME|CEPH_STATX_CTIME|CEPH_STATX_SIZE|CEPH_STATX_BLOCKS|CEPH_STATX_VERSION))
7041     mask |= CEPH_CAP_FILE_SHARED;
7042   if (want & (CEPH_STATX_VERSION|CEPH_STATX_CTIME))
7043     mask |= CEPH_CAP_XATTR_SHARED;
7044 out:
7045   return mask;
7046 }
7047
7048 int Client::statx(const char *relpath, struct ceph_statx *stx,
7049                   const UserPerm& perms,
7050                   unsigned int want, unsigned int flags)
7051 {
7052   ldout(cct, 3) << "statx enter (relpath " << relpath << " want " << want << ")" << dendl;
7053   Mutex::Locker lock(client_lock);
7054   tout(cct) << "statx" << std::endl;
7055   tout(cct) << relpath << std::endl;
7056
7057   if (unmounting)
7058     return -ENOTCONN;
7059
7060   filepath path(relpath);
7061   InodeRef in;
7062
7063   unsigned mask = statx_to_mask(flags, want);
7064
7065   int r = path_walk(path, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW), mask);
7066   if (r < 0)
7067     return r;
7068
7069   r = _getattr(in, mask, perms);
7070   if (r < 0) {
7071     ldout(cct, 3) << "statx exit on error!" << dendl;
7072     return r;
7073   }
7074
7075   fill_statx(in, mask, stx);
7076   ldout(cct, 3) << "statx exit (relpath " << relpath << " mask " << stx->stx_mask << ")" << dendl;
7077   return r;
7078 }
7079
7080 int Client::lstat(const char *relpath, struct stat *stbuf,
7081                   const UserPerm& perms, frag_info_t *dirstat, int mask)
7082 {
7083   ldout(cct, 3) << "lstat enter (relpath " << relpath << " mask " << mask << ")" << dendl;
7084   Mutex::Locker lock(client_lock);
7085   tout(cct) << "lstat" << std::endl;
7086   tout(cct) << relpath << std::endl;
7087
7088   if (unmounting)
7089     return -ENOTCONN;
7090
7091   filepath path(relpath);
7092   InodeRef in;
7093   // don't follow symlinks
7094   int r = path_walk(path, &in, perms, false, mask);
7095   if (r < 0)
7096     return r;
7097   r = _getattr(in, mask, perms);
7098   if (r < 0) {
7099     ldout(cct, 3) << "lstat exit on error!" << dendl;
7100     return r;
7101   }
7102   fill_stat(in, stbuf, dirstat);
7103   ldout(cct, 3) << "lstat exit (relpath " << relpath << " mask " << mask << ")" << dendl;
7104   return r;
7105 }
7106
7107 int Client::fill_stat(Inode *in, struct stat *st, frag_info_t *dirstat, nest_info_t *rstat)
7108 {
7109   ldout(cct, 10) << "fill_stat on " << in->ino << " snap/dev" << in->snapid
7110            << " mode 0" << oct << in->mode << dec
7111            << " mtime " << in->mtime << " ctime " << in->ctime << dendl;
7112   memset(st, 0, sizeof(struct stat));
7113   if (use_faked_inos())
7114     st->st_ino = in->faked_ino;
7115   else
7116     st->st_ino = in->ino;
7117   st->st_dev = in->snapid;
7118   st->st_mode = in->mode;
7119   st->st_rdev = in->rdev;
7120   st->st_nlink = in->nlink;
7121   st->st_uid = in->uid;
7122   st->st_gid = in->gid;
7123   if (in->ctime > in->mtime) {
7124     stat_set_ctime_sec(st, in->ctime.sec());
7125     stat_set_ctime_nsec(st, in->ctime.nsec());
7126   } else {
7127     stat_set_ctime_sec(st, in->mtime.sec());
7128     stat_set_ctime_nsec(st, in->mtime.nsec());
7129   }
7130   stat_set_atime_sec(st, in->atime.sec());
7131   stat_set_atime_nsec(st, in->atime.nsec());
7132   stat_set_mtime_sec(st, in->mtime.sec());
7133   stat_set_mtime_nsec(st, in->mtime.nsec());
7134   if (in->is_dir()) {
7135     if (cct->_conf->client_dirsize_rbytes)
7136       st->st_size = in->rstat.rbytes;
7137     else
7138       st->st_size = in->dirstat.size();
7139     st->st_blocks = 1;
7140   } else {
7141     st->st_size = in->size;
7142     st->st_blocks = (in->size + 511) >> 9;
7143   }
7144   st->st_blksize = MAX(in->layout.stripe_unit, 4096);
7145
7146   if (dirstat)
7147     *dirstat = in->dirstat;
7148   if (rstat)
7149     *rstat = in->rstat;
7150
7151   return in->caps_issued();
7152 }
7153
7154 void Client::fill_statx(Inode *in, unsigned int mask, struct ceph_statx *stx)
7155 {
7156   ldout(cct, 10) << "fill_statx on " << in->ino << " snap/dev" << in->snapid
7157            << " mode 0" << oct << in->mode << dec
7158            << " mtime " << in->mtime << " ctime " << in->ctime << dendl;
7159   memset(stx, 0, sizeof(struct ceph_statx));
7160
7161   /*
7162    * If mask is 0, then the caller set AT_NO_ATTR_SYNC. Reset the mask
7163    * so that all bits are set.
7164    */
7165   if (!mask)
7166     mask = ~0;
7167
7168   /* These are always considered to be available */
7169   stx->stx_dev = in->snapid;
7170   stx->stx_blksize = MAX(in->layout.stripe_unit, 4096);
7171
7172   /* Type bits are always set, even when CEPH_STATX_MODE is not */
7173   stx->stx_mode = S_IFMT & in->mode;
7174   stx->stx_ino = use_faked_inos() ? in->faked_ino : (ino_t)in->ino;
7175   stx->stx_rdev = in->rdev;
7176   stx->stx_mask |= (CEPH_STATX_INO|CEPH_STATX_RDEV);
7177
7178   if (mask & CEPH_CAP_AUTH_SHARED) {
7179     stx->stx_uid = in->uid;
7180     stx->stx_gid = in->gid;
7181     stx->stx_mode = in->mode;
7182     in->btime.to_timespec(&stx->stx_btime);
7183     stx->stx_mask |= (CEPH_STATX_MODE|CEPH_STATX_UID|CEPH_STATX_GID|CEPH_STATX_BTIME);
7184   }
7185
7186   if (mask & CEPH_CAP_LINK_SHARED) {
7187     stx->stx_nlink = in->nlink;
7188     stx->stx_mask |= CEPH_STATX_NLINK;
7189   }
7190
7191   if (mask & CEPH_CAP_FILE_SHARED) {
7192
7193     in->atime.to_timespec(&stx->stx_atime);
7194     in->mtime.to_timespec(&stx->stx_mtime);
7195
7196     if (in->is_dir()) {
7197       if (cct->_conf->client_dirsize_rbytes)
7198         stx->stx_size = in->rstat.rbytes;
7199       else
7200         stx->stx_size = in->dirstat.size();
7201       stx->stx_blocks = 1;
7202     } else {
7203       stx->stx_size = in->size;
7204       stx->stx_blocks = (in->size + 511) >> 9;
7205     }
7206     stx->stx_mask |= (CEPH_STATX_ATIME|CEPH_STATX_MTIME|
7207                       CEPH_STATX_SIZE|CEPH_STATX_BLOCKS);
7208   }
7209
7210   /* Change time and change_attr both require all shared caps to view */
7211   if ((mask & CEPH_STAT_CAP_INODE_ALL) == CEPH_STAT_CAP_INODE_ALL) {
7212     stx->stx_version = in->change_attr;
7213     if (in->ctime > in->mtime)
7214       in->ctime.to_timespec(&stx->stx_ctime);
7215     else
7216       in->mtime.to_timespec(&stx->stx_ctime);
7217     stx->stx_mask |= (CEPH_STATX_CTIME|CEPH_STATX_VERSION);
7218   }
7219
7220 }
7221
7222 void Client::touch_dn(Dentry *dn)
7223 {
7224   lru.lru_touch(dn);
7225 }
7226
7227 int Client::chmod(const char *relpath, mode_t mode, const UserPerm& perms)
7228 {
7229   Mutex::Locker lock(client_lock);
7230   tout(cct) << "chmod" << std::endl;
7231   tout(cct) << relpath << std::endl;
7232   tout(cct) << mode << std::endl;
7233
7234   if (unmounting)
7235     return -ENOTCONN;
7236
7237   filepath path(relpath);
7238   InodeRef in;
7239   int r = path_walk(path, &in, perms);
7240   if (r < 0)
7241     return r;
7242   struct stat attr;
7243   attr.st_mode = mode;
7244   return _setattr(in, &attr, CEPH_SETATTR_MODE, perms);
7245 }
7246
7247 int Client::fchmod(int fd, mode_t mode, const UserPerm& perms)
7248 {
7249   Mutex::Locker lock(client_lock);
7250   tout(cct) << "fchmod" << std::endl;
7251   tout(cct) << fd << std::endl;
7252   tout(cct) << mode << std::endl;
7253
7254   if (unmounting)
7255     return -ENOTCONN;
7256
7257   Fh *f = get_filehandle(fd);
7258   if (!f)
7259     return -EBADF;
7260 #if defined(__linux__) && defined(O_PATH)
7261   if (f->flags & O_PATH)
7262     return -EBADF;
7263 #endif
7264   struct stat attr;
7265   attr.st_mode = mode;
7266   return _setattr(f->inode, &attr, CEPH_SETATTR_MODE, perms);
7267 }
7268
7269 int Client::lchmod(const char *relpath, mode_t mode, const UserPerm& perms)
7270 {
7271   Mutex::Locker lock(client_lock);
7272   tout(cct) << "lchmod" << std::endl;
7273   tout(cct) << relpath << std::endl;
7274   tout(cct) << mode << std::endl;
7275
7276   if (unmounting)
7277     return -ENOTCONN;
7278
7279   filepath path(relpath);
7280   InodeRef in;
7281   // don't follow symlinks
7282   int r = path_walk(path, &in, perms, false);
7283   if (r < 0)
7284     return r;
7285   struct stat attr;
7286   attr.st_mode = mode;
7287   return _setattr(in, &attr, CEPH_SETATTR_MODE, perms);
7288 }
7289
7290 int Client::chown(const char *relpath, uid_t new_uid, gid_t new_gid,
7291                   const UserPerm& perms)
7292 {
7293   Mutex::Locker lock(client_lock);
7294   tout(cct) << "chown" << std::endl;
7295   tout(cct) << relpath << std::endl;
7296   tout(cct) << new_uid << std::endl;
7297   tout(cct) << new_gid << std::endl;
7298
7299   if (unmounting)
7300     return -ENOTCONN;
7301
7302   filepath path(relpath);
7303   InodeRef in;
7304   int r = path_walk(path, &in, perms);
7305   if (r < 0)
7306     return r;
7307   struct stat attr;
7308   attr.st_uid = new_uid;
7309   attr.st_gid = new_gid;
7310   return _setattr(in, &attr, CEPH_SETATTR_UID|CEPH_SETATTR_GID, perms);
7311 }
7312
7313 int Client::fchown(int fd, uid_t new_uid, gid_t new_gid, const UserPerm& perms)
7314 {
7315   Mutex::Locker lock(client_lock);
7316   tout(cct) << "fchown" << std::endl;
7317   tout(cct) << fd << std::endl;
7318   tout(cct) << new_uid << std::endl;
7319   tout(cct) << new_gid << std::endl;
7320
7321   if (unmounting)
7322     return -ENOTCONN;
7323
7324   Fh *f = get_filehandle(fd);
7325   if (!f)
7326     return -EBADF;
7327 #if defined(__linux__) && defined(O_PATH)
7328   if (f->flags & O_PATH)
7329     return -EBADF;
7330 #endif
7331   struct stat attr;
7332   attr.st_uid = new_uid;
7333   attr.st_gid = new_gid;
7334   int mask = 0;
7335   if (new_uid != static_cast<uid_t>(-1)) mask |= CEPH_SETATTR_UID;
7336   if (new_gid != static_cast<gid_t>(-1)) mask |= CEPH_SETATTR_GID;
7337   return _setattr(f->inode, &attr, mask, perms);
7338 }
7339
7340 int Client::lchown(const char *relpath, uid_t new_uid, gid_t new_gid,
7341                    const UserPerm& perms)
7342 {
7343   Mutex::Locker lock(client_lock);
7344   tout(cct) << "lchown" << std::endl;
7345   tout(cct) << relpath << std::endl;
7346   tout(cct) << new_uid << std::endl;
7347   tout(cct) << new_gid << std::endl;
7348
7349   if (unmounting)
7350     return -ENOTCONN;
7351
7352   filepath path(relpath);
7353   InodeRef in;
7354   // don't follow symlinks
7355   int r = path_walk(path, &in, perms, false);
7356   if (r < 0)
7357     return r;
7358   struct stat attr;
7359   attr.st_uid = new_uid;
7360   attr.st_gid = new_gid;
7361   int mask = 0;
7362   if (new_uid != static_cast<uid_t>(-1)) mask |= CEPH_SETATTR_UID;
7363   if (new_gid != static_cast<gid_t>(-1)) mask |= CEPH_SETATTR_GID;
7364   return _setattr(in, &attr, mask, perms);
7365 }
7366
7367 int Client::utime(const char *relpath, struct utimbuf *buf,
7368                   const UserPerm& perms)
7369 {
7370   Mutex::Locker lock(client_lock);
7371   tout(cct) << "utime" << std::endl;
7372   tout(cct) << relpath << std::endl;
7373   tout(cct) << buf->modtime << std::endl;
7374   tout(cct) << buf->actime << std::endl;
7375
7376   if (unmounting)
7377     return -ENOTCONN;
7378
7379   filepath path(relpath);
7380   InodeRef in;
7381   int r = path_walk(path, &in, perms);
7382   if (r < 0)
7383     return r;
7384   struct stat attr;
7385   stat_set_mtime_sec(&attr, buf->modtime);
7386   stat_set_mtime_nsec(&attr, 0);
7387   stat_set_atime_sec(&attr, buf->actime);
7388   stat_set_atime_nsec(&attr, 0);
7389   return _setattr(in, &attr, CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME, perms);
7390 }
7391
7392 int Client::lutime(const char *relpath, struct utimbuf *buf,
7393                    const UserPerm& perms)
7394 {
7395   Mutex::Locker lock(client_lock);
7396   tout(cct) << "lutime" << std::endl;
7397   tout(cct) << relpath << std::endl;
7398   tout(cct) << buf->modtime << std::endl;
7399   tout(cct) << buf->actime << std::endl;
7400
7401   if (unmounting)
7402     return -ENOTCONN;
7403
7404   filepath path(relpath);
7405   InodeRef in;
7406   // don't follow symlinks
7407   int r = path_walk(path, &in, perms, false);
7408   if (r < 0)
7409     return r;
7410   struct stat attr;
7411   stat_set_mtime_sec(&attr, buf->modtime);
7412   stat_set_mtime_nsec(&attr, 0);
7413   stat_set_atime_sec(&attr, buf->actime);
7414   stat_set_atime_nsec(&attr, 0);
7415   return _setattr(in, &attr, CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME, perms);
7416 }
7417
7418 int Client::flock(int fd, int operation, uint64_t owner)
7419 {
7420   Mutex::Locker lock(client_lock);
7421   tout(cct) << "flock" << std::endl;
7422   tout(cct) << fd << std::endl;
7423   tout(cct) << operation << std::endl;
7424   tout(cct) << owner << std::endl;
7425
7426   if (unmounting)
7427     return -ENOTCONN;
7428
7429   Fh *f = get_filehandle(fd);
7430   if (!f)
7431     return -EBADF;
7432
7433   return _flock(f, operation, owner);
7434 }
7435
7436 int Client::opendir(const char *relpath, dir_result_t **dirpp, const UserPerm& perms)
7437 {
7438   Mutex::Locker lock(client_lock);
7439   tout(cct) << "opendir" << std::endl;
7440   tout(cct) << relpath << std::endl;
7441
7442   if (unmounting)
7443     return -ENOTCONN;
7444
7445   filepath path(relpath);
7446   InodeRef in;
7447   int r = path_walk(path, &in, perms, true);
7448   if (r < 0)
7449     return r;
7450   if (cct->_conf->client_permissions) {
7451     int r = may_open(in.get(), O_RDONLY, perms);
7452     if (r < 0)
7453       return r;
7454   }
7455   r = _opendir(in.get(), dirpp, perms);
7456   /* if ENOTDIR, dirpp will be an uninitialized point and it's very dangerous to access its value */
7457   if (r != -ENOTDIR)
7458       tout(cct) << (unsigned long)*dirpp << std::endl;
7459   return r;
7460 }
7461
7462 int Client::_opendir(Inode *in, dir_result_t **dirpp, const UserPerm& perms)
7463 {
7464   if (!in->is_dir())
7465     return -ENOTDIR;
7466   *dirpp = new dir_result_t(in, perms);
7467   opened_dirs.insert(*dirpp);
7468   ldout(cct, 3) << "_opendir(" << in->ino << ") = " << 0 << " (" << *dirpp << ")" << dendl;
7469   return 0;
7470 }
7471
7472
7473 int Client::closedir(dir_result_t *dir)
7474 {
7475   Mutex::Locker lock(client_lock);
7476   tout(cct) << "closedir" << std::endl;
7477   tout(cct) << (unsigned long)dir << std::endl;
7478
7479   ldout(cct, 3) << "closedir(" << dir << ") = 0" << dendl;
7480   _closedir(dir);
7481   return 0;
7482 }
7483
7484 void Client::_closedir(dir_result_t *dirp)
7485 {
7486   ldout(cct, 10) << "_closedir(" << dirp << ")" << dendl;
7487   if (dirp->inode) {
7488     ldout(cct, 10) << "_closedir detaching inode " << dirp->inode << dendl;
7489     dirp->inode.reset();
7490   }
7491   _readdir_drop_dirp_buffer(dirp);
7492   opened_dirs.erase(dirp);
7493   delete dirp;
7494 }
7495
7496 void Client::rewinddir(dir_result_t *dirp)
7497 {
7498   Mutex::Locker lock(client_lock);
7499   ldout(cct, 3) << "rewinddir(" << dirp << ")" << dendl;
7500
7501   if (unmounting)
7502     return;
7503
7504   dir_result_t *d = static_cast<dir_result_t*>(dirp);
7505   _readdir_drop_dirp_buffer(d);
7506   d->reset();
7507 }
7508
7509 loff_t Client::telldir(dir_result_t *dirp)
7510 {
7511   dir_result_t *d = static_cast<dir_result_t*>(dirp);
7512   ldout(cct, 3) << "telldir(" << dirp << ") = " << d->offset << dendl;
7513   return d->offset;
7514 }
7515
7516 void Client::seekdir(dir_result_t *dirp, loff_t offset)
7517 {
7518   Mutex::Locker lock(client_lock);
7519
7520   ldout(cct, 3) << "seekdir(" << dirp << ", " << offset << ")" << dendl;
7521
7522   if (unmounting)
7523     return;
7524
7525   if (offset == dirp->offset)
7526     return;
7527
7528   if (offset > dirp->offset)
7529     dirp->release_count = 0;   // bump if we do a forward seek
7530   else
7531     dirp->ordered_count = 0;   // disable filling readdir cache
7532
7533   if (dirp->hash_order()) {
7534     if (dirp->offset > offset) {
7535       _readdir_drop_dirp_buffer(dirp);
7536       dirp->reset();
7537     }
7538   } else {
7539     if (offset == 0 ||
7540         dirp->buffer_frag != frag_t(dir_result_t::fpos_high(offset)) ||
7541         dirp->offset_low() > dir_result_t::fpos_low(offset))  {
7542       _readdir_drop_dirp_buffer(dirp);
7543       dirp->reset();
7544     }
7545   }
7546
7547   dirp->offset = offset;
7548 }
7549
7550
7551 //struct dirent {
7552 //  ino_t          d_ino;       /* inode number */
7553 //  off_t          d_off;       /* offset to the next dirent */
7554 //  unsigned short d_reclen;    /* length of this record */
7555 //  unsigned char  d_type;      /* type of file */
7556 //  char           d_name[256]; /* filename */
7557 //};
7558 void Client::fill_dirent(struct dirent *de, const char *name, int type, uint64_t ino, loff_t next_off)
7559 {
7560   strncpy(de->d_name, name, 255);
7561   de->d_name[255] = '\0';
7562 #ifndef __CYGWIN__
7563   de->d_ino = ino;
7564 #if !defined(DARWIN) && !defined(__FreeBSD__)
7565   de->d_off = next_off;
7566 #endif
7567   de->d_reclen = 1;
7568   de->d_type = IFTODT(type);
7569   ldout(cct, 10) << "fill_dirent '" << de->d_name << "' -> " << inodeno_t(de->d_ino)
7570            << " type " << (int)de->d_type << " w/ next_off " << hex << next_off << dec << dendl;
7571 #endif
7572 }
7573
7574 void Client::_readdir_next_frag(dir_result_t *dirp)
7575 {
7576   frag_t fg = dirp->buffer_frag;
7577
7578   if (fg.is_rightmost()) {
7579     ldout(cct, 10) << "_readdir_next_frag advance from " << fg << " to END" << dendl;
7580     dirp->set_end();
7581     return;
7582   }
7583
7584   // advance
7585   fg = fg.next();
7586   ldout(cct, 10) << "_readdir_next_frag advance from " << dirp->buffer_frag << " to " << fg << dendl;
7587
7588   if (dirp->hash_order()) {
7589     // keep last_name
7590     int64_t new_offset = dir_result_t::make_fpos(fg.value(), 2, true);
7591     if (dirp->offset < new_offset) // don't decrease offset
7592       dirp->offset = new_offset;
7593   } else {
7594     dirp->last_name.clear();
7595     dirp->offset = dir_result_t::make_fpos(fg, 2, false);
7596     _readdir_rechoose_frag(dirp);
7597   }
7598 }
7599
7600 void Client::_readdir_rechoose_frag(dir_result_t *dirp)
7601 {
7602   assert(dirp->inode);
7603
7604   if (dirp->hash_order())
7605     return;
7606
7607   frag_t cur = frag_t(dirp->offset_high());
7608   frag_t fg = dirp->inode->dirfragtree[cur.value()];
7609   if (fg != cur) {
7610     ldout(cct, 10) << "_readdir_rechoose_frag frag " << cur << " maps to " << fg << dendl;
7611     dirp->offset = dir_result_t::make_fpos(fg, 2, false);
7612     dirp->last_name.clear();
7613     dirp->next_offset = 2;
7614   }
7615 }
7616
7617 void Client::_readdir_drop_dirp_buffer(dir_result_t *dirp)
7618 {
7619   ldout(cct, 10) << "_readdir_drop_dirp_buffer " << dirp << dendl;
7620   dirp->buffer.clear();
7621 }
7622
7623 int Client::_readdir_get_frag(dir_result_t *dirp)
7624 {
7625   assert(dirp);
7626   assert(dirp->inode);
7627
7628   // get the current frag.
7629   frag_t fg;
7630   if (dirp->hash_order())
7631     fg = dirp->inode->dirfragtree[dirp->offset_high()];
7632   else
7633     fg = frag_t(dirp->offset_high());
7634
7635   ldout(cct, 10) << "_readdir_get_frag " << dirp << " on " << dirp->inode->ino << " fg " << fg
7636                  << " offset " << hex << dirp->offset << dec << dendl;
7637
7638   int op = CEPH_MDS_OP_READDIR;
7639   if (dirp->inode && dirp->inode->snapid == CEPH_SNAPDIR)
7640     op = CEPH_MDS_OP_LSSNAP;
7641
7642   InodeRef& diri = dirp->inode;
7643
7644   MetaRequest *req = new MetaRequest(op);
7645   filepath path;
7646   diri->make_nosnap_relative_path(path);
7647   req->set_filepath(path);
7648   req->set_inode(diri.get());
7649   req->head.args.readdir.frag = fg;
7650   req->head.args.readdir.flags = CEPH_READDIR_REPLY_BITFLAGS;
7651   if (dirp->last_name.length()) {
7652     req->path2.set_path(dirp->last_name);
7653   } else if (dirp->hash_order()) {
7654     req->head.args.readdir.offset_hash = dirp->offset_high();
7655   }
7656   req->dirp = dirp;
7657
7658   bufferlist dirbl;
7659   int res = make_request(req, dirp->perms, NULL, NULL, -1, &dirbl);
7660
7661   if (res == -EAGAIN) {
7662     ldout(cct, 10) << "_readdir_get_frag got EAGAIN, retrying" << dendl;
7663     _readdir_rechoose_frag(dirp);
7664     return _readdir_get_frag(dirp);
7665   }
7666
7667   if (res == 0) {
7668     ldout(cct, 10) << "_readdir_get_frag " << dirp << " got frag " << dirp->buffer_frag
7669                    << " size " << dirp->buffer.size() << dendl;
7670   } else {
7671     ldout(cct, 10) << "_readdir_get_frag got error " << res << ", setting end flag" << dendl;
7672     dirp->set_end();
7673   }
7674
7675   return res;
7676 }
7677
7678 struct dentry_off_lt {
7679   bool operator()(const Dentry* dn, int64_t off) const {
7680     return dir_result_t::fpos_cmp(dn->offset, off) < 0;
7681   }
7682 };
7683
7684 int Client::_readdir_cache_cb(dir_result_t *dirp, add_dirent_cb_t cb, void *p,
7685                               int caps, bool getref)
7686 {
7687   assert(client_lock.is_locked());
7688   ldout(cct, 10) << "_readdir_cache_cb " << dirp << " on " << dirp->inode->ino
7689            << " last_name " << dirp->last_name << " offset " << hex << dirp->offset << dec
7690            << dendl;
7691   Dir *dir = dirp->inode->dir;
7692
7693   if (!dir) {
7694     ldout(cct, 10) << " dir is empty" << dendl;
7695     dirp->set_end();
7696     return 0;
7697   }
7698
7699   vector<Dentry*>::iterator pd = std::lower_bound(dir->readdir_cache.begin(),
7700                                                   dir->readdir_cache.end(),
7701                                                   dirp->offset, dentry_off_lt());
7702
7703   string dn_name;
7704   while (true) {
7705     if (!dirp->inode->is_complete_and_ordered())
7706       return -EAGAIN;
7707     if (pd == dir->readdir_cache.end())
7708       break;
7709     Dentry *dn = *pd;
7710     if (dn->inode == NULL) {
7711       ldout(cct, 15) << " skipping null '" << dn->name << "'" << dendl;
7712       ++pd;
7713       continue;
7714     }
7715     if (dn->cap_shared_gen != dir->parent_inode->shared_gen) {
7716       ldout(cct, 15) << " skipping mismatch shared gen '" << dn->name << "'" << dendl;
7717       ++pd;
7718       continue;
7719     }
7720
7721     int r = _getattr(dn->inode, caps, dirp->perms);
7722     if (r < 0)
7723       return r;
7724
7725     struct ceph_statx stx;
7726     struct dirent de;
7727     fill_statx(dn->inode, caps, &stx);
7728
7729     uint64_t next_off = dn->offset + 1;
7730     ++pd;
7731     if (pd == dir->readdir_cache.end())
7732       next_off = dir_result_t::END;
7733
7734     Inode *in = NULL;
7735     fill_dirent(&de, dn->name.c_str(), stx.stx_mode, stx.stx_ino, next_off);
7736     if (getref) {
7737       in = dn->inode.get();
7738       _ll_get(in);
7739     }
7740
7741     dn_name = dn->name; // fill in name while we have lock
7742
7743     client_lock.Unlock();
7744     r = cb(p, &de, &stx, next_off, in);  // _next_ offset
7745     client_lock.Lock();
7746     ldout(cct, 15) << " de " << de.d_name << " off " << hex << dn->offset << dec
7747                    << " = " << r << dendl;
7748     if (r < 0) {
7749       return r;
7750     }
7751
7752     dirp->offset = next_off;
7753     if (dirp->at_end())
7754       dirp->next_offset = 2;
7755     else
7756       dirp->next_offset = dirp->offset_low();
7757     dirp->last_name = dn_name; // we successfully returned this one; update!
7758     if (r > 0)
7759       return r;
7760   }
7761
7762   ldout(cct, 10) << "_readdir_cache_cb " << dirp << " on " << dirp->inode->ino << " at end" << dendl;
7763   dirp->set_end();
7764   return 0;
7765 }
7766
7767 int Client::readdir_r_cb(dir_result_t *d, add_dirent_cb_t cb, void *p,
7768                          unsigned want, unsigned flags, bool getref)
7769 {
7770   int caps = statx_to_mask(flags, want);
7771
7772   Mutex::Locker lock(client_lock);
7773
7774   if (unmounting)
7775     return -ENOTCONN;
7776
7777   dir_result_t *dirp = static_cast<dir_result_t*>(d);
7778
7779   ldout(cct, 10) << "readdir_r_cb " << *dirp->inode << " offset " << hex << dirp->offset
7780                  << dec << " at_end=" << dirp->at_end()
7781                  << " hash_order=" << dirp->hash_order() << dendl;
7782
7783   struct dirent de;
7784   struct ceph_statx stx;
7785   memset(&de, 0, sizeof(de));
7786   memset(&stx, 0, sizeof(stx));
7787
7788   InodeRef& diri = dirp->inode;
7789
7790   if (dirp->at_end())
7791     return 0;
7792
7793   if (dirp->offset == 0) {
7794     ldout(cct, 15) << " including ." << dendl;
7795     assert(diri->dn_set.size() < 2); // can't have multiple hard-links to a dir
7796     uint64_t next_off = 1;
7797
7798     int r;
7799     r = _getattr(diri, caps, dirp->perms);
7800     if (r < 0)
7801       return r;
7802
7803     fill_statx(diri, caps, &stx);
7804     fill_dirent(&de, ".", S_IFDIR, stx.stx_ino, next_off);
7805
7806     Inode *inode = NULL;
7807     if (getref) {
7808       inode = diri.get();
7809       _ll_get(inode);
7810     }
7811
7812     client_lock.Unlock();
7813     r = cb(p, &de, &stx, next_off, inode);
7814     client_lock.Lock();
7815     if (r < 0)
7816       return r;
7817
7818     dirp->offset = next_off;
7819     if (r > 0)
7820       return r;
7821   }
7822   if (dirp->offset == 1) {
7823     ldout(cct, 15) << " including .." << dendl;
7824     uint64_t next_off = 2;
7825     InodeRef in;
7826     if (diri->dn_set.empty())
7827       in = diri;
7828     else
7829       in = diri->get_first_parent()->dir->parent_inode;
7830
7831     int r;
7832     r = _getattr(in, caps, dirp->perms);
7833     if (r < 0)
7834       return r;
7835
7836     fill_statx(in, caps, &stx);
7837     fill_dirent(&de, "..", S_IFDIR, stx.stx_ino, next_off);
7838
7839     Inode *inode = NULL;
7840     if (getref) {
7841       inode = in.get();
7842       _ll_get(inode);
7843     }
7844
7845     client_lock.Unlock();
7846     r = cb(p, &de, &stx, next_off, inode);
7847     client_lock.Lock();
7848     if (r < 0)
7849       return r;
7850
7851     dirp->offset = next_off;
7852     if (r > 0)
7853       return r;
7854   }
7855
7856   // can we read from our cache?
7857   ldout(cct, 10) << "offset " << hex << dirp->offset << dec
7858            << " snapid " << dirp->inode->snapid << " (complete && ordered) "
7859            << dirp->inode->is_complete_and_ordered()
7860            << " issued " << ccap_string(dirp->inode->caps_issued())
7861            << dendl;
7862   if (dirp->inode->snapid != CEPH_SNAPDIR &&
7863       dirp->inode->is_complete_and_ordered() &&
7864       dirp->inode->caps_issued_mask(CEPH_CAP_FILE_SHARED, true)) {
7865     int err = _readdir_cache_cb(dirp, cb, p, caps, getref);
7866     if (err != -EAGAIN)
7867       return err;
7868   }
7869
7870   while (1) {
7871     if (dirp->at_end())
7872       return 0;
7873
7874     bool check_caps = true;
7875     if (!dirp->is_cached()) {
7876       int r = _readdir_get_frag(dirp);
7877       if (r)
7878         return r;
7879       // _readdir_get_frag () may updates dirp->offset if the replied dirfrag is
7880       // different than the requested one. (our dirfragtree was outdated)
7881       check_caps = false;
7882     }
7883     frag_t fg = dirp->buffer_frag;
7884
7885     ldout(cct, 10) << "frag " << fg << " buffer size " << dirp->buffer.size()
7886                    << " offset " << hex << dirp->offset << dendl;
7887
7888     for (auto it = std::lower_bound(dirp->buffer.begin(), dirp->buffer.end(),
7889                                     dirp->offset, dir_result_t::dentry_off_lt());
7890          it != dirp->buffer.end();
7891          ++it) {
7892       dir_result_t::dentry &entry = *it;
7893
7894       uint64_t next_off = entry.offset + 1;
7895
7896       int r;
7897       if (check_caps) {
7898         r = _getattr(entry.inode, caps, dirp->perms);
7899         if (r < 0)
7900           return r;
7901       }
7902
7903       fill_statx(entry.inode, caps, &stx);
7904       fill_dirent(&de, entry.name.c_str(), stx.stx_mode, stx.stx_ino, next_off);
7905
7906       Inode *inode = NULL;
7907       if (getref) {
7908         inode = entry.inode.get();
7909         _ll_get(inode);
7910       }
7911
7912       client_lock.Unlock();
7913       r = cb(p, &de, &stx, next_off, inode);  // _next_ offset
7914       client_lock.Lock();
7915
7916       ldout(cct, 15) << " de " << de.d_name << " off " << hex << next_off - 1 << dec
7917                      << " = " << r << dendl;
7918       if (r < 0)
7919         return r;
7920
7921       dirp->offset = next_off;
7922       if (r > 0)
7923         return r;
7924     }
7925
7926     if (dirp->next_offset > 2) {
7927       ldout(cct, 10) << " fetching next chunk of this frag" << dendl;
7928       _readdir_drop_dirp_buffer(dirp);
7929       continue;  // more!
7930     }
7931
7932     if (!fg.is_rightmost()) {
7933       // next frag!
7934       _readdir_next_frag(dirp);
7935       continue;
7936     }
7937
7938     if (diri->shared_gen == dirp->start_shared_gen &&
7939         diri->dir_release_count == dirp->release_count) {
7940       if (diri->dir_ordered_count == dirp->ordered_count) {
7941         ldout(cct, 10) << " marking (I_COMPLETE|I_DIR_ORDERED) on " << *diri << dendl;
7942         if (diri->dir) {
7943           assert(diri->dir->readdir_cache.size() >= dirp->cache_index);
7944           diri->dir->readdir_cache.resize(dirp->cache_index);
7945         }
7946         diri->flags |= I_COMPLETE | I_DIR_ORDERED;
7947       } else {
7948         ldout(cct, 10) << " marking I_COMPLETE on " << *diri << dendl;
7949         diri->flags |= I_COMPLETE;
7950       }
7951     }
7952
7953     dirp->set_end();
7954     return 0;
7955   }
7956   ceph_abort();
7957   return 0;
7958 }
7959
7960
7961 int Client::readdir_r(dir_result_t *d, struct dirent *de)
7962 {
7963   return readdirplus_r(d, de, 0, 0, 0, NULL);
7964 }
7965
7966 /*
7967  * readdirplus_r
7968  *
7969  * returns
7970  *  1 if we got a dirent
7971  *  0 for end of directory
7972  * <0 on error
7973  */
7974
7975 struct single_readdir {
7976   struct dirent *de;
7977   struct ceph_statx *stx;
7978   Inode *inode;
7979   bool full;
7980 };
7981
7982 static int _readdir_single_dirent_cb(void *p, struct dirent *de,
7983                                      struct ceph_statx *stx, off_t off,
7984                                      Inode *in)
7985 {
7986   single_readdir *c = static_cast<single_readdir *>(p);
7987
7988   if (c->full)
7989     return -1;  // already filled this dirent
7990
7991   *c->de = *de;
7992   if (c->stx)
7993     *c->stx = *stx;
7994   c->inode = in;
7995   c->full = true;
7996   return 1;
7997 }
7998
7999 struct dirent *Client::readdir(dir_result_t *d)
8000 {
8001   int ret;
8002   static struct dirent de;
8003   single_readdir sr;
8004   sr.de = &de;
8005   sr.stx = NULL;
8006   sr.inode = NULL;
8007   sr.full = false;
8008
8009   // our callback fills the dirent and sets sr.full=true on first
8010   // call, and returns -1 the second time around.
8011   ret = readdir_r_cb(d, _readdir_single_dirent_cb, (void *)&sr);
8012   if (ret < -1) {
8013     errno = -ret;  // this sucks.
8014     return (dirent *) NULL;
8015   }
8016   if (sr.full) {
8017     return &de;
8018   }
8019   return (dirent *) NULL;
8020 }
8021
8022 int Client::readdirplus_r(dir_result_t *d, struct dirent *de,
8023                           struct ceph_statx *stx, unsigned want,
8024                           unsigned flags, Inode **out)
8025 {
8026   single_readdir sr;
8027   sr.de = de;
8028   sr.stx = stx;
8029   sr.inode = NULL;
8030   sr.full = false;
8031
8032   // our callback fills the dirent and sets sr.full=true on first
8033   // call, and returns -1 the second time around.
8034   int r = readdir_r_cb(d, _readdir_single_dirent_cb, (void *)&sr, want, flags, out);
8035   if (r < -1)
8036     return r;
8037   if (out)
8038     *out = sr.inode;
8039   if (sr.full)
8040     return 1;
8041   return 0;
8042 }
8043
8044
8045 /* getdents */
8046 struct getdents_result {
8047   char *buf;
8048   int buflen;
8049   int pos;
8050   bool fullent;
8051 };
8052
8053 static int _readdir_getdent_cb(void *p, struct dirent *de,
8054                                struct ceph_statx *stx, off_t off, Inode *in)
8055 {
8056   struct getdents_result *c = static_cast<getdents_result *>(p);
8057
8058   int dlen;
8059   if (c->fullent)
8060     dlen = sizeof(*de);
8061   else
8062     dlen = strlen(de->d_name) + 1;
8063
8064   if (c->pos + dlen > c->buflen)
8065     return -1;  // doesn't fit
8066
8067   if (c->fullent) {
8068     memcpy(c->buf + c->pos, de, sizeof(*de));
8069   } else {
8070     memcpy(c->buf + c->pos, de->d_name, dlen);
8071   }
8072   c->pos += dlen;
8073   return 0;
8074 }
8075
8076 int Client::_getdents(dir_result_t *dir, char *buf, int buflen, bool fullent)
8077 {
8078   getdents_result gr;
8079   gr.buf = buf;
8080   gr.buflen = buflen;
8081   gr.fullent = fullent;
8082   gr.pos = 0;
8083
8084   int r = readdir_r_cb(dir, _readdir_getdent_cb, (void *)&gr);
8085
8086   if (r < 0) { // some error
8087     if (r == -1) { // buffer ran out of space
8088       if (gr.pos) { // but we got some entries already!
8089         return gr.pos;
8090       } // or we need a larger buffer
8091       return -ERANGE;
8092     } else { // actual error, return it
8093       return r;
8094     }
8095   }
8096   return gr.pos;
8097 }
8098
8099
8100 /* getdir */
8101 struct getdir_result {
8102   list<string> *contents;
8103   int num;
8104 };
8105
8106 static int _getdir_cb(void *p, struct dirent *de, struct ceph_statx *stx, off_t off, Inode *in)
8107 {
8108   getdir_result *r = static_cast<getdir_result *>(p);
8109
8110   r->contents->push_back(de->d_name);
8111   r->num++;
8112   return 0;
8113 }
8114
8115 int Client::getdir(const char *relpath, list<string>& contents,
8116                    const UserPerm& perms)
8117 {
8118   ldout(cct, 3) << "getdir(" << relpath << ")" << dendl;
8119   {
8120     Mutex::Locker lock(client_lock);
8121     tout(cct) << "getdir" << std::endl;
8122     tout(cct) << relpath << std::endl;
8123   }
8124
8125   dir_result_t *d;
8126   int r = opendir(relpath, &d, perms);
8127   if (r < 0)
8128     return r;
8129
8130   getdir_result gr;
8131   gr.contents = &contents;
8132   gr.num = 0;
8133   r = readdir_r_cb(d, _getdir_cb, (void *)&gr);
8134
8135   closedir(d);
8136
8137   if (r < 0)
8138     return r;
8139   return gr.num;
8140 }
8141
8142
8143 /****** file i/o **********/
8144 int Client::open(const char *relpath, int flags, const UserPerm& perms,
8145                  mode_t mode, int stripe_unit, int stripe_count,
8146                  int object_size, const char *data_pool)
8147 {
8148   ldout(cct, 3) << "open enter(" << relpath << ", " << ceph_flags_sys2wire(flags) << "," << mode << ")" << dendl;
8149   Mutex::Locker lock(client_lock);
8150   tout(cct) << "open" << std::endl;
8151   tout(cct) << relpath << std::endl;
8152   tout(cct) << ceph_flags_sys2wire(flags) << std::endl;
8153
8154   if (unmounting)
8155     return -ENOTCONN;
8156
8157   Fh *fh = NULL;
8158
8159 #if defined(__linux__) && defined(O_PATH)
8160   /* When the O_PATH is being specified, others flags than O_DIRECTORY
8161    * and O_NOFOLLOW are ignored. Please refer do_entry_open() function
8162    * in kernel (fs/open.c). */
8163   if (flags & O_PATH)
8164     flags &= O_DIRECTORY | O_NOFOLLOW | O_PATH;
8165 #endif
8166
8167   filepath path(relpath);
8168   InodeRef in;
8169   bool created = false;
8170   /* O_CREATE with O_EXCL enforces O_NOFOLLOW. */
8171   bool followsym = !((flags & O_NOFOLLOW) || ((flags & O_CREAT) && (flags & O_EXCL)));
8172   int r = path_walk(path, &in, perms, followsym, ceph_caps_for_mode(mode));
8173
8174   if (r == 0 && (flags & O_CREAT) && (flags & O_EXCL))
8175     return -EEXIST;
8176
8177 #if defined(__linux__) && defined(O_PATH)
8178   if (r == 0 && in->is_symlink() && (flags & O_NOFOLLOW) && !(flags & O_PATH))
8179 #else
8180   if (r == 0 && in->is_symlink() && (flags & O_NOFOLLOW))
8181 #endif
8182     return -ELOOP;
8183
8184   if (r == -ENOENT && (flags & O_CREAT)) {
8185     filepath dirpath = path;
8186     string dname = dirpath.last_dentry();
8187     dirpath.pop_dentry();
8188     InodeRef dir;
8189     r = path_walk(dirpath, &dir, perms, true,
8190                   cct->_conf->client_permissions ? CEPH_CAP_AUTH_SHARED : 0);
8191     if (r < 0)
8192       goto out;
8193     if (cct->_conf->client_permissions) {
8194       r = may_create(dir.get(), perms);
8195       if (r < 0)
8196         goto out;
8197     }
8198     r = _create(dir.get(), dname.c_str(), flags, mode, &in, &fh, stripe_unit,
8199                 stripe_count, object_size, data_pool, &created, perms);
8200   }
8201   if (r < 0)
8202     goto out;
8203
8204   if (!created) {
8205     // posix says we can only check permissions of existing files
8206     if (cct->_conf->client_permissions) {
8207       r = may_open(in.get(), flags, perms);
8208       if (r < 0)
8209         goto out;
8210     }
8211   }
8212
8213   if (!fh)
8214     r = _open(in.get(), flags, mode, &fh, perms);
8215   if (r >= 0) {
8216     // allocate a integer file descriptor
8217     assert(fh);
8218     r = get_fd();
8219     assert(fd_map.count(r) == 0);
8220     fd_map[r] = fh;
8221   }
8222
8223  out:
8224   tout(cct) << r << std::endl;
8225   ldout(cct, 3) << "open exit(" << path << ", " << ceph_flags_sys2wire(flags) << ") = " << r << dendl;
8226   return r;
8227 }
8228
8229 int Client::open(const char *relpath, int flags, const UserPerm& perms, mode_t mode)
8230 {
8231   /* Use default file striping parameters */
8232   return open(relpath, flags, perms, mode, 0, 0, 0, NULL);
8233 }
8234
8235 int Client::lookup_hash(inodeno_t ino, inodeno_t dirino, const char *name,
8236                         const UserPerm& perms)
8237 {
8238   Mutex::Locker lock(client_lock);
8239   ldout(cct, 3) << "lookup_hash enter(" << ino << ", #" << dirino << "/" << name << ")" << dendl;
8240
8241   if (unmounting)
8242     return -ENOTCONN;
8243
8244   MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPHASH);
8245   filepath path(ino);
8246   req->set_filepath(path);
8247
8248   uint32_t h = ceph_str_hash(CEPH_STR_HASH_RJENKINS, name, strlen(name));
8249   char f[30];
8250   sprintf(f, "%u", h);
8251   filepath path2(dirino);
8252   path2.push_dentry(string(f));
8253   req->set_filepath2(path2);
8254
8255   int r = make_request(req, perms, NULL, NULL,
8256                        rand() % mdsmap->get_num_in_mds());
8257   ldout(cct, 3) << "lookup_hash exit(" << ino << ", #" << dirino << "/" << name << ") = " << r << dendl;
8258   return r;
8259 }
8260
8261
8262 /**
8263  * Load inode into local cache.
8264  *
8265  * If inode pointer is non-NULL, and take a reference on
8266  * the resulting Inode object in one operation, so that caller
8267  * can safely assume inode will still be there after return.
8268  */
8269 int Client::lookup_ino(inodeno_t ino, const UserPerm& perms, Inode **inode)
8270 {
8271   Mutex::Locker lock(client_lock);
8272   ldout(cct, 3) << "lookup_ino enter(" << ino << ")" << dendl;
8273
8274   if (unmounting)
8275     return -ENOTCONN;
8276
8277   MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPINO);
8278   filepath path(ino);
8279   req->set_filepath(path);
8280
8281   int r = make_request(req, perms, NULL, NULL, rand() % mdsmap->get_num_in_mds());
8282   if (r == 0 && inode != NULL) {
8283     vinodeno_t vino(ino, CEPH_NOSNAP);
8284     unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino);
8285     assert(p != inode_map.end());
8286     *inode = p->second;
8287     _ll_get(*inode);
8288   }
8289   ldout(cct, 3) << "lookup_ino exit(" << ino << ") = " << r << dendl;
8290   return r;
8291 }
8292
8293
8294
8295 /**
8296  * Find the parent inode of `ino` and insert it into
8297  * our cache.  Conditionally also set `parent` to a referenced
8298  * Inode* if caller provides non-NULL value.
8299  */
8300 int Client::lookup_parent(Inode *ino, const UserPerm& perms, Inode **parent)
8301 {
8302   Mutex::Locker lock(client_lock);
8303   ldout(cct, 3) << "lookup_parent enter(" << ino->ino << ")" << dendl;
8304
8305   if (unmounting)
8306     return -ENOTCONN;
8307
8308   if (!ino->dn_set.empty()) {
8309     // if we exposed the parent here, we'd need to check permissions,
8310     // but right now we just rely on the MDS doing so in make_request
8311     ldout(cct, 3) << "lookup_parent dentry already present" << dendl;
8312     return 0;
8313   }
8314
8315   if (ino->is_root()) {
8316     *parent = NULL;
8317     ldout(cct, 3) << "ino is root, no parent" << dendl;
8318     return -EINVAL;
8319   }
8320
8321   MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPPARENT);
8322   filepath path(ino->ino);
8323   req->set_filepath(path);
8324
8325   InodeRef target;
8326   int r = make_request(req, perms, &target, NULL, rand() % mdsmap->get_num_in_mds());
8327   // Give caller a reference to the parent ino if they provided a pointer.
8328   if (parent != NULL) {
8329     if (r == 0) {
8330       *parent = target.get();
8331       _ll_get(*parent);
8332       ldout(cct, 3) << "lookup_parent found parent " << (*parent)->ino << dendl;
8333     } else {
8334       *parent = NULL;
8335     }
8336   }
8337   ldout(cct, 3) << "lookup_parent exit(" << ino->ino << ") = " << r << dendl;
8338   return r;
8339 }
8340
8341
8342 /**
8343  * Populate the parent dentry for `ino`, provided it is
8344  * a child of `parent`.
8345  */
8346 int Client::lookup_name(Inode *ino, Inode *parent, const UserPerm& perms)
8347 {
8348   assert(parent->is_dir());
8349
8350   Mutex::Locker lock(client_lock);
8351   ldout(cct, 3) << "lookup_name enter(" << ino->ino << ")" << dendl;
8352
8353   if (unmounting)
8354     return -ENOTCONN;
8355
8356   MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPNAME);
8357   req->set_filepath2(filepath(parent->ino));
8358   req->set_filepath(filepath(ino->ino));
8359   req->set_inode(ino);
8360
8361   int r = make_request(req, perms, NULL, NULL, rand() % mdsmap->get_num_in_mds());
8362   ldout(cct, 3) << "lookup_name exit(" << ino->ino << ") = " << r << dendl;
8363   return r;
8364 }
8365
8366
8367  Fh *Client::_create_fh(Inode *in, int flags, int cmode, const UserPerm& perms)
8368 {
8369   assert(in);
8370   Fh *f = new Fh(in);
8371   f->mode = cmode;
8372   f->flags = flags;
8373
8374   // inode
8375   f->actor_perms = perms;
8376
8377   ldout(cct, 10) << "_create_fh " << in->ino << " mode " << cmode << dendl;
8378
8379   if (in->snapid != CEPH_NOSNAP) {
8380     in->snap_cap_refs++;
8381     ldout(cct, 5) << "open success, fh is " << f << " combined IMMUTABLE SNAP caps "
8382             << ccap_string(in->caps_issued()) << dendl;
8383   }
8384
8385   const md_config_t *conf = cct->_conf;
8386   f->readahead.set_trigger_requests(1);
8387   f->readahead.set_min_readahead_size(conf->client_readahead_min);
8388   uint64_t max_readahead = Readahead::NO_LIMIT;
8389   if (conf->client_readahead_max_bytes) {
8390     max_readahead = MIN(max_readahead, (uint64_t)conf->client_readahead_max_bytes);
8391   }
8392   if (conf->client_readahead_max_periods) {
8393     max_readahead = MIN(max_readahead, in->layout.get_period()*(uint64_t)conf->client_readahead_max_periods);
8394   }
8395   f->readahead.set_max_readahead_size(max_readahead);
8396   vector<uint64_t> alignments;
8397   alignments.push_back(in->layout.get_period());
8398   alignments.push_back(in->layout.stripe_unit);
8399   f->readahead.set_alignments(alignments);
8400
8401   return f;
8402 }
8403
8404 int Client::_release_fh(Fh *f)
8405 {
8406   //ldout(cct, 3) << "op: client->close(open_files[ " << fh << " ]);" << dendl;
8407   //ldout(cct, 3) << "op: open_files.erase( " << fh << " );" << dendl;
8408   Inode *in = f->inode.get();
8409   ldout(cct, 5) << "_release_fh " << f << " mode " << f->mode << " on " << *in << dendl;
8410
8411   in->unset_deleg(f);
8412
8413   if (in->snapid == CEPH_NOSNAP) {
8414     if (in->put_open_ref(f->mode)) {
8415       _flush(in, new C_Client_FlushComplete(this, in));
8416       check_caps(in, 0);
8417     }
8418   } else {
8419     assert(in->snap_cap_refs > 0);
8420     in->snap_cap_refs--;
8421   }
8422
8423   _release_filelocks(f);
8424
8425   // Finally, read any async err (i.e. from flushes)
8426   int err = f->take_async_err();
8427   if (err != 0) {
8428     ldout(cct, 1) << "_release_fh " << f << " on inode " << *in << " caught async_err = "
8429                   << cpp_strerror(err) << dendl;
8430   } else {
8431     ldout(cct, 10) << "_release_fh " << f << " on inode " << *in << " no async_err state" << dendl;
8432   }
8433
8434   _put_fh(f);
8435
8436   return err;
8437 }
8438
8439 void Client::_put_fh(Fh *f)
8440 {
8441   int left = f->put();
8442   if (!left) {
8443     delete f;
8444   }
8445 }
8446
8447 int Client::_open(Inode *in, int flags, mode_t mode, Fh **fhp,
8448                   const UserPerm& perms)
8449 {
8450   if (in->snapid != CEPH_NOSNAP &&
8451       (flags & (O_WRONLY | O_RDWR | O_CREAT | O_TRUNC | O_APPEND))) {
8452     return -EROFS;
8453   }
8454
8455   // use normalized flags to generate cmode
8456   int cmode = ceph_flags_to_mode(ceph_flags_sys2wire(flags));
8457   if (cmode < 0)
8458     return -EINVAL;
8459   int want = ceph_caps_for_mode(cmode);
8460   int result = 0;
8461
8462   in->get_open_ref(cmode);  // make note of pending open, since it effects _wanted_ caps.
8463
8464   if ((flags & O_TRUNC) == 0 && in->caps_issued_mask(want)) {
8465     // update wanted?
8466     check_caps(in, CHECK_CAPS_NODELAY);
8467   } else {
8468
8469     MetaRequest *req = new MetaRequest(CEPH_MDS_OP_OPEN);
8470     filepath path;
8471     in->make_nosnap_relative_path(path);
8472     req->set_filepath(path);
8473     req->head.args.open.flags = ceph_flags_sys2wire(flags & ~O_CREAT);
8474     req->head.args.open.mode = mode;
8475     req->head.args.open.pool = -1;
8476     if (cct->_conf->client_debug_getattr_caps)
8477       req->head.args.open.mask = DEBUG_GETATTR_CAPS;
8478     else
8479       req->head.args.open.mask = 0;
8480     req->head.args.open.old_size = in->size;   // for O_TRUNC
8481     req->set_inode(in);
8482     result = make_request(req, perms);
8483
8484     /*
8485      * NFS expects that delegations will be broken on a conflicting open,
8486      * not just when there is actual conflicting access to the file. SMB leases
8487      * and oplocks also have similar semantics.
8488      *
8489      * Ensure that clients that have delegations enabled will wait on minimal
8490      * caps during open, just to ensure that other clients holding delegations
8491      * return theirs first.
8492      */
8493     if (deleg_timeout && result == 0) {
8494       int need = 0, have;
8495
8496       if (cmode & CEPH_FILE_MODE_WR)
8497         need |= CEPH_CAP_FILE_WR;
8498       if (cmode & CEPH_FILE_MODE_RD)
8499         need |= CEPH_CAP_FILE_RD;
8500
8501       result = get_caps(in, need, want, &have, -1);
8502       if (result < 0) {
8503         ldout(cct, 1) << "Unable to get caps after open of inode " << *in <<
8504                           " . Denying open: " <<
8505                           cpp_strerror(result) << dendl;
8506         in->put_open_ref(cmode);
8507       } else {
8508         put_cap_ref(in, need);
8509       }
8510     }
8511   }
8512
8513   // success?
8514   if (result >= 0) {
8515     if (fhp)
8516       *fhp = _create_fh(in, flags, cmode, perms);
8517   } else {
8518     in->put_open_ref(cmode);
8519   }
8520
8521   trim_cache();
8522
8523   return result;
8524 }
8525
8526 int Client::_renew_caps(Inode *in)
8527 {
8528   int wanted = in->caps_file_wanted();
8529   if (in->is_any_caps() &&
8530       ((wanted & CEPH_CAP_ANY_WR) == 0 || in->auth_cap)) {
8531     check_caps(in, CHECK_CAPS_NODELAY);
8532     return 0;
8533   }
8534
8535   int flags = 0;
8536   if ((wanted & CEPH_CAP_FILE_RD) && (wanted & CEPH_CAP_FILE_WR))
8537     flags = O_RDWR;
8538   else if (wanted & CEPH_CAP_FILE_RD)
8539     flags = O_RDONLY;
8540   else if (wanted & CEPH_CAP_FILE_WR)
8541     flags = O_WRONLY;
8542
8543   MetaRequest *req = new MetaRequest(CEPH_MDS_OP_OPEN);
8544   filepath path;
8545   in->make_nosnap_relative_path(path);
8546   req->set_filepath(path);
8547   req->head.args.open.flags = flags;
8548   req->head.args.open.pool = -1;
8549   if (cct->_conf->client_debug_getattr_caps)
8550     req->head.args.open.mask = DEBUG_GETATTR_CAPS;
8551   else
8552     req->head.args.open.mask = 0;
8553   req->set_inode(in);
8554
8555   // duplicate in case Cap goes away; not sure if that race is a concern?
8556   const UserPerm *pperm = in->get_best_perms();
8557   UserPerm perms;
8558   if (pperm != NULL)
8559     perms = *pperm;
8560   int ret = make_request(req, perms);
8561   return ret;
8562 }
8563
8564 int Client::close(int fd)
8565 {
8566   ldout(cct, 3) << "close enter(" << fd << ")" << dendl;
8567   Mutex::Locker lock(client_lock);
8568   tout(cct) << "close" << std::endl;
8569   tout(cct) << fd << std::endl;
8570
8571   if (unmounting)
8572     return -ENOTCONN;
8573
8574   Fh *fh = get_filehandle(fd);
8575   if (!fh)
8576     return -EBADF;
8577   int err = _release_fh(fh);
8578   fd_map.erase(fd);
8579   put_fd(fd);
8580   ldout(cct, 3) << "close exit(" << fd << ")" << dendl;
8581   return err;
8582 }
8583
8584
8585 // ------------
8586 // read, write
8587
8588 loff_t Client::lseek(int fd, loff_t offset, int whence)
8589 {
8590   Mutex::Locker lock(client_lock);
8591   tout(cct) << "lseek" << std::endl;
8592   tout(cct) << fd << std::endl;
8593   tout(cct) << offset << std::endl;
8594   tout(cct) << whence << std::endl;
8595
8596   if (unmounting)
8597     return -ENOTCONN;
8598
8599   Fh *f = get_filehandle(fd);
8600   if (!f)
8601     return -EBADF;
8602 #if defined(__linux__) && defined(O_PATH)
8603   if (f->flags & O_PATH)
8604     return -EBADF;
8605 #endif
8606   return _lseek(f, offset, whence);
8607 }
8608
8609 loff_t Client::_lseek(Fh *f, loff_t offset, int whence)
8610 {
8611   Inode *in = f->inode.get();
8612   int r;
8613
8614   switch (whence) {
8615   case SEEK_SET:
8616     f->pos = offset;
8617     break;
8618
8619   case SEEK_CUR:
8620     f->pos += offset;
8621     break;
8622
8623   case SEEK_END:
8624     r = _getattr(in, CEPH_STAT_CAP_SIZE, f->actor_perms);
8625     if (r < 0)
8626       return r;
8627     f->pos = in->size + offset;
8628     break;
8629
8630   default:
8631     ceph_abort();
8632   }
8633
8634   ldout(cct, 3) << "_lseek(" << f << ", " << offset << ", " << whence << ") = " << f->pos << dendl;
8635   return f->pos;
8636 }
8637
8638
8639 void Client::lock_fh_pos(Fh *f)
8640 {
8641   ldout(cct, 10) << "lock_fh_pos " << f << dendl;
8642
8643   if (f->pos_locked || !f->pos_waiters.empty()) {
8644     Cond cond;
8645     f->pos_waiters.push_back(&cond);
8646     ldout(cct, 10) << "lock_fh_pos BLOCKING on " << f << dendl;
8647     while (f->pos_locked || f->pos_waiters.front() != &cond)
8648       cond.Wait(client_lock);
8649     ldout(cct, 10) << "lock_fh_pos UNBLOCKING on " << f << dendl;
8650     assert(f->pos_waiters.front() == &cond);
8651     f->pos_waiters.pop_front();
8652   }
8653
8654   f->pos_locked = true;
8655 }
8656
8657 void Client::unlock_fh_pos(Fh *f)
8658 {
8659   ldout(cct, 10) << "unlock_fh_pos " << f << dendl;
8660   f->pos_locked = false;
8661 }
8662
8663 int Client::uninline_data(Inode *in, Context *onfinish)
8664 {
8665   if (!in->inline_data.length()) {
8666     onfinish->complete(0);
8667     return 0;
8668   }
8669
8670   char oid_buf[32];
8671   snprintf(oid_buf, sizeof(oid_buf), "%llx.00000000", (long long unsigned)in->ino);
8672   object_t oid = oid_buf;
8673
8674   ObjectOperation create_ops;
8675   create_ops.create(false);
8676
8677   objecter->mutate(oid,
8678                    OSDMap::file_to_object_locator(in->layout),
8679                    create_ops,
8680                    in->snaprealm->get_snap_context(),
8681                    ceph::real_clock::now(),
8682                    0,
8683                    NULL);
8684
8685   bufferlist inline_version_bl;
8686   ::encode(in->inline_version, inline_version_bl);
8687
8688   ObjectOperation uninline_ops;
8689   uninline_ops.cmpxattr("inline_version",
8690                         CEPH_OSD_CMPXATTR_OP_GT,
8691                         CEPH_OSD_CMPXATTR_MODE_U64,
8692                         inline_version_bl);
8693   bufferlist inline_data = in->inline_data;
8694   uninline_ops.write(0, inline_data, in->truncate_size, in->truncate_seq);
8695   uninline_ops.setxattr("inline_version", stringify(in->inline_version));
8696
8697   objecter->mutate(oid,
8698                    OSDMap::file_to_object_locator(in->layout),
8699                    uninline_ops,
8700                    in->snaprealm->get_snap_context(),
8701                    ceph::real_clock::now(),
8702                    0,
8703                    onfinish);
8704
8705   return 0;
8706 }
8707
8708 //
8709
8710 // blocking osd interface
8711
8712 int Client::read(int fd, char *buf, loff_t size, loff_t offset)
8713 {
8714   Mutex::Locker lock(client_lock);
8715   tout(cct) << "read" << std::endl;
8716   tout(cct) << fd << std::endl;
8717   tout(cct) << size << std::endl;
8718   tout(cct) << offset << std::endl;
8719
8720   if (unmounting)
8721     return -ENOTCONN;
8722
8723   Fh *f = get_filehandle(fd);
8724   if (!f)
8725     return -EBADF;
8726 #if defined(__linux__) && defined(O_PATH)
8727   if (f->flags & O_PATH)
8728     return -EBADF;
8729 #endif
8730   bufferlist bl;
8731   int r = _read(f, offset, size, &bl);
8732   ldout(cct, 3) << "read(" << fd << ", " << (void*)buf << ", " << size << ", " << offset << ") = " << r << dendl;
8733   if (r >= 0) {
8734     bl.copy(0, bl.length(), buf);
8735     r = bl.length();
8736   }
8737   return r;
8738 }
8739
8740 int Client::preadv(int fd, const struct iovec *iov, int iovcnt, loff_t offset)
8741 {
8742   if (iovcnt < 0)
8743     return -EINVAL;
8744   return _preadv_pwritev(fd, iov, iovcnt, offset, false);
8745 }
8746
8747 int Client::_read(Fh *f, int64_t offset, uint64_t size, bufferlist *bl)
8748 {
8749   const md_config_t *conf = cct->_conf;
8750   Inode *in = f->inode.get();
8751
8752   if ((f->mode & CEPH_FILE_MODE_RD) == 0)
8753     return -EBADF;
8754   //bool lazy = f->mode == CEPH_FILE_MODE_LAZY;
8755
8756   bool movepos = false;
8757   if (offset < 0) {
8758     lock_fh_pos(f);
8759     offset = f->pos;
8760     movepos = true;
8761   }
8762   loff_t start_pos = offset;
8763
8764   if (in->inline_version == 0) {
8765     int r = _getattr(in, CEPH_STAT_CAP_INLINE_DATA, f->actor_perms, true);
8766     if (r < 0) {
8767       if (movepos)
8768         unlock_fh_pos(f);
8769       return r;
8770     }
8771     assert(in->inline_version > 0);
8772   }
8773
8774 retry:
8775   int have;
8776   int r = get_caps(in, CEPH_CAP_FILE_RD, CEPH_CAP_FILE_CACHE, &have, -1);
8777   if (r < 0) {
8778     if (movepos)
8779         unlock_fh_pos(f);
8780     return r;
8781   }
8782   if (f->flags & O_DIRECT)
8783     have &= ~CEPH_CAP_FILE_CACHE;
8784
8785   Mutex uninline_flock("Client::_read_uninline_data flock");
8786   Cond uninline_cond;
8787   bool uninline_done = false;
8788   int uninline_ret = 0;
8789   Context *onuninline = NULL;
8790
8791   if (in->inline_version < CEPH_INLINE_NONE) {
8792     if (!(have & CEPH_CAP_FILE_CACHE)) {
8793       onuninline = new C_SafeCond(&uninline_flock,
8794                                   &uninline_cond,
8795                                   &uninline_done,
8796                                   &uninline_ret);
8797       uninline_data(in, onuninline);
8798     } else {
8799       uint32_t len = in->inline_data.length();
8800
8801       uint64_t endoff = offset + size;
8802       if (endoff > in->size)
8803         endoff = in->size;
8804
8805       if (offset < len) {
8806         if (endoff <= len) {
8807           bl->substr_of(in->inline_data, offset, endoff - offset);
8808         } else {
8809           bl->substr_of(in->inline_data, offset, len - offset);
8810           bl->append_zero(endoff - len);
8811         }
8812       } else if ((uint64_t)offset < endoff) {
8813         bl->append_zero(endoff - offset);
8814       }
8815
8816       goto success;
8817     }
8818   }
8819
8820   if (!conf->client_debug_force_sync_read &&
8821       (conf->client_oc && (have & CEPH_CAP_FILE_CACHE))) {
8822
8823     if (f->flags & O_RSYNC) {
8824       _flush_range(in, offset, size);
8825     }
8826     r = _read_async(f, offset, size, bl);
8827     if (r < 0)
8828       goto done;
8829   } else {
8830     if (f->flags & O_DIRECT)
8831       _flush_range(in, offset, size);
8832
8833     bool checkeof = false;
8834     r = _read_sync(f, offset, size, bl, &checkeof);
8835     if (r < 0)
8836       goto done;
8837     if (checkeof) {
8838       offset += r;
8839       size -= r;
8840
8841       put_cap_ref(in, CEPH_CAP_FILE_RD);
8842       have = 0;
8843       // reverify size
8844       r = _getattr(in, CEPH_STAT_CAP_SIZE, f->actor_perms);
8845       if (r < 0)
8846         goto done;
8847
8848       // eof?  short read.
8849       if ((uint64_t)offset < in->size)
8850         goto retry;
8851     }
8852   }
8853
8854 success:
8855   if (movepos) {
8856     // adjust fd pos
8857     f->pos = start_pos + bl->length();
8858     unlock_fh_pos(f);
8859   }
8860
8861 done:
8862   // done!
8863
8864   if (onuninline) {
8865     client_lock.Unlock();
8866     uninline_flock.Lock();
8867     while (!uninline_done)
8868       uninline_cond.Wait(uninline_flock);
8869     uninline_flock.Unlock();
8870     client_lock.Lock();
8871
8872     if (uninline_ret >= 0 || uninline_ret == -ECANCELED) {
8873       in->inline_data.clear();
8874       in->inline_version = CEPH_INLINE_NONE;
8875       mark_caps_dirty(in, CEPH_CAP_FILE_WR);
8876       check_caps(in, 0);
8877     } else
8878       r = uninline_ret;
8879   }
8880
8881   if (have)
8882     put_cap_ref(in, CEPH_CAP_FILE_RD);
8883   if (r < 0) {
8884     if (movepos)
8885         unlock_fh_pos(f);
8886     return r;
8887   } else
8888     return bl->length();
8889 }
8890
8891 Client::C_Readahead::C_Readahead(Client *c, Fh *f) :
8892     client(c), f(f) {
8893   f->get();
8894   f->readahead.inc_pending();
8895 }
8896
8897 Client::C_Readahead::~C_Readahead() {
8898   f->readahead.dec_pending();
8899   client->_put_fh(f);
8900 }
8901
8902 void Client::C_Readahead::finish(int r) {
8903   lgeneric_subdout(client->cct, client, 20) << "client." << client->get_nodeid() << " " << "C_Readahead on " << f->inode << dendl;
8904   client->put_cap_ref(f->inode.get(), CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE);
8905 }
8906
8907 int Client::_read_async(Fh *f, uint64_t off, uint64_t len, bufferlist *bl)
8908 {
8909   const md_config_t *conf = cct->_conf;
8910   Inode *in = f->inode.get();
8911
8912   ldout(cct, 10) << "_read_async " << *in << " " << off << "~" << len << dendl;
8913
8914   // trim read based on file size?
8915   if (off >= in->size)
8916     return 0;
8917   if (len == 0)
8918     return 0;
8919   if (off + len > in->size) {
8920     len = in->size - off;
8921   }
8922
8923   ldout(cct, 10) << " min_bytes=" << f->readahead.get_min_readahead_size()
8924                  << " max_bytes=" << f->readahead.get_max_readahead_size()
8925                  << " max_periods=" << conf->client_readahead_max_periods << dendl;
8926
8927   // read (and possibly block)
8928   int r, rvalue = 0;
8929   Mutex flock("Client::_read_async flock");
8930   Cond cond;
8931   bool done = false;
8932   Context *onfinish = new C_SafeCond(&flock, &cond, &done, &rvalue);
8933   r = objectcacher->file_read(&in->oset, &in->layout, in->snapid,
8934                               off, len, bl, 0, onfinish);
8935   if (r == 0) {
8936     get_cap_ref(in, CEPH_CAP_FILE_CACHE);
8937     client_lock.Unlock();
8938     flock.Lock();
8939     while (!done)
8940       cond.Wait(flock);
8941     flock.Unlock();
8942     client_lock.Lock();
8943     put_cap_ref(in, CEPH_CAP_FILE_CACHE);
8944     r = rvalue;
8945   } else {
8946     // it was cached.
8947     delete onfinish;
8948   }
8949
8950   if(f->readahead.get_min_readahead_size() > 0) {
8951     pair<uint64_t, uint64_t> readahead_extent = f->readahead.update(off, len, in->size);
8952     if (readahead_extent.second > 0) {
8953       ldout(cct, 20) << "readahead " << readahead_extent.first << "~" << readahead_extent.second
8954                      << " (caller wants " << off << "~" << len << ")" << dendl;
8955       Context *onfinish2 = new C_Readahead(this, f);
8956       int r2 = objectcacher->file_read(&in->oset, &in->layout, in->snapid,
8957                                        readahead_extent.first, readahead_extent.second,
8958                                        NULL, 0, onfinish2);
8959       if (r2 == 0) {
8960         ldout(cct, 20) << "readahead initiated, c " << onfinish2 << dendl;
8961         get_cap_ref(in, CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE);
8962       } else {
8963         ldout(cct, 20) << "readahead was no-op, already cached" << dendl;
8964         delete onfinish2;
8965       }
8966     }
8967   }
8968
8969   return r;
8970 }
8971
8972 int Client::_read_sync(Fh *f, uint64_t off, uint64_t len, bufferlist *bl,
8973                        bool *checkeof)
8974 {
8975   Inode *in = f->inode.get();
8976   uint64_t pos = off;
8977   int left = len;
8978   int read = 0;
8979
8980   ldout(cct, 10) << "_read_sync " << *in << " " << off << "~" << len << dendl;
8981
8982   Mutex flock("Client::_read_sync flock");
8983   Cond cond;
8984   while (left > 0) {
8985     int r = 0;
8986     bool done = false;
8987     Context *onfinish = new C_SafeCond(&flock, &cond, &done, &r);
8988     bufferlist tbl;
8989
8990     int wanted = left;
8991     filer->read_trunc(in->ino, &in->layout, in->snapid,
8992                       pos, left, &tbl, 0,
8993                       in->truncate_size, in->truncate_seq,
8994                       onfinish);
8995     client_lock.Unlock();
8996     flock.Lock();
8997     while (!done)
8998       cond.Wait(flock);
8999     flock.Unlock();
9000     client_lock.Lock();
9001
9002     // if we get ENOENT from OSD, assume 0 bytes returned
9003     if (r == -ENOENT)
9004       r = 0;
9005     if (r < 0)
9006       return r;
9007     if (tbl.length()) {
9008       r = tbl.length();
9009
9010       read += r;
9011       pos += r;
9012       left -= r;
9013       bl->claim_append(tbl);
9014     }
9015     // short read?
9016     if (r >= 0 && r < wanted) {
9017       if (pos < in->size) {
9018         // zero up to known EOF
9019         int64_t some = in->size - pos;
9020         if (some > left)
9021           some = left;
9022         bufferptr z(some);
9023         z.zero();
9024         bl->push_back(z);
9025         read += some;
9026         pos += some;
9027         left -= some;
9028         if (left == 0)
9029           return read;
9030       }
9031
9032       *checkeof = true;
9033       return read;
9034     }
9035   }
9036   return read;
9037 }
9038
9039
9040 /*
9041  * we keep count of uncommitted sync writes on the inode, so that
9042  * fsync can DDRT.
9043  */
9044 void Client::_sync_write_commit(Inode *in)
9045 {
9046   assert(unsafe_sync_write > 0);
9047   unsafe_sync_write--;
9048
9049   put_cap_ref(in, CEPH_CAP_FILE_BUFFER);
9050
9051   ldout(cct, 15) << "sync_write_commit unsafe_sync_write = " << unsafe_sync_write << dendl;
9052   if (unsafe_sync_write == 0 && unmounting) {
9053     ldout(cct, 10) << "sync_write_commit -- no more unsafe writes, unmount can proceed" << dendl;
9054     mount_cond.Signal();
9055   }
9056 }
9057
9058 int Client::write(int fd, const char *buf, loff_t size, loff_t offset)
9059 {
9060   Mutex::Locker lock(client_lock);
9061   tout(cct) << "write" << std::endl;
9062   tout(cct) << fd << std::endl;
9063   tout(cct) << size << std::endl;
9064   tout(cct) << offset << std::endl;
9065
9066   if (unmounting)
9067     return -ENOTCONN;
9068
9069   Fh *fh = get_filehandle(fd);
9070   if (!fh)
9071     return -EBADF;
9072 #if defined(__linux__) && defined(O_PATH)
9073   if (fh->flags & O_PATH)
9074     return -EBADF;
9075 #endif
9076   int r = _write(fh, offset, size, buf, NULL, 0);
9077   ldout(cct, 3) << "write(" << fd << ", \"...\", " << size << ", " << offset << ") = " << r << dendl;
9078   return r;
9079 }
9080
9081 int Client::pwritev(int fd, const struct iovec *iov, int iovcnt, int64_t offset)
9082 {
9083   if (iovcnt < 0)
9084     return -EINVAL;
9085   return _preadv_pwritev(fd, iov, iovcnt, offset, true);
9086 }
9087
9088 int Client::_preadv_pwritev(int fd, const struct iovec *iov, unsigned iovcnt, int64_t offset, bool write)
9089 {
9090     Mutex::Locker lock(client_lock);
9091     tout(cct) << fd << std::endl;
9092     tout(cct) << offset << std::endl;
9093
9094     if (unmounting)
9095      return -ENOTCONN;
9096
9097     Fh *fh = get_filehandle(fd);
9098     if (!fh)
9099         return -EBADF;
9100 #if defined(__linux__) && defined(O_PATH)
9101     if (fh->flags & O_PATH)
9102         return -EBADF;
9103 #endif
9104     loff_t totallen = 0;
9105     for (unsigned i = 0; i < iovcnt; i++) {
9106         totallen += iov[i].iov_len;
9107     }
9108     if (write) {
9109         int w = _write(fh, offset, totallen, NULL, iov, iovcnt);
9110         ldout(cct, 3) << "pwritev(" << fd << ", \"...\", " << totallen << ", " << offset << ") = " << w << dendl;
9111         return w;
9112     } else {
9113         bufferlist bl;
9114         int r = _read(fh, offset, totallen, &bl);
9115         ldout(cct, 3) << "preadv(" << fd << ", " <<  offset << ") = " << r << dendl;
9116         if (r <= 0)
9117           return r;
9118
9119         int bufoff = 0;
9120         for (unsigned j = 0, resid = r; j < iovcnt && resid > 0; j++) {
9121                /*
9122                 * This piece of code aims to handle the case that bufferlist does not have enough data
9123                 * to fill in the iov
9124                 */
9125                if (resid < iov[j].iov_len) {
9126                     bl.copy(bufoff, resid, (char *)iov[j].iov_base);
9127                     break;
9128                } else {
9129                     bl.copy(bufoff, iov[j].iov_len, (char *)iov[j].iov_base);
9130                }
9131                resid -= iov[j].iov_len;
9132                bufoff += iov[j].iov_len;
9133         }
9134         return r;
9135     }
9136 }
9137
9138 int Client::_write(Fh *f, int64_t offset, uint64_t size, const char *buf,
9139                   const struct iovec *iov, int iovcnt)
9140 {
9141   if ((uint64_t)(offset+size) > mdsmap->get_max_filesize()) //too large!
9142     return -EFBIG;
9143
9144   //ldout(cct, 7) << "write fh " << fh << " size " << size << " offset " << offset << dendl;
9145   Inode *in = f->inode.get();
9146
9147   if (objecter->osdmap_pool_full(in->layout.pool_id)) {
9148     return -ENOSPC;
9149   }
9150
9151   assert(in->snapid == CEPH_NOSNAP);
9152
9153   // was Fh opened as writeable?
9154   if ((f->mode & CEPH_FILE_MODE_WR) == 0)
9155     return -EBADF;
9156
9157   // check quota
9158   uint64_t endoff = offset + size;
9159   if (endoff > in->size && is_quota_bytes_exceeded(in, endoff - in->size,
9160                                                    f->actor_perms)) {
9161     return -EDQUOT;
9162   }
9163
9164   // use/adjust fd pos?
9165   if (offset < 0) {
9166     lock_fh_pos(f);
9167     /*
9168      * FIXME: this is racy in that we may block _after_ this point waiting for caps, and size may
9169      * change out from under us.
9170      */
9171     if (f->flags & O_APPEND) {
9172       int r = _lseek(f, 0, SEEK_END);
9173       if (r < 0) {
9174         unlock_fh_pos(f);
9175         return r;
9176       }
9177     }
9178     offset = f->pos;
9179     f->pos = offset+size;
9180     unlock_fh_pos(f);
9181   }
9182
9183   //bool lazy = f->mode == CEPH_FILE_MODE_LAZY;
9184
9185   ldout(cct, 10) << "cur file size is " << in->size << dendl;
9186
9187   // time it.
9188   utime_t start = ceph_clock_now();
9189
9190   if (in->inline_version == 0) {
9191     int r = _getattr(in, CEPH_STAT_CAP_INLINE_DATA, f->actor_perms, true);
9192     if (r < 0)
9193       return r;
9194     assert(in->inline_version > 0);
9195   }
9196
9197   // copy into fresh buffer (since our write may be resub, async)
9198   bufferlist bl;
9199   if (buf) {
9200     if (size > 0)
9201       bl.append(buf, size);
9202   } else if (iov){
9203     for (int i = 0; i < iovcnt; i++) {
9204       if (iov[i].iov_len > 0) {
9205         bl.append((const char *)iov[i].iov_base, iov[i].iov_len);
9206       }
9207     }
9208   }
9209
9210   utime_t lat;
9211   uint64_t totalwritten;
9212   int have;
9213   int r = get_caps(in, CEPH_CAP_FILE_WR|CEPH_CAP_AUTH_SHARED,
9214                     CEPH_CAP_FILE_BUFFER, &have, endoff);
9215   if (r < 0)
9216     return r;
9217
9218   /* clear the setuid/setgid bits, if any */
9219   if (unlikely(in->mode & (S_ISUID|S_ISGID)) && size > 0) {
9220     struct ceph_statx stx = { 0 };
9221
9222     put_cap_ref(in, CEPH_CAP_AUTH_SHARED);
9223     r = __setattrx(in, &stx, CEPH_SETATTR_KILL_SGUID, f->actor_perms);
9224     if (r < 0)
9225       return r;
9226   } else {
9227     put_cap_ref(in, CEPH_CAP_AUTH_SHARED);
9228   }
9229
9230   if (f->flags & O_DIRECT)
9231     have &= ~CEPH_CAP_FILE_BUFFER;
9232
9233   ldout(cct, 10) << " snaprealm " << *in->snaprealm << dendl;
9234
9235   Mutex uninline_flock("Client::_write_uninline_data flock");
9236   Cond uninline_cond;
9237   bool uninline_done = false;
9238   int uninline_ret = 0;
9239   Context *onuninline = NULL;
9240
9241   if (in->inline_version < CEPH_INLINE_NONE) {
9242     if (endoff > cct->_conf->client_max_inline_size ||
9243         endoff > CEPH_INLINE_MAX_SIZE ||
9244         !(have & CEPH_CAP_FILE_BUFFER)) {
9245       onuninline = new C_SafeCond(&uninline_flock,
9246                                   &uninline_cond,
9247                                   &uninline_done,
9248                                   &uninline_ret);
9249       uninline_data(in, onuninline);
9250     } else {
9251       get_cap_ref(in, CEPH_CAP_FILE_BUFFER);
9252
9253       uint32_t len = in->inline_data.length();
9254
9255       if (endoff < len)
9256         in->inline_data.copy(endoff, len - endoff, bl);
9257
9258       if (offset < len)
9259         in->inline_data.splice(offset, len - offset);
9260       else if (offset > len)
9261         in->inline_data.append_zero(offset - len);
9262
9263       in->inline_data.append(bl);
9264       in->inline_version++;
9265
9266       put_cap_ref(in, CEPH_CAP_FILE_BUFFER);
9267
9268       goto success;
9269     }
9270   }
9271
9272   if (cct->_conf->client_oc && (have & CEPH_CAP_FILE_BUFFER)) {
9273     // do buffered write
9274     if (!in->oset.dirty_or_tx)
9275       get_cap_ref(in, CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_BUFFER);
9276
9277     get_cap_ref(in, CEPH_CAP_FILE_BUFFER);
9278
9279     // async, caching, non-blocking.
9280     r = objectcacher->file_write(&in->oset, &in->layout,
9281                                  in->snaprealm->get_snap_context(),
9282                                  offset, size, bl, ceph::real_clock::now(),
9283                                  0);
9284     put_cap_ref(in, CEPH_CAP_FILE_BUFFER);
9285
9286     if (r < 0)
9287       goto done;
9288
9289     // flush cached write if O_SYNC is set on file fh
9290     // O_DSYNC == O_SYNC on linux < 2.6.33
9291     // O_SYNC = __O_SYNC | O_DSYNC on linux >= 2.6.33
9292     if ((f->flags & O_SYNC) || (f->flags & O_DSYNC)) {
9293       _flush_range(in, offset, size);
9294     }
9295   } else {
9296     if (f->flags & O_DIRECT)
9297       _flush_range(in, offset, size);
9298
9299     // simple, non-atomic sync write
9300     Mutex flock("Client::_write flock");
9301     Cond cond;
9302     bool done = false;
9303     Context *onfinish = new C_SafeCond(&flock, &cond, &done);
9304
9305     unsafe_sync_write++;
9306     get_cap_ref(in, CEPH_CAP_FILE_BUFFER);  // released by onsafe callback
9307
9308     filer->write_trunc(in->ino, &in->layout, in->snaprealm->get_snap_context(),
9309                        offset, size, bl, ceph::real_clock::now(), 0,
9310                        in->truncate_size, in->truncate_seq,
9311                        onfinish);
9312     client_lock.Unlock();
9313     flock.Lock();
9314
9315     while (!done)
9316       cond.Wait(flock);
9317     flock.Unlock();
9318     client_lock.Lock();
9319     _sync_write_commit(in);
9320   }
9321
9322   // if we get here, write was successful, update client metadata
9323 success:
9324   // time
9325   lat = ceph_clock_now();
9326   lat -= start;
9327   logger->tinc(l_c_wrlat, lat);
9328
9329   totalwritten = size;
9330   r = (int)totalwritten;
9331
9332   // extend file?
9333   if (totalwritten + offset > in->size) {
9334     in->size = totalwritten + offset;
9335     mark_caps_dirty(in, CEPH_CAP_FILE_WR);
9336
9337     if (is_quota_bytes_approaching(in, f->actor_perms)) {
9338       check_caps(in, CHECK_CAPS_NODELAY);
9339     } else if (is_max_size_approaching(in)) {
9340       check_caps(in, 0);
9341     }
9342
9343     ldout(cct, 7) << "wrote to " << totalwritten+offset << ", extending file size" << dendl;
9344   } else {
9345     ldout(cct, 7) << "wrote to " << totalwritten+offset << ", leaving file size at " << in->size << dendl;
9346   }
9347
9348   // mtime
9349   in->mtime = ceph_clock_now();
9350   in->change_attr++;
9351   mark_caps_dirty(in, CEPH_CAP_FILE_WR);
9352
9353 done:
9354
9355   if (onuninline) {
9356     client_lock.Unlock();
9357     uninline_flock.Lock();
9358     while (!uninline_done)
9359       uninline_cond.Wait(uninline_flock);
9360     uninline_flock.Unlock();
9361     client_lock.Lock();
9362
9363     if (uninline_ret >= 0 || uninline_ret == -ECANCELED) {
9364       in->inline_data.clear();
9365       in->inline_version = CEPH_INLINE_NONE;
9366       mark_caps_dirty(in, CEPH_CAP_FILE_WR);
9367       check_caps(in, 0);
9368     } else
9369       r = uninline_ret;
9370   }
9371
9372   put_cap_ref(in, CEPH_CAP_FILE_WR);
9373   return r;
9374 }
9375
9376 int Client::_flush(Fh *f)
9377 {
9378   Inode *in = f->inode.get();
9379   int err = f->take_async_err();
9380   if (err != 0) {
9381     ldout(cct, 1) << __func__ << ": " << f << " on inode " << *in << " caught async_err = "
9382                   << cpp_strerror(err) << dendl;
9383   } else {
9384     ldout(cct, 10) << __func__ << ": " << f << " on inode " << *in << " no async_err state" << dendl;
9385   }
9386
9387   return err;
9388 }
9389
9390 int Client::truncate(const char *relpath, loff_t length, const UserPerm& perms)
9391 {
9392   struct ceph_statx stx;
9393   stx.stx_size = length;
9394   return setattrx(relpath, &stx, CEPH_SETATTR_SIZE, perms);
9395 }
9396
9397 int Client::ftruncate(int fd, loff_t length, const UserPerm& perms)
9398 {
9399   Mutex::Locker lock(client_lock);
9400   tout(cct) << "ftruncate" << std::endl;
9401   tout(cct) << fd << std::endl;
9402   tout(cct) << length << std::endl;
9403
9404   if (unmounting)
9405     return -ENOTCONN;
9406
9407   Fh *f = get_filehandle(fd);
9408   if (!f)
9409     return -EBADF;
9410 #if defined(__linux__) && defined(O_PATH)
9411   if (f->flags & O_PATH)
9412     return -EBADF;
9413 #endif
9414   struct stat attr;
9415   attr.st_size = length;
9416   return _setattr(f->inode, &attr, CEPH_SETATTR_SIZE, perms);
9417 }
9418
9419 int Client::fsync(int fd, bool syncdataonly)
9420 {
9421   Mutex::Locker lock(client_lock);
9422   tout(cct) << "fsync" << std::endl;
9423   tout(cct) << fd << std::endl;
9424   tout(cct) << syncdataonly << std::endl;
9425
9426   if (unmounting)
9427     return -ENOTCONN;
9428
9429   Fh *f = get_filehandle(fd);
9430   if (!f)
9431     return -EBADF;
9432 #if defined(__linux__) && defined(O_PATH)
9433   if (f->flags & O_PATH)
9434     return -EBADF;
9435 #endif
9436   int r = _fsync(f, syncdataonly);
9437   if (r == 0) {
9438     // The IOs in this fsync were okay, but maybe something happened
9439     // in the background that we shoudl be reporting?
9440     r = f->take_async_err();
9441     ldout(cct, 3) << "fsync(" << fd << ", " << syncdataonly
9442                   << ") = 0, async_err = " << r << dendl;
9443   } else {
9444     // Assume that an error we encountered during fsync, even reported
9445     // synchronously, would also have applied the error to the Fh, and we
9446     // should clear it here to avoid returning the same error again on next
9447     // call.
9448     ldout(cct, 3) << "fsync(" << fd << ", " << syncdataonly << ") = "
9449                   << r << dendl;
9450     f->take_async_err();
9451   }
9452   return r;
9453 }
9454
9455 int Client::_fsync(Inode *in, bool syncdataonly)
9456 {
9457   int r = 0;
9458   Mutex lock("Client::_fsync::lock");
9459   Cond cond;
9460   bool done = false;
9461   C_SafeCond *object_cacher_completion = NULL;
9462   ceph_tid_t flush_tid = 0;
9463   InodeRef tmp_ref;
9464
9465   ldout(cct, 3) << "_fsync on " << *in << " " << (syncdataonly ? "(dataonly)":"(data+metadata)") << dendl;
9466
9467   if (cct->_conf->client_oc) {
9468     object_cacher_completion = new C_SafeCond(&lock, &cond, &done, &r);
9469     tmp_ref = in; // take a reference; C_SafeCond doesn't and _flush won't either
9470     _flush(in, object_cacher_completion);
9471     ldout(cct, 15) << "using return-valued form of _fsync" << dendl;
9472   }
9473
9474   if (!syncdataonly && in->dirty_caps) {
9475     check_caps(in, CHECK_CAPS_NODELAY|CHECK_CAPS_SYNCHRONOUS);
9476     if (in->flushing_caps)
9477       flush_tid = last_flush_tid;
9478   } else ldout(cct, 10) << "no metadata needs to commit" << dendl;
9479
9480   if (!syncdataonly && !in->unsafe_ops.empty()) {
9481     MetaRequest *req = in->unsafe_ops.back();
9482     ldout(cct, 15) << "waiting on unsafe requests, last tid " << req->get_tid() <<  dendl;
9483
9484     req->get();
9485     wait_on_list(req->waitfor_safe);
9486     put_request(req);
9487   }
9488
9489   if (object_cacher_completion) { // wait on a real reply instead of guessing
9490     client_lock.Unlock();
9491     lock.Lock();
9492     ldout(cct, 15) << "waiting on data to flush" << dendl;
9493     while (!done)
9494       cond.Wait(lock);
9495     lock.Unlock();
9496     client_lock.Lock();
9497     ldout(cct, 15) << "got " << r << " from flush writeback" << dendl;
9498   } else {
9499     // FIXME: this can starve
9500     while (in->cap_refs[CEPH_CAP_FILE_BUFFER] > 0) {
9501       ldout(cct, 10) << "ino " << in->ino << " has " << in->cap_refs[CEPH_CAP_FILE_BUFFER]
9502                      << " uncommitted, waiting" << dendl;
9503       wait_on_list(in->waitfor_commit);
9504     }
9505   }
9506
9507   if (!r) {
9508     if (flush_tid > 0)
9509       wait_sync_caps(in, flush_tid);
9510
9511     ldout(cct, 10) << "ino " << in->ino << " has no uncommitted writes" << dendl;
9512   } else {
9513     ldout(cct, 1) << "ino " << in->ino << " failed to commit to disk! "
9514                   << cpp_strerror(-r) << dendl;
9515   }
9516
9517   return r;
9518 }
9519
9520 int Client::_fsync(Fh *f, bool syncdataonly)
9521 {
9522   ldout(cct, 3) << "_fsync(" << f << ", " << (syncdataonly ? "dataonly)":"data+metadata)") << dendl;
9523   return _fsync(f->inode.get(), syncdataonly);
9524 }
9525
9526 int Client::fstat(int fd, struct stat *stbuf, const UserPerm& perms, int mask)
9527 {
9528   Mutex::Locker lock(client_lock);
9529   tout(cct) << "fstat mask " << hex << mask << dec << std::endl;
9530   tout(cct) << fd << std::endl;
9531
9532   if (unmounting)
9533     return -ENOTCONN;
9534
9535   Fh *f = get_filehandle(fd);
9536   if (!f)
9537     return -EBADF;
9538   int r = _getattr(f->inode, mask, perms);
9539   if (r < 0)
9540     return r;
9541   fill_stat(f->inode, stbuf, NULL);
9542   ldout(cct, 3) << "fstat(" << fd << ", " << stbuf << ") = " << r << dendl;
9543   return r;
9544 }
9545
9546 int Client::fstatx(int fd, struct ceph_statx *stx, const UserPerm& perms,
9547                    unsigned int want, unsigned int flags)
9548 {
9549   Mutex::Locker lock(client_lock);
9550   tout(cct) << "fstatx flags " << hex << flags << " want " << want << dec << std::endl;
9551   tout(cct) << fd << std::endl;
9552
9553   if (unmounting)
9554     return -ENOTCONN;
9555
9556   Fh *f = get_filehandle(fd);
9557   if (!f)
9558     return -EBADF;
9559
9560   unsigned mask = statx_to_mask(flags, want);
9561
9562   int r = 0;
9563   if (mask && !f->inode->caps_issued_mask(mask, true)) {
9564     r = _getattr(f->inode, mask, perms);
9565     if (r < 0) {
9566       ldout(cct, 3) << "fstatx exit on error!" << dendl;
9567       return r;
9568     }
9569   }
9570
9571   fill_statx(f->inode, mask, stx);
9572   ldout(cct, 3) << "fstatx(" << fd << ", " << stx << ") = " << r << dendl;
9573   return r;
9574 }
9575
9576 // not written yet, but i want to link!
9577
9578 int Client::chdir(const char *relpath, std::string &new_cwd,
9579                   const UserPerm& perms)
9580 {
9581   Mutex::Locker lock(client_lock);
9582   tout(cct) << "chdir" << std::endl;
9583   tout(cct) << relpath << std::endl;
9584
9585   if (unmounting)
9586     return -ENOTCONN;
9587
9588   filepath path(relpath);
9589   InodeRef in;
9590   int r = path_walk(path, &in, perms);
9591   if (r < 0)
9592     return r;
9593   if (cwd != in)
9594     cwd.swap(in);
9595   ldout(cct, 3) << "chdir(" << relpath << ")  cwd now " << cwd->ino << dendl;
9596
9597   _getcwd(new_cwd, perms);
9598   return 0;
9599 }
9600
9601 void Client::_getcwd(string& dir, const UserPerm& perms)
9602 {
9603   filepath path;
9604   ldout(cct, 10) << "getcwd " << *cwd << dendl;
9605
9606   Inode *in = cwd.get();
9607   while (in != root) {
9608     assert(in->dn_set.size() < 2); // dirs can't be hard-linked
9609
9610     // A cwd or ancester is unlinked
9611     if (in->dn_set.empty()) {
9612       return;
9613     }
9614
9615     Dentry *dn = in->get_first_parent();
9616
9617
9618     if (!dn) {
9619       // look it up
9620       ldout(cct, 10) << "getcwd looking up parent for " << *in << dendl;
9621       MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPNAME);
9622       filepath path(in->ino);
9623       req->set_filepath(path);
9624       req->set_inode(in);
9625       int res = make_request(req, perms);
9626       if (res < 0)
9627         break;
9628
9629       // start over
9630       path = filepath();
9631       in = cwd.get();
9632       continue;
9633     }
9634     path.push_front_dentry(dn->name);
9635     in = dn->dir->parent_inode;
9636   }
9637   dir = "/";
9638   dir += path.get_path();
9639 }
9640
9641 void Client::getcwd(string& dir, const UserPerm& perms)
9642 {
9643   Mutex::Locker l(client_lock);
9644   if (!unmounting)
9645     _getcwd(dir, perms);
9646 }
9647
9648 int Client::statfs(const char *path, struct statvfs *stbuf,
9649                    const UserPerm& perms)
9650 {
9651   Mutex::Locker l(client_lock);
9652   tout(cct) << "statfs" << std::endl;
9653
9654   if (unmounting)
9655     return -ENOTCONN;
9656
9657   ceph_statfs stats;
9658   C_SaferCond cond;
9659
9660   const vector<int64_t> &data_pools = mdsmap->get_data_pools();
9661   if (data_pools.size() == 1) {
9662     objecter->get_fs_stats(stats, data_pools[0], &cond);
9663   } else {
9664     objecter->get_fs_stats(stats, boost::optional<int64_t>(), &cond);
9665   }
9666
9667   client_lock.Unlock();
9668   int rval = cond.wait();
9669   client_lock.Lock();
9670
9671   if (rval < 0) {
9672     ldout(cct, 1) << "underlying call to statfs returned error: "
9673                   << cpp_strerror(rval)
9674                   << dendl;
9675     return rval;
9676   }
9677
9678   memset(stbuf, 0, sizeof(*stbuf));
9679
9680   /*
9681    * we're going to set a block size of 4MB so we can represent larger
9682    * FSes without overflowing. Additionally convert the space
9683    * measurements from KB to bytes while making them in terms of
9684    * blocks.  We use 4MB only because it is big enough, and because it
9685    * actually *is* the (ceph) default block size.
9686    */
9687   const int CEPH_BLOCK_SHIFT = 22;
9688   stbuf->f_frsize = 1 << CEPH_BLOCK_SHIFT;
9689   stbuf->f_bsize = 1 << CEPH_BLOCK_SHIFT;
9690   stbuf->f_files = stats.num_objects;
9691   stbuf->f_ffree = -1;
9692   stbuf->f_favail = -1;
9693   stbuf->f_fsid = -1;       // ??
9694   stbuf->f_flag = 0;        // ??
9695   stbuf->f_namemax = NAME_MAX;
9696
9697   // Usually quota_root will == root_ancestor, but if the mount root has no
9698   // quota but we can see a parent of it that does have a quota, we'll
9699   // respect that one instead.
9700   assert(root != nullptr);
9701   Inode *quota_root = root->quota.is_enable() ? root : get_quota_root(root, perms);
9702
9703   // get_quota_root should always give us something
9704   // because client quotas are always enabled
9705   assert(quota_root != nullptr);
9706
9707   if (quota_root && cct->_conf->client_quota_df && quota_root->quota.max_bytes) {
9708
9709     // Skip the getattr if any sessions are stale, as we don't want to
9710     // block `df` if this client has e.g. been evicted, or if the MDS cluster
9711     // is unhealthy.
9712     if (!_any_stale_sessions()) {
9713       int r = _getattr(quota_root, 0, perms, true);
9714       if (r != 0) {
9715         // Ignore return value: error getting latest inode metadata is not a good
9716         // reason to break "df".
9717         lderr(cct) << "Error in getattr on quota root 0x"
9718                    << std::hex << quota_root->ino << std::dec
9719                    << " statfs result may be outdated" << dendl;
9720       }
9721     }
9722
9723     // Special case: if there is a size quota set on the Inode acting
9724     // as the root for this client mount, then report the quota status
9725     // as the filesystem statistics.
9726     const fsblkcnt_t total = quota_root->quota.max_bytes >> CEPH_BLOCK_SHIFT;
9727     const fsblkcnt_t used = quota_root->rstat.rbytes >> CEPH_BLOCK_SHIFT;
9728     // It is possible for a quota to be exceeded: arithmetic here must
9729     // handle case where used > total.
9730     const fsblkcnt_t free = total > used ? total - used : 0;
9731
9732     stbuf->f_blocks = total;
9733     stbuf->f_bfree = free;
9734     stbuf->f_bavail = free;
9735   } else {
9736     // General case: report the cluster statistics returned from RADOS. Because
9737     // multiple pools may be used without one filesystem namespace via
9738     // layouts, this is the most correct thing we can do.
9739     stbuf->f_blocks = stats.kb >> (CEPH_BLOCK_SHIFT - 10);
9740     stbuf->f_bfree = stats.kb_avail >> (CEPH_BLOCK_SHIFT - 10);
9741     stbuf->f_bavail = stats.kb_avail >> (CEPH_BLOCK_SHIFT - 10);
9742   }
9743
9744   return rval;
9745 }
9746
9747 int Client::_do_filelock(Inode *in, Fh *fh, int lock_type, int op, int sleep,
9748                          struct flock *fl, uint64_t owner, bool removing)
9749 {
9750   ldout(cct, 10) << "_do_filelock ino " << in->ino
9751                  << (lock_type == CEPH_LOCK_FCNTL ? " fcntl" : " flock")
9752                  << " type " << fl->l_type << " owner " << owner
9753                  << " " << fl->l_start << "~" << fl->l_len << dendl;
9754
9755   int lock_cmd;
9756   if (F_RDLCK == fl->l_type)
9757     lock_cmd = CEPH_LOCK_SHARED;
9758   else if (F_WRLCK == fl->l_type)
9759     lock_cmd = CEPH_LOCK_EXCL;
9760   else if (F_UNLCK == fl->l_type)
9761     lock_cmd = CEPH_LOCK_UNLOCK;
9762   else
9763     return -EIO;
9764
9765   if (op != CEPH_MDS_OP_SETFILELOCK || lock_cmd == CEPH_LOCK_UNLOCK)
9766     sleep = 0;
9767
9768   /*
9769    * Set the most significant bit, so that MDS knows the 'owner'
9770    * is sufficient to identify the owner of lock. (old code uses
9771    * both 'owner' and 'pid')
9772    */
9773   owner |= (1ULL << 63);
9774
9775   MetaRequest *req = new MetaRequest(op);
9776   filepath path;
9777   in->make_nosnap_relative_path(path);
9778   req->set_filepath(path);
9779   req->set_inode(in);
9780
9781   req->head.args.filelock_change.rule = lock_type;
9782   req->head.args.filelock_change.type = lock_cmd;
9783   req->head.args.filelock_change.owner = owner;
9784   req->head.args.filelock_change.pid = fl->l_pid;
9785   req->head.args.filelock_change.start = fl->l_start;
9786   req->head.args.filelock_change.length = fl->l_len;
9787   req->head.args.filelock_change.wait = sleep;
9788
9789   int ret;
9790   bufferlist bl;
9791
9792   if (sleep && switch_interrupt_cb) {
9793     // enable interrupt
9794     switch_interrupt_cb(callback_handle, req->get());
9795     ret = make_request(req, fh->actor_perms, NULL, NULL, -1, &bl);
9796     // disable interrupt
9797     switch_interrupt_cb(callback_handle, NULL);
9798     if (ret == 0 && req->aborted()) {
9799       // effect of this lock request has been revoked by the 'lock intr' request
9800       ret = req->get_abort_code();
9801     }
9802     put_request(req);
9803   } else {
9804     ret = make_request(req, fh->actor_perms, NULL, NULL, -1, &bl);
9805   }
9806
9807   if (ret == 0) {
9808     if (op == CEPH_MDS_OP_GETFILELOCK) {
9809       ceph_filelock filelock;
9810       bufferlist::iterator p = bl.begin();
9811       ::decode(filelock, p);
9812
9813       if (CEPH_LOCK_SHARED == filelock.type)
9814         fl->l_type = F_RDLCK;
9815       else if (CEPH_LOCK_EXCL == filelock.type)
9816         fl->l_type = F_WRLCK;
9817       else
9818         fl->l_type = F_UNLCK;
9819
9820       fl->l_whence = SEEK_SET;
9821       fl->l_start = filelock.start;
9822       fl->l_len = filelock.length;
9823       fl->l_pid = filelock.pid;
9824     } else if (op == CEPH_MDS_OP_SETFILELOCK) {
9825       ceph_lock_state_t *lock_state;
9826       if (lock_type == CEPH_LOCK_FCNTL) {
9827         if (!in->fcntl_locks)
9828           in->fcntl_locks = new ceph_lock_state_t(cct, CEPH_LOCK_FCNTL);
9829         lock_state = in->fcntl_locks;
9830       } else if (lock_type == CEPH_LOCK_FLOCK) {
9831         if (!in->flock_locks)
9832           in->flock_locks = new ceph_lock_state_t(cct, CEPH_LOCK_FLOCK);
9833         lock_state = in->flock_locks;
9834       } else {
9835         ceph_abort();
9836         return -EINVAL;
9837       }
9838       _update_lock_state(fl, owner, lock_state);
9839
9840       if (!removing) {
9841         if (lock_type == CEPH_LOCK_FCNTL) {
9842           if (!fh->fcntl_locks)
9843             fh->fcntl_locks = new ceph_lock_state_t(cct, CEPH_LOCK_FCNTL);
9844           lock_state = fh->fcntl_locks;
9845         } else {
9846           if (!fh->flock_locks)
9847             fh->flock_locks = new ceph_lock_state_t(cct, CEPH_LOCK_FLOCK);
9848           lock_state = fh->flock_locks;
9849         }
9850         _update_lock_state(fl, owner, lock_state);
9851       }
9852     } else
9853       ceph_abort();
9854   }
9855   return ret;
9856 }
9857
9858 int Client::_interrupt_filelock(MetaRequest *req)
9859 {
9860   // Set abort code, but do not kick. The abort code prevents the request
9861   // from being re-sent.
9862   req->abort(-EINTR);
9863   if (req->mds < 0)
9864     return 0; // haven't sent the request
9865
9866   Inode *in = req->inode();
9867
9868   int lock_type;
9869   if (req->head.args.filelock_change.rule == CEPH_LOCK_FLOCK)
9870     lock_type = CEPH_LOCK_FLOCK_INTR;
9871   else if (req->head.args.filelock_change.rule == CEPH_LOCK_FCNTL)
9872     lock_type = CEPH_LOCK_FCNTL_INTR;
9873   else {
9874     ceph_abort();
9875     return -EINVAL;
9876   }
9877
9878   MetaRequest *intr_req = new MetaRequest(CEPH_MDS_OP_SETFILELOCK);
9879   filepath path;
9880   in->make_nosnap_relative_path(path);
9881   intr_req->set_filepath(path);
9882   intr_req->set_inode(in);
9883   intr_req->head.args.filelock_change = req->head.args.filelock_change;
9884   intr_req->head.args.filelock_change.rule = lock_type;
9885   intr_req->head.args.filelock_change.type = CEPH_LOCK_UNLOCK;
9886
9887   UserPerm perms(req->get_uid(), req->get_gid());
9888   return make_request(intr_req, perms, NULL, NULL, -1);
9889 }
9890
9891 void Client::_encode_filelocks(Inode *in, bufferlist& bl)
9892 {
9893   if (!in->fcntl_locks && !in->flock_locks)
9894     return;
9895
9896   unsigned nr_fcntl_locks = in->fcntl_locks ? in->fcntl_locks->held_locks.size() : 0;
9897   ::encode(nr_fcntl_locks, bl);
9898   if (nr_fcntl_locks) {
9899     ceph_lock_state_t* lock_state = in->fcntl_locks;
9900     for(multimap<uint64_t, ceph_filelock>::iterator p = lock_state->held_locks.begin();
9901         p != lock_state->held_locks.end();
9902         ++p)
9903       ::encode(p->second, bl);
9904   }
9905
9906   unsigned nr_flock_locks = in->flock_locks ? in->flock_locks->held_locks.size() : 0;
9907   ::encode(nr_flock_locks, bl);
9908   if (nr_flock_locks) {
9909     ceph_lock_state_t* lock_state = in->flock_locks;
9910     for(multimap<uint64_t, ceph_filelock>::iterator p = lock_state->held_locks.begin();
9911         p != lock_state->held_locks.end();
9912         ++p)
9913       ::encode(p->second, bl);
9914   }
9915
9916   ldout(cct, 10) << "_encode_filelocks ino " << in->ino << ", " << nr_fcntl_locks
9917                  << " fcntl locks, " << nr_flock_locks << " flock locks" <<  dendl;
9918 }
9919
9920 void Client::_release_filelocks(Fh *fh)
9921 {
9922   if (!fh->fcntl_locks && !fh->flock_locks)
9923     return;
9924
9925   Inode *in = fh->inode.get();
9926   ldout(cct, 10) << "_release_filelocks " << fh << " ino " << in->ino << dendl;
9927
9928   list<pair<int, ceph_filelock> > to_release;
9929
9930   if (fh->fcntl_locks) {
9931     ceph_lock_state_t* lock_state = fh->fcntl_locks;
9932     for(multimap<uint64_t, ceph_filelock>::iterator p = lock_state->held_locks.begin();
9933         p != lock_state->held_locks.end();
9934         ++p)
9935       to_release.push_back(pair<int, ceph_filelock>(CEPH_LOCK_FCNTL, p->second));
9936     delete fh->fcntl_locks;
9937   }
9938   if (fh->flock_locks) {
9939     ceph_lock_state_t* lock_state = fh->flock_locks;
9940     for(multimap<uint64_t, ceph_filelock>::iterator p = lock_state->held_locks.begin();
9941         p != lock_state->held_locks.end();
9942         ++p)
9943       to_release.push_back(pair<int, ceph_filelock>(CEPH_LOCK_FLOCK, p->second));
9944     delete fh->flock_locks;
9945   }
9946
9947   if (to_release.empty())
9948     return;
9949
9950   struct flock fl;
9951   memset(&fl, 0, sizeof(fl));
9952   fl.l_whence = SEEK_SET;
9953   fl.l_type = F_UNLCK;
9954
9955   for (list<pair<int, ceph_filelock> >::iterator p = to_release.begin();
9956        p != to_release.end();
9957        ++p) {
9958     fl.l_start = p->second.start;
9959     fl.l_len = p->second.length;
9960     fl.l_pid = p->second.pid;
9961     _do_filelock(in, fh, p->first, CEPH_MDS_OP_SETFILELOCK, 0, &fl,
9962                  p->second.owner, true);
9963   }
9964 }
9965
9966 void Client::_update_lock_state(struct flock *fl, uint64_t owner,
9967                                 ceph_lock_state_t *lock_state)
9968 {
9969   int lock_cmd;
9970   if (F_RDLCK == fl->l_type)
9971     lock_cmd = CEPH_LOCK_SHARED;
9972   else if (F_WRLCK == fl->l_type)
9973     lock_cmd = CEPH_LOCK_EXCL;
9974   else
9975     lock_cmd = CEPH_LOCK_UNLOCK;;
9976
9977   ceph_filelock filelock;
9978   filelock.start = fl->l_start;
9979   filelock.length = fl->l_len;
9980   filelock.client = 0;
9981   // see comment in _do_filelock()
9982   filelock.owner = owner | (1ULL << 63);
9983   filelock.pid = fl->l_pid;
9984   filelock.type = lock_cmd;
9985
9986   if (filelock.type == CEPH_LOCK_UNLOCK) {
9987     list<ceph_filelock> activated_locks;
9988     lock_state->remove_lock(filelock, activated_locks);
9989   } else {
9990     bool r = lock_state->add_lock(filelock, false, false, NULL);
9991     assert(r);
9992   }
9993 }
9994
9995 int Client::_getlk(Fh *fh, struct flock *fl, uint64_t owner)
9996 {
9997   Inode *in = fh->inode.get();
9998   ldout(cct, 10) << "_getlk " << fh << " ino " << in->ino << dendl;
9999   int ret = _do_filelock(in, fh, CEPH_LOCK_FCNTL, CEPH_MDS_OP_GETFILELOCK, 0, fl, owner);
10000   return ret;
10001 }
10002
10003 int Client::_setlk(Fh *fh, struct flock *fl, uint64_t owner, int sleep)
10004 {
10005   Inode *in = fh->inode.get();
10006   ldout(cct, 10) << "_setlk " << fh << " ino " << in->ino << dendl;
10007   int ret =  _do_filelock(in, fh, CEPH_LOCK_FCNTL, CEPH_MDS_OP_SETFILELOCK, sleep, fl, owner);
10008   ldout(cct, 10) << "_setlk " << fh << " ino " << in->ino << " result=" << ret << dendl;
10009   return ret;
10010 }
10011
10012 int Client::_flock(Fh *fh, int cmd, uint64_t owner)
10013 {
10014   Inode *in = fh->inode.get();
10015   ldout(cct, 10) << "_flock " << fh << " ino " << in->ino << dendl;
10016
10017   int sleep = !(cmd & LOCK_NB);
10018   cmd &= ~LOCK_NB;
10019
10020   int type;
10021   switch (cmd) {
10022     case LOCK_SH:
10023       type = F_RDLCK;
10024       break;
10025     case LOCK_EX:
10026       type = F_WRLCK;
10027       break;
10028     case LOCK_UN:
10029       type = F_UNLCK;
10030       break;
10031     default:
10032       return -EINVAL;
10033   }
10034
10035   struct flock fl;
10036   memset(&fl, 0, sizeof(fl));
10037   fl.l_type = type;
10038   fl.l_whence = SEEK_SET;
10039
10040   int ret =  _do_filelock(in, fh, CEPH_LOCK_FLOCK, CEPH_MDS_OP_SETFILELOCK, sleep, &fl, owner);
10041   ldout(cct, 10) << "_flock " << fh << " ino " << in->ino << " result=" << ret << dendl;
10042   return ret;
10043 }
10044
10045 int Client::ll_statfs(Inode *in, struct statvfs *stbuf, const UserPerm& perms)
10046 {
10047   /* Since the only thing this does is wrap a call to statfs, and
10048      statfs takes a lock, it doesn't seem we have a need to split it
10049      out. */
10050   return statfs(0, stbuf, perms);
10051 }
10052
10053 void Client::ll_register_callbacks(struct client_callback_args *args)
10054 {
10055   if (!args)
10056     return;
10057   Mutex::Locker l(client_lock);
10058   ldout(cct, 10) << "ll_register_callbacks cb " << args->handle
10059                  << " invalidate_ino_cb " << args->ino_cb
10060                  << " invalidate_dentry_cb " << args->dentry_cb
10061                  << " getgroups_cb" << args->getgroups_cb
10062                  << " switch_interrupt_cb " << args->switch_intr_cb
10063                  << " remount_cb " << args->remount_cb
10064                  << dendl;
10065   callback_handle = args->handle;
10066   if (args->ino_cb) {
10067     ino_invalidate_cb = args->ino_cb;
10068     async_ino_invalidator.start();
10069   }
10070   if (args->dentry_cb) {
10071     dentry_invalidate_cb = args->dentry_cb;
10072     async_dentry_invalidator.start();
10073   }
10074   if (args->switch_intr_cb) {
10075     switch_interrupt_cb = args->switch_intr_cb;
10076     interrupt_finisher.start();
10077   }
10078   if (args->remount_cb) {
10079     remount_cb = args->remount_cb;
10080     remount_finisher.start();
10081   }
10082   getgroups_cb = args->getgroups_cb;
10083   umask_cb = args->umask_cb;
10084 }
10085
10086 int Client::test_dentry_handling(bool can_invalidate)
10087 {
10088   int r = 0;
10089
10090   can_invalidate_dentries = can_invalidate;
10091
10092   if (can_invalidate_dentries) {
10093     assert(dentry_invalidate_cb);
10094     ldout(cct, 1) << "using dentry_invalidate_cb" << dendl;
10095     r = 0;
10096   } else if (remount_cb) {
10097     ldout(cct, 1) << "using remount_cb" << dendl;
10098     r = _do_remount();
10099   }
10100   if (r) {
10101     bool should_abort = cct->_conf->get_val<bool>("client_die_on_failed_dentry_invalidate");
10102     if (should_abort) {
10103       lderr(cct) << "no method to invalidate kernel dentry cache; quitting!" << dendl;
10104       ceph_abort();
10105     } else {
10106       lderr(cct) << "no method to invalidate kernel dentry cache; expect issues!" << dendl;
10107     }
10108   }
10109   return r;
10110 }
10111
10112 int Client::_sync_fs()
10113 {
10114   ldout(cct, 10) << "_sync_fs" << dendl;
10115
10116   // flush file data
10117   Mutex lock("Client::_fsync::lock");
10118   Cond cond;
10119   bool flush_done = false;
10120   if (cct->_conf->client_oc)
10121     objectcacher->flush_all(new C_SafeCond(&lock, &cond, &flush_done));
10122   else
10123     flush_done = true;
10124
10125   // flush caps
10126   flush_caps_sync();
10127   ceph_tid_t flush_tid = last_flush_tid;
10128
10129   // wait for unsafe mds requests
10130   wait_unsafe_requests();
10131
10132   wait_sync_caps(flush_tid);
10133
10134   if (!flush_done) {
10135     client_lock.Unlock();
10136     lock.Lock();
10137     ldout(cct, 15) << "waiting on data to flush" << dendl;
10138     while (!flush_done)
10139       cond.Wait(lock);
10140     lock.Unlock();
10141     client_lock.Lock();
10142   }
10143
10144   return 0;
10145 }
10146
10147 int Client::sync_fs()
10148 {
10149   Mutex::Locker l(client_lock);
10150
10151   if (unmounting)
10152     return -ENOTCONN;
10153
10154   return _sync_fs();
10155 }
10156
10157 int64_t Client::drop_caches()
10158 {
10159   Mutex::Locker l(client_lock);
10160   return objectcacher->release_all();
10161 }
10162
10163
10164 int Client::lazyio_propogate(int fd, loff_t offset, size_t count)
10165 {
10166   Mutex::Locker l(client_lock);
10167   ldout(cct, 3) << "op: client->lazyio_propogate(" << fd
10168           << ", " << offset << ", " << count << ")" << dendl;
10169
10170   Fh *f = get_filehandle(fd);
10171   if (!f)
10172     return -EBADF;
10173
10174   // for now
10175   _fsync(f, true);
10176
10177   return 0;
10178 }
10179
10180 int Client::lazyio_synchronize(int fd, loff_t offset, size_t count)
10181 {
10182   Mutex::Locker l(client_lock);
10183   ldout(cct, 3) << "op: client->lazyio_synchronize(" << fd
10184           << ", " << offset << ", " << count << ")" << dendl;
10185
10186   Fh *f = get_filehandle(fd);
10187   if (!f)
10188     return -EBADF;
10189   Inode *in = f->inode.get();
10190
10191   _fsync(f, true);
10192   if (_release(in))
10193     check_caps(in, 0);
10194   return 0;
10195 }
10196
10197
10198 // =============================
10199 // snaps
10200
10201 int Client::mksnap(const char *relpath, const char *name, const UserPerm& perm)
10202 {
10203   Mutex::Locker l(client_lock);
10204
10205   if (unmounting)
10206     return -ENOTCONN;
10207
10208   filepath path(relpath);
10209   InodeRef in;
10210   int r = path_walk(path, &in, perm);
10211   if (r < 0)
10212     return r;
10213   if (cct->_conf->client_permissions) {
10214     r = may_create(in.get(), perm);
10215     if (r < 0)
10216       return r;
10217   }
10218   Inode *snapdir = open_snapdir(in.get());
10219   return _mkdir(snapdir, name, 0, perm);
10220 }
10221
10222 int Client::rmsnap(const char *relpath, const char *name, const UserPerm& perms)
10223 {
10224   Mutex::Locker l(client_lock);
10225
10226   if (unmounting)
10227     return -ENOTCONN;
10228
10229   filepath path(relpath);
10230   InodeRef in;
10231   int r = path_walk(path, &in, perms);
10232   if (r < 0)
10233     return r;
10234   if (cct->_conf->client_permissions) {
10235     r = may_delete(in.get(), NULL, perms);
10236     if (r < 0)
10237       return r;
10238   }
10239   Inode *snapdir = open_snapdir(in.get());
10240   return _rmdir(snapdir, name, perms);
10241 }
10242
10243 // =============================
10244 // expose caps
10245
10246 int Client::get_caps_issued(int fd) {
10247
10248   Mutex::Locker lock(client_lock);
10249
10250   if (unmounting)
10251     return -ENOTCONN;
10252
10253   Fh *f = get_filehandle(fd);
10254   if (!f)
10255     return -EBADF;
10256
10257   return f->inode->caps_issued();
10258 }
10259
10260 int Client::get_caps_issued(const char *path, const UserPerm& perms)
10261 {
10262   Mutex::Locker lock(client_lock);
10263
10264   if (unmounting)
10265     return -ENOTCONN;
10266
10267   filepath p(path);
10268   InodeRef in;
10269   int r = path_walk(p, &in, perms, true);
10270   if (r < 0)
10271     return r;
10272   return in->caps_issued();
10273 }
10274
10275 // =========================================
10276 // low level
10277
10278 Inode *Client::open_snapdir(Inode *diri)
10279 {
10280   Inode *in;
10281   vinodeno_t vino(diri->ino, CEPH_SNAPDIR);
10282   if (!inode_map.count(vino)) {
10283     in = new Inode(this, vino, &diri->layout);
10284
10285     in->ino = diri->ino;
10286     in->snapid = CEPH_SNAPDIR;
10287     in->mode = diri->mode;
10288     in->uid = diri->uid;
10289     in->gid = diri->gid;
10290     in->mtime = diri->mtime;
10291     in->ctime = diri->ctime;
10292     in->btime = diri->btime;
10293     in->size = diri->size;
10294     in->change_attr = diri->change_attr;
10295
10296     in->dirfragtree.clear();
10297     in->snapdir_parent = diri;
10298     diri->flags |= I_SNAPDIR_OPEN;
10299     inode_map[vino] = in;
10300     if (use_faked_inos())
10301       _assign_faked_ino(in);
10302     ldout(cct, 10) << "open_snapdir created snapshot inode " << *in << dendl;
10303   } else {
10304     in = inode_map[vino];
10305     ldout(cct, 10) << "open_snapdir had snapshot inode " << *in << dendl;
10306   }
10307   return in;
10308 }
10309
10310 int Client::ll_lookup(Inode *parent, const char *name, struct stat *attr,
10311                       Inode **out, const UserPerm& perms)
10312 {
10313   Mutex::Locker lock(client_lock);
10314   vinodeno_t vparent = _get_vino(parent);
10315   ldout(cct, 3) << "ll_lookup " << vparent << " " << name << dendl;
10316   tout(cct) << "ll_lookup" << std::endl;
10317   tout(cct) << name << std::endl;
10318
10319   if (unmounting)
10320     return -ENOTCONN;
10321
10322   int r = 0;
10323   if (!cct->_conf->fuse_default_permissions) {
10324     r = may_lookup(parent, perms);
10325     if (r < 0)
10326       return r;
10327   }
10328
10329   string dname(name);
10330   InodeRef in;
10331
10332   r = _lookup(parent, dname, CEPH_STAT_CAP_INODE_ALL, &in, perms);
10333   if (r < 0) {
10334     attr->st_ino = 0;
10335     goto out;
10336   }
10337
10338   assert(in);
10339   fill_stat(in, attr);
10340   _ll_get(in.get());
10341
10342  out:
10343   ldout(cct, 3) << "ll_lookup " << vparent << " " << name
10344           << " -> " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
10345   tout(cct) << attr->st_ino << std::endl;
10346   *out = in.get();
10347   return r;
10348 }
10349
10350 int Client::ll_lookupx(Inode *parent, const char *name, Inode **out,
10351                        struct ceph_statx *stx, unsigned want, unsigned flags,
10352                        const UserPerm& perms)
10353 {
10354   Mutex::Locker lock(client_lock);
10355   vinodeno_t vparent = _get_vino(parent);
10356   ldout(cct, 3) << "ll_lookupx " << vparent << " " << name << dendl;
10357   tout(cct) << "ll_lookupx" << std::endl;
10358   tout(cct) << name << std::endl;
10359
10360   if (unmounting)
10361     return -ENOTCONN;
10362
10363   int r = 0;
10364   if (!cct->_conf->fuse_default_permissions) {
10365     r = may_lookup(parent, perms);
10366     if (r < 0)
10367       return r;
10368   }
10369
10370   string dname(name);
10371   InodeRef in;
10372
10373   unsigned mask = statx_to_mask(flags, want);
10374   r = _lookup(parent, dname, mask, &in, perms);
10375   if (r < 0) {
10376     stx->stx_ino = 0;
10377     stx->stx_mask = 0;
10378   } else {
10379     assert(in);
10380     fill_statx(in, mask, stx);
10381     _ll_get(in.get());
10382   }
10383
10384   ldout(cct, 3) << "ll_lookupx " << vparent << " " << name
10385           << " -> " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
10386   tout(cct) << stx->stx_ino << std::endl;
10387   *out = in.get();
10388   return r;
10389 }
10390
10391 int Client::ll_walk(const char* name, Inode **out, struct ceph_statx *stx,
10392                     unsigned int want, unsigned int flags, const UserPerm& perms)
10393 {
10394   Mutex::Locker lock(client_lock);
10395
10396   if (unmounting)
10397     return -ENOTCONN;
10398
10399   filepath fp(name, 0);
10400   InodeRef in;
10401   int rc;
10402   unsigned mask = statx_to_mask(flags, want);
10403
10404   ldout(cct, 3) << "ll_walk" << name << dendl;
10405   tout(cct) << "ll_walk" << std::endl;
10406   tout(cct) << name << std::endl;
10407
10408   rc = path_walk(fp, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW), mask);
10409   if (rc < 0) {
10410     /* zero out mask, just in case... */
10411     stx->stx_mask = 0;
10412     stx->stx_ino = 0;
10413     *out = NULL;
10414     return rc;
10415   } else {
10416     assert(in);
10417     fill_statx(in, mask, stx);
10418     _ll_get(in.get());
10419     *out = in.get();
10420     return 0;
10421   }
10422 }
10423
10424 void Client::_ll_get(Inode *in)
10425 {
10426   if (in->ll_ref == 0) {
10427     in->get();
10428     if (in->is_dir() && !in->dn_set.empty()) {
10429       assert(in->dn_set.size() == 1); // dirs can't be hard-linked
10430       in->get_first_parent()->get(); // pin dentry
10431     }
10432   }
10433   in->ll_get();
10434   ldout(cct, 20) << "_ll_get " << in << " " << in->ino << " -> " << in->ll_ref << dendl;
10435 }
10436
10437 int Client::_ll_put(Inode *in, int num)
10438 {
10439   in->ll_put(num);
10440   ldout(cct, 20) << "_ll_put " << in << " " << in->ino << " " << num << " -> " << in->ll_ref << dendl;
10441   if (in->ll_ref == 0) {
10442     if (in->is_dir() && !in->dn_set.empty()) {
10443       assert(in->dn_set.size() == 1); // dirs can't be hard-linked
10444       in->get_first_parent()->put(); // unpin dentry
10445     }
10446     put_inode(in);
10447     return 0;
10448   } else {
10449     return in->ll_ref;
10450   }
10451 }
10452
10453 void Client::_ll_drop_pins()
10454 {
10455   ldout(cct, 10) << "_ll_drop_pins" << dendl;
10456   ceph::unordered_map<vinodeno_t, Inode*>::iterator next;
10457   for (ceph::unordered_map<vinodeno_t, Inode*>::iterator it = inode_map.begin();
10458        it != inode_map.end();
10459        it = next) {
10460     Inode *in = it->second;
10461     next = it;
10462     ++next;
10463     if (in->ll_ref)
10464       _ll_put(in, in->ll_ref);
10465   }
10466 }
10467
10468 bool Client::ll_forget(Inode *in, int count)
10469 {
10470   Mutex::Locker lock(client_lock);
10471   inodeno_t ino = _get_inodeno(in);
10472
10473   ldout(cct, 3) << "ll_forget " << ino << " " << count << dendl;
10474   tout(cct) << "ll_forget" << std::endl;
10475   tout(cct) << ino.val << std::endl;
10476   tout(cct) << count << std::endl;
10477
10478   // Ignore forget if we're no longer mounted
10479   if (unmounting)
10480     return true;
10481
10482   if (ino == 1) return true;  // ignore forget on root.
10483
10484   bool last = false;
10485   if (in->ll_ref < count) {
10486     ldout(cct, 1) << "WARNING: ll_forget on " << ino << " " << count
10487                   << ", which only has ll_ref=" << in->ll_ref << dendl;
10488     _ll_put(in, in->ll_ref);
10489     last = true;
10490   } else {
10491     if (_ll_put(in, count) == 0)
10492       last = true;
10493   }
10494
10495   return last;
10496 }
10497
10498 bool Client::ll_put(Inode *in)
10499 {
10500   /* ll_forget already takes the lock */
10501   return ll_forget(in, 1);
10502 }
10503
10504 snapid_t Client::ll_get_snapid(Inode *in)
10505 {
10506   Mutex::Locker lock(client_lock);
10507   return in->snapid;
10508 }
10509
10510 Inode *Client::ll_get_inode(ino_t ino)
10511 {
10512   Mutex::Locker lock(client_lock);
10513
10514   if (unmounting)
10515     return NULL;
10516
10517   vinodeno_t vino = _map_faked_ino(ino);
10518   unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino);
10519   if (p == inode_map.end())
10520     return NULL;
10521   Inode *in = p->second;
10522   _ll_get(in);
10523   return in;
10524 }
10525
10526 Inode *Client::ll_get_inode(vinodeno_t vino)
10527 {
10528   Mutex::Locker lock(client_lock);
10529
10530   if (unmounting)
10531     return NULL;
10532
10533   unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino);
10534   if (p == inode_map.end())
10535     return NULL;
10536   Inode *in = p->second;
10537   _ll_get(in);
10538   return in;
10539 }
10540
10541 int Client::_ll_getattr(Inode *in, int caps, const UserPerm& perms)
10542 {
10543   vinodeno_t vino = _get_vino(in);
10544
10545   ldout(cct, 3) << "ll_getattr " << vino << dendl;
10546   tout(cct) << "ll_getattr" << std::endl;
10547   tout(cct) << vino.ino.val << std::endl;
10548
10549   if (vino.snapid < CEPH_NOSNAP)
10550     return 0;
10551   else
10552     return _getattr(in, caps, perms);
10553 }
10554
10555 int Client::ll_getattr(Inode *in, struct stat *attr, const UserPerm& perms)
10556 {
10557   Mutex::Locker lock(client_lock);
10558
10559   if (unmounting)
10560     return -ENOTCONN;
10561
10562   int res = _ll_getattr(in, CEPH_STAT_CAP_INODE_ALL, perms);
10563
10564   if (res == 0)
10565     fill_stat(in, attr);
10566   ldout(cct, 3) << "ll_getattr " << _get_vino(in) << " = " << res << dendl;
10567   return res;
10568 }
10569
10570 int Client::ll_getattrx(Inode *in, struct ceph_statx *stx, unsigned int want,
10571                         unsigned int flags, const UserPerm& perms)
10572 {
10573   Mutex::Locker lock(client_lock);
10574
10575   if (unmounting)
10576     return -ENOTCONN;
10577
10578   int res = 0;
10579   unsigned mask = statx_to_mask(flags, want);
10580
10581   if (mask && !in->caps_issued_mask(mask, true))
10582     res = _ll_getattr(in, mask, perms);
10583
10584   if (res == 0)
10585     fill_statx(in, mask, stx);
10586   ldout(cct, 3) << "ll_getattrx " << _get_vino(in) << " = " << res << dendl;
10587   return res;
10588 }
10589
10590 int Client::_ll_setattrx(Inode *in, struct ceph_statx *stx, int mask,
10591                          const UserPerm& perms, InodeRef *inp)
10592 {
10593   vinodeno_t vino = _get_vino(in);
10594
10595   ldout(cct, 3) << "ll_setattrx " << vino << " mask " << hex << mask << dec
10596                 << dendl;
10597   tout(cct) << "ll_setattrx" << std::endl;
10598   tout(cct) << vino.ino.val << std::endl;
10599   tout(cct) << stx->stx_mode << std::endl;
10600   tout(cct) << stx->stx_uid << std::endl;
10601   tout(cct) << stx->stx_gid << std::endl;
10602   tout(cct) << stx->stx_size << std::endl;
10603   tout(cct) << stx->stx_mtime << std::endl;
10604   tout(cct) << stx->stx_atime << std::endl;
10605   tout(cct) << stx->stx_btime << std::endl;
10606   tout(cct) << mask << std::endl;
10607
10608   if (!cct->_conf->fuse_default_permissions) {
10609     int res = may_setattr(in, stx, mask, perms);
10610     if (res < 0)
10611       return res;
10612   }
10613
10614   mask &= ~(CEPH_SETATTR_MTIME_NOW | CEPH_SETATTR_ATIME_NOW);
10615
10616   return __setattrx(in, stx, mask, perms, inp);
10617 }
10618
10619 int Client::ll_setattrx(Inode *in, struct ceph_statx *stx, int mask,
10620                         const UserPerm& perms)
10621 {
10622   Mutex::Locker lock(client_lock);
10623
10624   if (unmounting)
10625     return -ENOTCONN;
10626
10627   InodeRef target(in);
10628   int res = _ll_setattrx(in, stx, mask, perms, &target);
10629   if (res == 0) {
10630     assert(in == target.get());
10631     fill_statx(in, in->caps_issued(), stx);
10632   }
10633
10634   ldout(cct, 3) << "ll_setattrx " << _get_vino(in) << " = " << res << dendl;
10635   return res;
10636 }
10637
10638 int Client::ll_setattr(Inode *in, struct stat *attr, int mask,
10639                        const UserPerm& perms)
10640 {
10641   struct ceph_statx stx;
10642   stat_to_statx(attr, &stx);
10643
10644   Mutex::Locker lock(client_lock);
10645
10646   if (unmounting)
10647     return -ENOTCONN;
10648
10649   InodeRef target(in);
10650   int res = _ll_setattrx(in, &stx, mask, perms, &target);
10651   if (res == 0) {
10652     assert(in == target.get());
10653     fill_stat(in, attr);
10654   }
10655
10656   ldout(cct, 3) << "ll_setattr " << _get_vino(in) << " = " << res << dendl;
10657   return res;
10658 }
10659
10660
10661 // ----------
10662 // xattrs
10663
10664 int Client::getxattr(const char *path, const char *name, void *value, size_t size,
10665                      const UserPerm& perms)
10666 {
10667   Mutex::Locker lock(client_lock);
10668
10669   if (unmounting)
10670     return -ENOTCONN;
10671
10672   InodeRef in;
10673   int r = Client::path_walk(path, &in, perms, true, CEPH_STAT_CAP_XATTR);
10674   if (r < 0)
10675     return r;
10676   return _getxattr(in, name, value, size, perms);
10677 }
10678
10679 int Client::lgetxattr(const char *path, const char *name, void *value, size_t size,
10680                       const UserPerm& perms)
10681 {
10682   Mutex::Locker lock(client_lock);
10683
10684   if (unmounting)
10685     return -ENOTCONN;
10686
10687   InodeRef in;
10688   int r = Client::path_walk(path, &in, perms, false, CEPH_STAT_CAP_XATTR);
10689   if (r < 0)
10690     return r;
10691   return _getxattr(in, name, value, size, perms);
10692 }
10693
10694 int Client::fgetxattr(int fd, const char *name, void *value, size_t size,
10695                       const UserPerm& perms)
10696 {
10697   Mutex::Locker lock(client_lock);
10698
10699   if (unmounting)
10700     return -ENOTCONN;
10701
10702   Fh *f = get_filehandle(fd);
10703   if (!f)
10704     return -EBADF;
10705   return _getxattr(f->inode, name, value, size, perms);
10706 }
10707
10708 int Client::listxattr(const char *path, char *list, size_t size,
10709                       const UserPerm& perms)
10710 {
10711   Mutex::Locker lock(client_lock);
10712
10713   if (unmounting)
10714     return -ENOTCONN;
10715
10716   InodeRef in;
10717   int r = Client::path_walk(path, &in, perms, true, CEPH_STAT_CAP_XATTR);
10718   if (r < 0)
10719     return r;
10720   return Client::_listxattr(in.get(), list, size, perms);
10721 }
10722
10723 int Client::llistxattr(const char *path, char *list, size_t size,
10724                        const UserPerm& perms)
10725 {
10726   Mutex::Locker lock(client_lock);
10727
10728   if (unmounting)
10729     return -ENOTCONN;
10730
10731   InodeRef in;
10732   int r = Client::path_walk(path, &in, perms, false, CEPH_STAT_CAP_XATTR);
10733   if (r < 0)
10734     return r;
10735   return Client::_listxattr(in.get(), list, size, perms);
10736 }
10737
10738 int Client::flistxattr(int fd, char *list, size_t size, const UserPerm& perms)
10739 {
10740   Mutex::Locker lock(client_lock);
10741
10742   if (unmounting)
10743     return -ENOTCONN;
10744
10745   Fh *f = get_filehandle(fd);
10746   if (!f)
10747     return -EBADF;
10748   return Client::_listxattr(f->inode.get(), list, size, perms);
10749 }
10750
10751 int Client::removexattr(const char *path, const char *name,
10752                         const UserPerm& perms)
10753 {
10754   Mutex::Locker lock(client_lock);
10755
10756   if (unmounting)
10757     return -ENOTCONN;
10758
10759   InodeRef in;
10760   int r = Client::path_walk(path, &in, perms, true);
10761   if (r < 0)
10762     return r;
10763   return _removexattr(in, name, perms);
10764 }
10765
10766 int Client::lremovexattr(const char *path, const char *name,
10767                          const UserPerm& perms)
10768 {
10769   Mutex::Locker lock(client_lock);
10770
10771   if (unmounting)
10772     return -ENOTCONN;
10773
10774   InodeRef in;
10775   int r = Client::path_walk(path, &in, perms, false);
10776   if (r < 0)
10777     return r;
10778   return _removexattr(in, name, perms);
10779 }
10780
10781 int Client::fremovexattr(int fd, const char *name, const UserPerm& perms)
10782 {
10783   Mutex::Locker lock(client_lock);
10784
10785   if (unmounting)
10786     return -ENOTCONN;
10787
10788   Fh *f = get_filehandle(fd);
10789   if (!f)
10790     return -EBADF;
10791   return _removexattr(f->inode, name, perms);
10792 }
10793
10794 int Client::setxattr(const char *path, const char *name, const void *value,
10795                      size_t size, int flags, const UserPerm& perms)
10796 {
10797   _setxattr_maybe_wait_for_osdmap(name, value, size);
10798
10799   Mutex::Locker lock(client_lock);
10800
10801   if (unmounting)
10802     return -ENOTCONN;
10803
10804   InodeRef in;
10805   int r = Client::path_walk(path, &in, perms, true);
10806   if (r < 0)
10807     return r;
10808   return _setxattr(in, name, value, size, flags, perms);
10809 }
10810
10811 int Client::lsetxattr(const char *path, const char *name, const void *value,
10812                       size_t size, int flags, const UserPerm& perms)
10813 {
10814   _setxattr_maybe_wait_for_osdmap(name, value, size);
10815
10816   Mutex::Locker lock(client_lock);
10817
10818   if (unmounting)
10819     return -ENOTCONN;
10820
10821   InodeRef in;
10822   int r = Client::path_walk(path, &in, perms, false);
10823   if (r < 0)
10824     return r;
10825   return _setxattr(in, name, value, size, flags, perms);
10826 }
10827
10828 int Client::fsetxattr(int fd, const char *name, const void *value, size_t size,
10829                       int flags, const UserPerm& perms)
10830 {
10831   _setxattr_maybe_wait_for_osdmap(name, value, size);
10832
10833   Mutex::Locker lock(client_lock);
10834
10835   if (unmounting)
10836     return -ENOTCONN;
10837
10838   Fh *f = get_filehandle(fd);
10839   if (!f)
10840     return -EBADF;
10841   return _setxattr(f->inode, name, value, size, flags, perms);
10842 }
10843
10844 int Client::_getxattr(Inode *in, const char *name, void *value, size_t size,
10845                       const UserPerm& perms)
10846 {
10847   int r;
10848
10849   const VXattr *vxattr = _match_vxattr(in, name);
10850   if (vxattr) {
10851     r = -ENODATA;
10852
10853     // Do a force getattr to get the latest quota before returning
10854     // a value to userspace.
10855     r = _getattr(in, 0, perms, true);
10856     if (r != 0) {
10857       // Error from getattr!
10858       return r;
10859     }
10860
10861     // call pointer-to-member function
10862     char buf[256];
10863     if (!(vxattr->exists_cb && !(this->*(vxattr->exists_cb))(in))) {
10864       r = (this->*(vxattr->getxattr_cb))(in, buf, sizeof(buf));
10865     } else {
10866       r = -ENODATA;
10867     }
10868
10869     if (size != 0) {
10870       if (r > (int)size) {
10871         r = -ERANGE;
10872       } else if (r > 0) {
10873         memcpy(value, buf, r);
10874       }
10875     }
10876     goto out;
10877   }
10878
10879   if (acl_type == NO_ACL && !strncmp(name, "system.", 7)) {
10880     r = -EOPNOTSUPP;
10881     goto out;
10882   }
10883
10884   r = _getattr(in, CEPH_STAT_CAP_XATTR, perms, in->xattr_version == 0);
10885   if (r == 0) {
10886     string n(name);
10887     r = -ENODATA;
10888    if (in->xattrs.count(n)) {
10889       r = in->xattrs[n].length();
10890       if (r > 0 && size != 0) {
10891         if (size >= (unsigned)r)
10892           memcpy(value, in->xattrs[n].c_str(), r);
10893         else
10894           r = -ERANGE;
10895       }
10896     }
10897   }
10898  out:
10899   ldout(cct, 3) << "_getxattr(" << in->ino << ", \"" << name << "\", " << size << ") = " << r << dendl;
10900   return r;
10901 }
10902
10903 int Client::_getxattr(InodeRef &in, const char *name, void *value, size_t size,
10904                       const UserPerm& perms)
10905 {
10906   if (cct->_conf->client_permissions) {
10907     int r = xattr_permission(in.get(), name, MAY_READ, perms);
10908     if (r < 0)
10909       return r;
10910   }
10911   return _getxattr(in.get(), name, value, size, perms);
10912 }
10913
10914 int Client::ll_getxattr(Inode *in, const char *name, void *value,
10915                         size_t size, const UserPerm& perms)
10916 {
10917   Mutex::Locker lock(client_lock);
10918
10919   if (unmounting)
10920     return -ENOTCONN;
10921
10922   vinodeno_t vino = _get_vino(in);
10923
10924   ldout(cct, 3) << "ll_getxattr " << vino << " " << name << " size " << size << dendl;
10925   tout(cct) << "ll_getxattr" << std::endl;
10926   tout(cct) << vino.ino.val << std::endl;
10927   tout(cct) << name << std::endl;
10928
10929   if (!cct->_conf->fuse_default_permissions) {
10930     int r = xattr_permission(in, name, MAY_READ, perms);
10931     if (r < 0)
10932       return r;
10933   }
10934
10935   return _getxattr(in, name, value, size, perms);
10936 }
10937
10938 int Client::_listxattr(Inode *in, char *name, size_t size,
10939                        const UserPerm& perms)
10940 {
10941   int r = _getattr(in, CEPH_STAT_CAP_XATTR, perms, in->xattr_version == 0);
10942   if (r == 0) {
10943     for (map<string,bufferptr>::iterator p = in->xattrs.begin();
10944          p != in->xattrs.end();
10945          ++p)
10946       r += p->first.length() + 1;
10947
10948     const VXattr *vxattrs = _get_vxattrs(in);
10949     r += _vxattrs_name_size(vxattrs);
10950
10951     if (size != 0) {
10952       if (size >= (unsigned)r) {
10953         for (map<string,bufferptr>::iterator p = in->xattrs.begin();
10954              p != in->xattrs.end();
10955              ++p) {
10956           memcpy(name, p->first.c_str(), p->first.length());
10957           name += p->first.length();
10958           *name = '\0';
10959           name++;
10960         }
10961         if (vxattrs) {
10962           for (int i = 0; !vxattrs[i].name.empty(); i++) {
10963             const VXattr& vxattr = vxattrs[i];
10964             if (vxattr.hidden)
10965               continue;
10966             // call pointer-to-member function
10967             if(vxattr.exists_cb && !(this->*(vxattr.exists_cb))(in))
10968               continue;
10969             memcpy(name, vxattr.name.c_str(), vxattr.name.length());
10970             name += vxattr.name.length();
10971             *name = '\0';
10972             name++;
10973           }
10974         }
10975       } else
10976         r = -ERANGE;
10977     }
10978   }
10979   ldout(cct, 3) << "_listxattr(" << in->ino << ", " << size << ") = " << r << dendl;
10980   return r;
10981 }
10982
10983 int Client::ll_listxattr(Inode *in, char *names, size_t size,
10984                          const UserPerm& perms)
10985 {
10986   Mutex::Locker lock(client_lock);
10987
10988   if (unmounting)
10989     return -ENOTCONN;
10990
10991   vinodeno_t vino = _get_vino(in);
10992
10993   ldout(cct, 3) << "ll_listxattr " << vino << " size " << size << dendl;
10994   tout(cct) << "ll_listxattr" << std::endl;
10995   tout(cct) << vino.ino.val << std::endl;
10996   tout(cct) << size << std::endl;
10997
10998   return _listxattr(in, names, size, perms);
10999 }
11000
11001 int Client::_do_setxattr(Inode *in, const char *name, const void *value,
11002                          size_t size, int flags, const UserPerm& perms)
11003 {
11004
11005   int xattr_flags = 0;
11006   if (!value)
11007     xattr_flags |= CEPH_XATTR_REMOVE;
11008   if (flags & XATTR_CREATE)
11009     xattr_flags |= CEPH_XATTR_CREATE;
11010   if (flags & XATTR_REPLACE)
11011     xattr_flags |= CEPH_XATTR_REPLACE;
11012
11013   MetaRequest *req = new MetaRequest(CEPH_MDS_OP_SETXATTR);
11014   filepath path;
11015   in->make_nosnap_relative_path(path);
11016   req->set_filepath(path);
11017   req->set_string2(name);
11018   req->set_inode(in);
11019   req->head.args.setxattr.flags = xattr_flags;
11020
11021   bufferlist bl;
11022   bl.append((const char*)value, size);
11023   req->set_data(bl);
11024
11025   int res = make_request(req, perms);
11026
11027   trim_cache();
11028   ldout(cct, 3) << "_setxattr(" << in->ino << ", \"" << name << "\") = " <<
11029     res << dendl;
11030   return res;
11031 }
11032
11033 int Client::_setxattr(Inode *in, const char *name, const void *value,
11034                       size_t size, int flags, const UserPerm& perms)
11035 {
11036   if (in->snapid != CEPH_NOSNAP) {
11037     return -EROFS;
11038   }
11039
11040   bool posix_acl_xattr = false;
11041   if (acl_type == POSIX_ACL)
11042     posix_acl_xattr = !strncmp(name, "system.", 7);
11043
11044   if (strncmp(name, "user.", 5) &&
11045       strncmp(name, "security.", 9) &&
11046       strncmp(name, "trusted.", 8) &&
11047       strncmp(name, "ceph.", 5) &&
11048       !posix_acl_xattr)
11049     return -EOPNOTSUPP;
11050
11051   if (posix_acl_xattr) {
11052     if (!strcmp(name, ACL_EA_ACCESS)) {
11053       mode_t new_mode = in->mode;
11054       if (value) {
11055         int ret = posix_acl_equiv_mode(value, size, &new_mode);
11056         if (ret < 0)
11057           return ret;
11058         if (ret == 0) {
11059           value = NULL;
11060           size = 0;
11061         }
11062         if (new_mode != in->mode) {
11063           struct ceph_statx stx;
11064           stx.stx_mode = new_mode;
11065           ret = _do_setattr(in, &stx, CEPH_SETATTR_MODE, perms, NULL);
11066           if (ret < 0)
11067             return ret;
11068         }
11069       }
11070     } else if (!strcmp(name, ACL_EA_DEFAULT)) {
11071       if (value) {
11072         if (!S_ISDIR(in->mode))
11073           return -EACCES;
11074         int ret = posix_acl_check(value, size);
11075         if (ret < 0)
11076           return -EINVAL;
11077         if (ret == 0) {
11078           value = NULL;
11079           size = 0;
11080         }
11081       }
11082     } else {
11083       return -EOPNOTSUPP;
11084     }
11085   } else {
11086     const VXattr *vxattr = _match_vxattr(in, name);
11087     if (vxattr && vxattr->readonly)
11088       return -EOPNOTSUPP;
11089   }
11090
11091   return _do_setxattr(in, name, value, size, flags, perms);
11092 }
11093
11094 int Client::_setxattr(InodeRef &in, const char *name, const void *value,
11095                       size_t size, int flags, const UserPerm& perms)
11096 {
11097   if (cct->_conf->client_permissions) {
11098     int r = xattr_permission(in.get(), name, MAY_WRITE, perms);
11099     if (r < 0)
11100       return r;
11101   }
11102   return _setxattr(in.get(), name, value, size, flags, perms);
11103 }
11104
11105 int Client::_setxattr_check_data_pool(string& name, string& value, const OSDMap *osdmap)
11106 {
11107   string tmp;
11108   if (name == "layout") {
11109     string::iterator begin = value.begin();
11110     string::iterator end = value.end();
11111     keys_and_values<string::iterator> p;    // create instance of parser
11112     std::map<string, string> m;             // map to receive results
11113     if (!qi::parse(begin, end, p, m)) {     // returns true if successful
11114       return -EINVAL;
11115     }
11116     if (begin != end)
11117       return -EINVAL;
11118     for (map<string,string>::iterator q = m.begin(); q != m.end(); ++q) {
11119       if (q->first == "pool") {
11120         tmp = q->second;
11121         break;
11122       }
11123     }
11124   } else if (name == "layout.pool") {
11125     tmp = value;
11126   }
11127
11128   if (tmp.length()) {
11129     int64_t pool;
11130     try {
11131       pool = boost::lexical_cast<unsigned>(tmp);
11132       if (!osdmap->have_pg_pool(pool))
11133         return -ENOENT;
11134     } catch (boost::bad_lexical_cast const&) {
11135       pool = osdmap->lookup_pg_pool_name(tmp);
11136       if (pool < 0) {
11137         return -ENOENT;
11138       }
11139     }
11140   }
11141
11142   return 0;
11143 }
11144
11145 void Client::_setxattr_maybe_wait_for_osdmap(const char *name, const void *value, size_t size)
11146 {
11147   // For setting pool of layout, MetaRequest need osdmap epoch.
11148   // There is a race which create a new data pool but client and mds both don't have.
11149   // Make client got the latest osdmap which make mds quickly judge whether get newer osdmap.
11150   if (strcmp(name, "ceph.file.layout.pool") == 0 || strcmp(name, "ceph.dir.layout.pool") == 0 ||
11151       strcmp(name, "ceph.file.layout") == 0 || strcmp(name, "ceph.dir.layout") == 0) {
11152     string rest(strstr(name, "layout"));
11153     string v((const char*)value, size);
11154     int r = objecter->with_osdmap([&](const OSDMap& o) {
11155       return _setxattr_check_data_pool(rest, v, &o);
11156     });
11157
11158     if (r == -ENOENT) {
11159       C_SaferCond ctx;
11160       objecter->wait_for_latest_osdmap(&ctx);
11161       ctx.wait();
11162     }
11163   }
11164 }
11165
11166 int Client::ll_setxattr(Inode *in, const char *name, const void *value,
11167                         size_t size, int flags, const UserPerm& perms)
11168 {
11169   _setxattr_maybe_wait_for_osdmap(name, value, size);
11170
11171   Mutex::Locker lock(client_lock);
11172
11173   if (unmounting)
11174     return -ENOTCONN;
11175
11176   vinodeno_t vino = _get_vino(in);
11177
11178   ldout(cct, 3) << "ll_setxattr " << vino << " " << name << " size " << size << dendl;
11179   tout(cct) << "ll_setxattr" << std::endl;
11180   tout(cct) << vino.ino.val << std::endl;
11181   tout(cct) << name << std::endl;
11182
11183   if (!cct->_conf->fuse_default_permissions) {
11184     int r = xattr_permission(in, name, MAY_WRITE, perms);
11185     if (r < 0)
11186       return r;
11187   }
11188   return _setxattr(in, name, value, size, flags, perms);
11189 }
11190
11191 int Client::_removexattr(Inode *in, const char *name, const UserPerm& perms)
11192 {
11193   if (in->snapid != CEPH_NOSNAP) {
11194     return -EROFS;
11195   }
11196
11197   // same xattrs supported by kernel client
11198   if (strncmp(name, "user.", 5) &&
11199       strncmp(name, "system.", 7) &&
11200       strncmp(name, "security.", 9) &&
11201       strncmp(name, "trusted.", 8) &&
11202       strncmp(name, "ceph.", 5))
11203     return -EOPNOTSUPP;
11204
11205   const VXattr *vxattr = _match_vxattr(in, name);
11206   if (vxattr && vxattr->readonly)
11207     return -EOPNOTSUPP;
11208
11209   MetaRequest *req = new MetaRequest(CEPH_MDS_OP_RMXATTR);
11210   filepath path;
11211   in->make_nosnap_relative_path(path);
11212   req->set_filepath(path);
11213   req->set_filepath2(name);
11214   req->set_inode(in);
11215
11216   int res = make_request(req, perms);
11217
11218   trim_cache();
11219   ldout(cct, 3) << "_removexattr(" << in->ino << ", \"" << name << "\") = " << res << dendl;
11220   return res;
11221 }
11222
11223 int Client::_removexattr(InodeRef &in, const char *name, const UserPerm& perms)
11224 {
11225   if (cct->_conf->client_permissions) {
11226     int r = xattr_permission(in.get(), name, MAY_WRITE, perms);
11227     if (r < 0)
11228       return r;
11229   }
11230   return _removexattr(in.get(), name, perms);
11231 }
11232
11233 int Client::ll_removexattr(Inode *in, const char *name, const UserPerm& perms)
11234 {
11235   Mutex::Locker lock(client_lock);
11236
11237   if (unmounting)
11238     return -ENOTCONN;
11239
11240   vinodeno_t vino = _get_vino(in);
11241
11242   ldout(cct, 3) << "ll_removexattr " << vino << " " << name << dendl;
11243   tout(cct) << "ll_removexattr" << std::endl;
11244   tout(cct) << vino.ino.val << std::endl;
11245   tout(cct) << name << std::endl;
11246
11247   if (!cct->_conf->fuse_default_permissions) {
11248     int r = xattr_permission(in, name, MAY_WRITE, perms);
11249     if (r < 0)
11250       return r;
11251   }
11252
11253   return _removexattr(in, name, perms);
11254 }
11255
11256 bool Client::_vxattrcb_quota_exists(Inode *in)
11257 {
11258   return in->quota.is_enable();
11259 }
11260 size_t Client::_vxattrcb_quota(Inode *in, char *val, size_t size)
11261 {
11262   return snprintf(val, size,
11263                   "max_bytes=%lld max_files=%lld",
11264                   (long long int)in->quota.max_bytes,
11265                   (long long int)in->quota.max_files);
11266 }
11267 size_t Client::_vxattrcb_quota_max_bytes(Inode *in, char *val, size_t size)
11268 {
11269   return snprintf(val, size, "%lld", (long long int)in->quota.max_bytes);
11270 }
11271 size_t Client::_vxattrcb_quota_max_files(Inode *in, char *val, size_t size)
11272 {
11273   return snprintf(val, size, "%lld", (long long int)in->quota.max_files);
11274 }
11275
11276 bool Client::_vxattrcb_layout_exists(Inode *in)
11277 {
11278   return in->layout != file_layout_t();
11279 }
11280 size_t Client::_vxattrcb_layout(Inode *in, char *val, size_t size)
11281 {
11282   int r = snprintf(val, size,
11283       "stripe_unit=%lld stripe_count=%lld object_size=%lld pool=",
11284       (unsigned long long)in->layout.stripe_unit,
11285       (unsigned long long)in->layout.stripe_count,
11286       (unsigned long long)in->layout.object_size);
11287   objecter->with_osdmap([&](const OSDMap& o) {
11288       if (o.have_pg_pool(in->layout.pool_id))
11289         r += snprintf(val + r, size - r, "%s",
11290                       o.get_pool_name(in->layout.pool_id).c_str());
11291       else
11292         r += snprintf(val + r, size - r, "%" PRIu64,
11293                       (uint64_t)in->layout.pool_id);
11294     });
11295   if (in->layout.pool_ns.length())
11296     r += snprintf(val + r, size - r, " pool_namespace=%s",
11297                   in->layout.pool_ns.c_str());
11298   return r;
11299 }
11300 size_t Client::_vxattrcb_layout_stripe_unit(Inode *in, char *val, size_t size)
11301 {
11302   return snprintf(val, size, "%lld", (unsigned long long)in->layout.stripe_unit);
11303 }
11304 size_t Client::_vxattrcb_layout_stripe_count(Inode *in, char *val, size_t size)
11305 {
11306   return snprintf(val, size, "%lld", (unsigned long long)in->layout.stripe_count);
11307 }
11308 size_t Client::_vxattrcb_layout_object_size(Inode *in, char *val, size_t size)
11309 {
11310   return snprintf(val, size, "%lld", (unsigned long long)in->layout.object_size);
11311 }
11312 size_t Client::_vxattrcb_layout_pool(Inode *in, char *val, size_t size)
11313 {
11314   size_t r;
11315   objecter->with_osdmap([&](const OSDMap& o) {
11316       if (o.have_pg_pool(in->layout.pool_id))
11317         r = snprintf(val, size, "%s", o.get_pool_name(
11318                        in->layout.pool_id).c_str());
11319       else
11320         r = snprintf(val, size, "%" PRIu64, (uint64_t)in->layout.pool_id);
11321     });
11322   return r;
11323 }
11324 size_t Client::_vxattrcb_layout_pool_namespace(Inode *in, char *val, size_t size)
11325 {
11326   return snprintf(val, size, "%s", in->layout.pool_ns.c_str());
11327 }
11328 size_t Client::_vxattrcb_dir_entries(Inode *in, char *val, size_t size)
11329 {
11330   return snprintf(val, size, "%lld", (unsigned long long)(in->dirstat.nfiles + in->dirstat.nsubdirs));
11331 }
11332 size_t Client::_vxattrcb_dir_files(Inode *in, char *val, size_t size)
11333 {
11334   return snprintf(val, size, "%lld", (unsigned long long)in->dirstat.nfiles);
11335 }
11336 size_t Client::_vxattrcb_dir_subdirs(Inode *in, char *val, size_t size)
11337 {
11338   return snprintf(val, size, "%lld", (unsigned long long)in->dirstat.nsubdirs);
11339 }
11340 size_t Client::_vxattrcb_dir_rentries(Inode *in, char *val, size_t size)
11341 {
11342   return snprintf(val, size, "%lld", (unsigned long long)(in->rstat.rfiles + in->rstat.rsubdirs));
11343 }
11344 size_t Client::_vxattrcb_dir_rfiles(Inode *in, char *val, size_t size)
11345 {
11346   return snprintf(val, size, "%lld", (unsigned long long)in->rstat.rfiles);
11347 }
11348 size_t Client::_vxattrcb_dir_rsubdirs(Inode *in, char *val, size_t size)
11349 {
11350   return snprintf(val, size, "%lld", (unsigned long long)in->rstat.rsubdirs);
11351 }
11352 size_t Client::_vxattrcb_dir_rbytes(Inode *in, char *val, size_t size)
11353 {
11354   return snprintf(val, size, "%lld", (unsigned long long)in->rstat.rbytes);
11355 }
11356 size_t Client::_vxattrcb_dir_rctime(Inode *in, char *val, size_t size)
11357 {
11358   return snprintf(val, size, "%ld.09%ld", (long)in->rstat.rctime.sec(),
11359       (long)in->rstat.rctime.nsec());
11360 }
11361
11362 #define CEPH_XATTR_NAME(_type, _name) "ceph." #_type "." #_name
11363 #define CEPH_XATTR_NAME2(_type, _name, _name2) "ceph." #_type "." #_name "." #_name2
11364
11365 #define XATTR_NAME_CEPH(_type, _name)                           \
11366 {                                                               \
11367   name: CEPH_XATTR_NAME(_type, _name),                          \
11368   getxattr_cb: &Client::_vxattrcb_ ## _type ## _ ## _name,      \
11369   readonly: true,                                               \
11370   hidden: false,                                                \
11371   exists_cb: NULL,                                              \
11372 }
11373 #define XATTR_LAYOUT_FIELD(_type, _name, _field)                \
11374 {                                                               \
11375   name: CEPH_XATTR_NAME2(_type, _name, _field),                 \
11376   getxattr_cb: &Client::_vxattrcb_ ## _name ## _ ## _field,     \
11377   readonly: false,                                              \
11378   hidden: true,                                                 \
11379   exists_cb: &Client::_vxattrcb_layout_exists,                  \
11380 }
11381 #define XATTR_QUOTA_FIELD(_type, _name)                         \
11382 {                                                               \
11383   name: CEPH_XATTR_NAME(_type, _name),                          \
11384   getxattr_cb: &Client::_vxattrcb_ ## _type ## _ ## _name,      \
11385   readonly: false,                                              \
11386   hidden: true,                                                 \
11387   exists_cb: &Client::_vxattrcb_quota_exists,                   \
11388 }
11389
11390 const Client::VXattr Client::_dir_vxattrs[] = {
11391   {
11392     name: "ceph.dir.layout",
11393     getxattr_cb: &Client::_vxattrcb_layout,
11394     readonly: false,
11395     hidden: true,
11396     exists_cb: &Client::_vxattrcb_layout_exists,
11397   },
11398   XATTR_LAYOUT_FIELD(dir, layout, stripe_unit),
11399   XATTR_LAYOUT_FIELD(dir, layout, stripe_count),
11400   XATTR_LAYOUT_FIELD(dir, layout, object_size),
11401   XATTR_LAYOUT_FIELD(dir, layout, pool),
11402   XATTR_LAYOUT_FIELD(dir, layout, pool_namespace),
11403   XATTR_NAME_CEPH(dir, entries),
11404   XATTR_NAME_CEPH(dir, files),
11405   XATTR_NAME_CEPH(dir, subdirs),
11406   XATTR_NAME_CEPH(dir, rentries),
11407   XATTR_NAME_CEPH(dir, rfiles),
11408   XATTR_NAME_CEPH(dir, rsubdirs),
11409   XATTR_NAME_CEPH(dir, rbytes),
11410   XATTR_NAME_CEPH(dir, rctime),
11411   {
11412     name: "ceph.quota",
11413     getxattr_cb: &Client::_vxattrcb_quota,
11414     readonly: false,
11415     hidden: true,
11416     exists_cb: &Client::_vxattrcb_quota_exists,
11417   },
11418   XATTR_QUOTA_FIELD(quota, max_bytes),
11419   XATTR_QUOTA_FIELD(quota, max_files),
11420   { name: "" }     /* Required table terminator */
11421 };
11422
11423 const Client::VXattr Client::_file_vxattrs[] = {
11424   {
11425     name: "ceph.file.layout",
11426     getxattr_cb: &Client::_vxattrcb_layout,
11427     readonly: false,
11428     hidden: true,
11429     exists_cb: &Client::_vxattrcb_layout_exists,
11430   },
11431   XATTR_LAYOUT_FIELD(file, layout, stripe_unit),
11432   XATTR_LAYOUT_FIELD(file, layout, stripe_count),
11433   XATTR_LAYOUT_FIELD(file, layout, object_size),
11434   XATTR_LAYOUT_FIELD(file, layout, pool),
11435   XATTR_LAYOUT_FIELD(file, layout, pool_namespace),
11436   { name: "" }     /* Required table terminator */
11437 };
11438
11439 const Client::VXattr *Client::_get_vxattrs(Inode *in)
11440 {
11441   if (in->is_dir())
11442     return _dir_vxattrs;
11443   else if (in->is_file())
11444     return _file_vxattrs;
11445   return NULL;
11446 }
11447
11448 const Client::VXattr *Client::_match_vxattr(Inode *in, const char *name)
11449 {
11450   if (strncmp(name, "ceph.", 5) == 0) {
11451     const VXattr *vxattr = _get_vxattrs(in);
11452     if (vxattr) {
11453       while (!vxattr->name.empty()) {
11454         if (vxattr->name == name)
11455           return vxattr;
11456         vxattr++;
11457       }
11458     }
11459   }
11460   return NULL;
11461 }
11462
11463 size_t Client::_vxattrs_calcu_name_size(const VXattr *vxattr)
11464 {
11465   size_t len = 0;
11466   while (!vxattr->name.empty()) {
11467     if (!vxattr->hidden)
11468       len += vxattr->name.length() + 1;
11469     vxattr++;
11470   }
11471   return len;
11472 }
11473
11474 int Client::ll_readlink(Inode *in, char *buf, size_t buflen, const UserPerm& perms)
11475 {
11476   Mutex::Locker lock(client_lock);
11477
11478   if (unmounting)
11479     return -ENOTCONN;
11480
11481   vinodeno_t vino = _get_vino(in);
11482
11483   ldout(cct, 3) << "ll_readlink " << vino << dendl;
11484   tout(cct) << "ll_readlink" << std::endl;
11485   tout(cct) << vino.ino.val << std::endl;
11486
11487   set<Dentry*>::iterator dn = in->dn_set.begin();
11488   while (dn != in->dn_set.end()) {
11489     touch_dn(*dn);
11490     ++dn;
11491   }
11492
11493   int r = _readlink(in, buf, buflen); // FIXME: no permission checking!
11494   ldout(cct, 3) << "ll_readlink " << vino << " = " << r << dendl;
11495   return r;
11496 }
11497
11498 int Client::_mknod(Inode *dir, const char *name, mode_t mode, dev_t rdev,
11499                    const UserPerm& perms, InodeRef *inp)
11500 {
11501   ldout(cct, 3) << "_mknod(" << dir->ino << " " << name << ", 0" << oct
11502                 << mode << dec << ", " << rdev << ", uid " << perms.uid()
11503                 << ", gid " << perms.gid() << ")" << dendl;
11504
11505   if (strlen(name) > NAME_MAX)
11506     return -ENAMETOOLONG;
11507
11508   if (dir->snapid != CEPH_NOSNAP) {
11509     return -EROFS;
11510   }
11511   if (is_quota_files_exceeded(dir, perms)) {
11512     return -EDQUOT;
11513   }
11514
11515   MetaRequest *req = new MetaRequest(CEPH_MDS_OP_MKNOD);
11516
11517   filepath path;
11518   dir->make_nosnap_relative_path(path);
11519   path.push_dentry(name);
11520   req->set_filepath(path);
11521   req->set_inode(dir);
11522   req->head.args.mknod.rdev = rdev;
11523   req->dentry_drop = CEPH_CAP_FILE_SHARED;
11524   req->dentry_unless = CEPH_CAP_FILE_EXCL;
11525
11526   bufferlist xattrs_bl;
11527   int res = _posix_acl_create(dir, &mode, xattrs_bl, perms);
11528   if (res < 0)
11529     goto fail;
11530   req->head.args.mknod.mode = mode;
11531   if (xattrs_bl.length() > 0)
11532     req->set_data(xattrs_bl);
11533
11534   Dentry *de;
11535   res = get_or_create(dir, name, &de);
11536   if (res < 0)
11537     goto fail;
11538   req->set_dentry(de);
11539
11540   res = make_request(req, perms, inp);
11541
11542   trim_cache();
11543
11544   ldout(cct, 3) << "mknod(" << path << ", 0" << oct << mode << dec << ") = " << res << dendl;
11545   return res;
11546
11547  fail:
11548   put_request(req);
11549   return res;
11550 }
11551
11552 int Client::ll_mknod(Inode *parent, const char *name, mode_t mode,
11553                      dev_t rdev, struct stat *attr, Inode **out,
11554                      const UserPerm& perms)
11555 {
11556   Mutex::Locker lock(client_lock);
11557
11558   if (unmounting)
11559     return -ENOTCONN;
11560
11561   vinodeno_t vparent = _get_vino(parent);
11562
11563   ldout(cct, 3) << "ll_mknod " << vparent << " " << name << dendl;
11564   tout(cct) << "ll_mknod" << std::endl;
11565   tout(cct) << vparent.ino.val << std::endl;
11566   tout(cct) << name << std::endl;
11567   tout(cct) << mode << std::endl;
11568   tout(cct) << rdev << std::endl;
11569
11570   if (!cct->_conf->fuse_default_permissions) {
11571     int r = may_create(parent, perms);
11572     if (r < 0)
11573       return r;
11574   }
11575
11576   InodeRef in;
11577   int r = _mknod(parent, name, mode, rdev, perms, &in);
11578   if (r == 0) {
11579     fill_stat(in, attr);
11580     _ll_get(in.get());
11581   }
11582   tout(cct) << attr->st_ino << std::endl;
11583   ldout(cct, 3) << "ll_mknod " << vparent << " " << name
11584           << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
11585   *out = in.get();
11586   return r;
11587 }
11588
11589 int Client::ll_mknodx(Inode *parent, const char *name, mode_t mode,
11590                       dev_t rdev, Inode **out,
11591                       struct ceph_statx *stx, unsigned want, unsigned flags,
11592                       const UserPerm& perms)
11593 {
11594   unsigned caps = statx_to_mask(flags, want);
11595   Mutex::Locker lock(client_lock);
11596
11597   if (unmounting)
11598     return -ENOTCONN;
11599
11600   vinodeno_t vparent = _get_vino(parent);
11601
11602   ldout(cct, 3) << "ll_mknodx " << vparent << " " << name << dendl;
11603   tout(cct) << "ll_mknodx" << std::endl;
11604   tout(cct) << vparent.ino.val << std::endl;
11605   tout(cct) << name << std::endl;
11606   tout(cct) << mode << std::endl;
11607   tout(cct) << rdev << std::endl;
11608
11609   if (!cct->_conf->fuse_default_permissions) {
11610     int r = may_create(parent, perms);
11611     if (r < 0)
11612       return r;
11613   }
11614
11615   InodeRef in;
11616   int r = _mknod(parent, name, mode, rdev, perms, &in);
11617   if (r == 0) {
11618     fill_statx(in, caps, stx);
11619     _ll_get(in.get());
11620   }
11621   tout(cct) << stx->stx_ino << std::endl;
11622   ldout(cct, 3) << "ll_mknodx " << vparent << " " << name
11623           << " = " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
11624   *out = in.get();
11625   return r;
11626 }
11627
11628 int Client::_create(Inode *dir, const char *name, int flags, mode_t mode,
11629                     InodeRef *inp, Fh **fhp, int stripe_unit, int stripe_count,
11630                     int object_size, const char *data_pool, bool *created,
11631                     const UserPerm& perms)
11632 {
11633   ldout(cct, 3) << "_create(" << dir->ino << " " << name << ", 0" << oct <<
11634     mode << dec << ")" << dendl;
11635
11636   if (strlen(name) > NAME_MAX)
11637     return -ENAMETOOLONG;
11638   if (dir->snapid != CEPH_NOSNAP) {
11639     return -EROFS;
11640   }
11641   if (is_quota_files_exceeded(dir, perms)) {
11642     return -EDQUOT;
11643   }
11644
11645   // use normalized flags to generate cmode
11646   int cmode = ceph_flags_to_mode(ceph_flags_sys2wire(flags));
11647   if (cmode < 0)
11648     return -EINVAL;
11649
11650   int64_t pool_id = -1;
11651   if (data_pool && *data_pool) {
11652     pool_id = objecter->with_osdmap(
11653       std::mem_fn(&OSDMap::lookup_pg_pool_name), data_pool);
11654     if (pool_id < 0)
11655       return -EINVAL;
11656     if (pool_id > 0xffffffffll)
11657       return -ERANGE;  // bummer!
11658   }
11659
11660   MetaRequest *req = new MetaRequest(CEPH_MDS_OP_CREATE);
11661
11662   filepath path;
11663   dir->make_nosnap_relative_path(path);
11664   path.push_dentry(name);
11665   req->set_filepath(path);
11666   req->set_inode(dir);
11667   req->head.args.open.flags = ceph_flags_sys2wire(flags | O_CREAT);
11668
11669   req->head.args.open.stripe_unit = stripe_unit;
11670   req->head.args.open.stripe_count = stripe_count;
11671   req->head.args.open.object_size = object_size;
11672   if (cct->_conf->client_debug_getattr_caps)
11673     req->head.args.open.mask = DEBUG_GETATTR_CAPS;
11674   else
11675     req->head.args.open.mask = 0;
11676   req->head.args.open.pool = pool_id;
11677   req->dentry_drop = CEPH_CAP_FILE_SHARED;
11678   req->dentry_unless = CEPH_CAP_FILE_EXCL;
11679
11680   mode |= S_IFREG;
11681   bufferlist xattrs_bl;
11682   int res = _posix_acl_create(dir, &mode, xattrs_bl, perms);
11683   if (res < 0)
11684     goto fail;
11685   req->head.args.open.mode = mode;
11686   if (xattrs_bl.length() > 0)
11687     req->set_data(xattrs_bl);
11688
11689   Dentry *de;
11690   res = get_or_create(dir, name, &de);
11691   if (res < 0)
11692     goto fail;
11693   req->set_dentry(de);
11694
11695   res = make_request(req, perms, inp, created);
11696   if (res < 0) {
11697     goto reply_error;
11698   }
11699
11700   /* If the caller passed a value in fhp, do the open */
11701   if(fhp) {
11702     (*inp)->get_open_ref(cmode);
11703     *fhp = _create_fh(inp->get(), flags, cmode, perms);
11704   }
11705
11706  reply_error:
11707   trim_cache();
11708
11709   ldout(cct, 3) << "create(" << path << ", 0" << oct << mode << dec
11710                 << " layout " << stripe_unit
11711                 << ' ' << stripe_count
11712                 << ' ' << object_size
11713                 <<") = " << res << dendl;
11714   return res;
11715
11716  fail:
11717   put_request(req);
11718   return res;
11719 }
11720
11721
11722 int Client::_mkdir(Inode *dir, const char *name, mode_t mode, const UserPerm& perm,
11723                    InodeRef *inp)
11724 {
11725   ldout(cct, 3) << "_mkdir(" << dir->ino << " " << name << ", 0" << oct
11726                 << mode << dec << ", uid " << perm.uid()
11727                 << ", gid " << perm.gid() << ")" << dendl;
11728
11729   if (strlen(name) > NAME_MAX)
11730     return -ENAMETOOLONG;
11731
11732   if (dir->snapid != CEPH_NOSNAP && dir->snapid != CEPH_SNAPDIR) {
11733     return -EROFS;
11734   }
11735   if (is_quota_files_exceeded(dir, perm)) {
11736     return -EDQUOT;
11737   }
11738   MetaRequest *req = new MetaRequest(dir->snapid == CEPH_SNAPDIR ?
11739                                      CEPH_MDS_OP_MKSNAP : CEPH_MDS_OP_MKDIR);
11740
11741   filepath path;
11742   dir->make_nosnap_relative_path(path);
11743   path.push_dentry(name);
11744   req->set_filepath(path);
11745   req->set_inode(dir);
11746   req->dentry_drop = CEPH_CAP_FILE_SHARED;
11747   req->dentry_unless = CEPH_CAP_FILE_EXCL;
11748
11749   mode |= S_IFDIR;
11750   bufferlist xattrs_bl;
11751   int res = _posix_acl_create(dir, &mode, xattrs_bl, perm);
11752   if (res < 0)
11753     goto fail;
11754   req->head.args.mkdir.mode = mode;
11755   if (xattrs_bl.length() > 0)
11756     req->set_data(xattrs_bl);
11757
11758   Dentry *de;
11759   res = get_or_create(dir, name, &de);
11760   if (res < 0)
11761     goto fail;
11762   req->set_dentry(de);
11763
11764   ldout(cct, 10) << "_mkdir: making request" << dendl;
11765   res = make_request(req, perm, inp);
11766   ldout(cct, 10) << "_mkdir result is " << res << dendl;
11767
11768   trim_cache();
11769
11770   ldout(cct, 3) << "_mkdir(" << path << ", 0" << oct << mode << dec << ") = " << res << dendl;
11771   return res;
11772
11773  fail:
11774   put_request(req);
11775   return res;
11776 }
11777
11778 int Client::ll_mkdir(Inode *parent, const char *name, mode_t mode,
11779                      struct stat *attr, Inode **out, const UserPerm& perm)
11780 {
11781   Mutex::Locker lock(client_lock);
11782
11783   if (unmounting)
11784     return -ENOTCONN;
11785
11786   vinodeno_t vparent = _get_vino(parent);
11787
11788   ldout(cct, 3) << "ll_mkdir " << vparent << " " << name << dendl;
11789   tout(cct) << "ll_mkdir" << std::endl;
11790   tout(cct) << vparent.ino.val << std::endl;
11791   tout(cct) << name << std::endl;
11792   tout(cct) << mode << std::endl;
11793
11794   if (!cct->_conf->fuse_default_permissions) {
11795     int r = may_create(parent, perm);
11796     if (r < 0)
11797       return r;
11798   }
11799
11800   InodeRef in;
11801   int r = _mkdir(parent, name, mode, perm, &in);
11802   if (r == 0) {
11803     fill_stat(in, attr);
11804     _ll_get(in.get());
11805   }
11806   tout(cct) << attr->st_ino << std::endl;
11807   ldout(cct, 3) << "ll_mkdir " << vparent << " " << name
11808           << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
11809   *out = in.get();
11810   return r;
11811 }
11812
11813 int Client::ll_mkdirx(Inode *parent, const char *name, mode_t mode, Inode **out,
11814                       struct ceph_statx *stx, unsigned want, unsigned flags,
11815                       const UserPerm& perms)
11816 {
11817   Mutex::Locker lock(client_lock);
11818
11819   if (unmounting)
11820     return -ENOTCONN;
11821
11822   vinodeno_t vparent = _get_vino(parent);
11823
11824   ldout(cct, 3) << "ll_mkdirx " << vparent << " " << name << dendl;
11825   tout(cct) << "ll_mkdirx" << std::endl;
11826   tout(cct) << vparent.ino.val << std::endl;
11827   tout(cct) << name << std::endl;
11828   tout(cct) << mode << std::endl;
11829
11830   if (!cct->_conf->fuse_default_permissions) {
11831     int r = may_create(parent, perms);
11832     if (r < 0)
11833       return r;
11834   }
11835
11836   InodeRef in;
11837   int r = _mkdir(parent, name, mode, perms, &in);
11838   if (r == 0) {
11839     fill_statx(in, statx_to_mask(flags, want), stx);
11840     _ll_get(in.get());
11841   } else {
11842     stx->stx_ino = 0;
11843     stx->stx_mask = 0;
11844   }
11845   tout(cct) << stx->stx_ino << std::endl;
11846   ldout(cct, 3) << "ll_mkdirx " << vparent << " " << name
11847           << " = " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
11848   *out = in.get();
11849   return r;
11850 }
11851
11852 int Client::_symlink(Inode *dir, const char *name, const char *target,
11853                      const UserPerm& perms, InodeRef *inp)
11854 {
11855   ldout(cct, 3) << "_symlink(" << dir->ino << " " << name << ", " << target
11856                 << ", uid " << perms.uid() << ", gid " << perms.gid() << ")"
11857                 << dendl;
11858
11859   if (strlen(name) > NAME_MAX)
11860     return -ENAMETOOLONG;
11861
11862   if (dir->snapid != CEPH_NOSNAP) {
11863     return -EROFS;
11864   }
11865   if (is_quota_files_exceeded(dir, perms)) {
11866     return -EDQUOT;
11867   }
11868
11869   MetaRequest *req = new MetaRequest(CEPH_MDS_OP_SYMLINK);
11870
11871   filepath path;
11872   dir->make_nosnap_relative_path(path);
11873   path.push_dentry(name);
11874   req->set_filepath(path);
11875   req->set_inode(dir);
11876   req->set_string2(target);
11877   req->dentry_drop = CEPH_CAP_FILE_SHARED;
11878   req->dentry_unless = CEPH_CAP_FILE_EXCL;
11879
11880   Dentry *de;
11881   int res = get_or_create(dir, name, &de);
11882   if (res < 0)
11883     goto fail;
11884   req->set_dentry(de);
11885
11886   res = make_request(req, perms, inp);
11887
11888   trim_cache();
11889   ldout(cct, 3) << "_symlink(\"" << path << "\", \"" << target << "\") = " <<
11890     res << dendl;
11891   return res;
11892
11893  fail:
11894   put_request(req);
11895   return res;
11896 }
11897
11898 int Client::ll_symlink(Inode *parent, const char *name, const char *value,
11899                        struct stat *attr, Inode **out, const UserPerm& perms)
11900 {
11901   Mutex::Locker lock(client_lock);
11902
11903   if (unmounting)
11904     return -ENOTCONN;
11905
11906   vinodeno_t vparent = _get_vino(parent);
11907
11908   ldout(cct, 3) << "ll_symlink " << vparent << " " << name << " -> " << value
11909                 << dendl;
11910   tout(cct) << "ll_symlink" << std::endl;
11911   tout(cct) << vparent.ino.val << std::endl;
11912   tout(cct) << name << std::endl;
11913   tout(cct) << value << std::endl;
11914
11915   if (!cct->_conf->fuse_default_permissions) {
11916     int r = may_create(parent, perms);
11917     if (r < 0)
11918       return r;
11919   }
11920
11921   InodeRef in;
11922   int r = _symlink(parent, name, value, perms, &in);
11923   if (r == 0) {
11924     fill_stat(in, attr);
11925     _ll_get(in.get());
11926   }
11927   tout(cct) << attr->st_ino << std::endl;
11928   ldout(cct, 3) << "ll_symlink " << vparent << " " << name
11929           << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
11930   *out = in.get();
11931   return r;
11932 }
11933
11934 int Client::ll_symlinkx(Inode *parent, const char *name, const char *value,
11935                         Inode **out, struct ceph_statx *stx, unsigned want,
11936                         unsigned flags, const UserPerm& perms)
11937 {
11938   Mutex::Locker lock(client_lock);
11939
11940   if (unmounting)
11941     return -ENOTCONN;
11942
11943   vinodeno_t vparent = _get_vino(parent);
11944
11945   ldout(cct, 3) << "ll_symlinkx " << vparent << " " << name << " -> " << value
11946                 << dendl;
11947   tout(cct) << "ll_symlinkx" << std::endl;
11948   tout(cct) << vparent.ino.val << std::endl;
11949   tout(cct) << name << std::endl;
11950   tout(cct) << value << std::endl;
11951
11952   if (!cct->_conf->fuse_default_permissions) {
11953     int r = may_create(parent, perms);
11954     if (r < 0)
11955       return r;
11956   }
11957
11958   InodeRef in;
11959   int r = _symlink(parent, name, value, perms, &in);
11960   if (r == 0) {
11961     fill_statx(in, statx_to_mask(flags, want), stx);
11962     _ll_get(in.get());
11963   }
11964   tout(cct) << stx->stx_ino << std::endl;
11965   ldout(cct, 3) << "ll_symlinkx " << vparent << " " << name
11966           << " = " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
11967   *out = in.get();
11968   return r;
11969 }
11970
11971 int Client::_unlink(Inode *dir, const char *name, const UserPerm& perm)
11972 {
11973   ldout(cct, 3) << "_unlink(" << dir->ino << " " << name
11974                 << " uid " << perm.uid() << " gid " << perm.gid()
11975                 << ")" << dendl;
11976
11977   if (dir->snapid != CEPH_NOSNAP) {
11978     return -EROFS;
11979   }
11980
11981   MetaRequest *req = new MetaRequest(CEPH_MDS_OP_UNLINK);
11982
11983   filepath path;
11984   dir->make_nosnap_relative_path(path);
11985   path.push_dentry(name);
11986   req->set_filepath(path);
11987
11988   InodeRef otherin;
11989   Inode *in;
11990   Dentry *de;
11991
11992   int res = get_or_create(dir, name, &de);
11993   if (res < 0)
11994     goto fail;
11995   req->set_dentry(de);
11996   req->dentry_drop = CEPH_CAP_FILE_SHARED;
11997   req->dentry_unless = CEPH_CAP_FILE_EXCL;
11998
11999   res = _lookup(dir, name, 0, &otherin, perm);
12000   if (res < 0)
12001     goto fail;
12002
12003   in = otherin.get();
12004   req->set_other_inode(in);
12005   in->break_all_delegs();
12006   req->other_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
12007
12008   req->set_inode(dir);
12009
12010   res = make_request(req, perm);
12011
12012   trim_cache();
12013   ldout(cct, 3) << "unlink(" << path << ") = " << res << dendl;
12014   return res;
12015
12016  fail:
12017   put_request(req);
12018   return res;
12019 }
12020
12021 int Client::ll_unlink(Inode *in, const char *name, const UserPerm& perm)
12022 {
12023   Mutex::Locker lock(client_lock);
12024
12025   if (unmounting)
12026     return -ENOTCONN;
12027
12028   vinodeno_t vino = _get_vino(in);
12029
12030   ldout(cct, 3) << "ll_unlink " << vino << " " << name << dendl;
12031   tout(cct) << "ll_unlink" << std::endl;
12032   tout(cct) << vino.ino.val << std::endl;
12033   tout(cct) << name << std::endl;
12034
12035   if (!cct->_conf->fuse_default_permissions) {
12036     int r = may_delete(in, name, perm);
12037     if (r < 0)
12038       return r;
12039   }
12040   return _unlink(in, name, perm);
12041 }
12042
12043 int Client::_rmdir(Inode *dir, const char *name, const UserPerm& perms)
12044 {
12045   ldout(cct, 3) << "_rmdir(" << dir->ino << " " << name << " uid "
12046                 << perms.uid() << " gid " << perms.gid() << ")" << dendl;
12047
12048   if (dir->snapid != CEPH_NOSNAP && dir->snapid != CEPH_SNAPDIR) {
12049     return -EROFS;
12050   }
12051
12052   int op = dir->snapid == CEPH_SNAPDIR ? CEPH_MDS_OP_RMSNAP : CEPH_MDS_OP_RMDIR;
12053   MetaRequest *req = new MetaRequest(op);
12054   filepath path;
12055   dir->make_nosnap_relative_path(path);
12056   path.push_dentry(name);
12057   req->set_filepath(path);
12058
12059   req->dentry_drop = CEPH_CAP_FILE_SHARED;
12060   req->dentry_unless = CEPH_CAP_FILE_EXCL;
12061   req->other_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
12062
12063   InodeRef in;
12064
12065   Dentry *de;
12066   int res = get_or_create(dir, name, &de);
12067   if (res < 0)
12068     goto fail;
12069   if (op == CEPH_MDS_OP_RMDIR)
12070     req->set_dentry(de);
12071   else
12072     de->get();
12073
12074   res = _lookup(dir, name, 0, &in, perms);
12075   if (res < 0)
12076     goto fail;
12077   if (op == CEPH_MDS_OP_RMDIR) {
12078     req->set_inode(dir);
12079     req->set_other_inode(in.get());
12080   } else {
12081     unlink(de, true, true);
12082     de->put();
12083     req->set_other_inode(in.get());
12084   }
12085
12086   res = make_request(req, perms);
12087
12088   trim_cache();
12089   ldout(cct, 3) << "rmdir(" << path << ") = " << res << dendl;
12090   return res;
12091
12092  fail:
12093   put_request(req);
12094   return res;
12095 }
12096
12097 int Client::ll_rmdir(Inode *in, const char *name, const UserPerm& perms)
12098 {
12099   Mutex::Locker lock(client_lock);
12100
12101   if (unmounting)
12102     return -ENOTCONN;
12103
12104   vinodeno_t vino = _get_vino(in);
12105
12106   ldout(cct, 3) << "ll_rmdir " << vino << " " << name << dendl;
12107   tout(cct) << "ll_rmdir" << std::endl;
12108   tout(cct) << vino.ino.val << std::endl;
12109   tout(cct) << name << std::endl;
12110
12111   if (!cct->_conf->fuse_default_permissions) {
12112     int r = may_delete(in, name, perms);
12113     if (r < 0)
12114       return r;
12115   }
12116
12117   return _rmdir(in, name, perms);
12118 }
12119
12120 int Client::_rename(Inode *fromdir, const char *fromname, Inode *todir, const char *toname, const UserPerm& perm)
12121 {
12122   ldout(cct, 3) << "_rename(" << fromdir->ino << " " << fromname << " to "
12123                 << todir->ino << " " << toname
12124                 << " uid " << perm.uid() << " gid " << perm.gid() << ")"
12125                 << dendl;
12126
12127   if (fromdir->snapid != todir->snapid)
12128     return -EXDEV;
12129
12130   int op = CEPH_MDS_OP_RENAME;
12131   if (fromdir->snapid != CEPH_NOSNAP) {
12132     if (fromdir == todir && fromdir->snapid == CEPH_SNAPDIR)
12133       op = CEPH_MDS_OP_RENAMESNAP;
12134     else
12135       return -EROFS;
12136   }
12137   if (fromdir != todir) {
12138     Inode *fromdir_root =
12139       fromdir->quota.is_enable() ? fromdir : get_quota_root(fromdir, perm);
12140     Inode *todir_root =
12141       todir->quota.is_enable() ? todir : get_quota_root(todir, perm);
12142     if (fromdir_root != todir_root) {
12143       return -EXDEV;
12144     }
12145   }
12146
12147   InodeRef target;
12148   MetaRequest *req = new MetaRequest(op);
12149
12150   filepath from;
12151   fromdir->make_nosnap_relative_path(from);
12152   from.push_dentry(fromname);
12153   filepath to;
12154   todir->make_nosnap_relative_path(to);
12155   to.push_dentry(toname);
12156   req->set_filepath(to);
12157   req->set_filepath2(from);
12158
12159   Dentry *oldde;
12160   int res = get_or_create(fromdir, fromname, &oldde);
12161   if (res < 0)
12162     goto fail;
12163   Dentry *de;
12164   res = get_or_create(todir, toname, &de);
12165   if (res < 0)
12166     goto fail;
12167
12168   if (op == CEPH_MDS_OP_RENAME) {
12169     req->set_old_dentry(oldde);
12170     req->old_dentry_drop = CEPH_CAP_FILE_SHARED;
12171     req->old_dentry_unless = CEPH_CAP_FILE_EXCL;
12172
12173     req->set_dentry(de);
12174     req->dentry_drop = CEPH_CAP_FILE_SHARED;
12175     req->dentry_unless = CEPH_CAP_FILE_EXCL;
12176
12177     InodeRef oldin, otherin;
12178     res = _lookup(fromdir, fromname, 0, &oldin, perm);
12179     if (res < 0)
12180       goto fail;
12181
12182     Inode *oldinode = oldin.get();
12183     oldinode->break_all_delegs();
12184     req->set_old_inode(oldinode);
12185     req->old_inode_drop = CEPH_CAP_LINK_SHARED;
12186
12187     res = _lookup(todir, toname, 0, &otherin, perm);
12188     switch (res) {
12189     case 0:
12190       {
12191         Inode *in = otherin.get();
12192         req->set_other_inode(in);
12193         in->break_all_delegs();
12194       }
12195       req->other_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
12196       break;
12197     case -ENOENT:
12198       break;
12199     default:
12200       goto fail;
12201     }
12202
12203     req->set_inode(todir);
12204   } else {
12205     // renamesnap reply contains no tracedn, so we need to invalidate
12206     // dentry manually
12207     unlink(oldde, true, true);
12208     unlink(de, true, true);
12209   }
12210
12211   res = make_request(req, perm, &target);
12212   ldout(cct, 10) << "rename result is " << res << dendl;
12213
12214   // renamed item from our cache
12215
12216   trim_cache();
12217   ldout(cct, 3) << "_rename(" << from << ", " << to << ") = " << res << dendl;
12218   return res;
12219
12220  fail:
12221   put_request(req);
12222   return res;
12223 }
12224
12225 int Client::ll_rename(Inode *parent, const char *name, Inode *newparent,
12226                       const char *newname, const UserPerm& perm)
12227 {
12228   Mutex::Locker lock(client_lock);
12229
12230   if (unmounting)
12231     return -ENOTCONN;
12232
12233   vinodeno_t vparent = _get_vino(parent);
12234   vinodeno_t vnewparent = _get_vino(newparent);
12235
12236   ldout(cct, 3) << "ll_rename " << vparent << " " << name << " to "
12237           << vnewparent << " " << newname << dendl;
12238   tout(cct) << "ll_rename" << std::endl;
12239   tout(cct) << vparent.ino.val << std::endl;
12240   tout(cct) << name << std::endl;
12241   tout(cct) << vnewparent.ino.val << std::endl;
12242   tout(cct) << newname << std::endl;
12243
12244   if (!cct->_conf->fuse_default_permissions) {
12245     int r = may_delete(parent, name, perm);
12246     if (r < 0)
12247       return r;
12248     r = may_delete(newparent, newname, perm);
12249     if (r < 0 && r != -ENOENT)
12250       return r;
12251   }
12252
12253   return _rename(parent, name, newparent, newname, perm);
12254 }
12255
12256 int Client::_link(Inode *in, Inode *dir, const char *newname, const UserPerm& perm, InodeRef *inp)
12257 {
12258   ldout(cct, 3) << "_link(" << in->ino << " to " << dir->ino << " " << newname
12259                 << " uid " << perm.uid() << " gid " << perm.gid() << ")" << dendl;
12260
12261   if (strlen(newname) > NAME_MAX)
12262     return -ENAMETOOLONG;
12263
12264   if (in->snapid != CEPH_NOSNAP || dir->snapid != CEPH_NOSNAP) {
12265     return -EROFS;
12266   }
12267   if (is_quota_files_exceeded(dir, perm)) {
12268     return -EDQUOT;
12269   }
12270
12271   in->break_all_delegs();
12272   MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LINK);
12273
12274   filepath path(newname, dir->ino);
12275   req->set_filepath(path);
12276   filepath existing(in->ino);
12277   req->set_filepath2(existing);
12278
12279   req->set_inode(dir);
12280   req->inode_drop = CEPH_CAP_FILE_SHARED;
12281   req->inode_unless = CEPH_CAP_FILE_EXCL;
12282
12283   Dentry *de;
12284   int res = get_or_create(dir, newname, &de);
12285   if (res < 0)
12286     goto fail;
12287   req->set_dentry(de);
12288
12289   res = make_request(req, perm, inp);
12290   ldout(cct, 10) << "link result is " << res << dendl;
12291
12292   trim_cache();
12293   ldout(cct, 3) << "link(" << existing << ", " << path << ") = " << res << dendl;
12294   return res;
12295
12296  fail:
12297   put_request(req);
12298   return res;
12299 }
12300
12301 int Client::ll_link(Inode *in, Inode *newparent, const char *newname,
12302                     const UserPerm& perm)
12303 {
12304   Mutex::Locker lock(client_lock);
12305
12306   if (unmounting)
12307     return -ENOTCONN;
12308
12309   vinodeno_t vino = _get_vino(in);
12310   vinodeno_t vnewparent = _get_vino(newparent);
12311
12312   ldout(cct, 3) << "ll_link " << vino << " to " << vnewparent << " " <<
12313     newname << dendl;
12314   tout(cct) << "ll_link" << std::endl;
12315   tout(cct) << vino.ino.val << std::endl;
12316   tout(cct) << vnewparent << std::endl;
12317   tout(cct) << newname << std::endl;
12318
12319   int r = 0;
12320   InodeRef target;
12321
12322   if (!cct->_conf->fuse_default_permissions) {
12323     if (S_ISDIR(in->mode))
12324       return -EPERM;
12325
12326     r = may_hardlink(in, perm);
12327     if (r < 0)
12328       return r;
12329
12330     r = may_create(newparent, perm);
12331     if (r < 0)
12332       return r;
12333   }
12334
12335   return _link(in, newparent, newname, perm, &target);
12336 }
12337
12338 int Client::ll_num_osds(void)
12339 {
12340   Mutex::Locker lock(client_lock);
12341   return objecter->with_osdmap(std::mem_fn(&OSDMap::get_num_osds));
12342 }
12343
12344 int Client::ll_osdaddr(int osd, uint32_t *addr)
12345 {
12346   Mutex::Locker lock(client_lock);
12347
12348   entity_addr_t g;
12349   bool exists = objecter->with_osdmap([&](const OSDMap& o) {
12350       if (!o.exists(osd))
12351         return false;
12352       g = o.get_addr(osd);
12353       return true;
12354     });
12355   if (!exists)
12356     return -1;
12357   uint32_t nb_addr = (g.in4_addr()).sin_addr.s_addr;
12358   *addr = ntohl(nb_addr);
12359   return 0;
12360 }
12361
12362 uint32_t Client::ll_stripe_unit(Inode *in)
12363 {
12364   Mutex::Locker lock(client_lock);
12365   return in->layout.stripe_unit;
12366 }
12367
12368 uint64_t Client::ll_snap_seq(Inode *in)
12369 {
12370   Mutex::Locker lock(client_lock);
12371   return in->snaprealm->seq;
12372 }
12373
12374 int Client::ll_file_layout(Inode *in, file_layout_t *layout)
12375 {
12376   Mutex::Locker lock(client_lock);
12377   *layout = in->layout;
12378   return 0;
12379 }
12380
12381 int Client::ll_file_layout(Fh *fh, file_layout_t *layout)
12382 {
12383   return ll_file_layout(fh->inode.get(), layout);
12384 }
12385
12386 /* Currently we cannot take advantage of redundancy in reads, since we
12387    would have to go through all possible placement groups (a
12388    potentially quite large number determined by a hash), and use CRUSH
12389    to calculate the appropriate set of OSDs for each placement group,
12390    then index into that.  An array with one entry per OSD is much more
12391    tractable and works for demonstration purposes. */
12392
12393 int Client::ll_get_stripe_osd(Inode *in, uint64_t blockno,
12394                               file_layout_t* layout)
12395 {
12396   Mutex::Locker lock(client_lock);
12397
12398   inodeno_t ino = ll_get_inodeno(in);
12399   uint32_t object_size = layout->object_size;
12400   uint32_t su = layout->stripe_unit;
12401   uint32_t stripe_count = layout->stripe_count;
12402   uint64_t stripes_per_object = object_size / su;
12403
12404   uint64_t stripeno = blockno / stripe_count;    // which horizontal stripe        (Y)
12405   uint64_t stripepos = blockno % stripe_count;   // which object in the object set (X)
12406   uint64_t objectsetno = stripeno / stripes_per_object;       // which object set
12407   uint64_t objectno = objectsetno * stripe_count + stripepos;  // object id
12408
12409   object_t oid = file_object_t(ino, objectno);
12410   return objecter->with_osdmap([&](const OSDMap& o) {
12411       ceph_object_layout olayout =
12412         o.file_to_object_layout(oid, *layout);
12413       pg_t pg = (pg_t)olayout.ol_pgid;
12414       vector<int> osds;
12415       int primary;
12416       o.pg_to_acting_osds(pg, &osds, &primary);
12417       return primary;
12418     });
12419 }
12420
12421 /* Return the offset of the block, internal to the object */
12422
12423 uint64_t Client::ll_get_internal_offset(Inode *in, uint64_t blockno)
12424 {
12425   Mutex::Locker lock(client_lock);
12426   file_layout_t *layout=&(in->layout);
12427   uint32_t object_size = layout->object_size;
12428   uint32_t su = layout->stripe_unit;
12429   uint64_t stripes_per_object = object_size / su;
12430
12431   return (blockno % stripes_per_object) * su;
12432 }
12433
12434 int Client::ll_opendir(Inode *in, int flags, dir_result_t** dirpp,
12435                        const UserPerm& perms)
12436 {
12437   Mutex::Locker lock(client_lock);
12438
12439   if (unmounting)
12440     return -ENOTCONN;
12441
12442   vinodeno_t vino = _get_vino(in);
12443
12444   ldout(cct, 3) << "ll_opendir " << vino << dendl;
12445   tout(cct) << "ll_opendir" << std::endl;
12446   tout(cct) << vino.ino.val << std::endl;
12447
12448   if (!cct->_conf->fuse_default_permissions) {
12449     int r = may_open(in, flags, perms);
12450     if (r < 0)
12451       return r;
12452   }
12453
12454   int r = _opendir(in, dirpp, perms);
12455   tout(cct) << (unsigned long)*dirpp << std::endl;
12456
12457   ldout(cct, 3) << "ll_opendir " << vino << " = " << r << " (" << *dirpp << ")"
12458                 << dendl;
12459   return r;
12460 }
12461
12462 int Client::ll_releasedir(dir_result_t *dirp)
12463 {
12464   Mutex::Locker lock(client_lock);
12465   ldout(cct, 3) << "ll_releasedir " << dirp << dendl;
12466   tout(cct) << "ll_releasedir" << std::endl;
12467   tout(cct) << (unsigned long)dirp << std::endl;
12468
12469   if (unmounting)
12470     return -ENOTCONN;
12471
12472   _closedir(dirp);
12473   return 0;
12474 }
12475
12476 int Client::ll_fsyncdir(dir_result_t *dirp)
12477 {
12478   Mutex::Locker lock(client_lock);
12479   ldout(cct, 3) << "ll_fsyncdir " << dirp << dendl;
12480   tout(cct) << "ll_fsyncdir" << std::endl;
12481   tout(cct) << (unsigned long)dirp << std::endl;
12482
12483   if (unmounting)
12484     return -ENOTCONN;
12485
12486   return _fsync(dirp->inode.get(), false);
12487 }
12488
12489 int Client::ll_open(Inode *in, int flags, Fh **fhp, const UserPerm& perms)
12490 {
12491   assert(!(flags & O_CREAT));
12492
12493   Mutex::Locker lock(client_lock);
12494
12495   if (unmounting)
12496     return -ENOTCONN;
12497
12498   vinodeno_t vino = _get_vino(in);
12499
12500   ldout(cct, 3) << "ll_open " << vino << " " << ceph_flags_sys2wire(flags) << dendl;
12501   tout(cct) << "ll_open" << std::endl;
12502   tout(cct) << vino.ino.val << std::endl;
12503   tout(cct) << ceph_flags_sys2wire(flags) << std::endl;
12504
12505   int r;
12506   if (!cct->_conf->fuse_default_permissions) {
12507     r = may_open(in, flags, perms);
12508     if (r < 0)
12509       goto out;
12510   }
12511
12512   r = _open(in, flags, 0, fhp /* may be NULL */, perms);
12513
12514  out:
12515   Fh *fhptr = fhp ? *fhp : NULL;
12516   if (fhptr) {
12517     ll_unclosed_fh_set.insert(fhptr);
12518   }
12519   tout(cct) << (unsigned long)fhptr << std::endl;
12520   ldout(cct, 3) << "ll_open " << vino << " " << ceph_flags_sys2wire(flags) <<
12521       " = " << r << " (" << fhptr << ")" << dendl;
12522   return r;
12523 }
12524
12525 int Client::_ll_create(Inode *parent, const char *name, mode_t mode,
12526                       int flags, InodeRef *in, int caps, Fh **fhp,
12527                       const UserPerm& perms)
12528 {
12529   *fhp = NULL;
12530
12531   vinodeno_t vparent = _get_vino(parent);
12532
12533   ldout(cct, 3) << "_ll_create " << vparent << " " << name << " 0" << oct <<
12534     mode << dec << " " << ceph_flags_sys2wire(flags) << ", uid " << perms.uid()
12535                 << ", gid " << perms.gid() << dendl;
12536   tout(cct) << "ll_create" << std::endl;
12537   tout(cct) << vparent.ino.val << std::endl;
12538   tout(cct) << name << std::endl;
12539   tout(cct) << mode << std::endl;
12540   tout(cct) << ceph_flags_sys2wire(flags) << std::endl;
12541
12542   bool created = false;
12543   int r = _lookup(parent, name, caps, in, perms);
12544
12545   if (r == 0 && (flags & O_CREAT) && (flags & O_EXCL))
12546     return -EEXIST;
12547
12548   if (r == -ENOENT && (flags & O_CREAT)) {
12549     if (!cct->_conf->fuse_default_permissions) {
12550       r = may_create(parent, perms);
12551       if (r < 0)
12552         goto out;
12553     }
12554     r = _create(parent, name, flags, mode, in, fhp, 0, 0, 0, NULL, &created,
12555                 perms);
12556     if (r < 0)
12557       goto out;
12558   }
12559
12560   if (r < 0)
12561     goto out;
12562
12563   assert(*in);
12564
12565   ldout(cct, 20) << "_ll_create created = " << created << dendl;
12566   if (!created) {
12567     if (!cct->_conf->fuse_default_permissions) {
12568       r = may_open(in->get(), flags, perms);
12569       if (r < 0) {
12570         if (*fhp) {
12571           int release_r = _release_fh(*fhp);
12572           assert(release_r == 0);  // during create, no async data ops should have happened
12573         }
12574         goto out;
12575       }
12576     }
12577     if (*fhp == NULL) {
12578       r = _open(in->get(), flags, mode, fhp, perms);
12579       if (r < 0)
12580         goto out;
12581     }
12582   }
12583
12584 out:
12585   if (*fhp) {
12586     ll_unclosed_fh_set.insert(*fhp);
12587   }
12588
12589   ino_t ino = 0;
12590   if (r >= 0) {
12591     Inode *inode = in->get();
12592     if (use_faked_inos())
12593       ino = inode->faked_ino;
12594     else
12595       ino = inode->ino;
12596   }
12597
12598   tout(cct) << (unsigned long)*fhp << std::endl;
12599   tout(cct) << ino << std::endl;
12600   ldout(cct, 3) << "_ll_create " << vparent << " " << name << " 0" << oct <<
12601     mode << dec << " " << ceph_flags_sys2wire(flags) << " = " << r << " (" <<
12602     *fhp << " " << hex << ino << dec << ")" << dendl;
12603
12604   return r;
12605 }
12606
12607 int Client::ll_create(Inode *parent, const char *name, mode_t mode,
12608                       int flags, struct stat *attr, Inode **outp, Fh **fhp,
12609                       const UserPerm& perms)
12610 {
12611   Mutex::Locker lock(client_lock);
12612   InodeRef in;
12613
12614   if (unmounting)
12615     return -ENOTCONN;
12616
12617   int r = _ll_create(parent, name, mode, flags, &in, CEPH_STAT_CAP_INODE_ALL,
12618                       fhp, perms);
12619   if (r >= 0) {
12620     assert(in);
12621
12622     // passing an Inode in outp requires an additional ref
12623     if (outp) {
12624       _ll_get(in.get());
12625       *outp = in.get();
12626     }
12627     fill_stat(in, attr);
12628   } else {
12629     attr->st_ino = 0;
12630   }
12631
12632   return r;
12633 }
12634
12635 int Client::ll_createx(Inode *parent, const char *name, mode_t mode,
12636                         int oflags, Inode **outp, Fh **fhp,
12637                         struct ceph_statx *stx, unsigned want, unsigned lflags,
12638                         const UserPerm& perms)
12639 {
12640   unsigned caps = statx_to_mask(lflags, want);
12641   Mutex::Locker lock(client_lock);
12642   InodeRef in;
12643
12644   if (unmounting)
12645     return -ENOTCONN;
12646
12647   int r = _ll_create(parent, name, mode, oflags, &in, caps, fhp, perms);
12648   if (r >= 0) {
12649     assert(in);
12650
12651     // passing an Inode in outp requires an additional ref
12652     if (outp) {
12653       _ll_get(in.get());
12654       *outp = in.get();
12655     }
12656     fill_statx(in, caps, stx);
12657   } else {
12658     stx->stx_ino = 0;
12659     stx->stx_mask = 0;
12660   }
12661
12662   return r;
12663 }
12664
12665 loff_t Client::ll_lseek(Fh *fh, loff_t offset, int whence)
12666 {
12667   Mutex::Locker lock(client_lock);
12668   tout(cct) << "ll_lseek" << std::endl;
12669   tout(cct) << offset << std::endl;
12670   tout(cct) << whence << std::endl;
12671
12672   if (unmounting)
12673     return -ENOTCONN;
12674
12675   return _lseek(fh, offset, whence);
12676 }
12677
12678 int Client::ll_read(Fh *fh, loff_t off, loff_t len, bufferlist *bl)
12679 {
12680   Mutex::Locker lock(client_lock);
12681   ldout(cct, 3) << "ll_read " << fh << " " << fh->inode->ino << " " << " " << off << "~" << len << dendl;
12682   tout(cct) << "ll_read" << std::endl;
12683   tout(cct) << (unsigned long)fh << std::endl;
12684   tout(cct) << off << std::endl;
12685   tout(cct) << len << std::endl;
12686
12687   if (unmounting)
12688     return -ENOTCONN;
12689
12690   return _read(fh, off, len, bl);
12691 }
12692
12693 int Client::ll_read_block(Inode *in, uint64_t blockid,
12694                           char *buf,
12695                           uint64_t offset,
12696                           uint64_t length,
12697                           file_layout_t* layout)
12698 {
12699   Mutex::Locker lock(client_lock);
12700
12701   if (unmounting)
12702     return -ENOTCONN;
12703
12704   vinodeno_t vino = _get_vino(in);
12705   object_t oid = file_object_t(vino.ino, blockid);
12706   C_SaferCond onfinish;
12707   bufferlist bl;
12708
12709   objecter->read(oid,
12710                  object_locator_t(layout->pool_id),
12711                  offset,
12712                  length,
12713                  vino.snapid,
12714                  &bl,
12715                  CEPH_OSD_FLAG_READ,
12716                  &onfinish);
12717
12718   client_lock.Unlock();
12719   int r = onfinish.wait();
12720   client_lock.Lock();
12721
12722   if (r >= 0) {
12723       bl.copy(0, bl.length(), buf);
12724       r = bl.length();
12725   }
12726
12727   return r;
12728 }
12729
12730 /* It appears that the OSD doesn't return success unless the entire
12731    buffer was written, return the write length on success. */
12732
12733 int Client::ll_write_block(Inode *in, uint64_t blockid,
12734                            char* buf, uint64_t offset,
12735                            uint64_t length, file_layout_t* layout,
12736                            uint64_t snapseq, uint32_t sync)
12737 {
12738   Mutex flock("Client::ll_write_block flock");
12739   vinodeno_t vino = ll_get_vino(in);
12740   Cond cond;
12741   bool done;
12742   int r = 0;
12743   Context *onsafe = nullptr;
12744
12745   if (length == 0) {
12746     return -EINVAL;
12747   }
12748   if (true || sync) {
12749     /* if write is stable, the epilogue is waiting on
12750      * flock */
12751     onsafe = new C_SafeCond(&flock, &cond, &done, &r);
12752     done = false;
12753   } else {
12754     /* if write is unstable, we just place a barrier for
12755      * future commits to wait on */
12756     /*onsafe = new C_Block_Sync(this, vino.ino,
12757                               barrier_interval(offset, offset + length), &r);
12758     */
12759     done = true;
12760   }
12761   object_t oid = file_object_t(vino.ino, blockid);
12762   SnapContext fakesnap;
12763   bufferptr bp;
12764   if (length > 0) bp = buffer::copy(buf, length);
12765   bufferlist bl;
12766   bl.push_back(bp);
12767
12768   ldout(cct, 1) << "ll_block_write for " << vino.ino << "." << blockid
12769                 << dendl;
12770
12771   fakesnap.seq = snapseq;
12772
12773   /* lock just in time */
12774   client_lock.Lock();
12775   if (unmounting) {
12776     client_lock.Unlock();
12777     delete onsafe;
12778     return -ENOTCONN;
12779   }
12780
12781   objecter->write(oid,
12782                   object_locator_t(layout->pool_id),
12783                   offset,
12784                   length,
12785                   fakesnap,
12786                   bl,
12787                   ceph::real_clock::now(),
12788                   0,
12789                   onsafe);
12790
12791   client_lock.Unlock();
12792   if (!done /* also !sync */) {
12793     flock.Lock();
12794     while (! done)
12795       cond.Wait(flock);
12796     flock.Unlock();
12797   }
12798
12799   if (r < 0) {
12800     return r;
12801   } else {
12802     return length;
12803   }
12804 }
12805
12806 int Client::ll_commit_blocks(Inode *in,
12807                              uint64_t offset,
12808                              uint64_t length)
12809 {
12810     Mutex::Locker lock(client_lock);
12811     /*
12812     BarrierContext *bctx;
12813     vinodeno_t vino = _get_vino(in);
12814     uint64_t ino = vino.ino;
12815
12816     ldout(cct, 1) << "ll_commit_blocks for " << vino.ino << " from "
12817                   << offset << " to " << length << dendl;
12818
12819     if (length == 0) {
12820       return -EINVAL;
12821     }
12822
12823     map<uint64_t, BarrierContext*>::iterator p = barriers.find(ino);
12824     if (p != barriers.end()) {
12825       barrier_interval civ(offset, offset + length);
12826       p->second->commit_barrier(civ);
12827     }
12828     */
12829     return 0;
12830 }
12831
12832 int Client::ll_write(Fh *fh, loff_t off, loff_t len, const char *data)
12833 {
12834   Mutex::Locker lock(client_lock);
12835   ldout(cct, 3) << "ll_write " << fh << " " << fh->inode->ino << " " << off <<
12836     "~" << len << dendl;
12837   tout(cct) << "ll_write" << std::endl;
12838   tout(cct) << (unsigned long)fh << std::endl;
12839   tout(cct) << off << std::endl;
12840   tout(cct) << len << std::endl;
12841
12842   if (unmounting)
12843     return -ENOTCONN;
12844
12845   int r = _write(fh, off, len, data, NULL, 0);
12846   ldout(cct, 3) << "ll_write " << fh << " " << off << "~" << len << " = " << r
12847                 << dendl;
12848   return r;
12849 }
12850
12851 int Client::ll_flush(Fh *fh)
12852 {
12853   Mutex::Locker lock(client_lock);
12854   ldout(cct, 3) << "ll_flush " << fh << " " << fh->inode->ino << " " << dendl;
12855   tout(cct) << "ll_flush" << std::endl;
12856   tout(cct) << (unsigned long)fh << std::endl;
12857
12858   if (unmounting)
12859     return -ENOTCONN;
12860
12861   return _flush(fh);
12862 }
12863
12864 int Client::ll_fsync(Fh *fh, bool syncdataonly)
12865 {
12866   Mutex::Locker lock(client_lock);
12867   ldout(cct, 3) << "ll_fsync " << fh << " " << fh->inode->ino << " " << dendl;
12868   tout(cct) << "ll_fsync" << std::endl;
12869   tout(cct) << (unsigned long)fh << std::endl;
12870
12871   if (unmounting)
12872     return -ENOTCONN;
12873
12874   int r = _fsync(fh, syncdataonly);
12875   if (r) {
12876     // If we're returning an error, clear it from the FH
12877     fh->take_async_err();
12878   }
12879   return r;
12880 }
12881
12882 #ifdef FALLOC_FL_PUNCH_HOLE
12883
12884 int Client::_fallocate(Fh *fh, int mode, int64_t offset, int64_t length)
12885 {
12886   if (offset < 0 || length <= 0)
12887     return -EINVAL;
12888
12889   if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
12890     return -EOPNOTSUPP;
12891
12892   if ((mode & FALLOC_FL_PUNCH_HOLE) && !(mode & FALLOC_FL_KEEP_SIZE))
12893     return -EOPNOTSUPP;
12894
12895   Inode *in = fh->inode.get();
12896
12897   if (objecter->osdmap_pool_full(in->layout.pool_id) &&
12898       !(mode & FALLOC_FL_PUNCH_HOLE)) {
12899     return -ENOSPC;
12900   }
12901
12902   if (in->snapid != CEPH_NOSNAP)
12903     return -EROFS;
12904
12905   if ((fh->mode & CEPH_FILE_MODE_WR) == 0)
12906     return -EBADF;
12907
12908   uint64_t size = offset + length;
12909   if (!(mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE)) &&
12910       size > in->size &&
12911       is_quota_bytes_exceeded(in, size - in->size, fh->actor_perms)) {
12912     return -EDQUOT;
12913   }
12914
12915   int have;
12916   int r = get_caps(in, CEPH_CAP_FILE_WR, CEPH_CAP_FILE_BUFFER, &have, -1);
12917   if (r < 0)
12918     return r;
12919
12920   Mutex uninline_flock("Client::_fallocate_uninline_data flock");
12921   Cond uninline_cond;
12922   bool uninline_done = false;
12923   int uninline_ret = 0;
12924   Context *onuninline = NULL;
12925
12926   if (mode & FALLOC_FL_PUNCH_HOLE) {
12927     if (in->inline_version < CEPH_INLINE_NONE &&
12928         (have & CEPH_CAP_FILE_BUFFER)) {
12929       bufferlist bl;
12930       int len = in->inline_data.length();
12931       if (offset < len) {
12932         if (offset > 0)
12933           in->inline_data.copy(0, offset, bl);
12934         int size = length;
12935         if (offset + size > len)
12936           size = len - offset;
12937         if (size > 0)
12938           bl.append_zero(size);
12939         if (offset + size < len)
12940           in->inline_data.copy(offset + size, len - offset - size, bl);
12941         in->inline_data = bl;
12942         in->inline_version++;
12943       }
12944       in->mtime = ceph_clock_now();
12945       in->change_attr++;
12946       mark_caps_dirty(in, CEPH_CAP_FILE_WR);
12947     } else {
12948       if (in->inline_version < CEPH_INLINE_NONE) {
12949         onuninline = new C_SafeCond(&uninline_flock,
12950                                     &uninline_cond,
12951                                     &uninline_done,
12952                                     &uninline_ret);
12953         uninline_data(in, onuninline);
12954       }
12955
12956       Mutex flock("Client::_punch_hole flock");
12957       Cond cond;
12958       bool done = false;
12959       Context *onfinish = new C_SafeCond(&flock, &cond, &done);
12960
12961       unsafe_sync_write++;
12962       get_cap_ref(in, CEPH_CAP_FILE_BUFFER);
12963
12964       _invalidate_inode_cache(in, offset, length);
12965       filer->zero(in->ino, &in->layout,
12966                   in->snaprealm->get_snap_context(),
12967                   offset, length,
12968                   ceph::real_clock::now(),
12969                   0, true, onfinish);
12970       in->mtime = ceph_clock_now();
12971       in->change_attr++;
12972       mark_caps_dirty(in, CEPH_CAP_FILE_WR);
12973
12974       client_lock.Unlock();
12975       flock.Lock();
12976       while (!done)
12977         cond.Wait(flock);
12978       flock.Unlock();
12979       client_lock.Lock();
12980       _sync_write_commit(in);
12981     }
12982   } else if (!(mode & FALLOC_FL_KEEP_SIZE)) {
12983     uint64_t size = offset + length;
12984     if (size > in->size) {
12985       in->size = size;
12986       in->mtime = ceph_clock_now();
12987       in->change_attr++;
12988       mark_caps_dirty(in, CEPH_CAP_FILE_WR);
12989
12990       if (is_quota_bytes_approaching(in, fh->actor_perms)) {
12991         check_caps(in, CHECK_CAPS_NODELAY);
12992       } else if (is_max_size_approaching(in)) {
12993         check_caps(in, 0);
12994       }
12995     }
12996   }
12997
12998   if (onuninline) {
12999     client_lock.Unlock();
13000     uninline_flock.Lock();
13001     while (!uninline_done)
13002       uninline_cond.Wait(uninline_flock);
13003     uninline_flock.Unlock();
13004     client_lock.Lock();
13005
13006     if (uninline_ret >= 0 || uninline_ret == -ECANCELED) {
13007       in->inline_data.clear();
13008       in->inline_version = CEPH_INLINE_NONE;
13009       mark_caps_dirty(in, CEPH_CAP_FILE_WR);
13010       check_caps(in, 0);
13011     } else
13012       r = uninline_ret;
13013   }
13014
13015   put_cap_ref(in, CEPH_CAP_FILE_WR);
13016   return r;
13017 }
13018 #else
13019
13020 int Client::_fallocate(Fh *fh, int mode, int64_t offset, int64_t length)
13021 {
13022   return -EOPNOTSUPP;
13023 }
13024
13025 #endif
13026
13027
13028 int Client::ll_fallocate(Fh *fh, int mode, loff_t offset, loff_t length)
13029 {
13030   Mutex::Locker lock(client_lock);
13031   ldout(cct, 3) << "ll_fallocate " << fh << " " << fh->inode->ino << " " << dendl;
13032   tout(cct) << "ll_fallocate " << mode << " " << offset << " " << length << std::endl;
13033   tout(cct) << (unsigned long)fh << std::endl;
13034
13035   if (unmounting)
13036     return -ENOTCONN;
13037
13038   return _fallocate(fh, mode, offset, length);
13039 }
13040
13041 int Client::fallocate(int fd, int mode, loff_t offset, loff_t length)
13042 {
13043   Mutex::Locker lock(client_lock);
13044   tout(cct) << "fallocate " << " " << fd << mode << " " << offset << " " << length << std::endl;
13045
13046   if (unmounting)
13047     return -ENOTCONN;
13048
13049   Fh *fh = get_filehandle(fd);
13050   if (!fh)
13051     return -EBADF;
13052 #if defined(__linux__) && defined(O_PATH)
13053   if (fh->flags & O_PATH)
13054     return -EBADF;
13055 #endif
13056   return _fallocate(fh, mode, offset, length);
13057 }
13058
13059 int Client::ll_release(Fh *fh)
13060 {
13061   Mutex::Locker lock(client_lock);
13062   ldout(cct, 3) << "ll_release (fh)" << fh << " " << fh->inode->ino << " " <<
13063     dendl;
13064   tout(cct) << "ll_release (fh)" << std::endl;
13065   tout(cct) << (unsigned long)fh << std::endl;
13066
13067   if (unmounting)
13068     return -ENOTCONN;
13069
13070   if (ll_unclosed_fh_set.count(fh))
13071     ll_unclosed_fh_set.erase(fh);
13072   return _release_fh(fh);
13073 }
13074
13075 int Client::ll_getlk(Fh *fh, struct flock *fl, uint64_t owner)
13076 {
13077   Mutex::Locker lock(client_lock);
13078
13079   ldout(cct, 3) << "ll_getlk (fh)" << fh << " " << fh->inode->ino << dendl;
13080   tout(cct) << "ll_getk (fh)" << (unsigned long)fh << std::endl;
13081
13082   if (unmounting)
13083     return -ENOTCONN;
13084
13085   return _getlk(fh, fl, owner);
13086 }
13087
13088 int Client::ll_setlk(Fh *fh, struct flock *fl, uint64_t owner, int sleep)
13089 {
13090   Mutex::Locker lock(client_lock);
13091
13092   ldout(cct, 3) << "ll_setlk  (fh) " << fh << " " << fh->inode->ino << dendl;
13093   tout(cct) << "ll_setk (fh)" << (unsigned long)fh << std::endl;
13094
13095   if (unmounting)
13096     return -ENOTCONN;
13097
13098   return _setlk(fh, fl, owner, sleep);
13099 }
13100
13101 int Client::ll_flock(Fh *fh, int cmd, uint64_t owner)
13102 {
13103   Mutex::Locker lock(client_lock);
13104
13105   ldout(cct, 3) << "ll_flock  (fh) " << fh << " " << fh->inode->ino << dendl;
13106   tout(cct) << "ll_flock (fh)" << (unsigned long)fh << std::endl;
13107
13108   if (unmounting)
13109     return -ENOTCONN;
13110
13111   return _flock(fh, cmd, owner);
13112 }
13113
13114 int Client::set_deleg_timeout(uint32_t timeout)
13115 {
13116   Mutex::Locker lock(client_lock);
13117
13118   /*
13119    * The whole point is to prevent blacklisting so we must time out the
13120    * delegation before the session autoclose timeout kicks in.
13121    */
13122   if (timeout >= mdsmap->get_session_autoclose())
13123     return -EINVAL;
13124
13125   deleg_timeout = timeout;
13126   return 0;
13127 }
13128
13129 int Client::ll_delegation(Fh *fh, unsigned cmd, ceph_deleg_cb_t cb, void *priv)
13130 {
13131   int ret = -EINVAL;
13132
13133   Mutex::Locker lock(client_lock);
13134
13135   if (!mounted)
13136     return -ENOTCONN;
13137
13138   Inode *inode = fh->inode.get();
13139
13140   switch(cmd) {
13141   case CEPH_DELEGATION_NONE:
13142     inode->unset_deleg(fh);
13143     ret = 0;
13144     break;
13145   default:
13146     try {
13147       ret = inode->set_deleg(fh, cmd, cb, priv);
13148     } catch (std::bad_alloc) {
13149       ret = -ENOMEM;
13150     }
13151     break;
13152   }
13153   return ret;
13154 }
13155
13156 class C_Client_RequestInterrupt : public Context  {
13157 private:
13158   Client *client;
13159   MetaRequest *req;
13160 public:
13161   C_Client_RequestInterrupt(Client *c, MetaRequest *r) : client(c), req(r) {
13162     req->get();
13163   }
13164   void finish(int r) override {
13165     Mutex::Locker l(client->client_lock);
13166     assert(req->head.op == CEPH_MDS_OP_SETFILELOCK);
13167     client->_interrupt_filelock(req);
13168     client->put_request(req);
13169   }
13170 };
13171
13172 void Client::ll_interrupt(void *d)
13173 {
13174   MetaRequest *req = static_cast<MetaRequest*>(d);
13175   ldout(cct, 3) << "ll_interrupt tid " << req->get_tid() << dendl;
13176   tout(cct) << "ll_interrupt tid " << req->get_tid() << std::endl;
13177   interrupt_finisher.queue(new C_Client_RequestInterrupt(this, req));
13178 }
13179
13180 // =========================================
13181 // layout
13182
13183 // expose file layouts
13184
13185 int Client::describe_layout(const char *relpath, file_layout_t *lp,
13186                             const UserPerm& perms)
13187 {
13188   Mutex::Locker lock(client_lock);
13189
13190   if (unmounting)
13191     return -ENOTCONN;
13192
13193   filepath path(relpath);
13194   InodeRef in;
13195   int r = path_walk(path, &in, perms);
13196   if (r < 0)
13197     return r;
13198
13199   *lp = in->layout;
13200
13201   ldout(cct, 3) << "describe_layout(" << relpath << ") = 0" << dendl;
13202   return 0;
13203 }
13204
13205 int Client::fdescribe_layout(int fd, file_layout_t *lp)
13206 {
13207   Mutex::Locker lock(client_lock);
13208
13209   if (unmounting)
13210     return -ENOTCONN;
13211
13212   Fh *f = get_filehandle(fd);
13213   if (!f)
13214     return -EBADF;
13215   Inode *in = f->inode.get();
13216
13217   *lp = in->layout;
13218
13219   ldout(cct, 3) << "fdescribe_layout(" << fd << ") = 0" << dendl;
13220   return 0;
13221 }
13222
13223 int64_t Client::get_default_pool_id()
13224 {
13225   Mutex::Locker lock(client_lock);
13226
13227   if (unmounting)
13228     return -ENOTCONN;
13229
13230   /* first data pool is the default */
13231   return mdsmap->get_first_data_pool();
13232 }
13233
13234 // expose osdmap
13235
13236 int64_t Client::get_pool_id(const char *pool_name)
13237 {
13238   Mutex::Locker lock(client_lock);
13239
13240   if (unmounting)
13241     return -ENOTCONN;
13242
13243   return objecter->with_osdmap(std::mem_fn(&OSDMap::lookup_pg_pool_name),
13244                                pool_name);
13245 }
13246
13247 string Client::get_pool_name(int64_t pool)
13248 {
13249   Mutex::Locker lock(client_lock);
13250
13251   if (unmounting)
13252     return string();
13253
13254   return objecter->with_osdmap([pool](const OSDMap& o) {
13255       return o.have_pg_pool(pool) ? o.get_pool_name(pool) : string();
13256     });
13257 }
13258
13259 int Client::get_pool_replication(int64_t pool)
13260 {
13261   Mutex::Locker lock(client_lock);
13262
13263   if (unmounting)
13264     return -ENOTCONN;
13265
13266   return objecter->with_osdmap([pool](const OSDMap& o) {
13267       return o.have_pg_pool(pool) ? o.get_pg_pool(pool)->get_size() : -ENOENT;
13268     });
13269 }
13270
13271 int Client::get_file_extent_osds(int fd, loff_t off, loff_t *len, vector<int>& osds)
13272 {
13273   Mutex::Locker lock(client_lock);
13274
13275   if (unmounting)
13276     return -ENOTCONN;
13277
13278   Fh *f = get_filehandle(fd);
13279   if (!f)
13280     return -EBADF;
13281   Inode *in = f->inode.get();
13282
13283   vector<ObjectExtent> extents;
13284   Striper::file_to_extents(cct, in->ino, &in->layout, off, 1, in->truncate_size, extents);
13285   assert(extents.size() == 1);
13286
13287   objecter->with_osdmap([&](const OSDMap& o) {
13288       pg_t pg = o.object_locator_to_pg(extents[0].oid, extents[0].oloc);
13289       o.pg_to_acting_osds(pg, osds);
13290     });
13291
13292   if (osds.empty())
13293     return -EINVAL;
13294
13295   /*
13296    * Return the remainder of the extent (stripe unit)
13297    *
13298    * If length = 1 is passed to Striper::file_to_extents we get a single
13299    * extent back, but its length is one so we still need to compute the length
13300    * to the end of the stripe unit.
13301    *
13302    * If length = su then we may get 1 or 2 objects back in the extents vector
13303    * which would have to be examined. Even then, the offsets are local to the
13304    * object, so matching up to the file offset is extra work.
13305    *
13306    * It seems simpler to stick with length = 1 and manually compute the
13307    * remainder.
13308    */
13309   if (len) {
13310     uint64_t su = in->layout.stripe_unit;
13311     *len = su - (off % su);
13312   }
13313
13314   return 0;
13315 }
13316
13317 int Client::get_osd_crush_location(int id, vector<pair<string, string> >& path)
13318 {
13319   Mutex::Locker lock(client_lock);
13320
13321   if (unmounting)
13322     return -ENOTCONN;
13323
13324   if (id < 0)
13325     return -EINVAL;
13326   return objecter->with_osdmap([&](const OSDMap& o) {
13327       return o.crush->get_full_location_ordered(id, path);
13328     });
13329 }
13330
13331 int Client::get_file_stripe_address(int fd, loff_t offset,
13332                                     vector<entity_addr_t>& address)
13333 {
13334   Mutex::Locker lock(client_lock);
13335
13336   if (unmounting)
13337     return -ENOTCONN;
13338
13339   Fh *f = get_filehandle(fd);
13340   if (!f)
13341     return -EBADF;
13342   Inode *in = f->inode.get();
13343
13344   // which object?
13345   vector<ObjectExtent> extents;
13346   Striper::file_to_extents(cct, in->ino, &in->layout, offset, 1,
13347                            in->truncate_size, extents);
13348   assert(extents.size() == 1);
13349
13350   // now we have the object and its 'layout'
13351   return objecter->with_osdmap([&](const OSDMap& o) {
13352       pg_t pg = o.object_locator_to_pg(extents[0].oid, extents[0].oloc);
13353       vector<int> osds;
13354       o.pg_to_acting_osds(pg, osds);
13355       if (osds.empty())
13356         return -EINVAL;
13357       for (unsigned i = 0; i < osds.size(); i++) {
13358         entity_addr_t addr = o.get_addr(osds[i]);
13359         address.push_back(addr);
13360       }
13361       return 0;
13362     });
13363 }
13364
13365 int Client::get_osd_addr(int osd, entity_addr_t& addr)
13366 {
13367   Mutex::Locker lock(client_lock);
13368
13369   if (unmounting)
13370     return -ENOTCONN;
13371
13372   return objecter->with_osdmap([&](const OSDMap& o) {
13373       if (!o.exists(osd))
13374         return -ENOENT;
13375
13376       addr = o.get_addr(osd);
13377       return 0;
13378     });
13379 }
13380
13381 int Client::enumerate_layout(int fd, vector<ObjectExtent>& result,
13382                              loff_t length, loff_t offset)
13383 {
13384   Mutex::Locker lock(client_lock);
13385
13386   if (unmounting)
13387     return -ENOTCONN;
13388
13389   Fh *f = get_filehandle(fd);
13390   if (!f)
13391     return -EBADF;
13392   Inode *in = f->inode.get();
13393
13394   // map to a list of extents
13395   Striper::file_to_extents(cct, in->ino, &in->layout, offset, length, in->truncate_size, result);
13396
13397   ldout(cct, 3) << "enumerate_layout(" << fd << ", " << length << ", " << offset << ") = 0" << dendl;
13398   return 0;
13399 }
13400
13401
13402 /* find an osd with the same ip.  -ENXIO if none. */
13403 int Client::get_local_osd()
13404 {
13405   Mutex::Locker lock(client_lock);
13406
13407   if (unmounting)
13408     return -ENOTCONN;
13409
13410   objecter->with_osdmap([this](const OSDMap& o) {
13411       if (o.get_epoch() != local_osd_epoch) {
13412         local_osd = o.find_osd_on_ip(messenger->get_myaddr());
13413         local_osd_epoch = o.get_epoch();
13414       }
13415     });
13416   return local_osd;
13417 }
13418
13419
13420
13421
13422
13423
13424 // ===============================
13425
13426 void Client::ms_handle_connect(Connection *con)
13427 {
13428   ldout(cct, 10) << "ms_handle_connect on " << con->get_peer_addr() << dendl;
13429 }
13430
13431 bool Client::ms_handle_reset(Connection *con)
13432 {
13433   ldout(cct, 0) << "ms_handle_reset on " << con->get_peer_addr() << dendl;
13434   return false;
13435 }
13436
13437 void Client::ms_handle_remote_reset(Connection *con)
13438 {
13439   ldout(cct, 0) << "ms_handle_remote_reset on " << con->get_peer_addr() << dendl;
13440   Mutex::Locker l(client_lock);
13441   switch (con->get_peer_type()) {
13442   case CEPH_ENTITY_TYPE_MDS:
13443     {
13444       // kludge to figure out which mds this is; fixme with a Connection* state
13445       mds_rank_t mds = MDS_RANK_NONE;
13446       MetaSession *s = NULL;
13447       for (map<mds_rank_t,MetaSession*>::iterator p = mds_sessions.begin();
13448            p != mds_sessions.end();
13449            ++p) {
13450         if (mdsmap->get_addr(p->first) == con->get_peer_addr()) {
13451           mds = p->first;
13452           s = p->second;
13453         }
13454       }
13455       if (mds >= 0) {
13456         assert (s != NULL);
13457         switch (s->state) {
13458         case MetaSession::STATE_CLOSING:
13459           ldout(cct, 1) << "reset from mds we were closing; we'll call that closed" << dendl;
13460           _closed_mds_session(s);
13461           break;
13462
13463         case MetaSession::STATE_OPENING:
13464           {
13465             ldout(cct, 1) << "reset from mds we were opening; retrying" << dendl;
13466             list<Context*> waiters;
13467             waiters.swap(s->waiting_for_open);
13468             _closed_mds_session(s);
13469             MetaSession *news = _get_or_open_mds_session(mds);
13470             news->waiting_for_open.swap(waiters);
13471           }
13472           break;
13473
13474         case MetaSession::STATE_OPEN:
13475           {
13476             const md_config_t *conf = cct->_conf;
13477             if (conf->client_reconnect_stale) {
13478               ldout(cct, 1) << "reset from mds we were open; close mds session for reconnect" << dendl;
13479               _closed_mds_session(s);
13480             } else {
13481               ldout(cct, 1) << "reset from mds we were open; mark session as stale" << dendl;
13482               s->state = MetaSession::STATE_STALE;
13483             }
13484           }
13485           break;
13486
13487         case MetaSession::STATE_NEW:
13488         case MetaSession::STATE_CLOSED:
13489         default:
13490           break;
13491         }
13492       }
13493     }
13494     break;
13495   }
13496 }
13497
13498 bool Client::ms_handle_refused(Connection *con)
13499 {
13500   ldout(cct, 1) << "ms_handle_refused on " << con->get_peer_addr() << dendl;
13501   return false;
13502 }
13503
13504 bool Client::ms_get_authorizer(int dest_type, AuthAuthorizer **authorizer, bool force_new)
13505 {
13506   if (dest_type == CEPH_ENTITY_TYPE_MON)
13507     return true;
13508   *authorizer = monclient->build_authorizer(dest_type);
13509   return true;
13510 }
13511
13512 Inode *Client::get_quota_root(Inode *in, const UserPerm& perms)
13513 {
13514   Inode *cur = in;
13515   utime_t now = ceph_clock_now();
13516
13517   while (cur) {
13518     if (cur != in && cur->quota.is_enable())
13519       break;
13520
13521     Inode *parent_in = NULL;
13522     if (!cur->dn_set.empty()) {
13523       for (auto p = cur->dn_set.begin(); p != cur->dn_set.end(); ++p) {
13524         Dentry *dn = *p;
13525         if (dn->lease_mds >= 0 &&
13526             dn->lease_ttl > now &&
13527             mds_sessions.count(dn->lease_mds)) {
13528           parent_in = dn->dir->parent_inode;
13529         } else {
13530           Inode *diri = dn->dir->parent_inode;
13531           if (diri->caps_issued_mask(CEPH_CAP_FILE_SHARED) &&
13532               diri->shared_gen == dn->cap_shared_gen) {
13533             parent_in = dn->dir->parent_inode;
13534           }
13535         }
13536         if (parent_in)
13537           break;
13538       }
13539     } else if (root_parents.count(cur)) {
13540       parent_in = root_parents[cur].get();
13541     }
13542
13543     if (parent_in) {
13544       cur = parent_in;
13545       continue;
13546     }
13547
13548     if (cur == root_ancestor)
13549       break;
13550
13551     // deleted inode
13552     if (cur->nlink == 0) {
13553       cur = root_ancestor;
13554       break;
13555     }
13556
13557     MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPNAME);
13558     filepath path(cur->ino);
13559     req->set_filepath(path);
13560     req->set_inode(cur);
13561
13562     InodeRef parent_ref;
13563     int ret = make_request(req, perms, &parent_ref);
13564     if (ret < 0) {
13565       ldout(cct, 1) << __func__ << " " << in->vino()
13566                     << " failed to find parent of " << cur->vino()
13567                     << " err " << ret <<  dendl;
13568       // FIXME: what to do?
13569       cur = root_ancestor;
13570       break;
13571     }
13572
13573     now = ceph_clock_now();
13574     if (cur == in)
13575       cur = parent_ref.get();
13576     else
13577       cur = in; // start over
13578   }
13579
13580   ldout(cct, 10) << __func__ << " " << in->vino() << " -> " << cur->vino() << dendl;
13581   return cur;
13582 }
13583
13584 /**
13585  * Traverse quota ancestors of the Inode, return true
13586  * if any of them passes the passed function
13587  */
13588 bool Client::check_quota_condition(Inode *in, const UserPerm& perms,
13589                                    std::function<bool (const Inode &in)> test)
13590 {
13591   while (true) {
13592     assert(in != NULL);
13593     if (test(*in)) {
13594       return true;
13595     }
13596
13597     if (in == root_ancestor) {
13598       // We're done traversing, drop out
13599       return false;
13600     } else {
13601       // Continue up the tree
13602       in = get_quota_root(in, perms);
13603     }
13604   }
13605
13606   return false;
13607 }
13608
13609 bool Client::is_quota_files_exceeded(Inode *in, const UserPerm& perms)
13610 {
13611   return check_quota_condition(in, perms,
13612       [](const Inode &in) {
13613         return in.quota.max_files && in.rstat.rsize() >= in.quota.max_files;
13614       });
13615 }
13616
13617 bool Client::is_quota_bytes_exceeded(Inode *in, int64_t new_bytes,
13618                                      const UserPerm& perms)
13619 {
13620   return check_quota_condition(in, perms,
13621       [&new_bytes](const Inode &in) {
13622         return in.quota.max_bytes && (in.rstat.rbytes + new_bytes)
13623                > in.quota.max_bytes;
13624       });
13625 }
13626
13627 bool Client::is_quota_bytes_approaching(Inode *in, const UserPerm& perms)
13628 {
13629   return check_quota_condition(in, perms,
13630       [](const Inode &in) {
13631         if (in.quota.max_bytes) {
13632           if (in.rstat.rbytes >= in.quota.max_bytes) {
13633             return true;
13634           }
13635
13636           assert(in.size >= in.reported_size);
13637           const uint64_t space = in.quota.max_bytes - in.rstat.rbytes;
13638           const uint64_t size = in.size - in.reported_size;
13639           return (space >> 4) < size;
13640         } else {
13641           return false;
13642         }
13643       });
13644 }
13645
13646 enum {
13647   POOL_CHECKED = 1,
13648   POOL_CHECKING = 2,
13649   POOL_READ = 4,
13650   POOL_WRITE = 8,
13651 };
13652
13653 int Client::check_pool_perm(Inode *in, int need)
13654 {
13655   if (!cct->_conf->client_check_pool_perm)
13656     return 0;
13657
13658   int64_t pool_id = in->layout.pool_id;
13659   std::string pool_ns = in->layout.pool_ns;
13660   std::pair<int64_t, std::string> perm_key(pool_id, pool_ns);
13661   int have = 0;
13662   while (true) {
13663     auto it = pool_perms.find(perm_key);
13664     if (it == pool_perms.end())
13665       break;
13666     if (it->second == POOL_CHECKING) {
13667       // avoid concurrent checkings
13668       wait_on_list(waiting_for_pool_perm);
13669     } else {
13670       have = it->second;
13671       assert(have & POOL_CHECKED);
13672       break;
13673     }
13674   }
13675
13676   if (!have) {
13677     if (in->snapid != CEPH_NOSNAP) {
13678       // pool permission check needs to write to the first object. But for snapshot,
13679       // head of the first object may have alread been deleted. To avoid creating
13680       // orphan object, skip the check for now.
13681       return 0;
13682     }
13683
13684     pool_perms[perm_key] = POOL_CHECKING;
13685
13686     char oid_buf[32];
13687     snprintf(oid_buf, sizeof(oid_buf), "%llx.00000000", (unsigned long long)in->ino);
13688     object_t oid = oid_buf;
13689
13690     SnapContext nullsnapc;
13691
13692     C_SaferCond rd_cond;
13693     ObjectOperation rd_op;
13694     rd_op.stat(NULL, (ceph::real_time*)nullptr, NULL);
13695
13696     objecter->mutate(oid, OSDMap::file_to_object_locator(in->layout), rd_op,
13697                      nullsnapc, ceph::real_clock::now(), 0, &rd_cond);
13698
13699     C_SaferCond wr_cond;
13700     ObjectOperation wr_op;
13701     wr_op.create(true);
13702
13703     objecter->mutate(oid, OSDMap::file_to_object_locator(in->layout), wr_op,
13704                      nullsnapc, ceph::real_clock::now(), 0, &wr_cond);
13705
13706     client_lock.Unlock();
13707     int rd_ret = rd_cond.wait();
13708     int wr_ret = wr_cond.wait();
13709     client_lock.Lock();
13710
13711     bool errored = false;
13712
13713     if (rd_ret == 0 || rd_ret == -ENOENT)
13714       have |= POOL_READ;
13715     else if (rd_ret != -EPERM) {
13716       ldout(cct, 10) << "check_pool_perm on pool " << pool_id << " ns " << pool_ns
13717                      << " rd_err = " << rd_ret << " wr_err = " << wr_ret << dendl;
13718       errored = true;
13719     }
13720
13721     if (wr_ret == 0 || wr_ret == -EEXIST)
13722       have |= POOL_WRITE;
13723     else if (wr_ret != -EPERM) {
13724       ldout(cct, 10) << "check_pool_perm on pool " << pool_id << " ns " << pool_ns
13725                      << " rd_err = " << rd_ret << " wr_err = " << wr_ret << dendl;
13726       errored = true;
13727     }
13728
13729     if (errored) {
13730       // Indeterminate: erase CHECKING state so that subsequent calls re-check.
13731       // Raise EIO because actual error code might be misleading for
13732       // userspace filesystem user.
13733       pool_perms.erase(perm_key);
13734       signal_cond_list(waiting_for_pool_perm);
13735       return -EIO;
13736     }
13737
13738     pool_perms[perm_key] = have | POOL_CHECKED;
13739     signal_cond_list(waiting_for_pool_perm);
13740   }
13741
13742   if ((need & CEPH_CAP_FILE_RD) && !(have & POOL_READ)) {
13743     ldout(cct, 10) << "check_pool_perm on pool " << pool_id << " ns " << pool_ns
13744                    << " need " << ccap_string(need) << ", but no read perm" << dendl;
13745     return -EPERM;
13746   }
13747   if ((need & CEPH_CAP_FILE_WR) && !(have & POOL_WRITE)) {
13748     ldout(cct, 10) << "check_pool_perm on pool " << pool_id << " ns " << pool_ns
13749                    << " need " << ccap_string(need) << ", but no write perm" << dendl;
13750     return -EPERM;
13751   }
13752
13753   return 0;
13754 }
13755
13756 int Client::_posix_acl_permission(Inode *in, const UserPerm& perms, unsigned want)
13757 {
13758   if (acl_type == POSIX_ACL) {
13759     if (in->xattrs.count(ACL_EA_ACCESS)) {
13760       const bufferptr& access_acl = in->xattrs[ACL_EA_ACCESS];
13761
13762       return posix_acl_permits(access_acl, in->uid, in->gid, perms, want);
13763     }
13764   }
13765   return -EAGAIN;
13766 }
13767
13768 int Client::_posix_acl_chmod(Inode *in, mode_t mode, const UserPerm& perms)
13769 {
13770   if (acl_type == NO_ACL)
13771     return 0;
13772
13773   int r = _getattr(in, CEPH_STAT_CAP_XATTR, perms, in->xattr_version == 0);
13774   if (r < 0)
13775     goto out;
13776
13777   if (acl_type == POSIX_ACL) {
13778     if (in->xattrs.count(ACL_EA_ACCESS)) {
13779       const bufferptr& access_acl = in->xattrs[ACL_EA_ACCESS];
13780       bufferptr acl(access_acl.c_str(), access_acl.length());
13781       r = posix_acl_access_chmod(acl, mode);
13782       if (r < 0)
13783         goto out;
13784       r = _do_setxattr(in, ACL_EA_ACCESS, acl.c_str(), acl.length(), 0, perms);
13785     } else {
13786       r = 0;
13787     }
13788   }
13789 out:
13790   ldout(cct, 10) << __func__ << " ino " << in->ino << " result=" << r << dendl;
13791   return r;
13792 }
13793
13794 int Client::_posix_acl_create(Inode *dir, mode_t *mode, bufferlist& xattrs_bl,
13795                               const UserPerm& perms)
13796 {
13797   if (acl_type == NO_ACL)
13798     return 0;
13799
13800   if (S_ISLNK(*mode))
13801     return 0;
13802
13803   int r = _getattr(dir, CEPH_STAT_CAP_XATTR, perms, dir->xattr_version == 0);
13804   if (r < 0)
13805     goto out;
13806
13807   if (acl_type == POSIX_ACL) {
13808     if (dir->xattrs.count(ACL_EA_DEFAULT)) {
13809       map<string, bufferptr> xattrs;
13810
13811       const bufferptr& default_acl = dir->xattrs[ACL_EA_DEFAULT];
13812       bufferptr acl(default_acl.c_str(), default_acl.length());
13813       r = posix_acl_inherit_mode(acl, mode);
13814       if (r < 0)
13815         goto out;
13816
13817       if (r > 0) {
13818         r = posix_acl_equiv_mode(acl.c_str(), acl.length(), mode);
13819         if (r < 0)
13820           goto out;
13821         if (r > 0)
13822           xattrs[ACL_EA_ACCESS] = acl;
13823       }
13824
13825       if (S_ISDIR(*mode))
13826         xattrs[ACL_EA_DEFAULT] = dir->xattrs[ACL_EA_DEFAULT];
13827
13828       r = xattrs.size();
13829       if (r > 0)
13830         ::encode(xattrs, xattrs_bl);
13831     } else {
13832       if (umask_cb)
13833         *mode &= ~umask_cb(callback_handle);
13834       r = 0;
13835     }
13836   }
13837 out:
13838   ldout(cct, 10) << __func__ << " dir ino " << dir->ino << " result=" << r << dendl;
13839   return r;
13840 }
13841
13842 void Client::set_filer_flags(int flags)
13843 {
13844   Mutex::Locker l(client_lock);
13845   assert(flags == 0 ||
13846          flags == CEPH_OSD_FLAG_LOCALIZE_READS);
13847   objecter->add_global_op_flags(flags);
13848 }
13849
13850 void Client::clear_filer_flags(int flags)
13851 {
13852   Mutex::Locker l(client_lock);
13853   assert(flags == CEPH_OSD_FLAG_LOCALIZE_READS);
13854   objecter->clear_global_op_flag(flags);
13855 }
13856
13857 /**
13858  * This is included in cap release messages, to cause
13859  * the MDS to wait until this OSD map epoch.  It is necessary
13860  * in corner cases where we cancel RADOS ops, so that
13861  * nobody else tries to do IO to the same objects in
13862  * the same epoch as the cancelled ops.
13863  */
13864 void Client::set_cap_epoch_barrier(epoch_t e)
13865 {
13866   ldout(cct, 5) << __func__ << " epoch = " << e << dendl;
13867   cap_epoch_barrier = e;
13868 }
13869
13870 const char** Client::get_tracked_conf_keys() const
13871 {
13872   static const char* keys[] = {
13873     "client_cache_size",
13874     "client_cache_mid",
13875     "client_acl_type",
13876     "client_deleg_timeout",
13877     "client_deleg_break_on_open",
13878     NULL
13879   };
13880   return keys;
13881 }
13882
13883 void Client::handle_conf_change(const struct md_config_t *conf,
13884                                 const std::set <std::string> &changed)
13885 {
13886   Mutex::Locker lock(client_lock);
13887
13888   if (changed.count("client_cache_mid")) {
13889     lru.lru_set_midpoint(cct->_conf->client_cache_mid);
13890   }
13891   if (changed.count("client_acl_type")) {
13892     acl_type = NO_ACL;
13893     if (cct->_conf->client_acl_type == "posix_acl")
13894       acl_type = POSIX_ACL;
13895   }
13896 }
13897
13898 void Client::init_groups(UserPerm *perms)
13899 {
13900   gid_t *sgids;
13901   int count = _getgrouplist(&sgids, perms->uid(), perms->gid());
13902   perms->init_gids(sgids, count);
13903 }
13904
13905 void intrusive_ptr_add_ref(Inode *in)
13906 {
13907   in->get();
13908 }
13909
13910 void intrusive_ptr_release(Inode *in)
13911 {
13912   in->client->put_inode(in);
13913 }
13914
13915 mds_rank_t Client::_get_random_up_mds() const
13916 {
13917   assert(client_lock.is_locked_by_me());
13918
13919   std::set<mds_rank_t> up;
13920   mdsmap->get_up_mds_set(up);
13921
13922   if (up.empty())
13923     return MDS_RANK_NONE;
13924   std::set<mds_rank_t>::const_iterator p = up.begin();
13925   for (int n = rand() % up.size(); n; n--)
13926     ++p;
13927   return *p;
13928 }
13929
13930
13931 StandaloneClient::StandaloneClient(Messenger *m, MonClient *mc)
13932     : Client(m, mc, new Objecter(m->cct, m, mc, NULL, 0, 0))
13933 {
13934   monclient->set_messenger(m);
13935   objecter->set_client_incarnation(0);
13936 }
13937
13938 StandaloneClient::~StandaloneClient()
13939 {
13940   delete objecter;
13941   objecter = nullptr;
13942 }
13943
13944 int StandaloneClient::init()
13945 {
13946   timer.init();
13947   objectcacher->start();
13948   objecter->init();
13949
13950   client_lock.Lock();
13951   assert(!initialized);
13952
13953   messenger->add_dispatcher_tail(objecter);
13954   messenger->add_dispatcher_tail(this);
13955
13956   monclient->set_want_keys(CEPH_ENTITY_TYPE_MDS | CEPH_ENTITY_TYPE_OSD);
13957   int r = monclient->init();
13958   if (r < 0) {
13959     // need to do cleanup because we're in an intermediate init state
13960     timer.shutdown();
13961     client_lock.Unlock();
13962     objecter->shutdown();
13963     objectcacher->stop();
13964     monclient->shutdown();
13965     return r;
13966   }
13967   objecter->start();
13968
13969   client_lock.Unlock();
13970   _finish_init();
13971
13972   return 0;
13973 }
13974
13975 void StandaloneClient::shutdown()
13976 {
13977   Client::shutdown();
13978   objecter->shutdown();
13979   monclient->shutdown();
13980 }