ceph/src/client/Client.cc

   1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
   2 // vim: ts=8 sw=2 smarttab
   3 /*
   4  * Ceph - scalable distributed file system
   5  *
   6  * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
   7  *
   8  * This is free software; you can redistribute it and/or
   9  * modify it under the terms of the GNU Lesser General Public
  10  * License version 2.1, as published by the Free Software
  11  * Foundation.  See file COPYING.
  12  *
  13  */
  14
  15
  16 // unix-ey fs stuff
  17 #include <unistd.h>
  18 #include <sys/types.h>
  19 #include <time.h>
  20 #include <utime.h>
  21 #include <sys/stat.h>
  22 #include <sys/param.h>
  23 #include <fcntl.h>
  24 #include <sys/file.h>
  25 #include <sys/utsname.h>
  26 #include <sys/uio.h>
  27
  28 #include <boost/lexical_cast.hpp>
  29 #include <boost/fusion/include/std_pair.hpp>
  30
  31 #if defined(__FreeBSD__)
  32 #define XATTR_CREATE    0x1
  33 #define XATTR_REPLACE   0x2
  34 #else
  35 #include <sys/xattr.h>
  36 #endif
  37
  38 #if defined(__linux__)
  39 #include <linux/falloc.h>
  40 #endif
  41
  42 #include <sys/statvfs.h>
  43
  44 #include "common/config.h"
  45 #include "common/version.h"
  46
  47 // ceph stuff
  48 #include "messages/MClientSession.h"
  49 #include "messages/MClientReconnect.h"
  50 #include "messages/MClientRequest.h"
  51 #include "messages/MClientRequestForward.h"
  52 #include "messages/MClientReply.h"
  53 #include "messages/MClientCaps.h"
  54 #include "messages/MClientLease.h"
  55 #include "messages/MClientSnap.h"
  56 #include "messages/MCommandReply.h"
  57 #include "messages/MOSDMap.h"
  58 #include "messages/MClientQuota.h"
  59 #include "messages/MClientCapRelease.h"
  60 #include "messages/MMDSMap.h"
  61 #include "messages/MFSMap.h"
  62 #include "messages/MFSMapUser.h"
  63
  64 #include "mon/MonClient.h"
  65
  66 #include "mds/flock.h"
  67 #include "osd/OSDMap.h"
  68 #include "osdc/Filer.h"
  69
  70 #include "common/Cond.h"
  71 #include "common/Mutex.h"
  72 #include "common/perf_counters.h"
  73 #include "common/admin_socket.h"
  74 #include "common/errno.h"
  75 #include "include/str_list.h"
  76
  77 #define dout_subsys ceph_subsys_client
  78
  79 #include "include/lru.h"
  80 #include "include/compat.h"
  81 #include "include/stringify.h"
  82
  83 #include "Client.h"
  84 #include "Inode.h"
  85 #include "Dentry.h"
  86 #include "Delegation.h"
  87 #include "Dir.h"
  88 #include "ClientSnapRealm.h"
  89 #include "Fh.h"
  90 #include "MetaSession.h"
  91 #include "MetaRequest.h"
  92 #include "ObjecterWriteback.h"
  93 #include "posix_acl.h"
  94
  95 #include "include/assert.h"
  96 #include "include/stat.h"
  97
  98 #include "include/cephfs/ceph_statx.h"
  99
 100 #if HAVE_GETGROUPLIST
 101 #include <grp.h>
 102 #include <pwd.h>
 103 #include <unistd.h>
 104 #endif
 105
 106 #undef dout_prefix
 107 #define dout_prefix *_dout << "client." << whoami << " "
 108
 109 #define  tout(cct)       if (!cct->_conf->client_trace.empty()) traceout
 110
 111 // FreeBSD fails to define this
 112 #ifndef O_DSYNC
 113 #define O_DSYNC 0x0
 114 #endif
 115 // Darwin fails to define this
 116 #ifndef O_RSYNC
 117 #define O_RSYNC 0x0
 118 #endif
 119
 120 #ifndef O_DIRECT
 121 #define O_DIRECT 0x0
 122 #endif
 123
 124 #define DEBUG_GETATTR_CAPS (CEPH_CAP_XATTR_SHARED)
 125
 126 void client_flush_set_callback(void *p, ObjectCacher::ObjectSet *oset)
 127 {
 128   Client *client = static_cast<Client*>(p);
 129   client->flush_set_callback(oset);
 130 }
 131
 132
 133 // -------------
 134
 135 Client::CommandHook::CommandHook(Client *client) :
 136   m_client(client)
 137 {
 138 }
 139
 140 bool Client::CommandHook::call(std::string command, cmdmap_t& cmdmap,
 141                                std::string format, bufferlist& out)
 142 {
 143   Formatter *f = Formatter::create(format);
 144   f->open_object_section("result");
 145   m_client->client_lock.Lock();
 146   if (command == "mds_requests")
 147     m_client->dump_mds_requests(f);
 148   else if (command == "mds_sessions")
 149     m_client->dump_mds_sessions(f);
 150   else if (command == "dump_cache")
 151     m_client->dump_cache(f);
 152   else if (command == "kick_stale_sessions")
 153     m_client->_kick_stale_sessions();
 154   else if (command == "status")
 155     m_client->dump_status(f);
 156   else
 157     assert(0 == "bad command registered");
 158   m_client->client_lock.Unlock();
 159   f->close_section();
 160   f->flush(out);
 161   delete f;
 162   return true;
 163 }
 164
 165
 166 // -------------
 167
 168 dir_result_t::dir_result_t(Inode *in, const UserPerm& perms)
 169   : inode(in), offset(0), next_offset(2),
 170     release_count(0), ordered_count(0), cache_index(0), start_shared_gen(0),
 171     perms(perms)
 172   { }
 173
 174 void Client::_reset_faked_inos()
 175 {
 176   ino_t start = 1024;
 177   free_faked_inos.clear();
 178   free_faked_inos.insert(start, (uint32_t)-1 - start + 1);
 179   last_used_faked_ino = 0;
 180   _use_faked_inos = sizeof(ino_t) < 8 || cct->_conf->client_use_faked_inos;
 181 }
 182
 183 void Client::_assign_faked_ino(Inode *in)
 184 {
 185   interval_set<ino_t>::const_iterator it = free_faked_inos.lower_bound(last_used_faked_ino + 1);
 186   if (it == free_faked_inos.end() && last_used_faked_ino > 0) {
 187     last_used_faked_ino = 0;
 188     it = free_faked_inos.lower_bound(last_used_faked_ino + 1);
 189   }
 190   assert(it != free_faked_inos.end());
 191   if (last_used_faked_ino < it.get_start()) {
 192     assert(it.get_len() > 0);
 193     last_used_faked_ino = it.get_start();
 194   } else {
 195     ++last_used_faked_ino;
 196     assert(it.get_start() + it.get_len() > last_used_faked_ino);
 197   }
 198   in->faked_ino = last_used_faked_ino;
 199   free_faked_inos.erase(in->faked_ino);
 200   faked_ino_map[in->faked_ino] = in->vino();
 201 }
 202
 203 void Client::_release_faked_ino(Inode *in)
 204 {
 205   free_faked_inos.insert(in->faked_ino);
 206   faked_ino_map.erase(in->faked_ino);
 207 }
 208
 209 vinodeno_t Client::_map_faked_ino(ino_t ino)
 210 {
 211   vinodeno_t vino;
 212   if (ino == 1)
 213     vino = root->vino();
 214   else if (faked_ino_map.count(ino))
 215     vino = faked_ino_map[ino];
 216   else
 217     vino = vinodeno_t(0, CEPH_NOSNAP);
 218   ldout(cct, 10) << "map_faked_ino " << ino << " -> " << vino << dendl;
 219   return vino;
 220 }
 221
 222 vinodeno_t Client::map_faked_ino(ino_t ino)
 223 {
 224   Mutex::Locker lock(client_lock);
 225   return _map_faked_ino(ino);
 226 }
 227
 228 // cons/des
 229
 230 Client::Client(Messenger *m, MonClient *mc, Objecter *objecter_)
 231   : Dispatcher(m->cct),
 232     m_command_hook(this),
 233     timer(m->cct, client_lock),
 234     callback_handle(NULL),
 235     switch_interrupt_cb(NULL),
 236     remount_cb(NULL),
 237     ino_invalidate_cb(NULL),
 238     dentry_invalidate_cb(NULL),
 239     umask_cb(NULL),
 240     can_invalidate_dentries(false),
 241     async_ino_invalidator(m->cct),
 242     async_dentry_invalidator(m->cct),
 243     interrupt_finisher(m->cct),
 244     remount_finisher(m->cct),
 245     objecter_finisher(m->cct),
 246     tick_event(NULL),
 247     messenger(m), monclient(mc),
 248     objecter(objecter_),
 249     whoami(mc->get_global_id()), cap_epoch_barrier(0),
 250     last_tid(0), oldest_tid(0), last_flush_tid(1),
 251     initialized(false),
 252     mounted(false), unmounting(false), blacklisted(false),
 253     local_osd(-ENXIO), local_osd_epoch(0),
 254     unsafe_sync_write(0),
 255     client_lock("Client::client_lock"),
 256     deleg_timeout(0)
 257 {
 258   _reset_faked_inos();
 259   //
 260   root = 0;
 261
 262   num_flushing_caps = 0;
 263
 264   _dir_vxattrs_name_size = _vxattrs_calcu_name_size(_dir_vxattrs);
 265   _file_vxattrs_name_size = _vxattrs_calcu_name_size(_file_vxattrs);
 266
 267   user_id = cct->_conf->client_mount_uid;
 268   group_id = cct->_conf->client_mount_gid;
 269
 270   acl_type = NO_ACL;
 271   if (cct->_conf->client_acl_type == "posix_acl")
 272     acl_type = POSIX_ACL;
 273
 274   lru.lru_set_midpoint(cct->_conf->client_cache_mid);
 275
 276   // file handles
 277   free_fd_set.insert(10, 1<<30);
 278
 279   mdsmap.reset(new MDSMap);
 280
 281   // osd interfaces
 282   writeback_handler.reset(new ObjecterWriteback(objecter, &objecter_finisher,
 283                                             &client_lock));
 284   objectcacher.reset(new ObjectCacher(cct, "libcephfs", *writeback_handler, client_lock,
 285                                   client_flush_set_callback,    // all commit callback
 286                                   (void*)this,
 287                                   cct->_conf->client_oc_size,
 288                                   cct->_conf->client_oc_max_objects,
 289                                   cct->_conf->client_oc_max_dirty,
 290                                   cct->_conf->client_oc_target_dirty,
 291                                   cct->_conf->client_oc_max_dirty_age,
 292                                   true));
 293   objecter_finisher.start();
 294   filer.reset(new Filer(objecter, &objecter_finisher));
 295   objecter->enable_blacklist_events();
 296 }
 297
 298
 299 Client::~Client()
 300 {
 301   assert(!client_lock.is_locked());
 302
 303   // It is necessary to hold client_lock, because any inode destruction
 304   // may call into ObjectCacher, which asserts that it's lock (which is
 305   // client_lock) is held.
 306   client_lock.Lock();
 307   tear_down_cache();
 308   client_lock.Unlock();
 309 }
 310
 311 void Client::tear_down_cache()
 312 {
 313   // fd's
 314   for (ceph::unordered_map<int, Fh*>::iterator it = fd_map.begin();
 315        it != fd_map.end();
 316        ++it) {
 317     Fh *fh = it->second;
 318     ldout(cct, 1) << "tear_down_cache forcing close of fh " << it->first << " ino " << fh->inode->ino << dendl;
 319     _release_fh(fh);
 320   }
 321   fd_map.clear();
 322
 323   while (!opened_dirs.empty()) {
 324     dir_result_t *dirp = *opened_dirs.begin();
 325     ldout(cct, 1) << "tear_down_cache forcing close of dir " << dirp << " ino " << dirp->inode->ino << dendl;
 326     _closedir(dirp);
 327   }
 328
 329   // caps!
 330   // *** FIXME ***
 331
 332   // empty lru
 333   trim_cache();
 334   assert(lru.lru_get_size() == 0);
 335
 336   // close root ino
 337   assert(inode_map.size() <= 1 + root_parents.size());
 338   if (root && inode_map.size() == 1 + root_parents.size()) {
 339     delete root;
 340     root = 0;
 341     root_ancestor = 0;
 342     while (!root_parents.empty())
 343       root_parents.erase(root_parents.begin());
 344     inode_map.clear();
 345     _reset_faked_inos();
 346   }
 347
 348   assert(inode_map.empty());
 349 }
 350
 351 inodeno_t Client::get_root_ino()
 352 {
 353   Mutex::Locker l(client_lock);
 354   if (use_faked_inos())
 355     return root->faked_ino;
 356   else
 357     return root->ino;
 358 }
 359
 360 Inode *Client::get_root()
 361 {
 362   Mutex::Locker l(client_lock);
 363   root->ll_get();
 364   return root;
 365 }
 366
 367
 368 // debug crapola
 369
 370 void Client::dump_inode(Formatter *f, Inode *in, set<Inode*>& did, bool disconnected)
 371 {
 372   filepath path;
 373   in->make_long_path(path);
 374   ldout(cct, 1) << "dump_inode: "
 375                 << (disconnected ? "DISCONNECTED ":"")
 376                 << "inode " << in->ino
 377                 << " " << path
 378                 << " ref " << in->get_num_ref()
 379                 << *in << dendl;
 380
 381   if (f) {
 382     f->open_object_section("inode");
 383     f->dump_stream("path") << path;
 384     if (disconnected)
 385       f->dump_int("disconnected", 1);
 386     in->dump(f);
 387     f->close_section();
 388   }
 389
 390   did.insert(in);
 391   if (in->dir) {
 392     ldout(cct, 1) << "  dir " << in->dir << " size " << in->dir->dentries.size() << dendl;
 393     for (ceph::unordered_map<string, Dentry*>::iterator it = in->dir->dentries.begin();
 394          it != in->dir->dentries.end();
 395          ++it) {
 396       ldout(cct, 1) << "   " << in->ino << " dn " << it->first << " " << it->second << " ref " << it->second->ref << dendl;
 397       if (f) {
 398         f->open_object_section("dentry");
 399         it->second->dump(f);
 400         f->close_section();
 401       }
 402       if (it->second->inode)
 403         dump_inode(f, it->second->inode.get(), did, false);
 404     }
 405   }
 406 }
 407
 408 void Client::dump_cache(Formatter *f)
 409 {
 410   set<Inode*> did;
 411
 412   ldout(cct, 1) << "dump_cache" << dendl;
 413
 414   if (f)
 415     f->open_array_section("cache");
 416
 417   if (root)
 418     dump_inode(f, root, did, true);
 419
 420   // make a second pass to catch anything disconnected
 421   for (ceph::unordered_map<vinodeno_t, Inode*>::iterator it = inode_map.begin();
 422        it != inode_map.end();
 423        ++it) {
 424     if (did.count(it->second))
 425       continue;
 426     dump_inode(f, it->second, did, true);
 427   }
 428
 429   if (f)
 430     f->close_section();
 431 }
 432
 433 void Client::dump_status(Formatter *f)
 434 {
 435   assert(client_lock.is_locked_by_me());
 436
 437   ldout(cct, 1) << __func__ << dendl;
 438
 439   const epoch_t osd_epoch
 440     = objecter->with_osdmap(std::mem_fn(&OSDMap::get_epoch));
 441
 442   if (f) {
 443     f->open_object_section("metadata");
 444     for (const auto& kv : metadata)
 445       f->dump_string(kv.first.c_str(), kv.second);
 446     f->close_section();
 447
 448     f->dump_int("dentry_count", lru.lru_get_size());
 449     f->dump_int("dentry_pinned_count", lru.lru_get_num_pinned());
 450     f->dump_int("id", get_nodeid().v);
 451     entity_inst_t inst(messenger->get_myname(), messenger->get_myaddr());
 452     f->dump_object("inst", inst);
 453     f->dump_stream("inst_str") << inst;
 454     f->dump_stream("addr_str") << inst.addr;
 455     f->dump_int("inode_count", inode_map.size());
 456     f->dump_int("mds_epoch", mdsmap->get_epoch());
 457     f->dump_int("osd_epoch", osd_epoch);
 458     f->dump_int("osd_epoch_barrier", cap_epoch_barrier);
 459     f->dump_bool("blacklisted", blacklisted);
 460   }
 461 }
 462
 463 int Client::init()
 464 {
 465   timer.init();
 466   objectcacher->start();
 467
 468   client_lock.Lock();
 469   assert(!initialized);
 470
 471   messenger->add_dispatcher_tail(this);
 472   client_lock.Unlock();
 473
 474   _finish_init();
 475   return 0;
 476 }
 477
 478 void Client::_finish_init()
 479 {
 480   client_lock.Lock();
 481   // logger
 482   PerfCountersBuilder plb(cct, "client", l_c_first, l_c_last);
 483   plb.add_time_avg(l_c_reply, "reply", "Latency of receiving a reply on metadata request");
 484   plb.add_time_avg(l_c_lat, "lat", "Latency of processing a metadata request");
 485   plb.add_time_avg(l_c_wrlat, "wrlat", "Latency of a file data write operation");
 486   logger.reset(plb.create_perf_counters());
 487   cct->get_perfcounters_collection()->add(logger.get());
 488
 489   client_lock.Unlock();
 490
 491   cct->_conf->add_observer(this);
 492
 493   AdminSocket* admin_socket = cct->get_admin_socket();
 494   int ret = admin_socket->register_command("mds_requests",
 495                                            "mds_requests",
 496                                            &m_command_hook,
 497                                            "show in-progress mds requests");
 498   if (ret < 0) {
 499     lderr(cct) << "error registering admin socket command: "
 500                << cpp_strerror(-ret) << dendl;
 501   }
 502   ret = admin_socket->register_command("mds_sessions",
 503                                        "mds_sessions",
 504                                        &m_command_hook,
 505                                        "show mds session state");
 506   if (ret < 0) {
 507     lderr(cct) << "error registering admin socket command: "
 508                << cpp_strerror(-ret) << dendl;
 509   }
 510   ret = admin_socket->register_command("dump_cache",
 511                                        "dump_cache",
 512                                        &m_command_hook,
 513                                        "show in-memory metadata cache contents");
 514   if (ret < 0) {
 515     lderr(cct) << "error registering admin socket command: "
 516                << cpp_strerror(-ret) << dendl;
 517   }
 518   ret = admin_socket->register_command("kick_stale_sessions",
 519                                        "kick_stale_sessions",
 520                                        &m_command_hook,
 521                                        "kick sessions that were remote reset");
 522   if (ret < 0) {
 523     lderr(cct) << "error registering admin socket command: "
 524                << cpp_strerror(-ret) << dendl;
 525   }
 526   ret = admin_socket->register_command("status",
 527                                        "status",
 528                                        &m_command_hook,
 529                                        "show overall client status");
 530   if (ret < 0) {
 531     lderr(cct) << "error registering admin socket command: "
 532                << cpp_strerror(-ret) << dendl;
 533   }
 534
 535   client_lock.Lock();
 536   initialized = true;
 537   client_lock.Unlock();
 538 }
 539
 540 void Client::shutdown()
 541 {
 542   ldout(cct, 1) << "shutdown" << dendl;
 543
 544   // If we were not mounted, but were being used for sending
 545   // MDS commands, we may have sessions that need closing.
 546   client_lock.Lock();
 547   _close_sessions();
 548   client_lock.Unlock();
 549
 550   cct->_conf->remove_observer(this);
 551
 552   AdminSocket* admin_socket = cct->get_admin_socket();
 553   admin_socket->unregister_command("mds_requests");
 554   admin_socket->unregister_command("mds_sessions");
 555   admin_socket->unregister_command("dump_cache");
 556   admin_socket->unregister_command("kick_stale_sessions");
 557   admin_socket->unregister_command("status");
 558
 559   if (ino_invalidate_cb) {
 560     ldout(cct, 10) << "shutdown stopping cache invalidator finisher" << dendl;
 561     async_ino_invalidator.wait_for_empty();
 562     async_ino_invalidator.stop();
 563   }
 564
 565   if (dentry_invalidate_cb) {
 566     ldout(cct, 10) << "shutdown stopping dentry invalidator finisher" << dendl;
 567     async_dentry_invalidator.wait_for_empty();
 568     async_dentry_invalidator.stop();
 569   }
 570
 571   if (switch_interrupt_cb) {
 572     ldout(cct, 10) << "shutdown stopping interrupt finisher" << dendl;
 573     interrupt_finisher.wait_for_empty();
 574     interrupt_finisher.stop();
 575   }
 576
 577   if (remount_cb) {
 578     ldout(cct, 10) << "shutdown stopping remount finisher" << dendl;
 579     remount_finisher.wait_for_empty();
 580     remount_finisher.stop();
 581   }
 582
 583   objectcacher->stop();  // outside of client_lock! this does a join.
 584
 585   client_lock.Lock();
 586   assert(initialized);
 587   initialized = false;
 588   timer.shutdown();
 589   client_lock.Unlock();
 590
 591   objecter_finisher.wait_for_empty();
 592   objecter_finisher.stop();
 593
 594   if (logger) {
 595     cct->get_perfcounters_collection()->remove(logger.get());
 596     logger.reset();
 597   }
 598 }
 599
 600
 601 // ===================
 602 // metadata cache stuff
 603
 604 void Client::trim_cache(bool trim_kernel_dcache)
 605 {
 606   uint64_t max = cct->_conf->client_cache_size;
 607   ldout(cct, 20) << "trim_cache size " << lru.lru_get_size() << " max " << max << dendl;
 608   unsigned last = 0;
 609   while (lru.lru_get_size() != last) {
 610     last = lru.lru_get_size();
 611
 612     if (!unmounting && lru.lru_get_size() <= max)  break;
 613
 614     // trim!
 615     Dentry *dn = static_cast<Dentry*>(lru.lru_get_next_expire());
 616     if (!dn)
 617       break;  // done
 618
 619     trim_dentry(dn);
 620   }
 621
 622   if (trim_kernel_dcache && lru.lru_get_size() > max)
 623     _invalidate_kernel_dcache();
 624
 625   // hose root?
 626   if (lru.lru_get_size() == 0 && root && root->get_num_ref() == 0 && inode_map.size() == 1 + root_parents.size()) {
 627     ldout(cct, 15) << "trim_cache trimmed root " << root << dendl;
 628     delete root;
 629     root = 0;
 630     root_ancestor = 0;
 631     while (!root_parents.empty())
 632       root_parents.erase(root_parents.begin());
 633     inode_map.clear();
 634     _reset_faked_inos();
 635   }
 636 }
 637
 638 void Client::trim_cache_for_reconnect(MetaSession *s)
 639 {
 640   mds_rank_t mds = s->mds_num;
 641   ldout(cct, 20) << "trim_cache_for_reconnect mds." << mds << dendl;
 642
 643   int trimmed = 0;
 644   list<Dentry*> skipped;
 645   while (lru.lru_get_size() > 0) {
 646     Dentry *dn = static_cast<Dentry*>(lru.lru_expire());
 647     if (!dn)
 648       break;
 649
 650     if ((dn->inode && dn->inode->caps.count(mds)) ||
 651         dn->dir->parent_inode->caps.count(mds)) {
 652       trim_dentry(dn);
 653       trimmed++;
 654     } else
 655       skipped.push_back(dn);
 656   }
 657
 658   for(list<Dentry*>::iterator p = skipped.begin(); p != skipped.end(); ++p)
 659     lru.lru_insert_mid(*p);
 660
 661   ldout(cct, 20) << "trim_cache_for_reconnect mds." << mds
 662                  << " trimmed " << trimmed << " dentries" << dendl;
 663
 664   if (s->caps.size() > 0)
 665     _invalidate_kernel_dcache();
 666 }
 667
 668 void Client::trim_dentry(Dentry *dn)
 669 {
 670   ldout(cct, 15) << "trim_dentry unlinking dn " << dn->name
 671                  << " in dir " << hex << dn->dir->parent_inode->ino
 672                  << dendl;
 673   if (dn->inode) {
 674     Inode *diri = dn->dir->parent_inode;
 675     diri->dir_release_count++;
 676     clear_dir_complete_and_ordered(diri, true);
 677   }
 678   unlink(dn, false, false);  // drop dir, drop dentry
 679 }
 680
 681
 682 void Client::update_inode_file_size(Inode *in, int issued, uint64_t size,
 683                                     uint64_t truncate_seq, uint64_t truncate_size)
 684 {
 685   uint64_t prior_size = in->size;
 686
 687   if (truncate_seq > in->truncate_seq ||
 688       (truncate_seq == in->truncate_seq && size > in->size)) {
 689     ldout(cct, 10) << "size " << in->size << " -> " << size << dendl;
 690     in->size = size;
 691     in->reported_size = size;
 692     if (truncate_seq != in->truncate_seq) {
 693       ldout(cct, 10) << "truncate_seq " << in->truncate_seq << " -> "
 694                << truncate_seq << dendl;
 695       in->truncate_seq = truncate_seq;
 696       in->oset.truncate_seq = truncate_seq;
 697
 698       // truncate cached file data
 699       if (prior_size > size) {
 700         _invalidate_inode_cache(in, truncate_size, prior_size - truncate_size);
 701       }
 702     }
 703
 704     // truncate inline data
 705     if (in->inline_version < CEPH_INLINE_NONE) {
 706       uint32_t len = in->inline_data.length();
 707       if (size < len)
 708         in->inline_data.splice(size, len - size);
 709     }
 710   }
 711   if (truncate_seq >= in->truncate_seq &&
 712       in->truncate_size != truncate_size) {
 713     if (in->is_file()) {
 714       ldout(cct, 10) << "truncate_size " << in->truncate_size << " -> "
 715                << truncate_size << dendl;
 716       in->truncate_size = truncate_size;
 717       in->oset.truncate_size = truncate_size;
 718     } else {
 719       ldout(cct, 0) << "Hmmm, truncate_seq && truncate_size changed on non-file inode!" << dendl;
 720     }
 721   }
 722 }
 723
 724 void Client::update_inode_file_time(Inode *in, int issued, uint64_t time_warp_seq,
 725                                     utime_t ctime, utime_t mtime, utime_t atime)
 726 {
 727   ldout(cct, 10) << __func__ << " " << *in << " " << ccap_string(issued)
 728                  << " ctime " << ctime << " mtime " << mtime << dendl;
 729
 730   if (time_warp_seq > in->time_warp_seq)
 731     ldout(cct, 10) << " mds time_warp_seq " << time_warp_seq
 732                    << " is higher than local time_warp_seq "
 733                    << in->time_warp_seq << dendl;
 734
 735   int warn = false;
 736   // be careful with size, mtime, atime
 737   if (issued & (CEPH_CAP_FILE_EXCL|
 738                 CEPH_CAP_FILE_WR|
 739                 CEPH_CAP_FILE_BUFFER|
 740                 CEPH_CAP_AUTH_EXCL|
 741                 CEPH_CAP_XATTR_EXCL)) {
 742     ldout(cct, 30) << "Yay have enough caps to look at our times" << dendl;
 743     if (ctime > in->ctime)
 744       in->ctime = ctime;
 745     if (time_warp_seq > in->time_warp_seq) {
 746       //the mds updated times, so take those!
 747       in->mtime = mtime;
 748       in->atime = atime;
 749       in->time_warp_seq = time_warp_seq;
 750     } else if (time_warp_seq == in->time_warp_seq) {
 751       //take max times
 752       if (mtime > in->mtime)
 753         in->mtime = mtime;
 754       if (atime > in->atime)
 755         in->atime = atime;
 756     } else if (issued & CEPH_CAP_FILE_EXCL) {
 757       //ignore mds values as we have a higher seq
 758     } else warn = true;
 759   } else {
 760     ldout(cct, 30) << "Don't have enough caps, just taking mds' time values" << dendl;
 761     if (time_warp_seq >= in->time_warp_seq) {
 762       in->ctime = ctime;
 763       in->mtime = mtime;
 764       in->atime = atime;
 765       in->time_warp_seq = time_warp_seq;
 766     } else warn = true;
 767   }
 768   if (warn) {
 769     ldout(cct, 0) << "WARNING: " << *in << " mds time_warp_seq "
 770             << time_warp_seq << " is lower than local time_warp_seq "
 771             << in->time_warp_seq
 772             << dendl;
 773   }
 774 }
 775
 776 void Client::_fragmap_remove_non_leaves(Inode *in)
 777 {
 778   for (map<frag_t,int>::iterator p = in->fragmap.begin(); p != in->fragmap.end(); )
 779     if (!in->dirfragtree.is_leaf(p->first))
 780       in->fragmap.erase(p++);
 781     else
 782       ++p;
 783 }
 784
 785 void Client::_fragmap_remove_stopped_mds(Inode *in, mds_rank_t mds)
 786 {
 787   for (auto p = in->fragmap.begin(); p != in->fragmap.end(); )
 788     if (p->second == mds)
 789       in->fragmap.erase(p++);
 790     else
 791       ++p;
 792 }
 793
 794 Inode * Client::add_update_inode(InodeStat *st, utime_t from,
 795                                  MetaSession *session,
 796                                  const UserPerm& request_perms)
 797 {
 798   Inode *in;
 799   bool was_new = false;
 800   if (inode_map.count(st->vino)) {
 801     in = inode_map[st->vino];
 802     ldout(cct, 12) << "add_update_inode had " << *in << " caps " << ccap_string(st->cap.caps) << dendl;
 803   } else {
 804     in = new Inode(this, st->vino, &st->layout);
 805     inode_map[st->vino] = in;
 806
 807     if (use_faked_inos())
 808       _assign_faked_ino(in);
 809
 810     if (!root) {
 811       root = in;
 812       root_ancestor = in;
 813       cwd = root;
 814     } else if (!mounted) {
 815       root_parents[root_ancestor] = in;
 816       root_ancestor = in;
 817     }
 818
 819     // immutable bits
 820     in->ino = st->vino.ino;
 821     in->snapid = st->vino.snapid;
 822     in->mode = st->mode & S_IFMT;
 823     was_new = true;
 824   }
 825
 826   in->rdev = st->rdev;
 827   if (in->is_symlink())
 828     in->symlink = st->symlink;
 829
 830   // only update inode if mds info is strictly newer, or it is the same and projected (odd).
 831   bool new_version = false;
 832   if (in->version == 0 ||
 833       ((st->cap.flags & CEPH_CAP_FLAG_AUTH) &&
 834        (in->version & ~1) < st->version))
 835     new_version = true;
 836
 837   int issued;
 838   in->caps_issued(&issued);
 839   issued |= in->caps_dirty();
 840   int new_issued = ~issued & (int)st->cap.caps;
 841
 842   if ((new_version || (new_issued & CEPH_CAP_AUTH_SHARED)) &&
 843       !(issued & CEPH_CAP_AUTH_EXCL)) {
 844     in->mode = st->mode;
 845     in->uid = st->uid;
 846     in->gid = st->gid;
 847     in->btime = st->btime;
 848   }
 849
 850   if ((new_version || (new_issued & CEPH_CAP_LINK_SHARED)) &&
 851       !(issued & CEPH_CAP_LINK_EXCL)) {
 852     in->nlink = st->nlink;
 853   }
 854
 855   if (new_version || (new_issued & CEPH_CAP_ANY_RD)) {
 856     update_inode_file_time(in, issued, st->time_warp_seq,
 857                            st->ctime, st->mtime, st->atime);
 858   }
 859
 860   if (new_version ||
 861       (new_issued & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR))) {
 862     in->layout = st->layout;
 863     update_inode_file_size(in, issued, st->size, st->truncate_seq, st->truncate_size);
 864   }
 865
 866   if (in->is_dir()) {
 867     if (new_version || (new_issued & CEPH_CAP_FILE_SHARED)) {
 868       in->dirstat = st->dirstat;
 869     }
 870     // dir_layout/rstat/quota are not tracked by capability, update them only if
 871     // the inode stat is from auth mds
 872     if (new_version || (st->cap.flags & CEPH_CAP_FLAG_AUTH)) {
 873       in->dir_layout = st->dir_layout;
 874       ldout(cct, 20) << " dir hash is " << (int)in->dir_layout.dl_dir_hash << dendl;
 875       in->rstat = st->rstat;
 876       in->quota = st->quota;
 877     }
 878     // move me if/when version reflects fragtree changes.
 879     if (in->dirfragtree != st->dirfragtree) {
 880       in->dirfragtree = st->dirfragtree;
 881       _fragmap_remove_non_leaves(in);
 882     }
 883   }
 884
 885   if ((in->xattr_version  == 0 || !(issued & CEPH_CAP_XATTR_EXCL)) &&
 886       st->xattrbl.length() &&
 887       st->xattr_version > in->xattr_version) {
 888     bufferlist::iterator p = st->xattrbl.begin();
 889     ::decode(in->xattrs, p);
 890     in->xattr_version = st->xattr_version;
 891   }
 892
 893   if (st->inline_version > in->inline_version) {
 894     in->inline_data = st->inline_data;
 895     in->inline_version = st->inline_version;
 896   }
 897
 898   /* always take a newer change attr */
 899   if (st->change_attr > in->change_attr)
 900     in->change_attr = st->change_attr;
 901
 902   if (st->version > in->version)
 903     in->version = st->version;
 904
 905   if (was_new)
 906     ldout(cct, 12) << __func__ << " adding " << *in << " caps " << ccap_string(st->cap.caps) << dendl;
 907
 908   if (!st->cap.caps)
 909     return in;   // as with readdir returning indoes in different snaprealms (no caps!)
 910
 911   if (in->snapid == CEPH_NOSNAP) {
 912     add_update_cap(in, session, st->cap.cap_id, st->cap.caps, st->cap.seq,
 913                    st->cap.mseq, inodeno_t(st->cap.realm), st->cap.flags,
 914                    request_perms);
 915     if (in->auth_cap && in->auth_cap->session == session) {
 916       in->max_size = st->max_size;
 917       in->rstat = st->rstat;
 918     }
 919
 920     // setting I_COMPLETE needs to happen after adding the cap
 921     if (in->is_dir() &&
 922         (st->cap.caps & CEPH_CAP_FILE_SHARED) &&
 923         (issued & CEPH_CAP_FILE_EXCL) == 0 &&
 924         in->dirstat.nfiles == 0 &&
 925         in->dirstat.nsubdirs == 0) {
 926       ldout(cct, 10) << " marking (I_COMPLETE|I_DIR_ORDERED) on empty dir " << *in << dendl;
 927       in->flags |= I_COMPLETE | I_DIR_ORDERED;
 928       if (in->dir) {
 929         ldout(cct, 10) << " dir is open on empty dir " << in->ino << " with "
 930                        << in->dir->dentries.size() << " entries, marking all dentries null" << dendl;
 931         in->dir->readdir_cache.clear();
 932         for (const auto& p : in->dir->dentries) {
 933           unlink(p.second, true, true);  // keep dir, keep dentry
 934         }
 935         if (in->dir->dentries.empty())
 936           close_dir(in->dir);
 937       }
 938     }
 939   } else {
 940     in->snap_caps |= st->cap.caps;
 941   }
 942
 943   return in;
 944 }
 945
 946
 947 /*
 948  * insert_dentry_inode - insert + link a single dentry + inode into the metadata cache.
 949  */
 950 Dentry *Client::insert_dentry_inode(Dir *dir, const string& dname, LeaseStat *dlease,
 951                                     Inode *in, utime_t from, MetaSession *session,
 952                                     Dentry *old_dentry)
 953 {
 954   Dentry *dn = NULL;
 955   if (dir->dentries.count(dname))
 956     dn = dir->dentries[dname];
 957
 958   ldout(cct, 12) << "insert_dentry_inode '" << dname << "' vino " << in->vino()
 959                  << " in dir " << dir->parent_inode->vino() << " dn " << dn
 960                  << dendl;
 961
 962   if (dn && dn->inode) {
 963     if (dn->inode->vino() == in->vino()) {
 964       touch_dn(dn);
 965       ldout(cct, 12) << " had dentry " << dname
 966                << " with correct vino " << dn->inode->vino()
 967                << dendl;
 968     } else {
 969       ldout(cct, 12) << " had dentry " << dname
 970                << " with WRONG vino " << dn->inode->vino()
 971                << dendl;
 972       unlink(dn, true, true);  // keep dir, keep dentry
 973     }
 974   }
 975
 976   if (!dn || !dn->inode) {
 977     InodeRef tmp_ref(in);
 978     if (old_dentry) {
 979       if (old_dentry->dir != dir) {
 980         Inode *old_diri = old_dentry->dir->parent_inode;
 981         old_diri->dir_ordered_count++;
 982         clear_dir_complete_and_ordered(old_diri, false);
 983       }
 984       unlink(old_dentry, dir == old_dentry->dir, false);  // drop dentry, keep dir open if its the same dir
 985     }
 986     Inode *diri = dir->parent_inode;
 987     diri->dir_ordered_count++;
 988     clear_dir_complete_and_ordered(diri, false);
 989     dn = link(dir, dname, in, dn);
 990   }
 991
 992   update_dentry_lease(dn, dlease, from, session);
 993   return dn;
 994 }
 995
 996 void Client::update_dentry_lease(Dentry *dn, LeaseStat *dlease, utime_t from, MetaSession *session)
 997 {
 998   utime_t dttl = from;
 999   dttl += (float)dlease->duration_ms / 1000.0;
1000
1001   assert(dn);
1002
1003   if (dlease->mask & CEPH_LOCK_DN) {
1004     if (dttl > dn->lease_ttl) {
1005       ldout(cct, 10) << "got dentry lease on " << dn->name
1006                << " dur " << dlease->duration_ms << "ms ttl " << dttl << dendl;
1007       dn->lease_ttl = dttl;
1008       dn->lease_mds = session->mds_num;
1009       dn->lease_seq = dlease->seq;
1010       dn->lease_gen = session->cap_gen;
1011     }
1012   }
1013   dn->cap_shared_gen = dn->dir->parent_inode->shared_gen;
1014 }
1015
1016
1017 /*
1018  * update MDS location cache for a single inode
1019  */
1020 void Client::update_dir_dist(Inode *in, DirStat *dst)
1021 {
1022   // auth
1023   ldout(cct, 20) << "got dirfrag map for " << in->ino << " frag " << dst->frag << " to mds " << dst->auth << dendl;
1024   if (dst->auth >= 0) {
1025     in->fragmap[dst->frag] = dst->auth;
1026   } else {
1027     in->fragmap.erase(dst->frag);
1028   }
1029   if (!in->dirfragtree.is_leaf(dst->frag)) {
1030     in->dirfragtree.force_to_leaf(cct, dst->frag);
1031     _fragmap_remove_non_leaves(in);
1032   }
1033
1034   // replicated
1035   in->dir_replicated = !dst->dist.empty();  // FIXME that's just one frag!
1036
1037   // dist
1038   /*
1039   if (!st->dirfrag_dist.empty()) {   // FIXME
1040     set<int> dist = st->dirfrag_dist.begin()->second;
1041     if (dist.empty() && !in->dir_contacts.empty())
1042       ldout(cct, 9) << "lost dist spec for " << in->ino
1043               << " " << dist << dendl;
1044     if (!dist.empty() && in->dir_contacts.empty())
1045       ldout(cct, 9) << "got dist spec for " << in->ino
1046               << " " << dist << dendl;
1047     in->dir_contacts = dist;
1048   }
1049   */
1050 }
1051
1052 void Client::clear_dir_complete_and_ordered(Inode *diri, bool complete)
1053 {
1054   if (diri->flags & I_COMPLETE) {
1055     if (complete) {
1056       ldout(cct, 10) << " clearing (I_COMPLETE|I_DIR_ORDERED) on " << *diri << dendl;
1057       diri->flags &= ~(I_COMPLETE | I_DIR_ORDERED);
1058     } else {
1059       if (diri->flags & I_DIR_ORDERED) {
1060         ldout(cct, 10) << " clearing I_DIR_ORDERED on " << *diri << dendl;
1061         diri->flags &= ~I_DIR_ORDERED;
1062       }
1063     }
1064     if (diri->dir)
1065       diri->dir->readdir_cache.clear();
1066   }
1067 }
1068
1069 /*
1070  * insert results from readdir or lssnap into the metadata cache.
1071  */
1072 void Client::insert_readdir_results(MetaRequest *request, MetaSession *session, Inode *diri) {
1073
1074   MClientReply *reply = request->reply;
1075   ConnectionRef con = request->reply->get_connection();
1076   uint64_t features = con->get_features();
1077
1078   dir_result_t *dirp = request->dirp;
1079   assert(dirp);
1080
1081   // the extra buffer list is only set for readdir and lssnap replies
1082   bufferlist::iterator p = reply->get_extra_bl().begin();
1083   if (!p.end()) {
1084     // snapdir?
1085     if (request->head.op == CEPH_MDS_OP_LSSNAP) {
1086       assert(diri);
1087       diri = open_snapdir(diri);
1088     }
1089
1090     // only open dir if we're actually adding stuff to it!
1091     Dir *dir = diri->open_dir();
1092     assert(dir);
1093
1094     // dirstat
1095     DirStat dst(p);
1096     __u32 numdn;
1097     __u16 flags;
1098     ::decode(numdn, p);
1099     ::decode(flags, p);
1100
1101     bool end = ((unsigned)flags & CEPH_READDIR_FRAG_END);
1102     bool hash_order = ((unsigned)flags & CEPH_READDIR_HASH_ORDER);
1103
1104     frag_t fg = (unsigned)request->head.args.readdir.frag;
1105     unsigned readdir_offset = dirp->next_offset;
1106     string readdir_start = dirp->last_name;
1107     assert(!readdir_start.empty() || readdir_offset == 2);
1108
1109     unsigned last_hash = 0;
1110     if (hash_order) {
1111       if (!readdir_start.empty()) {
1112         last_hash = ceph_frag_value(diri->hash_dentry_name(readdir_start));
1113       } else if (flags & CEPH_READDIR_OFFSET_HASH) {
1114         /* mds understands offset_hash */
1115         last_hash = (unsigned)request->head.args.readdir.offset_hash;
1116       }
1117     }
1118
1119     if (fg != dst.frag) {
1120       ldout(cct, 10) << "insert_trace got new frag " << fg << " -> " << dst.frag << dendl;
1121       fg = dst.frag;
1122       if (!hash_order) {
1123         readdir_offset = 2;
1124         readdir_start.clear();
1125         dirp->offset = dir_result_t::make_fpos(fg, readdir_offset, false);
1126       }
1127     }
1128
1129     ldout(cct, 10) << __func__ << " " << numdn << " readdir items, end=" << end
1130                    << ", hash_order=" << hash_order
1131                    << ", readdir_start " << readdir_start
1132                    << ", last_hash " << last_hash
1133                    << ", next_offset " << readdir_offset << dendl;
1134
1135     if (diri->snapid != CEPH_SNAPDIR &&
1136         fg.is_leftmost() && readdir_offset == 2 &&
1137         !(hash_order && last_hash)) {
1138       dirp->release_count = diri->dir_release_count;
1139       dirp->ordered_count = diri->dir_ordered_count;
1140       dirp->start_shared_gen = diri->shared_gen;
1141       dirp->cache_index = 0;
1142     }
1143
1144     dirp->buffer_frag = fg;
1145
1146     _readdir_drop_dirp_buffer(dirp);
1147     dirp->buffer.reserve(numdn);
1148
1149     string dname;
1150     LeaseStat dlease;
1151     for (unsigned i=0; i<numdn; i++) {
1152       ::decode(dname, p);
1153       ::decode(dlease, p);
1154       InodeStat ist(p, features);
1155
1156       ldout(cct, 15) << "" << i << ": '" << dname << "'" << dendl;
1157
1158       Inode *in = add_update_inode(&ist, request->sent_stamp, session,
1159                                    request->perms);
1160       Dentry *dn;
1161       if (diri->dir->dentries.count(dname)) {
1162         Dentry *olddn = diri->dir->dentries[dname];
1163         if (olddn->inode != in) {
1164           // replace incorrect dentry
1165           unlink(olddn, true, true);  // keep dir, dentry
1166           dn = link(dir, dname, in, olddn);
1167           assert(dn == olddn);
1168         } else {
1169           // keep existing dn
1170           dn = olddn;
1171           touch_dn(dn);
1172         }
1173       } else {
1174         // new dn
1175         dn = link(dir, dname, in, NULL);
1176       }
1177
1178       update_dentry_lease(dn, &dlease, request->sent_stamp, session);
1179       if (hash_order) {
1180         unsigned hash = ceph_frag_value(diri->hash_dentry_name(dname));
1181         if (hash != last_hash)
1182           readdir_offset = 2;
1183         last_hash = hash;
1184         dn->offset = dir_result_t::make_fpos(hash, readdir_offset++, true);
1185       } else {
1186         dn->offset = dir_result_t::make_fpos(fg, readdir_offset++, false);
1187       }
1188       // add to readdir cache
1189       if (dirp->release_count == diri->dir_release_count &&
1190           dirp->ordered_count == diri->dir_ordered_count &&
1191           dirp->start_shared_gen == diri->shared_gen) {
1192         if (dirp->cache_index == dir->readdir_cache.size()) {
1193           if (i == 0) {
1194             assert(!dirp->inode->is_complete_and_ordered());
1195             dir->readdir_cache.reserve(dirp->cache_index + numdn);
1196           }
1197           dir->readdir_cache.push_back(dn);
1198         } else if (dirp->cache_index < dir->readdir_cache.size()) {
1199           if (dirp->inode->is_complete_and_ordered())
1200             assert(dir->readdir_cache[dirp->cache_index] == dn);
1201           else
1202             dir->readdir_cache[dirp->cache_index] = dn;
1203         } else {
1204           assert(0 == "unexpected readdir buffer idx");
1205         }
1206         dirp->cache_index++;
1207       }
1208       // add to cached result list
1209       dirp->buffer.push_back(dir_result_t::dentry(dn->offset, dname, in));
1210       ldout(cct, 15) << __func__ << "  " << hex << dn->offset << dec << ": '" << dname << "' -> " << in->ino << dendl;
1211     }
1212
1213     if (numdn > 0)
1214       dirp->last_name = dname;
1215     if (end)
1216       dirp->next_offset = 2;
1217     else
1218       dirp->next_offset = readdir_offset;
1219
1220     if (dir->is_empty())
1221       close_dir(dir);
1222   }
1223 }
1224
1225 /** insert_trace
1226  *
1227  * insert a trace from a MDS reply into the cache.
1228  */
1229 Inode* Client::insert_trace(MetaRequest *request, MetaSession *session)
1230 {
1231   MClientReply *reply = request->reply;
1232   int op = request->get_op();
1233
1234   ldout(cct, 10) << "insert_trace from " << request->sent_stamp << " mds." << session->mds_num
1235            << " is_target=" << (int)reply->head.is_target
1236            << " is_dentry=" << (int)reply->head.is_dentry
1237            << dendl;
1238
1239   bufferlist::iterator p = reply->get_trace_bl().begin();
1240   if (request->got_unsafe) {
1241     ldout(cct, 10) << "insert_trace -- already got unsafe; ignoring" << dendl;
1242     assert(p.end());
1243     return NULL;
1244   }
1245
1246   if (p.end()) {
1247     ldout(cct, 10) << "insert_trace -- no trace" << dendl;
1248
1249     Dentry *d = request->dentry();
1250     if (d) {
1251       Inode *diri = d->dir->parent_inode;
1252       diri->dir_release_count++;
1253       clear_dir_complete_and_ordered(diri, true);
1254     }
1255
1256     if (d && reply->get_result() == 0) {
1257       if (op == CEPH_MDS_OP_RENAME) {
1258         // rename
1259         Dentry *od = request->old_dentry();
1260         ldout(cct, 10) << " unlinking rename src dn " << od << " for traceless reply" << dendl;
1261         assert(od);
1262         unlink(od, true, true);  // keep dir, dentry
1263       } else if (op == CEPH_MDS_OP_RMDIR ||
1264                  op == CEPH_MDS_OP_UNLINK) {
1265         // unlink, rmdir
1266         ldout(cct, 10) << " unlinking unlink/rmdir dn " << d << " for traceless reply" << dendl;
1267         unlink(d, true, true);  // keep dir, dentry
1268       }
1269     }
1270     return NULL;
1271   }
1272
1273   ConnectionRef con = request->reply->get_connection();
1274   uint64_t features = con->get_features();
1275   ldout(cct, 10) << " features 0x" << hex << features << dec << dendl;
1276
1277   // snap trace
1278   SnapRealm *realm = NULL;
1279   if (reply->snapbl.length())
1280     update_snap_trace(reply->snapbl, &realm);
1281
1282   ldout(cct, 10) << " hrm "
1283            << " is_target=" << (int)reply->head.is_target
1284            << " is_dentry=" << (int)reply->head.is_dentry
1285            << dendl;
1286
1287   InodeStat dirst;
1288   DirStat dst;
1289   string dname;
1290   LeaseStat dlease;
1291   InodeStat ist;
1292
1293   if (reply->head.is_dentry) {
1294     dirst.decode(p, features);
1295     dst.decode(p);
1296     ::decode(dname, p);
1297     ::decode(dlease, p);
1298   }
1299
1300   Inode *in = 0;
1301   if (reply->head.is_target) {
1302     ist.decode(p, features);
1303     if (cct->_conf->client_debug_getattr_caps) {
1304       unsigned wanted = 0;
1305       if (op == CEPH_MDS_OP_GETATTR || op == CEPH_MDS_OP_LOOKUP)
1306         wanted = request->head.args.getattr.mask;
1307       else if (op == CEPH_MDS_OP_OPEN || op == CEPH_MDS_OP_CREATE)
1308         wanted = request->head.args.open.mask;
1309
1310       if ((wanted & CEPH_CAP_XATTR_SHARED) &&
1311           !(ist.xattr_version > 0 && ist.xattrbl.length() > 0))
1312           assert(0 == "MDS reply does not contain xattrs");
1313     }
1314
1315     in = add_update_inode(&ist, request->sent_stamp, session,
1316                           request->perms);
1317   }
1318
1319   Inode *diri = NULL;
1320   if (reply->head.is_dentry) {
1321     diri = add_update_inode(&dirst, request->sent_stamp, session,
1322                             request->perms);
1323     update_dir_dist(diri, &dst);  // dir stat info is attached to ..
1324
1325     if (in) {
1326       Dir *dir = diri->open_dir();
1327       insert_dentry_inode(dir, dname, &dlease, in, request->sent_stamp, session,
1328                           (op == CEPH_MDS_OP_RENAME) ? request->old_dentry() : NULL);
1329     } else {
1330       Dentry *dn = NULL;
1331       if (diri->dir && diri->dir->dentries.count(dname)) {
1332         dn = diri->dir->dentries[dname];
1333         if (dn->inode) {
1334           diri->dir_ordered_count++;
1335           clear_dir_complete_and_ordered(diri, false);
1336           unlink(dn, true, true);  // keep dir, dentry
1337         }
1338       }
1339       if (dlease.duration_ms > 0) {
1340         if (!dn) {
1341           Dir *dir = diri->open_dir();
1342           dn = link(dir, dname, NULL, NULL);
1343         }
1344         update_dentry_lease(dn, &dlease, request->sent_stamp, session);
1345       }
1346     }
1347   } else if (op == CEPH_MDS_OP_LOOKUPSNAP ||
1348              op == CEPH_MDS_OP_MKSNAP) {
1349     ldout(cct, 10) << " faking snap lookup weirdness" << dendl;
1350     // fake it for snap lookup
1351     vinodeno_t vino = ist.vino;
1352     vino.snapid = CEPH_SNAPDIR;
1353     assert(inode_map.count(vino));
1354     diri = inode_map[vino];
1355
1356     string dname = request->path.last_dentry();
1357
1358     LeaseStat dlease;
1359     dlease.duration_ms = 0;
1360
1361     if (in) {
1362       Dir *dir = diri->open_dir();
1363       insert_dentry_inode(dir, dname, &dlease, in, request->sent_stamp, session);
1364     } else {
1365       if (diri->dir && diri->dir->dentries.count(dname)) {
1366         Dentry *dn = diri->dir->dentries[dname];
1367         if (dn->inode)
1368           unlink(dn, true, true);  // keep dir, dentry
1369       }
1370     }
1371   }
1372
1373   if (in) {
1374     if (op == CEPH_MDS_OP_READDIR ||
1375         op == CEPH_MDS_OP_LSSNAP) {
1376       insert_readdir_results(request, session, in);
1377     } else if (op == CEPH_MDS_OP_LOOKUPNAME) {
1378       // hack: return parent inode instead
1379       in = diri;
1380     }
1381
1382     if (request->dentry() == NULL && in != request->inode()) {
1383       // pin the target inode if its parent dentry is not pinned
1384       request->set_other_inode(in);
1385     }
1386   }
1387
1388   if (realm)
1389     put_snap_realm(realm);
1390
1391   request->target = in;
1392   return in;
1393 }
1394
1395 // -------
1396
1397 mds_rank_t Client::choose_target_mds(MetaRequest *req, Inode** phash_diri)
1398 {
1399   mds_rank_t mds = MDS_RANK_NONE;
1400   __u32 hash = 0;
1401   bool is_hash = false;
1402
1403   Inode *in = NULL;
1404   Dentry *de = NULL;
1405   Cap *cap = NULL;
1406
1407   if (req->resend_mds >= 0) {
1408     mds = req->resend_mds;
1409     req->resend_mds = -1;
1410     ldout(cct, 10) << "choose_target_mds resend_mds specified as mds." << mds << dendl;
1411     goto out;
1412   }
1413
1414   if (cct->_conf->client_use_random_mds)
1415     goto random_mds;
1416
1417   in = req->inode();
1418   de = req->dentry();
1419   if (in) {
1420     ldout(cct, 20) << "choose_target_mds starting with req->inode " << *in << dendl;
1421     if (req->path.depth()) {
1422       hash = in->hash_dentry_name(req->path[0]);
1423       ldout(cct, 20) << "choose_target_mds inode dir hash is " << (int)in->dir_layout.dl_dir_hash
1424                << " on " << req->path[0]
1425                << " => " << hash << dendl;
1426       is_hash = true;
1427     }
1428   } else if (de) {
1429     if (de->inode) {
1430       in = de->inode.get();
1431       ldout(cct, 20) << "choose_target_mds starting with req->dentry inode " << *in << dendl;
1432     } else {
1433       in = de->dir->parent_inode;
1434       hash = in->hash_dentry_name(de->name);
1435       ldout(cct, 20) << "choose_target_mds dentry dir hash is " << (int)in->dir_layout.dl_dir_hash
1436                << " on " << de->name
1437                << " => " << hash << dendl;
1438       is_hash = true;
1439     }
1440   }
1441   if (in) {
1442     if (in->snapid != CEPH_NOSNAP) {
1443       ldout(cct, 10) << "choose_target_mds " << *in << " is snapped, using nonsnap parent" << dendl;
1444       while (in->snapid != CEPH_NOSNAP) {
1445         if (in->snapid == CEPH_SNAPDIR)
1446           in = in->snapdir_parent.get();
1447         else if (!in->dn_set.empty())
1448           /* In most cases there will only be one dentry, so getting it
1449            * will be the correct action. If there are multiple hard links,
1450            * I think the MDS should be able to redirect as needed*/
1451           in = in->get_first_parent()->dir->parent_inode;
1452         else {
1453           ldout(cct, 10) << "got unlinked inode, can't look at parent" << dendl;
1454           break;
1455         }
1456       }
1457       is_hash = false;
1458     }
1459
1460     ldout(cct, 20) << "choose_target_mds " << *in << " is_hash=" << is_hash
1461              << " hash=" << hash << dendl;
1462
1463     if (is_hash && S_ISDIR(in->mode) && !in->fragmap.empty()) {
1464       frag_t fg = in->dirfragtree[hash];
1465       if (in->fragmap.count(fg)) {
1466         mds = in->fragmap[fg];
1467         if (phash_diri)
1468           *phash_diri = in;
1469       } else if (in->auth_cap) {
1470         mds = in->auth_cap->session->mds_num;
1471       }
1472       if (mds >= 0) {
1473         ldout(cct, 10) << "choose_target_mds from dirfragtree hash" << dendl;
1474         goto out;
1475       }
1476     }
1477
1478     if (req->auth_is_best())
1479       cap = in->auth_cap;
1480     if (!cap && !in->caps.empty())
1481       cap = in->caps.begin()->second;
1482     if (!cap)
1483       goto random_mds;
1484     mds = cap->session->mds_num;
1485     ldout(cct, 10) << "choose_target_mds from caps on inode " << *in << dendl;
1486
1487     goto out;
1488   }
1489
1490 random_mds:
1491   if (mds < 0) {
1492     mds = _get_random_up_mds();
1493     ldout(cct, 10) << "did not get mds through better means, so chose random mds " << mds << dendl;
1494   }
1495
1496 out:
1497   ldout(cct, 20) << "mds is " << mds << dendl;
1498   return mds;
1499 }
1500
1501
1502 void Client::connect_mds_targets(mds_rank_t mds)
1503 {
1504   ldout(cct, 10) << "connect_mds_targets for mds." << mds << dendl;
1505   assert(mds_sessions.count(mds));
1506   const MDSMap::mds_info_t& info = mdsmap->get_mds_info(mds);
1507   for (set<mds_rank_t>::const_iterator q = info.export_targets.begin();
1508        q != info.export_targets.end();
1509        ++q) {
1510     if (mds_sessions.count(*q) == 0 &&
1511         mdsmap->is_clientreplay_or_active_or_stopping(*q)) {
1512       ldout(cct, 10) << "check_mds_sessions opening mds." << mds
1513                      << " export target mds." << *q << dendl;
1514       _open_mds_session(*q);
1515     }
1516   }
1517 }
1518
1519 void Client::dump_mds_sessions(Formatter *f)
1520 {
1521   f->dump_int("id", get_nodeid().v);
1522   entity_inst_t inst(messenger->get_myname(), messenger->get_myaddr());
1523   f->dump_object("inst", inst);
1524   f->dump_stream("inst_str") << inst;
1525   f->dump_stream("addr_str") << inst.addr;
1526   f->open_array_section("sessions");
1527   for (map<mds_rank_t,MetaSession*>::const_iterator p = mds_sessions.begin(); p != mds_sessions.end(); ++p) {
1528     f->open_object_section("session");
1529     p->second->dump(f);
1530     f->close_section();
1531   }
1532   f->close_section();
1533   f->dump_int("mdsmap_epoch", mdsmap->get_epoch());
1534 }
1535 void Client::dump_mds_requests(Formatter *f)
1536 {
1537   for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
1538        p != mds_requests.end();
1539        ++p) {
1540     f->open_object_section("request");
1541     p->second->dump(f);
1542     f->close_section();
1543   }
1544 }
1545
1546 int Client::verify_reply_trace(int r,
1547                                MetaRequest *request, MClientReply *reply,
1548                                InodeRef *ptarget, bool *pcreated,
1549                                const UserPerm& perms)
1550 {
1551   // check whether this request actually did the create, and set created flag
1552   bufferlist extra_bl;
1553   inodeno_t created_ino;
1554   bool got_created_ino = false;
1555   ceph::unordered_map<vinodeno_t, Inode*>::iterator p;
1556
1557   extra_bl.claim(reply->get_extra_bl());
1558   if (extra_bl.length() >= 8) {
1559     // if the extra bufferlist has a buffer, we assume its the created inode
1560     // and that this request to create succeeded in actually creating
1561     // the inode (won the race with other create requests)
1562     ::decode(created_ino, extra_bl);
1563     got_created_ino = true;
1564     ldout(cct, 10) << "make_request created ino " << created_ino << dendl;
1565   }
1566
1567   if (pcreated)
1568     *pcreated = got_created_ino;
1569
1570   if (request->target) {
1571     *ptarget = request->target;
1572     ldout(cct, 20) << "make_request target is " << *ptarget->get() << dendl;
1573   } else {
1574     if (got_created_ino && (p = inode_map.find(vinodeno_t(created_ino, CEPH_NOSNAP))) != inode_map.end()) {
1575       (*ptarget) = p->second;
1576       ldout(cct, 20) << "make_request created, target is " << *ptarget->get() << dendl;
1577     } else {
1578       // we got a traceless reply, and need to look up what we just
1579       // created.  for now, do this by name.  someday, do this by the
1580       // ino... which we know!  FIXME.
1581       InodeRef target;
1582       Dentry *d = request->dentry();
1583       if (d) {
1584         if (d->dir) {
1585           ldout(cct, 10) << "make_request got traceless reply, looking up #"
1586                          << d->dir->parent_inode->ino << "/" << d->name
1587                          << " got_ino " << got_created_ino
1588                          << " ino " << created_ino
1589                          << dendl;
1590           r = _do_lookup(d->dir->parent_inode, d->name, request->regetattr_mask,
1591                          &target, perms);
1592         } else {
1593           // if the dentry is not linked, just do our best. see #5021.
1594           assert(0 == "how did this happen?  i want logs!");
1595         }
1596       } else {
1597         Inode *in = request->inode();
1598         ldout(cct, 10) << "make_request got traceless reply, forcing getattr on #"
1599                        << in->ino << dendl;
1600         r = _getattr(in, request->regetattr_mask, perms, true);
1601         target = in;
1602       }
1603       if (r >= 0) {
1604         // verify ino returned in reply and trace_dist are the same
1605         if (got_created_ino &&
1606             created_ino.val != target->ino.val) {
1607           ldout(cct, 5) << "create got ino " << created_ino << " but then failed on lookup; EINTR?" << dendl;
1608           r = -EINTR;
1609         }
1610         if (ptarget)
1611           ptarget->swap(target);
1612       }
1613     }
1614   }
1615
1616   return r;
1617 }
1618
1619
1620 /**
1621  * make a request
1622  *
1623  * Blocking helper to make an MDS request.
1624  *
1625  * If the ptarget flag is set, behavior changes slightly: the caller
1626  * expects to get a pointer to the inode we are creating or operating
1627  * on.  As a result, we will follow up any traceless mutation reply
1628  * with a getattr or lookup to transparently handle a traceless reply
1629  * from the MDS (as when the MDS restarts and the client has to replay
1630  * a request).
1631  *
1632  * @param request the MetaRequest to execute
1633  * @param perms The user uid/gid to execute as (eventually, full group lists?)
1634  * @param ptarget [optional] address to store a pointer to the target inode we want to create or operate on
1635  * @param pcreated [optional; required if ptarget] where to store a bool of whether our create atomically created a file
1636  * @param use_mds [optional] prefer a specific mds (-1 for default)
1637  * @param pdirbl [optional; disallowed if ptarget] where to pass extra reply payload to the caller
1638  */
1639 int Client::make_request(MetaRequest *request,
1640                          const UserPerm& perms,
1641                          InodeRef *ptarget, bool *pcreated,
1642                          mds_rank_t use_mds,
1643                          bufferlist *pdirbl)
1644 {
1645   int r = 0;
1646
1647   // assign a unique tid
1648   ceph_tid_t tid = ++last_tid;
1649   request->set_tid(tid);
1650
1651   // and timestamp
1652   request->op_stamp = ceph_clock_now();
1653
1654   // make note
1655   mds_requests[tid] = request->get();
1656   if (oldest_tid == 0 && request->get_op() != CEPH_MDS_OP_SETFILELOCK)
1657     oldest_tid = tid;
1658
1659   request->set_caller_perms(perms);
1660
1661   if (cct->_conf->client_inject_fixed_oldest_tid) {
1662     ldout(cct, 20) << __func__ << " injecting fixed oldest_client_tid(1)" << dendl;
1663     request->set_oldest_client_tid(1);
1664   } else {
1665     request->set_oldest_client_tid(oldest_tid);
1666   }
1667
1668   // hack target mds?
1669   if (use_mds >= 0)
1670     request->resend_mds = use_mds;
1671
1672   while (1) {
1673     if (request->aborted())
1674       break;
1675
1676     if (blacklisted) {
1677       request->abort(-EBLACKLISTED);
1678       break;
1679     }
1680
1681     // set up wait cond
1682     Cond caller_cond;
1683     request->caller_cond = &caller_cond;
1684
1685     // choose mds
1686     Inode *hash_diri = NULL;
1687     mds_rank_t mds = choose_target_mds(request, &hash_diri);
1688     int mds_state = (mds == MDS_RANK_NONE) ? MDSMap::STATE_NULL : mdsmap->get_state(mds);
1689     if (mds_state != MDSMap::STATE_ACTIVE && mds_state != MDSMap::STATE_STOPPING) {
1690       if (mds_state == MDSMap::STATE_NULL && mds >= mdsmap->get_max_mds()) {
1691         if (hash_diri) {
1692           ldout(cct, 10) << " target mds." << mds << " has stopped, remove it from fragmap" << dendl;
1693           _fragmap_remove_stopped_mds(hash_diri, mds);
1694         } else {
1695           ldout(cct, 10) << " target mds." << mds << " has stopped, trying a random mds" << dendl;
1696           request->resend_mds = _get_random_up_mds();
1697         }
1698       } else {
1699         ldout(cct, 10) << " target mds." << mds << " not active, waiting for new mdsmap" << dendl;
1700         wait_on_list(waiting_for_mdsmap);
1701       }
1702       continue;
1703     }
1704
1705     // open a session?
1706     MetaSession *session = NULL;
1707     if (!have_open_session(mds)) {
1708       session = _get_or_open_mds_session(mds);
1709
1710       // wait
1711       if (session->state == MetaSession::STATE_OPENING) {
1712         ldout(cct, 10) << "waiting for session to mds." << mds << " to open" << dendl;
1713         wait_on_context_list(session->waiting_for_open);
1714         // Abort requests on REJECT from MDS
1715         if (rejected_by_mds.count(mds)) {
1716           request->abort(-EPERM);
1717           break;
1718         }
1719         continue;
1720       }
1721
1722       if (!have_open_session(mds))
1723         continue;
1724     } else {
1725       session = mds_sessions[mds];
1726     }
1727
1728     // send request.
1729     send_request(request, session);
1730
1731     // wait for signal
1732     ldout(cct, 20) << "awaiting reply|forward|kick on " << &caller_cond << dendl;
1733     request->kick = false;
1734     while (!request->reply &&         // reply
1735            request->resend_mds < 0 && // forward
1736            !request->kick)
1737       caller_cond.Wait(client_lock);
1738     request->caller_cond = NULL;
1739
1740     // did we get a reply?
1741     if (request->reply)
1742       break;
1743   }
1744
1745   if (!request->reply) {
1746     assert(request->aborted());
1747     assert(!request->got_unsafe);
1748     r = request->get_abort_code();
1749     request->item.remove_myself();
1750     unregister_request(request);
1751     put_request(request); // ours
1752     return r;
1753   }
1754
1755   // got it!
1756   MClientReply *reply = request->reply;
1757   request->reply = NULL;
1758   r = reply->get_result();
1759   if (r >= 0)
1760     request->success = true;
1761
1762   // kick dispatcher (we've got it!)
1763   assert(request->dispatch_cond);
1764   request->dispatch_cond->Signal();
1765   ldout(cct, 20) << "sendrecv kickback on tid " << tid << " " << request->dispatch_cond << dendl;
1766   request->dispatch_cond = 0;
1767
1768   if (r >= 0 && ptarget)
1769     r = verify_reply_trace(r, request, reply, ptarget, pcreated, perms);
1770
1771   if (pdirbl)
1772     pdirbl->claim(reply->get_extra_bl());
1773
1774   // -- log times --
1775   utime_t lat = ceph_clock_now();
1776   lat -= request->sent_stamp;
1777   ldout(cct, 20) << "lat " << lat << dendl;
1778   logger->tinc(l_c_lat, lat);
1779   logger->tinc(l_c_reply, lat);
1780
1781   put_request(request);
1782
1783   reply->put();
1784   return r;
1785 }
1786
1787 void Client::unregister_request(MetaRequest *req)
1788 {
1789   mds_requests.erase(req->tid);
1790   if (req->tid == oldest_tid) {
1791     map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.upper_bound(oldest_tid);
1792     while (true) {
1793       if (p == mds_requests.end()) {
1794         oldest_tid = 0;
1795         break;
1796       }
1797       if (p->second->get_op() != CEPH_MDS_OP_SETFILELOCK) {
1798         oldest_tid = p->first;
1799         break;
1800       }
1801       ++p;
1802     }
1803   }
1804   put_request(req);
1805 }
1806
1807 void Client::put_request(MetaRequest *request)
1808 {
1809   if (request->_put()) {
1810     int op = -1;
1811     if (request->success)
1812       op = request->get_op();
1813     InodeRef other_in;
1814     request->take_other_inode(&other_in);
1815     delete request;
1816
1817     if (other_in &&
1818         (op == CEPH_MDS_OP_RMDIR ||
1819          op == CEPH_MDS_OP_RENAME ||
1820          op == CEPH_MDS_OP_RMSNAP)) {
1821       _try_to_trim_inode(other_in.get(), false);
1822     }
1823   }
1824 }
1825
1826 int Client::encode_inode_release(Inode *in, MetaRequest *req,
1827                          mds_rank_t mds, int drop,
1828                          int unless, int force)
1829 {
1830   ldout(cct, 20) << "encode_inode_release enter(in:" << *in << ", req:" << req
1831            << " mds:" << mds << ", drop:" << drop << ", unless:" << unless
1832            << ", have:" << ", force:" << force << ")" << dendl;
1833   int released = 0;
1834   if (in->caps.count(mds)) {
1835     Cap *caps = in->caps[mds];
1836     drop &= ~(in->dirty_caps | get_caps_used(in));
1837     if ((drop & caps->issued) &&
1838         !(unless & caps->issued)) {
1839       ldout(cct, 25) << "Dropping caps. Initial " << ccap_string(caps->issued) << dendl;
1840       caps->issued &= ~drop;
1841       caps->implemented &= ~drop;
1842       released = 1;
1843       ldout(cct, 25) << "Now have: " << ccap_string(caps->issued) << dendl;
1844     } else {
1845       released = force;
1846     }
1847     if (released) {
1848       ceph_mds_request_release rel;
1849       rel.ino = in->ino;
1850       rel.cap_id = caps->cap_id;
1851       rel.seq = caps->seq;
1852       rel.issue_seq = caps->issue_seq;
1853       rel.mseq = caps->mseq;
1854       rel.caps = caps->implemented;
1855       rel.wanted = caps->wanted;
1856       rel.dname_len = 0;
1857       rel.dname_seq = 0;
1858       req->cap_releases.push_back(MClientRequest::Release(rel,""));
1859     }
1860   }
1861   ldout(cct, 25) << "encode_inode_release exit(in:" << *in << ") released:"
1862            << released << dendl;
1863   return released;
1864 }
1865
1866 void Client::encode_dentry_release(Dentry *dn, MetaRequest *req,
1867                            mds_rank_t mds, int drop, int unless)
1868 {
1869   ldout(cct, 20) << "encode_dentry_release enter(dn:"
1870            << dn << ")" << dendl;
1871   int released = 0;
1872   if (dn->dir)
1873     released = encode_inode_release(dn->dir->parent_inode, req,
1874                                     mds, drop, unless, 1);
1875   if (released && dn->lease_mds == mds) {
1876     ldout(cct, 25) << "preemptively releasing dn to mds" << dendl;
1877     MClientRequest::Release& rel = req->cap_releases.back();
1878     rel.item.dname_len = dn->name.length();
1879     rel.item.dname_seq = dn->lease_seq;
1880     rel.dname = dn->name;
1881   }
1882   ldout(cct, 25) << "encode_dentry_release exit(dn:"
1883            << dn << ")" << dendl;
1884 }
1885
1886
1887 /*
1888  * This requires the MClientRequest *request member to be set.
1889  * It will error out horribly without one.
1890  * Additionally, if you set any *drop member, you'd better have
1891  * set the corresponding dentry!
1892  */
1893 void Client::encode_cap_releases(MetaRequest *req, mds_rank_t mds)
1894 {
1895   ldout(cct, 20) << "encode_cap_releases enter (req: "
1896                  << req << ", mds: " << mds << ")" << dendl;
1897   if (req->inode_drop && req->inode())
1898     encode_inode_release(req->inode(), req,
1899                          mds, req->inode_drop,
1900                          req->inode_unless);
1901
1902   if (req->old_inode_drop && req->old_inode())
1903     encode_inode_release(req->old_inode(), req,
1904                          mds, req->old_inode_drop,
1905                          req->old_inode_unless);
1906   if (req->other_inode_drop && req->other_inode())
1907     encode_inode_release(req->other_inode(), req,
1908                          mds, req->other_inode_drop,
1909                          req->other_inode_unless);
1910
1911   if (req->dentry_drop && req->dentry())
1912     encode_dentry_release(req->dentry(), req,
1913                           mds, req->dentry_drop,
1914                           req->dentry_unless);
1915
1916   if (req->old_dentry_drop && req->old_dentry())
1917     encode_dentry_release(req->old_dentry(), req,
1918                           mds, req->old_dentry_drop,
1919                           req->old_dentry_unless);
1920   ldout(cct, 25) << "encode_cap_releases exit (req: "
1921            << req << ", mds " << mds <<dendl;
1922 }
1923
1924 bool Client::have_open_session(mds_rank_t mds)
1925 {
1926   return
1927     mds_sessions.count(mds) &&
1928     (mds_sessions[mds]->state == MetaSession::STATE_OPEN ||
1929      mds_sessions[mds]->state == MetaSession::STATE_STALE);
1930 }
1931
1932 MetaSession *Client::_get_mds_session(mds_rank_t mds, Connection *con)
1933 {
1934   if (mds_sessions.count(mds) == 0)
1935     return NULL;
1936   MetaSession *s = mds_sessions[mds];
1937   if (s->con != con)
1938     return NULL;
1939   return s;
1940 }
1941
1942 MetaSession *Client::_get_or_open_mds_session(mds_rank_t mds)
1943 {
1944   if (mds_sessions.count(mds))
1945     return mds_sessions[mds];
1946   return _open_mds_session(mds);
1947 }
1948
1949 /**
1950  * Populate a map of strings with client-identifying metadata,
1951  * such as the hostname.  Call this once at initialization.
1952  */
1953 void Client::populate_metadata(const std::string &mount_root)
1954 {
1955   // Hostname
1956   struct utsname u;
1957   int r = uname(&u);
1958   if (r >= 0) {
1959     metadata["hostname"] = u.nodename;
1960     ldout(cct, 20) << __func__ << " read hostname '" << u.nodename << "'" << dendl;
1961   } else {
1962     ldout(cct, 1) << __func__ << " failed to read hostname (" << cpp_strerror(r) << ")" << dendl;
1963   }
1964
1965   metadata["pid"] = stringify(getpid());
1966
1967   // Ceph entity id (the '0' in "client.0")
1968   metadata["entity_id"] = cct->_conf->name.get_id();
1969
1970   // Our mount position
1971   if (!mount_root.empty()) {
1972     metadata["root"] = mount_root;
1973   }
1974
1975   // Ceph version
1976   metadata["ceph_version"] = pretty_version_to_str();
1977   metadata["ceph_sha1"] = git_version_to_str();
1978
1979   // Apply any metadata from the user's configured overrides
1980   std::vector<std::string> tokens;
1981   get_str_vec(cct->_conf->client_metadata, ",", tokens);
1982   for (const auto &i : tokens) {
1983     auto eqpos = i.find("=");
1984     // Throw out anything that isn't of the form "<str>=<str>"
1985     if (eqpos == 0 || eqpos == std::string::npos || eqpos == i.size()) {
1986       lderr(cct) << "Invalid metadata keyval pair: '" << i << "'" << dendl;
1987       continue;
1988     }
1989     metadata[i.substr(0, eqpos)] = i.substr(eqpos + 1);
1990   }
1991 }
1992
1993 /**
1994  * Optionally add or override client metadata fields.
1995  */
1996 void Client::update_metadata(std::string const &k, std::string const &v)
1997 {
1998   Mutex::Locker l(client_lock);
1999   assert(initialized);
2000
2001   if (metadata.count(k)) {
2002     ldout(cct, 1) << __func__ << " warning, overriding metadata field '" << k
2003       << "' from '" << metadata[k] << "' to '" << v << "'" << dendl;
2004   }
2005
2006   metadata[k] = v;
2007 }
2008
2009 MetaSession *Client::_open_mds_session(mds_rank_t mds)
2010 {
2011   ldout(cct, 10) << "_open_mds_session mds." << mds << dendl;
2012   assert(mds_sessions.count(mds) == 0);
2013   MetaSession *session = new MetaSession;
2014   session->mds_num = mds;
2015   session->seq = 0;
2016   session->inst = mdsmap->get_inst(mds);
2017   session->con = messenger->get_connection(session->inst);
2018   session->state = MetaSession::STATE_OPENING;
2019   session->mds_state = MDSMap::STATE_NULL;
2020   mds_sessions[mds] = session;
2021
2022   // Maybe skip sending a request to open if this MDS daemon
2023   // has previously sent us a REJECT.
2024   if (rejected_by_mds.count(mds)) {
2025     if (rejected_by_mds[mds] == session->inst) {
2026       ldout(cct, 4) << "_open_mds_session mds." << mds << " skipping "
2027                        "because we were rejected" << dendl;
2028       return session;
2029     } else {
2030       ldout(cct, 4) << "_open_mds_session mds." << mds << " old inst "
2031                        "rejected us, trying with new inst" << dendl;
2032       rejected_by_mds.erase(mds);
2033     }
2034   }
2035
2036   MClientSession *m = new MClientSession(CEPH_SESSION_REQUEST_OPEN);
2037   m->client_meta = metadata;
2038   session->con->send_message(m);
2039   return session;
2040 }
2041
2042 void Client::_close_mds_session(MetaSession *s)
2043 {
2044   ldout(cct, 2) << "_close_mds_session mds." << s->mds_num << " seq " << s->seq << dendl;
2045   s->state = MetaSession::STATE_CLOSING;
2046   s->con->send_message(new MClientSession(CEPH_SESSION_REQUEST_CLOSE, s->seq));
2047 }
2048
2049 void Client::_closed_mds_session(MetaSession *s)
2050 {
2051   s->state = MetaSession::STATE_CLOSED;
2052   s->con->mark_down();
2053   signal_context_list(s->waiting_for_open);
2054   mount_cond.Signal();
2055   remove_session_caps(s);
2056   kick_requests_closed(s);
2057   mds_sessions.erase(s->mds_num);
2058   delete s;
2059 }
2060
2061 void Client::handle_client_session(MClientSession *m)
2062 {
2063   mds_rank_t from = mds_rank_t(m->get_source().num());
2064   ldout(cct, 10) << "handle_client_session " << *m << " from mds." << from << dendl;
2065
2066   MetaSession *session = _get_mds_session(from, m->get_connection().get());
2067   if (!session) {
2068     ldout(cct, 10) << " discarding session message from sessionless mds " << m->get_source_inst() << dendl;
2069     m->put();
2070     return;
2071   }
2072
2073   switch (m->get_op()) {
2074   case CEPH_SESSION_OPEN:
2075     renew_caps(session);
2076     session->state = MetaSession::STATE_OPEN;
2077     if (unmounting)
2078       mount_cond.Signal();
2079     else
2080       connect_mds_targets(from);
2081     signal_context_list(session->waiting_for_open);
2082     break;
2083
2084   case CEPH_SESSION_CLOSE:
2085     _closed_mds_session(session);
2086     break;
2087
2088   case CEPH_SESSION_RENEWCAPS:
2089     if (session->cap_renew_seq == m->get_seq()) {
2090       session->cap_ttl =
2091         session->last_cap_renew_request + mdsmap->get_session_timeout();
2092       wake_inode_waiters(session);
2093     }
2094     break;
2095
2096   case CEPH_SESSION_STALE:
2097     // invalidate session caps/leases
2098     session->cap_gen++;
2099     session->cap_ttl = ceph_clock_now();
2100     session->cap_ttl -= 1;
2101     renew_caps(session);
2102     break;
2103
2104   case CEPH_SESSION_RECALL_STATE:
2105     trim_caps(session, m->get_max_caps());
2106     break;
2107
2108   case CEPH_SESSION_FLUSHMSG:
2109     session->con->send_message(new MClientSession(CEPH_SESSION_FLUSHMSG_ACK, m->get_seq()));
2110     break;
2111
2112   case CEPH_SESSION_FORCE_RO:
2113     force_session_readonly(session);
2114     break;
2115
2116   case CEPH_SESSION_REJECT:
2117     rejected_by_mds[session->mds_num] = session->inst;
2118     _closed_mds_session(session);
2119
2120     break;
2121
2122   default:
2123     ceph_abort();
2124   }
2125
2126   m->put();
2127 }
2128
2129 bool Client::_any_stale_sessions() const
2130 {
2131   assert(client_lock.is_locked_by_me());
2132
2133   for (const auto &i : mds_sessions) {
2134     if (i.second->state == MetaSession::STATE_STALE) {
2135       return true;
2136     }
2137   }
2138
2139   return false;
2140 }
2141
2142 void Client::_kick_stale_sessions()
2143 {
2144   ldout(cct, 1) << "kick_stale_sessions" << dendl;
2145
2146   for (map<mds_rank_t,MetaSession*>::iterator p = mds_sessions.begin();
2147        p != mds_sessions.end(); ) {
2148     MetaSession *s = p->second;
2149     ++p;
2150     if (s->state == MetaSession::STATE_STALE)
2151       _closed_mds_session(s);
2152   }
2153 }
2154
2155 void Client::send_request(MetaRequest *request, MetaSession *session,
2156                           bool drop_cap_releases)
2157 {
2158   // make the request
2159   mds_rank_t mds = session->mds_num;
2160   ldout(cct, 10) << "send_request rebuilding request " << request->get_tid()
2161                  << " for mds." << mds << dendl;
2162   MClientRequest *r = build_client_request(request);
2163   if (request->dentry()) {
2164     r->set_dentry_wanted();
2165   }
2166   if (request->got_unsafe) {
2167     r->set_replayed_op();
2168     if (request->target)
2169       r->head.ino = request->target->ino;
2170   } else {
2171     encode_cap_releases(request, mds);
2172     if (drop_cap_releases) // we haven't send cap reconnect yet, drop cap releases
2173       request->cap_releases.clear();
2174     else
2175       r->releases.swap(request->cap_releases);
2176   }
2177   r->set_mdsmap_epoch(mdsmap->get_epoch());
2178   if (r->head.op == CEPH_MDS_OP_SETXATTR) {
2179     objecter->with_osdmap([r](const OSDMap& o) {
2180         r->set_osdmap_epoch(o.get_epoch());
2181       });
2182   }
2183
2184   if (request->mds == -1) {
2185     request->sent_stamp = ceph_clock_now();
2186     ldout(cct, 20) << "send_request set sent_stamp to " << request->sent_stamp << dendl;
2187   }
2188   request->mds = mds;
2189
2190   Inode *in = request->inode();
2191   if (in && in->caps.count(mds))
2192     request->sent_on_mseq = in->caps[mds]->mseq;
2193
2194   session->requests.push_back(&request->item);
2195
2196   ldout(cct, 10) << "send_request " << *r << " to mds." << mds << dendl;
2197   session->con->send_message(r);
2198 }
2199
2200 MClientRequest* Client::build_client_request(MetaRequest *request)
2201 {
2202   MClientRequest *req = new MClientRequest(request->get_op());
2203   req->set_tid(request->tid);
2204   req->set_stamp(request->op_stamp);
2205   memcpy(&req->head, &request->head, sizeof(ceph_mds_request_head));
2206
2207   // if the filepath's haven't been set, set them!
2208   if (request->path.empty()) {
2209     Inode *in = request->inode();
2210     Dentry *de = request->dentry();
2211     if (in)
2212       in->make_nosnap_relative_path(request->path);
2213     else if (de) {
2214       if (de->inode)
2215         de->inode->make_nosnap_relative_path(request->path);
2216       else if (de->dir) {
2217         de->dir->parent_inode->make_nosnap_relative_path(request->path);
2218         request->path.push_dentry(de->name);
2219       }
2220       else ldout(cct, 1) << "Warning -- unable to construct a filepath!"
2221                    << " No path, inode, or appropriately-endowed dentry given!"
2222                    << dendl;
2223     } else ldout(cct, 1) << "Warning -- unable to construct a filepath!"
2224                    << " No path, inode, or dentry given!"
2225                    << dendl;
2226   }
2227   req->set_filepath(request->get_filepath());
2228   req->set_filepath2(request->get_filepath2());
2229   req->set_data(request->data);
2230   req->set_retry_attempt(request->retry_attempt++);
2231   req->head.num_fwd = request->num_fwd;
2232   const gid_t *_gids;
2233   int gid_count = request->perms.get_gids(&_gids);
2234   req->set_gid_list(gid_count, _gids);
2235   return req;
2236 }
2237
2238
2239
2240 void Client::handle_client_request_forward(MClientRequestForward *fwd)
2241 {
2242   mds_rank_t mds = mds_rank_t(fwd->get_source().num());
2243   MetaSession *session = _get_mds_session(mds, fwd->get_connection().get());
2244   if (!session) {
2245     fwd->put();
2246     return;
2247   }
2248   ceph_tid_t tid = fwd->get_tid();
2249
2250   if (mds_requests.count(tid) == 0) {
2251     ldout(cct, 10) << "handle_client_request_forward no pending request on tid " << tid << dendl;
2252     fwd->put();
2253     return;
2254   }
2255
2256   MetaRequest *request = mds_requests[tid];
2257   assert(request);
2258
2259   // reset retry counter
2260   request->retry_attempt = 0;
2261
2262   // request not forwarded, or dest mds has no session.
2263   // resend.
2264   ldout(cct, 10) << "handle_client_request tid " << tid
2265            << " fwd " << fwd->get_num_fwd()
2266            << " to mds." << fwd->get_dest_mds()
2267            << ", resending to " << fwd->get_dest_mds()
2268            << dendl;
2269
2270   request->mds = -1;
2271   request->item.remove_myself();
2272   request->num_fwd = fwd->get_num_fwd();
2273   request->resend_mds = fwd->get_dest_mds();
2274   request->caller_cond->Signal();
2275
2276   fwd->put();
2277 }
2278
2279 bool Client::is_dir_operation(MetaRequest *req)
2280 {
2281   int op = req->get_op();
2282   if (op == CEPH_MDS_OP_MKNOD || op == CEPH_MDS_OP_LINK ||
2283       op == CEPH_MDS_OP_UNLINK || op == CEPH_MDS_OP_RENAME ||
2284       op == CEPH_MDS_OP_MKDIR || op == CEPH_MDS_OP_RMDIR ||
2285       op == CEPH_MDS_OP_SYMLINK || op == CEPH_MDS_OP_CREATE)
2286     return true;
2287   return false;
2288 }
2289
2290 void Client::handle_client_reply(MClientReply *reply)
2291 {
2292   mds_rank_t mds_num = mds_rank_t(reply->get_source().num());
2293   MetaSession *session = _get_mds_session(mds_num, reply->get_connection().get());
2294   if (!session) {
2295     reply->put();
2296     return;
2297   }
2298
2299   ceph_tid_t tid = reply->get_tid();
2300   bool is_safe = reply->is_safe();
2301
2302   if (mds_requests.count(tid) == 0) {
2303     lderr(cct) << "handle_client_reply no pending request on tid " << tid
2304                << " safe is:" << is_safe << dendl;
2305     reply->put();
2306     return;
2307   }
2308   MetaRequest *request = mds_requests.at(tid);
2309
2310   ldout(cct, 20) << "handle_client_reply got a reply. Safe:" << is_safe
2311                  << " tid " << tid << dendl;
2312
2313   if (request->got_unsafe && !is_safe) {
2314     //duplicate response
2315     ldout(cct, 0) << "got a duplicate reply on tid " << tid << " from mds "
2316             << mds_num << " safe:" << is_safe << dendl;
2317     reply->put();
2318     return;
2319   }
2320
2321   if (-ESTALE == reply->get_result()) { // see if we can get to proper MDS
2322     ldout(cct, 20) << "got ESTALE on tid " << request->tid
2323                    << " from mds." << request->mds << dendl;
2324     request->send_to_auth = true;
2325     request->resend_mds = choose_target_mds(request);
2326     Inode *in = request->inode();
2327     if (request->resend_mds >= 0 &&
2328         request->resend_mds == request->mds &&
2329         (in == NULL ||
2330          in->caps.count(request->resend_mds) == 0 ||
2331          request->sent_on_mseq == in->caps[request->resend_mds]->mseq)) {
2332       // have to return ESTALE
2333     } else {
2334       request->caller_cond->Signal();
2335       reply->put();
2336       return;
2337     }
2338     ldout(cct, 20) << "have to return ESTALE" << dendl;
2339   }
2340
2341   assert(request->reply == NULL);
2342   request->reply = reply;
2343   insert_trace(request, session);
2344
2345   // Handle unsafe reply
2346   if (!is_safe) {
2347     request->got_unsafe = true;
2348     session->unsafe_requests.push_back(&request->unsafe_item);
2349     if (is_dir_operation(request)) {
2350       Inode *dir = request->inode();
2351       assert(dir);
2352       dir->unsafe_ops.push_back(&request->unsafe_dir_item);
2353     }
2354     if (request->target) {
2355       InodeRef &in = request->target;
2356       in->unsafe_ops.push_back(&request->unsafe_target_item);
2357     }
2358   }
2359
2360   // Only signal the caller once (on the first reply):
2361   // Either its an unsafe reply, or its a safe reply and no unsafe reply was sent.
2362   if (!is_safe || !request->got_unsafe) {
2363     Cond cond;
2364     request->dispatch_cond = &cond;
2365
2366     // wake up waiter
2367     ldout(cct, 20) << "handle_client_reply signalling caller " << (void*)request->caller_cond << dendl;
2368     request->caller_cond->Signal();
2369
2370     // wake for kick back
2371     while (request->dispatch_cond) {
2372       ldout(cct, 20) << "handle_client_reply awaiting kickback on tid " << tid << " " << &cond << dendl;
2373       cond.Wait(client_lock);
2374     }
2375   }
2376
2377   if (is_safe) {
2378     // the filesystem change is committed to disk
2379     // we're done, clean up
2380     if (request->got_unsafe) {
2381       request->unsafe_item.remove_myself();
2382       request->unsafe_dir_item.remove_myself();
2383       request->unsafe_target_item.remove_myself();
2384       signal_cond_list(request->waitfor_safe);
2385     }
2386     request->item.remove_myself();
2387     unregister_request(request);
2388   }
2389   if (unmounting)
2390     mount_cond.Signal();
2391 }
2392
2393 void Client::_handle_full_flag(int64_t pool)
2394 {
2395   ldout(cct, 1) << __func__ << ": FULL: cancelling outstanding operations "
2396     << "on " << pool << dendl;
2397   // Cancel all outstanding ops in this pool with -ENOSPC: it is necessary
2398   // to do this rather than blocking, because otherwise when we fill up we
2399   // potentially lock caps forever on files with dirty pages, and we need
2400   // to be able to release those caps to the MDS so that it can delete files
2401   // and free up space.
2402   epoch_t cancelled_epoch = objecter->op_cancel_writes(-ENOSPC, pool);
2403
2404   // For all inodes with layouts in this pool and a pending flush write op
2405   // (i.e. one of the ones we will cancel), we've got to purge_set their data
2406   // from ObjectCacher so that it doesn't re-issue the write in response to
2407   // the ENOSPC error.
2408   // Fortunately since we're cancelling everything in a given pool, we don't
2409   // need to know which ops belong to which ObjectSet, we can just blow all
2410   // the un-flushed cached data away and mark any dirty inodes' async_err
2411   // field with -ENOSPC as long as we're sure all the ops we cancelled were
2412   // affecting this pool, and all the objectsets we're purging were also
2413   // in this pool.
2414   for (unordered_map<vinodeno_t,Inode*>::iterator i = inode_map.begin();
2415        i != inode_map.end(); ++i)
2416   {
2417     Inode *inode = i->second;
2418     if (inode->oset.dirty_or_tx
2419         && (pool == -1 || inode->layout.pool_id == pool)) {
2420       ldout(cct, 4) << __func__ << ": FULL: inode 0x" << std::hex << i->first << std::dec
2421         << " has dirty objects, purging and setting ENOSPC" << dendl;
2422       objectcacher->purge_set(&inode->oset);
2423       inode->set_async_err(-ENOSPC);
2424     }
2425   }
2426
2427   if (cancelled_epoch != (epoch_t)-1) {
2428     set_cap_epoch_barrier(cancelled_epoch);
2429   }
2430 }
2431
2432 void Client::handle_osd_map(MOSDMap *m)
2433 {
2434   std::set<entity_addr_t> new_blacklists;
2435   objecter->consume_blacklist_events(&new_blacklists);
2436
2437   const auto myaddr = messenger->get_myaddr();
2438   if (!blacklisted && new_blacklists.count(myaddr)) {
2439     auto epoch = objecter->with_osdmap([](const OSDMap &o){
2440         return o.get_epoch();
2441         });
2442     lderr(cct) << "I was blacklisted at osd epoch " << epoch << dendl;
2443     blacklisted = true;
2444     for (std::map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
2445          p != mds_requests.end(); ) {
2446       auto req = p->second;
2447       ++p;
2448       req->abort(-EBLACKLISTED);
2449       if (req->caller_cond) {
2450         req->kick = true;
2451         req->caller_cond->Signal();
2452       }
2453     }
2454
2455     // Progress aborts on any requests that were on this waitlist.  Any
2456     // requests that were on a waiting_for_open session waitlist
2457     // will get kicked during close session below.
2458     signal_cond_list(waiting_for_mdsmap);
2459
2460     // Force-close all sessions: assume this is not abandoning any state
2461     // on the MDS side because the MDS will have seen the blacklist too.
2462     while(!mds_sessions.empty()) {
2463       auto i = mds_sessions.begin();
2464       auto session = i->second;
2465       _closed_mds_session(session);
2466     }
2467
2468     // Since we know all our OSD ops will fail, cancel them all preemtively,
2469     // so that on an unhealthy cluster we can umount promptly even if e.g.
2470     // some PGs were inaccessible.
2471     objecter->op_cancel_writes(-EBLACKLISTED);
2472
2473   } else if (blacklisted) {
2474     // Handle case where we were blacklisted but no longer are
2475     blacklisted = objecter->with_osdmap([myaddr](const OSDMap &o){
2476         return o.is_blacklisted(myaddr);});
2477   }
2478
2479   // Always subscribe to next osdmap for blacklisted client
2480   // until this client is not blacklisted.
2481   if (blacklisted) {
2482     objecter->maybe_request_map();
2483   }
2484
2485   if (objecter->osdmap_full_flag()) {
2486     _handle_full_flag(-1);
2487   } else {
2488     // Accumulate local list of full pools so that I can drop
2489     // the objecter lock before re-entering objecter in
2490     // cancel_writes
2491     std::vector<int64_t> full_pools;
2492
2493     objecter->with_osdmap([&full_pools](const OSDMap &o) {
2494         for (const auto& kv : o.get_pools()) {
2495           if (kv.second.has_flag(pg_pool_t::FLAG_FULL)) {
2496             full_pools.push_back(kv.first);
2497           }
2498         }
2499       });
2500
2501     for (auto p : full_pools)
2502       _handle_full_flag(p);
2503
2504     // Subscribe to subsequent maps to watch for the full flag going
2505     // away.  For the global full flag objecter does this for us, but
2506     // it pays no attention to the per-pool full flag so in this branch
2507     // we do it ourselves.
2508     if (!full_pools.empty()) {
2509       objecter->maybe_request_map();
2510     }
2511   }
2512
2513   m->put();
2514 }
2515
2516
2517 // ------------------------
2518 // incoming messages
2519
2520
2521 bool Client::ms_dispatch(Message *m)
2522 {
2523   Mutex::Locker l(client_lock);
2524   if (!initialized) {
2525     ldout(cct, 10) << "inactive, discarding " << *m << dendl;
2526     m->put();
2527     return true;
2528   }
2529
2530   switch (m->get_type()) {
2531     // mounting and mds sessions
2532   case CEPH_MSG_MDS_MAP:
2533     handle_mds_map(static_cast<MMDSMap*>(m));
2534     break;
2535   case CEPH_MSG_FS_MAP:
2536     handle_fs_map(static_cast<MFSMap*>(m));
2537     break;
2538   case CEPH_MSG_FS_MAP_USER:
2539     handle_fs_map_user(static_cast<MFSMapUser*>(m));
2540     break;
2541   case CEPH_MSG_CLIENT_SESSION:
2542     handle_client_session(static_cast<MClientSession*>(m));
2543     break;
2544
2545   case CEPH_MSG_OSD_MAP:
2546     handle_osd_map(static_cast<MOSDMap*>(m));
2547     break;
2548
2549     // requests
2550   case CEPH_MSG_CLIENT_REQUEST_FORWARD:
2551     handle_client_request_forward(static_cast<MClientRequestForward*>(m));
2552     break;
2553   case CEPH_MSG_CLIENT_REPLY:
2554     handle_client_reply(static_cast<MClientReply*>(m));
2555     break;
2556
2557   case CEPH_MSG_CLIENT_SNAP:
2558     handle_snap(static_cast<MClientSnap*>(m));
2559     break;
2560   case CEPH_MSG_CLIENT_CAPS:
2561     handle_caps(static_cast<MClientCaps*>(m));
2562     break;
2563   case CEPH_MSG_CLIENT_LEASE:
2564     handle_lease(static_cast<MClientLease*>(m));
2565     break;
2566   case MSG_COMMAND_REPLY:
2567     if (m->get_source().type() == CEPH_ENTITY_TYPE_MDS) {
2568       handle_command_reply(static_cast<MCommandReply*>(m));
2569     } else {
2570       return false;
2571     }
2572     break;
2573   case CEPH_MSG_CLIENT_QUOTA:
2574     handle_quota(static_cast<MClientQuota*>(m));
2575     break;
2576
2577   default:
2578     return false;
2579   }
2580
2581   // unmounting?
2582   if (unmounting) {
2583     ldout(cct, 10) << "unmounting: trim pass, size was " << lru.lru_get_size()
2584              << "+" << inode_map.size() << dendl;
2585     long unsigned size = lru.lru_get_size() + inode_map.size();
2586     trim_cache();
2587     if (size < lru.lru_get_size() + inode_map.size()) {
2588       ldout(cct, 10) << "unmounting: trim pass, cache shrank, poking unmount()" << dendl;
2589       mount_cond.Signal();
2590     } else {
2591       ldout(cct, 10) << "unmounting: trim pass, size still " << lru.lru_get_size()
2592                << "+" << inode_map.size() << dendl;
2593     }
2594   }
2595
2596   return true;
2597 }
2598
2599 void Client::handle_fs_map(MFSMap *m)
2600 {
2601   fsmap.reset(new FSMap(m->get_fsmap()));
2602   m->put();
2603
2604   signal_cond_list(waiting_for_fsmap);
2605
2606   monclient->sub_got("fsmap", fsmap->get_epoch());
2607 }
2608
2609 void Client::handle_fs_map_user(MFSMapUser *m)
2610 {
2611   fsmap_user.reset(new FSMapUser);
2612   *fsmap_user = m->get_fsmap();
2613   m->put();
2614
2615   monclient->sub_got("fsmap.user", fsmap_user->get_epoch());
2616   signal_cond_list(waiting_for_fsmap);
2617 }
2618
2619 void Client::handle_mds_map(MMDSMap* m)
2620 {
2621   mds_gid_t old_inc, new_inc;
2622   if (m->get_epoch() <= mdsmap->get_epoch()) {
2623     ldout(cct, 1) << "handle_mds_map epoch " << m->get_epoch()
2624                   << " is identical to or older than our "
2625                   << mdsmap->get_epoch() << dendl;
2626     m->put();
2627     return;
2628   }
2629
2630   ldout(cct, 1) << "handle_mds_map epoch " << m->get_epoch() << dendl;
2631
2632   std::unique_ptr<MDSMap> oldmap(new MDSMap);
2633   oldmap.swap(mdsmap);
2634
2635   mdsmap->decode(m->get_encoded());
2636
2637   // Cancel any commands for missing or laggy GIDs
2638   std::list<ceph_tid_t> cancel_ops;
2639   auto &commands = command_table.get_commands();
2640   for (const auto &i : commands) {
2641     auto &op = i.second;
2642     const mds_gid_t op_mds_gid = op.mds_gid;
2643     if (mdsmap->is_dne_gid(op_mds_gid) || mdsmap->is_laggy_gid(op_mds_gid)) {
2644       ldout(cct, 1) << __func__ << ": cancelling command op " << i.first << dendl;
2645       cancel_ops.push_back(i.first);
2646       if (op.outs) {
2647         std::ostringstream ss;
2648         ss << "MDS " << op_mds_gid << " went away";
2649         *(op.outs) = ss.str();
2650       }
2651       op.con->mark_down();
2652       if (op.on_finish) {
2653         op.on_finish->complete(-ETIMEDOUT);
2654       }
2655     }
2656   }
2657
2658   for (std::list<ceph_tid_t>::iterator i = cancel_ops.begin();
2659        i != cancel_ops.end(); ++i) {
2660     command_table.erase(*i);
2661   }
2662
2663   // reset session
2664   for (map<mds_rank_t,MetaSession*>::iterator p = mds_sessions.begin();
2665        p != mds_sessions.end(); ) {
2666     mds_rank_t mds = p->first;
2667     MetaSession *session = p->second;
2668     ++p;
2669
2670     int oldstate = oldmap->get_state(mds);
2671     int newstate = mdsmap->get_state(mds);
2672     if (!mdsmap->is_up(mds)) {
2673       session->con->mark_down();
2674     } else if (mdsmap->get_inst(mds) != session->inst) {
2675       old_inc = oldmap->get_incarnation(mds);
2676       new_inc = mdsmap->get_incarnation(mds);
2677       if (old_inc != new_inc) {
2678         ldout(cct, 1) << "mds incarnation changed from "
2679                       << old_inc << " to " << new_inc << dendl;
2680         oldstate = MDSMap::STATE_NULL;
2681       }
2682       session->con->mark_down();
2683       session->inst = mdsmap->get_inst(mds);
2684       // When new MDS starts to take over, notify kernel to trim unused entries
2685       // in its dcache/icache. Hopefully, the kernel will release some unused
2686       // inodes before the new MDS enters reconnect state.
2687       trim_cache_for_reconnect(session);
2688     } else if (oldstate == newstate)
2689       continue;  // no change
2690
2691     session->mds_state = newstate;
2692     if (old_inc != new_inc && newstate > MDSMap::STATE_RECONNECT) {
2693       // missed reconnect close the session so that it can be reopened
2694       _closed_mds_session(session);
2695       continue;
2696     }
2697     if (newstate == MDSMap::STATE_RECONNECT) {
2698       session->con = messenger->get_connection(session->inst);
2699       send_reconnect(session);
2700     } else if (newstate >= MDSMap::STATE_ACTIVE) {
2701       if (oldstate < MDSMap::STATE_ACTIVE) {
2702         // kick new requests
2703         kick_requests(session);
2704         kick_flushing_caps(session);
2705         signal_context_list(session->waiting_for_open);
2706         kick_maxsize_requests(session);
2707         wake_inode_waiters(session);
2708       }
2709       connect_mds_targets(mds);
2710     } else if (newstate == MDSMap::STATE_NULL &&
2711                mds >= mdsmap->get_max_mds()) {
2712       _closed_mds_session(session);
2713     }
2714   }
2715
2716   // kick any waiting threads
2717   signal_cond_list(waiting_for_mdsmap);
2718
2719   m->put();
2720
2721   monclient->sub_got("mdsmap", mdsmap->get_epoch());
2722 }
2723
2724 void Client::send_reconnect(MetaSession *session)
2725 {
2726   mds_rank_t mds = session->mds_num;
2727   ldout(cct, 10) << "send_reconnect to mds." << mds << dendl;
2728
2729   // trim unused caps to reduce MDS's cache rejoin time
2730   trim_cache_for_reconnect(session);
2731
2732   session->readonly = false;
2733
2734   if (session->release) {
2735     session->release->put();
2736     session->release = NULL;
2737   }
2738
2739   // reset my cap seq number
2740   session->seq = 0;
2741   //connect to the mds' offload targets
2742   connect_mds_targets(mds);
2743   //make sure unsafe requests get saved
2744   resend_unsafe_requests(session);
2745
2746   MClientReconnect *m = new MClientReconnect;
2747
2748   // i have an open session.
2749   ceph::unordered_set<inodeno_t> did_snaprealm;
2750   for (ceph::unordered_map<vinodeno_t, Inode*>::iterator p = inode_map.begin();
2751        p != inode_map.end();
2752        ++p) {
2753     Inode *in = p->second;
2754     if (in->caps.count(mds)) {
2755       ldout(cct, 10) << " caps on " << p->first
2756                << " " << ccap_string(in->caps[mds]->issued)
2757                << " wants " << ccap_string(in->caps_wanted())
2758                << dendl;
2759       filepath path;
2760       in->make_long_path(path);
2761       ldout(cct, 10) << "    path " << path << dendl;
2762
2763       bufferlist flockbl;
2764       _encode_filelocks(in, flockbl);
2765
2766       Cap *cap = in->caps[mds];
2767       cap->seq = 0;  // reset seq.
2768       cap->issue_seq = 0;  // reset seq.
2769       cap->mseq = 0;  // reset seq.
2770       cap->issued = cap->implemented;
2771
2772       snapid_t snap_follows = 0;
2773       if (!in->cap_snaps.empty())
2774         snap_follows = in->cap_snaps.begin()->first;
2775
2776       m->add_cap(p->first.ino,
2777                  cap->cap_id,
2778                  path.get_ino(), path.get_path(),   // ino
2779                  in->caps_wanted(), // wanted
2780                  cap->issued,     // issued
2781                  in->snaprealm->ino,
2782                  snap_follows,
2783                  flockbl);
2784
2785       if (did_snaprealm.count(in->snaprealm->ino) == 0) {
2786         ldout(cct, 10) << " snaprealm " << *in->snaprealm << dendl;
2787         m->add_snaprealm(in->snaprealm->ino, in->snaprealm->seq, in->snaprealm->parent);
2788         did_snaprealm.insert(in->snaprealm->ino);
2789       }
2790     }
2791   }
2792
2793   early_kick_flushing_caps(session);
2794
2795   session->con->send_message(m);
2796
2797   mount_cond.Signal();
2798 }
2799
2800
2801 void Client::kick_requests(MetaSession *session)
2802 {
2803   ldout(cct, 10) << "kick_requests for mds." << session->mds_num << dendl;
2804   for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
2805        p != mds_requests.end();
2806        ++p) {
2807     MetaRequest *req = p->second;
2808     if (req->got_unsafe)
2809       continue;
2810     if (req->aborted()) {
2811       if (req->caller_cond) {
2812         req->kick = true;
2813         req->caller_cond->Signal();
2814       }
2815       continue;
2816     }
2817     if (req->retry_attempt > 0)
2818       continue; // new requests only
2819     if (req->mds == session->mds_num) {
2820       send_request(p->second, session);
2821     }
2822   }
2823 }
2824
2825 void Client::resend_unsafe_requests(MetaSession *session)
2826 {
2827   for (xlist<MetaRequest*>::iterator iter = session->unsafe_requests.begin();
2828        !iter.end();
2829        ++iter)
2830     send_request(*iter, session);
2831
2832   // also re-send old requests when MDS enters reconnect stage. So that MDS can
2833   // process completed requests in clientreplay stage.
2834   for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
2835        p != mds_requests.end();
2836        ++p) {
2837     MetaRequest *req = p->second;
2838     if (req->got_unsafe)
2839       continue;
2840     if (req->aborted())
2841       continue;
2842     if (req->retry_attempt == 0)
2843       continue; // old requests only
2844     if (req->mds == session->mds_num)
2845       send_request(req, session, true);
2846   }
2847 }
2848
2849 void Client::wait_unsafe_requests()
2850 {
2851   list<MetaRequest*> last_unsafe_reqs;
2852   for (map<mds_rank_t,MetaSession*>::iterator p = mds_sessions.begin();
2853        p != mds_sessions.end();
2854        ++p) {
2855     MetaSession *s = p->second;
2856     if (!s->unsafe_requests.empty()) {
2857       MetaRequest *req = s->unsafe_requests.back();
2858       req->get();
2859       last_unsafe_reqs.push_back(req);
2860     }
2861   }
2862
2863   for (list<MetaRequest*>::iterator p = last_unsafe_reqs.begin();
2864        p != last_unsafe_reqs.end();
2865        ++p) {
2866     MetaRequest *req = *p;
2867     if (req->unsafe_item.is_on_list())
2868       wait_on_list(req->waitfor_safe);
2869     put_request(req);
2870   }
2871 }
2872
2873 void Client::kick_requests_closed(MetaSession *session)
2874 {
2875   ldout(cct, 10) << "kick_requests_closed for mds." << session->mds_num << dendl;
2876   for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
2877        p != mds_requests.end(); ) {
2878     MetaRequest *req = p->second;
2879     ++p;
2880     if (req->mds == session->mds_num) {
2881       if (req->caller_cond) {
2882         req->kick = true;
2883         req->caller_cond->Signal();
2884       }
2885       req->item.remove_myself();
2886       if (req->got_unsafe) {
2887         lderr(cct) << "kick_requests_closed removing unsafe request " << req->get_tid() << dendl;
2888         req->unsafe_item.remove_myself();
2889         req->unsafe_dir_item.remove_myself();
2890         req->unsafe_target_item.remove_myself();
2891         signal_cond_list(req->waitfor_safe);
2892         unregister_request(req);
2893       }
2894     }
2895   }
2896   assert(session->requests.empty());
2897   assert(session->unsafe_requests.empty());
2898 }
2899
2900
2901
2902
2903 /************
2904  * leases
2905  */
2906
2907 void Client::got_mds_push(MetaSession *s)
2908 {
2909   s->seq++;
2910   ldout(cct, 10) << " mds." << s->mds_num << " seq now " << s->seq << dendl;
2911   if (s->state == MetaSession::STATE_CLOSING) {
2912     s->con->send_message(new MClientSession(CEPH_SESSION_REQUEST_CLOSE, s->seq));
2913   }
2914 }
2915
2916 void Client::handle_lease(MClientLease *m)
2917 {
2918   ldout(cct, 10) << "handle_lease " << *m << dendl;
2919
2920   assert(m->get_action() == CEPH_MDS_LEASE_REVOKE);
2921
2922   mds_rank_t mds = mds_rank_t(m->get_source().num());
2923   MetaSession *session = _get_mds_session(mds, m->get_connection().get());
2924   if (!session) {
2925     m->put();
2926     return;
2927   }
2928
2929   got_mds_push(session);
2930
2931   ceph_seq_t seq = m->get_seq();
2932
2933   Inode *in;
2934   vinodeno_t vino(m->get_ino(), CEPH_NOSNAP);
2935   if (inode_map.count(vino) == 0) {
2936     ldout(cct, 10) << " don't have vino " << vino << dendl;
2937     goto revoke;
2938   }
2939   in = inode_map[vino];
2940
2941   if (m->get_mask() & CEPH_LOCK_DN) {
2942     if (!in->dir || in->dir->dentries.count(m->dname) == 0) {
2943       ldout(cct, 10) << " don't have dir|dentry " << m->get_ino() << "/" << m->dname <<dendl;
2944       goto revoke;
2945     }
2946     Dentry *dn = in->dir->dentries[m->dname];
2947     ldout(cct, 10) << " revoked DN lease on " << dn << dendl;
2948     dn->lease_mds = -1;
2949   }
2950
2951  revoke:
2952   m->get_connection()->send_message(
2953     new MClientLease(
2954       CEPH_MDS_LEASE_RELEASE, seq,
2955       m->get_mask(), m->get_ino(), m->get_first(), m->get_last(), m->dname));
2956   m->put();
2957 }
2958
2959 void Client::put_inode(Inode *in, int n)
2960 {
2961   ldout(cct, 10) << "put_inode on " << *in << dendl;
2962   int left = in->_put(n);
2963   if (left == 0) {
2964     // release any caps
2965     remove_all_caps(in);
2966
2967     ldout(cct, 10) << "put_inode deleting " << *in << dendl;
2968     bool unclean = objectcacher->release_set(&in->oset);
2969     assert(!unclean);
2970     inode_map.erase(in->vino());
2971     if (use_faked_inos())
2972       _release_faked_ino(in);
2973
2974     if (in == root) {
2975       root = 0;
2976       root_ancestor = 0;
2977       while (!root_parents.empty())
2978         root_parents.erase(root_parents.begin());
2979     }
2980
2981     delete in;
2982   }
2983 }
2984
2985 void Client::close_dir(Dir *dir)
2986 {
2987   Inode *in = dir->parent_inode;
2988   ldout(cct, 15) << "close_dir dir " << dir << " on " << in << dendl;
2989   assert(dir->is_empty());
2990   assert(in->dir == dir);
2991   assert(in->dn_set.size() < 2);     // dirs can't be hard-linked
2992   if (!in->dn_set.empty())
2993     in->get_first_parent()->put();   // unpin dentry
2994
2995   delete in->dir;
2996   in->dir = 0;
2997   put_inode(in);               // unpin inode
2998 }
2999
3000   /**
3001    * Don't call this with in==NULL, use get_or_create for that
3002    * leave dn set to default NULL unless you're trying to add
3003    * a new inode to a pre-created Dentry
3004    */
3005 Dentry* Client::link(Dir *dir, const string& name, Inode *in, Dentry *dn)
3006 {
3007   if (!dn) {
3008     // create a new Dentry
3009     dn = new Dentry;
3010     dn->name = name;
3011
3012     // link to dir
3013     dn->dir = dir;
3014     dir->dentries[dn->name] = dn;
3015     lru.lru_insert_mid(dn);    // mid or top?
3016     if (!in)
3017       dir->num_null_dentries++;
3018
3019     ldout(cct, 15) << "link dir " << dir->parent_inode << " '" << name << "' to inode " << in
3020                    << " dn " << dn << " (new dn)" << dendl;
3021   } else {
3022     assert(!dn->inode);
3023     if (in)
3024       dir->num_null_dentries--;
3025     ldout(cct, 15) << "link dir " << dir->parent_inode << " '" << name << "' to inode " << in
3026                    << " dn " << dn << " (old dn)" << dendl;
3027   }
3028
3029   if (in) {    // link to inode
3030     dn->inode = in;
3031     if (in->is_dir()) {
3032       if (in->dir)
3033         dn->get(); // dir -> dn pin
3034       if (in->ll_ref)
3035         dn->get(); // ll_ref -> dn pin
3036     }
3037
3038     assert(in->dn_set.count(dn) == 0);
3039
3040     // only one parent for directories!
3041     if (in->is_dir() && !in->dn_set.empty()) {
3042       Dentry *olddn = in->get_first_parent();
3043       assert(olddn->dir != dir || olddn->name != name);
3044       Inode *old_diri = olddn->dir->parent_inode;
3045       old_diri->dir_release_count++;
3046       clear_dir_complete_and_ordered(old_diri, true);
3047       unlink(olddn, true, true);  // keep dir, dentry
3048     }
3049
3050     in->dn_set.insert(dn);
3051
3052     ldout(cct, 20) << "link  inode " << in << " parents now " << in->dn_set << dendl;
3053   }
3054
3055   return dn;
3056 }
3057
3058 void Client::unlink(Dentry *dn, bool keepdir, bool keepdentry)
3059 {
3060   InodeRef in;
3061   in.swap(dn->inode);
3062   ldout(cct, 15) << "unlink dir " << dn->dir->parent_inode << " '" << dn->name << "' dn " << dn
3063                  << " inode " << dn->inode << dendl;
3064
3065   // unlink from inode
3066   if (in) {
3067     if (in->is_dir()) {
3068       if (in->dir)
3069         dn->put(); // dir -> dn pin
3070       if (in->ll_ref)
3071         dn->put(); // ll_ref -> dn pin
3072     }
3073     dn->inode = 0;
3074     assert(in->dn_set.count(dn));
3075     in->dn_set.erase(dn);
3076     ldout(cct, 20) << "unlink  inode " << in << " parents now " << in->dn_set << dendl;
3077   }
3078
3079   if (keepdentry) {
3080     dn->lease_mds = -1;
3081     if (in)
3082       dn->dir->num_null_dentries++;
3083   } else {
3084     ldout(cct, 15) << "unlink  removing '" << dn->name << "' dn " << dn << dendl;
3085
3086     // unlink from dir
3087     dn->dir->dentries.erase(dn->name);
3088     if (!in)
3089       dn->dir->num_null_dentries--;
3090     if (dn->dir->is_empty() && !keepdir)
3091       close_dir(dn->dir);
3092     dn->dir = 0;
3093
3094     // delete den
3095     lru.lru_remove(dn);
3096     dn->put();
3097   }
3098 }
3099
3100 /**
3101  * For asynchronous flushes, check for errors from the IO and
3102  * update the inode if necessary
3103  */
3104 class C_Client_FlushComplete : public Context {
3105 private:
3106   Client *client;
3107   InodeRef inode;
3108 public:
3109   C_Client_FlushComplete(Client *c, Inode *in) : client(c), inode(in) { }
3110   void finish(int r) override {
3111     assert(client->client_lock.is_locked_by_me());
3112     if (r != 0) {
3113       client_t const whoami = client->whoami;  // For the benefit of ldout prefix
3114       ldout(client->cct, 1) << "I/O error from flush on inode " << inode
3115         << " 0x" << std::hex << inode->ino << std::dec
3116         << ": " << r << "(" << cpp_strerror(r) << ")" << dendl;
3117       inode->set_async_err(r);
3118     }
3119   }
3120 };
3121
3122
3123 /****
3124  * caps
3125  */
3126
3127 void Client::get_cap_ref(Inode *in, int cap)
3128 {
3129   if ((cap & CEPH_CAP_FILE_BUFFER) &&
3130       in->cap_refs[CEPH_CAP_FILE_BUFFER] == 0) {
3131     ldout(cct, 5) << "get_cap_ref got first FILE_BUFFER ref on " << *in << dendl;
3132     in->get();
3133   }
3134   if ((cap & CEPH_CAP_FILE_CACHE) &&
3135       in->cap_refs[CEPH_CAP_FILE_CACHE] == 0) {
3136     ldout(cct, 5) << "get_cap_ref got first FILE_CACHE ref on " << *in << dendl;
3137     in->get();
3138   }
3139   in->get_cap_ref(cap);
3140 }
3141
3142 void Client::put_cap_ref(Inode *in, int cap)
3143 {
3144   int last = in->put_cap_ref(cap);
3145   if (last) {
3146     int put_nref = 0;
3147     int drop = last & ~in->caps_issued();
3148     if (in->snapid == CEPH_NOSNAP) {
3149       if ((last & CEPH_CAP_FILE_WR) &&
3150           !in->cap_snaps.empty() &&
3151           in->cap_snaps.rbegin()->second.writing) {
3152         ldout(cct, 10) << "put_cap_ref finishing pending cap_snap on " << *in << dendl;
3153         in->cap_snaps.rbegin()->second.writing = 0;
3154         finish_cap_snap(in, in->cap_snaps.rbegin()->second, get_caps_used(in));
3155         signal_cond_list(in->waitfor_caps);  // wake up blocked sync writers
3156       }
3157       if (last & CEPH_CAP_FILE_BUFFER) {
3158         for (auto &p : in->cap_snaps)
3159           p.second.dirty_data = 0;
3160         signal_cond_list(in->waitfor_commit);
3161         ldout(cct, 5) << "put_cap_ref dropped last FILE_BUFFER ref on " << *in << dendl;
3162         ++put_nref;
3163       }
3164     }
3165     if (last & CEPH_CAP_FILE_CACHE) {
3166       ldout(cct, 5) << "put_cap_ref dropped last FILE_CACHE ref on " << *in << dendl;
3167       ++put_nref;
3168     }
3169     if (drop)
3170       check_caps(in, 0);
3171     if (put_nref)
3172       put_inode(in, put_nref);
3173   }
3174 }
3175
3176 int Client::get_caps(Inode *in, int need, int want, int *phave, loff_t endoff)
3177 {
3178   int r = check_pool_perm(in, need);
3179   if (r < 0)
3180     return r;
3181
3182   while (1) {
3183     int file_wanted = in->caps_file_wanted();
3184     if ((file_wanted & need) != need) {
3185       ldout(cct, 10) << "get_caps " << *in << " need " << ccap_string(need)
3186                      << " file_wanted " << ccap_string(file_wanted) << ", EBADF "
3187                      << dendl;
3188       return -EBADF;
3189     }
3190
3191     int implemented;
3192     int have = in->caps_issued(&implemented);
3193
3194     bool waitfor_caps = false;
3195     bool waitfor_commit = false;
3196
3197     if (have & need & CEPH_CAP_FILE_WR) {
3198       if (endoff > 0 &&
3199           (endoff >= (loff_t)in->max_size ||
3200            endoff > (loff_t)(in->size << 1)) &&
3201           endoff > (loff_t)in->wanted_max_size) {
3202         ldout(cct, 10) << "wanted_max_size " << in->wanted_max_size << " -> " << endoff << dendl;
3203         in->wanted_max_size = endoff;
3204         check_caps(in, 0);
3205       }
3206
3207       if (endoff >= 0 && endoff > (loff_t)in->max_size) {
3208         ldout(cct, 10) << "waiting on max_size, endoff " << endoff << " max_size " << in->max_size << " on " << *in << dendl;
3209         waitfor_caps = true;
3210       }
3211       if (!in->cap_snaps.empty()) {
3212         if (in->cap_snaps.rbegin()->second.writing) {
3213           ldout(cct, 10) << "waiting on cap_snap write to complete" << dendl;
3214           waitfor_caps = true;
3215         }
3216         for (auto &p : in->cap_snaps) {
3217           if (p.second.dirty_data) {
3218             waitfor_commit = true;
3219             break;
3220           }
3221         }
3222         if (waitfor_commit) {
3223           _flush(in, new C_Client_FlushComplete(this, in));
3224           ldout(cct, 10) << "waiting for WRBUFFER to get dropped" << dendl;
3225         }
3226       }
3227     }
3228
3229     if (!waitfor_caps && !waitfor_commit) {
3230       if ((have & need) == need) {
3231         int revoking = implemented & ~have;
3232         ldout(cct, 10) << "get_caps " << *in << " have " << ccap_string(have)
3233                  << " need " << ccap_string(need) << " want " << ccap_string(want)
3234                  << " revoking " << ccap_string(revoking)
3235                  << dendl;
3236         if ((revoking & want) == 0) {
3237           *phave = need | (have & want);
3238           in->get_cap_ref(need);
3239           return 0;
3240         }
3241       }
3242       ldout(cct, 10) << "waiting for caps " << *in << " need " << ccap_string(need) << " want " << ccap_string(want) << dendl;
3243       waitfor_caps = true;
3244     }
3245
3246     if ((need & CEPH_CAP_FILE_WR) && in->auth_cap &&
3247         in->auth_cap->session->readonly)
3248       return -EROFS;
3249
3250     if (in->flags & I_CAP_DROPPED) {
3251       int mds_wanted = in->caps_mds_wanted();
3252       if ((mds_wanted & need) != need) {
3253         int ret = _renew_caps(in);
3254         if (ret < 0)
3255           return ret;
3256         continue;
3257       }
3258       if ((mds_wanted & file_wanted) ==
3259           (file_wanted & (CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR))) {
3260         in->flags &= ~I_CAP_DROPPED;
3261       }
3262     }
3263
3264     if (waitfor_caps)
3265       wait_on_list(in->waitfor_caps);
3266     else if (waitfor_commit)
3267       wait_on_list(in->waitfor_commit);
3268   }
3269 }
3270
3271 int Client::get_caps_used(Inode *in)
3272 {
3273   unsigned used = in->caps_used();
3274   if (!(used & CEPH_CAP_FILE_CACHE) &&
3275       !objectcacher->set_is_empty(&in->oset))
3276     used |= CEPH_CAP_FILE_CACHE;
3277   return used;
3278 }
3279
3280 void Client::cap_delay_requeue(Inode *in)
3281 {
3282   ldout(cct, 10) << "cap_delay_requeue on " << *in << dendl;
3283   in->hold_caps_until = ceph_clock_now();
3284   in->hold_caps_until += cct->_conf->client_caps_release_delay;
3285   delayed_list.push_back(&in->delay_cap_item);
3286 }
3287
3288 void Client::send_cap(Inode *in, MetaSession *session, Cap *cap,
3289                       bool sync, int used, int want, int retain,
3290                       int flush, ceph_tid_t flush_tid)
3291 {
3292   int held = cap->issued | cap->implemented;
3293   int revoking = cap->implemented & ~cap->issued;
3294   retain &= ~revoking;
3295   int dropping = cap->issued & ~retain;
3296   int op = CEPH_CAP_OP_UPDATE;
3297
3298   ldout(cct, 10) << "send_cap " << *in
3299            << " mds." << session->mds_num << " seq " << cap->seq
3300            << (sync ? " sync " : " async ")
3301            << " used " << ccap_string(used)
3302            << " want " << ccap_string(want)
3303            << " flush " << ccap_string(flush)
3304            << " retain " << ccap_string(retain)
3305            << " held "<< ccap_string(held)
3306            << " revoking " << ccap_string(revoking)
3307            << " dropping " << ccap_string(dropping)
3308            << dendl;
3309
3310   if (cct->_conf->client_inject_release_failure && revoking) {
3311     const int would_have_issued = cap->issued & retain;
3312     const int would_have_implemented = cap->implemented & (cap->issued | used);
3313     // Simulated bug:
3314     //  - tell the server we think issued is whatever they issued plus whatever we implemented
3315     //  - leave what we have implemented in place
3316     ldout(cct, 20) << __func__ << " injecting failure to release caps" << dendl;
3317     cap->issued = cap->issued | cap->implemented;
3318
3319     // Make an exception for revoking xattr caps: we are injecting
3320     // failure to release other caps, but allow xattr because client
3321     // will block on xattr ops if it can't release these to MDS (#9800)
3322     const int xattr_mask = CEPH_CAP_XATTR_SHARED | CEPH_CAP_XATTR_EXCL;
3323     cap->issued ^= xattr_mask & revoking;
3324     cap->implemented ^= xattr_mask & revoking;
3325
3326     ldout(cct, 20) << __func__ << " issued " << ccap_string(cap->issued) << " vs " << ccap_string(would_have_issued) << dendl;
3327     ldout(cct, 20) << __func__ << " implemented " << ccap_string(cap->implemented) << " vs " << ccap_string(would_have_implemented) << dendl;
3328   } else {
3329     // Normal behaviour
3330     cap->issued &= retain;
3331     cap->implemented &= cap->issued | used;
3332   }
3333
3334   snapid_t follows = 0;
3335
3336   if (flush)
3337     follows = in->snaprealm->get_snap_context().seq;
3338
3339   MClientCaps *m = new MClientCaps(op,
3340                                    in->ino,
3341                                    0,
3342                                    cap->cap_id, cap->seq,
3343                                    cap->implemented,
3344                                    want,
3345                                    flush,
3346                                    cap->mseq,
3347                                    cap_epoch_barrier);
3348   m->caller_uid = in->cap_dirtier_uid;
3349   m->caller_gid = in->cap_dirtier_gid;
3350
3351   m->head.issue_seq = cap->issue_seq;
3352   m->set_tid(flush_tid);
3353
3354   m->head.uid = in->uid;
3355   m->head.gid = in->gid;
3356   m->head.mode = in->mode;
3357
3358   m->head.nlink = in->nlink;
3359
3360   if (flush & CEPH_CAP_XATTR_EXCL) {
3361     ::encode(in->xattrs, m->xattrbl);
3362     m->head.xattr_version = in->xattr_version;
3363   }
3364
3365   m->size = in->size;
3366   m->max_size = in->max_size;
3367   m->truncate_seq = in->truncate_seq;
3368   m->truncate_size = in->truncate_size;
3369   m->mtime = in->mtime;
3370   m->atime = in->atime;
3371   m->ctime = in->ctime;
3372   m->btime = in->btime;
3373   m->time_warp_seq = in->time_warp_seq;
3374   m->change_attr = in->change_attr;
3375   if (sync)
3376     m->flags |= CLIENT_CAPS_SYNC;
3377
3378   if (flush & CEPH_CAP_FILE_WR) {
3379     m->inline_version = in->inline_version;
3380     m->inline_data = in->inline_data;
3381   }
3382
3383   in->reported_size = in->size;
3384   m->set_snap_follows(follows);
3385   cap->wanted = want;
3386   if (cap == in->auth_cap) {
3387     m->set_max_size(in->wanted_max_size);
3388     in->requested_max_size = in->wanted_max_size;
3389     ldout(cct, 15) << "auth cap, setting max_size = " << in->requested_max_size << dendl;
3390   }
3391
3392   if (!session->flushing_caps_tids.empty())
3393     m->set_oldest_flush_tid(*session->flushing_caps_tids.begin());
3394
3395   session->con->send_message(m);
3396 }
3397
3398 static bool is_max_size_approaching(Inode *in)
3399 {
3400   /* mds will adjust max size according to the reported size */
3401   if (in->flushing_caps & CEPH_CAP_FILE_WR)
3402     return false;
3403   if (in->size >= in->max_size)
3404     return true;
3405   /* half of previous max_size increment has been used */
3406   if (in->max_size > in->reported_size &&
3407       (in->size << 1) >= in->max_size + in->reported_size)
3408     return true;
3409   return false;
3410 }
3411
3412 /**
3413  * check_caps
3414  *
3415  * Examine currently used and wanted versus held caps. Release, flush or ack
3416  * revoked caps to the MDS as appropriate.
3417  *
3418  * @param in the inode to check
3419  * @param flags flags to apply to cap check
3420  */
3421 void Client::check_caps(Inode *in, unsigned flags)
3422 {
3423   unsigned wanted = in->caps_wanted();
3424   unsigned used = get_caps_used(in);
3425   unsigned cap_used;
3426
3427   if (in->is_dir() && (in->flags & I_COMPLETE)) {
3428     // we do this here because we don't want to drop to Fs (and then
3429     // drop the Fs if we do a create!) if that alone makes us send lookups
3430     // to the MDS. Doing it in in->caps_wanted() has knock-on effects elsewhere
3431     wanted |= CEPH_CAP_FILE_EXCL;
3432   }
3433
3434   int implemented;
3435   int issued = in->caps_issued(&implemented);
3436   int revoking = implemented & ~issued;
3437
3438   int retain = wanted | used | CEPH_CAP_PIN;
3439   if (!unmounting) {
3440     if (wanted)
3441       retain |= CEPH_CAP_ANY;
3442     else
3443       retain |= CEPH_CAP_ANY_SHARED;
3444   }
3445
3446   ldout(cct, 10) << "check_caps on " << *in
3447            << " wanted " << ccap_string(wanted)
3448            << " used " << ccap_string(used)
3449            << " issued " << ccap_string(issued)
3450            << " revoking " << ccap_string(revoking)
3451            << " flags=" << flags
3452            << dendl;
3453
3454   if (in->snapid != CEPH_NOSNAP)
3455     return; //snap caps last forever, can't write
3456
3457   if (in->caps.empty())
3458     return;   // guard if at end of func
3459
3460   if ((revoking & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO)) &&
3461       (used & CEPH_CAP_FILE_CACHE) && !(used & CEPH_CAP_FILE_BUFFER)) {
3462     if (_release(in))
3463       used &= ~CEPH_CAP_FILE_CACHE;
3464   }
3465
3466   if (!in->cap_snaps.empty())
3467     flush_snaps(in);
3468
3469   if (flags & CHECK_CAPS_NODELAY)
3470     in->hold_caps_until = utime_t();
3471   else
3472     cap_delay_requeue(in);
3473
3474   utime_t now = ceph_clock_now();
3475
3476   map<mds_rank_t, Cap*>::iterator it = in->caps.begin();
3477   while (it != in->caps.end()) {
3478     mds_rank_t mds = it->first;
3479     Cap *cap = it->second;
3480     ++it;
3481
3482     MetaSession *session = mds_sessions[mds];
3483     assert(session);
3484
3485     cap_used = used;
3486     if (in->auth_cap && cap != in->auth_cap)
3487       cap_used &= ~in->auth_cap->issued;
3488
3489     revoking = cap->implemented & ~cap->issued;
3490
3491     ldout(cct, 10) << " cap mds." << mds
3492              << " issued " << ccap_string(cap->issued)
3493              << " implemented " << ccap_string(cap->implemented)
3494              << " revoking " << ccap_string(revoking) << dendl;
3495
3496     if (in->wanted_max_size > in->max_size &&
3497         in->wanted_max_size > in->requested_max_size &&
3498         cap == in->auth_cap)
3499       goto ack;
3500
3501     /* approaching file_max? */
3502     if ((cap->issued & CEPH_CAP_FILE_WR) &&
3503         cap == in->auth_cap &&
3504         is_max_size_approaching(in)) {
3505       ldout(cct, 10) << "size " << in->size << " approaching max_size " << in->max_size
3506                      << ", reported " << in->reported_size << dendl;
3507       goto ack;
3508     }
3509
3510     /* completed revocation? */
3511     if (revoking && (revoking & cap_used) == 0) {
3512       ldout(cct, 10) << "completed revocation of " << ccap_string(cap->implemented & ~cap->issued) << dendl;
3513       goto ack;
3514     }
3515
3516     /* want more caps from mds? */
3517     if (wanted & ~(cap->wanted | cap->issued))
3518       goto ack;
3519
3520     if (!revoking && unmounting && (cap_used == 0))
3521       goto ack;
3522
3523     if (wanted == cap->wanted &&         // mds knows what we want.
3524         ((cap->issued & ~retain) == 0) &&// and we don't have anything we wouldn't like
3525         !in->dirty_caps)                 // and we have no dirty caps
3526       continue;
3527
3528     if (now < in->hold_caps_until) {
3529       ldout(cct, 10) << "delaying cap release" << dendl;
3530       continue;
3531     }
3532
3533   ack:
3534     // re-send old cap/snapcap flushes first.
3535     if (session->mds_state >= MDSMap::STATE_RECONNECT &&
3536         session->mds_state < MDSMap::STATE_ACTIVE &&
3537         session->early_flushing_caps.count(in) == 0) {
3538       ldout(cct, 20) << " reflushing caps (check_caps) on " << *in
3539                      << " to mds." << session->mds_num << dendl;
3540       session->early_flushing_caps.insert(in);
3541       if (in->cap_snaps.size())
3542         flush_snaps(in, true);
3543       if (in->flushing_caps)
3544         flush_caps(in, session, flags & CHECK_CAPS_SYNCHRONOUS);
3545     }
3546
3547     int flushing;
3548     ceph_tid_t flush_tid;
3549     if (in->auth_cap == cap && in->dirty_caps) {
3550       flushing = mark_caps_flushing(in, &flush_tid);
3551     } else {
3552       flushing = 0;
3553       flush_tid = 0;
3554     }
3555
3556     send_cap(in, session, cap, flags & CHECK_CAPS_SYNCHRONOUS, cap_used, wanted,
3557              retain, flushing, flush_tid);
3558   }
3559 }
3560
3561
3562 void Client::queue_cap_snap(Inode *in, SnapContext& old_snapc)
3563 {
3564   int used = get_caps_used(in);
3565   int dirty = in->caps_dirty();
3566   ldout(cct, 10) << "queue_cap_snap " << *in << " snapc " << old_snapc << " used " << ccap_string(used) << dendl;
3567
3568   if (in->cap_snaps.size() &&
3569       in->cap_snaps.rbegin()->second.writing) {
3570     ldout(cct, 10) << "queue_cap_snap already have pending cap_snap on " << *in << dendl;
3571     return;
3572   } else if (in->caps_dirty() ||
3573             (used & CEPH_CAP_FILE_WR) ||
3574              (dirty & CEPH_CAP_ANY_WR)) {
3575     const auto &capsnapem = in->cap_snaps.emplace(std::piecewise_construct, std::make_tuple(old_snapc.seq), std::make_tuple(in));
3576     assert(capsnapem.second == true); /* element inserted */
3577     CapSnap &capsnap = capsnapem.first->second;
3578     capsnap.context = old_snapc;
3579     capsnap.issued = in->caps_issued();
3580     capsnap.dirty = in->caps_dirty();
3581
3582     capsnap.dirty_data = (used & CEPH_CAP_FILE_BUFFER);
3583
3584     capsnap.uid = in->uid;
3585     capsnap.gid = in->gid;
3586     capsnap.mode = in->mode;
3587     capsnap.btime = in->btime;
3588     capsnap.xattrs = in->xattrs;
3589     capsnap.xattr_version = in->xattr_version;
3590
3591     if (used & CEPH_CAP_FILE_WR) {
3592       ldout(cct, 10) << "queue_cap_snap WR used on " << *in << dendl;
3593       capsnap.writing = 1;
3594     } else {
3595       finish_cap_snap(in, capsnap, used);
3596     }
3597   } else {
3598     ldout(cct, 10) << "queue_cap_snap not dirty|writing on " << *in << dendl;
3599   }
3600 }
3601
3602 void Client::finish_cap_snap(Inode *in, CapSnap &capsnap, int used)
3603 {
3604   ldout(cct, 10) << "finish_cap_snap " << *in << " capsnap " << (void *)&capsnap << " used " << ccap_string(used) << dendl;
3605   capsnap.size = in->size;
3606   capsnap.mtime = in->mtime;
3607   capsnap.atime = in->atime;
3608   capsnap.ctime = in->ctime;
3609   capsnap.time_warp_seq = in->time_warp_seq;
3610   capsnap.change_attr = in->change_attr;
3611
3612   capsnap.dirty |= in->caps_dirty();
3613
3614   if (capsnap.dirty & CEPH_CAP_FILE_WR) {
3615     capsnap.inline_data = in->inline_data;
3616     capsnap.inline_version = in->inline_version;
3617   }
3618
3619   if (used & CEPH_CAP_FILE_BUFFER) {
3620     ldout(cct, 10) << "finish_cap_snap " << *in << " cap_snap " << &capsnap << " used " << used
3621              << " WRBUFFER, delaying" << dendl;
3622   } else {
3623     capsnap.dirty_data = 0;
3624     flush_snaps(in);
3625   }
3626 }
3627
3628 void Client::_flushed_cap_snap(Inode *in, snapid_t seq)
3629 {
3630   ldout(cct, 10) << "_flushed_cap_snap seq " << seq << " on " << *in << dendl;
3631   in->cap_snaps.at(seq).dirty_data = 0;
3632   flush_snaps(in);
3633 }
3634
3635 void Client::flush_snaps(Inode *in, bool all_again)
3636 {
3637   ldout(cct, 10) << "flush_snaps on " << *in << " all_again " << all_again << dendl;
3638   assert(in->cap_snaps.size());
3639
3640   // pick auth mds
3641   assert(in->auth_cap);
3642   MetaSession *session = in->auth_cap->session;
3643   int mseq = in->auth_cap->mseq;
3644
3645   for (auto &p : in->cap_snaps) {
3646     CapSnap &capsnap = p.second;
3647     if (!all_again) {
3648       // only flush once per session
3649       if (capsnap.flush_tid > 0)
3650         continue;
3651     }
3652
3653     ldout(cct, 10) << "flush_snaps mds." << session->mds_num
3654              << " follows " << p.first
3655              << " size " << capsnap.size
3656              << " mtime " << capsnap.mtime
3657              << " dirty_data=" << capsnap.dirty_data
3658              << " writing=" << capsnap.writing
3659              << " on " << *in << dendl;
3660     if (capsnap.dirty_data || capsnap.writing)
3661       continue;
3662
3663     if (capsnap.flush_tid == 0) {
3664       capsnap.flush_tid = ++last_flush_tid;
3665       if (!in->flushing_cap_item.is_on_list())
3666         session->flushing_caps.push_back(&in->flushing_cap_item);
3667       session->flushing_caps_tids.insert(capsnap.flush_tid);
3668     }
3669
3670     MClientCaps *m = new MClientCaps(CEPH_CAP_OP_FLUSHSNAP, in->ino, in->snaprealm->ino, 0, mseq,
3671                                      cap_epoch_barrier);
3672     if (user_id >= 0)
3673       m->caller_uid = user_id;
3674     if (group_id >= 0)
3675       m->caller_gid = group_id;
3676
3677     m->set_client_tid(capsnap.flush_tid);
3678     m->head.snap_follows = p.first;
3679
3680     m->head.caps = capsnap.issued;
3681     m->head.dirty = capsnap.dirty;
3682
3683     m->head.uid = capsnap.uid;
3684     m->head.gid = capsnap.gid;
3685     m->head.mode = capsnap.mode;
3686     m->btime = capsnap.btime;
3687
3688     m->size = capsnap.size;
3689
3690     m->head.xattr_version = capsnap.xattr_version;
3691     ::encode(capsnap.xattrs, m->xattrbl);
3692
3693     m->ctime = capsnap.ctime;
3694     m->btime = capsnap.btime;
3695     m->mtime = capsnap.mtime;
3696     m->atime = capsnap.atime;
3697     m->time_warp_seq = capsnap.time_warp_seq;
3698     m->change_attr = capsnap.change_attr;
3699
3700     if (capsnap.dirty & CEPH_CAP_FILE_WR) {
3701       m->inline_version = in->inline_version;
3702       m->inline_data = in->inline_data;
3703     }
3704
3705     assert(!session->flushing_caps_tids.empty());
3706     m->set_oldest_flush_tid(*session->flushing_caps_tids.begin());
3707
3708     session->con->send_message(m);
3709   }
3710 }
3711
3712
3713
3714 void Client::wait_on_list(list<Cond*>& ls)
3715 {
3716   Cond cond;
3717   ls.push_back(&cond);
3718   cond.Wait(client_lock);
3719   ls.remove(&cond);
3720 }
3721
3722 void Client::signal_cond_list(list<Cond*>& ls)
3723 {
3724   for (list<Cond*>::iterator it = ls.begin(); it != ls.end(); ++it)
3725     (*it)->Signal();
3726 }
3727
3728 void Client::wait_on_context_list(list<Context*>& ls)
3729 {
3730   Cond cond;
3731   bool done = false;
3732   int r;
3733   ls.push_back(new C_Cond(&cond, &done, &r));
3734   while (!done)
3735     cond.Wait(client_lock);
3736 }
3737
3738 void Client::signal_context_list(list<Context*>& ls)
3739 {
3740   while (!ls.empty()) {
3741     ls.front()->complete(0);
3742     ls.pop_front();
3743   }
3744 }
3745
3746 void Client::wake_inode_waiters(MetaSession *s)
3747 {
3748   xlist<Cap*>::iterator iter = s->caps.begin();
3749   while (!iter.end()){
3750     signal_cond_list((*iter)->inode->waitfor_caps);
3751     ++iter;
3752   }
3753 }
3754
3755
3756 // flush dirty data (from objectcache)
3757
3758 class C_Client_CacheInvalidate : public Context  {
3759 private:
3760   Client *client;
3761   vinodeno_t ino;
3762   int64_t offset, length;
3763 public:
3764   C_Client_CacheInvalidate(Client *c, Inode *in, int64_t off, int64_t len) :
3765     client(c), offset(off), length(len) {
3766     if (client->use_faked_inos())
3767       ino = vinodeno_t(in->faked_ino, CEPH_NOSNAP);
3768     else
3769       ino = in->vino();
3770   }
3771   void finish(int r) override {
3772     // _async_invalidate takes the lock when it needs to, call this back from outside of lock.
3773     assert(!client->client_lock.is_locked_by_me());
3774     client->_async_invalidate(ino, offset, length);
3775   }
3776 };
3777
3778 void Client::_async_invalidate(vinodeno_t ino, int64_t off, int64_t len)
3779 {
3780   if (unmounting)
3781     return;
3782   ldout(cct, 10) << "_async_invalidate " << ino << " " << off << "~" << len << dendl;
3783   ino_invalidate_cb(callback_handle, ino, off, len);
3784 }
3785
3786 void Client::_schedule_invalidate_callback(Inode *in, int64_t off, int64_t len) {
3787
3788   if (ino_invalidate_cb)
3789     // we queue the invalidate, which calls the callback and decrements the ref
3790     async_ino_invalidator.queue(new C_Client_CacheInvalidate(this, in, off, len));
3791 }
3792
3793 void Client::_invalidate_inode_cache(Inode *in)
3794 {
3795   ldout(cct, 10) << "_invalidate_inode_cache " << *in << dendl;
3796
3797   // invalidate our userspace inode cache
3798   if (cct->_conf->client_oc) {
3799     objectcacher->release_set(&in->oset);
3800     if (!objectcacher->set_is_empty(&in->oset))
3801       lderr(cct) << "failed to invalidate cache for " << *in << dendl;
3802   }
3803
3804   _schedule_invalidate_callback(in, 0, 0);
3805 }
3806
3807 void Client::_invalidate_inode_cache(Inode *in, int64_t off, int64_t len)
3808 {
3809   ldout(cct, 10) << "_invalidate_inode_cache " << *in << " " << off << "~" << len << dendl;
3810
3811   // invalidate our userspace inode cache
3812   if (cct->_conf->client_oc) {
3813     vector<ObjectExtent> ls;
3814     Striper::file_to_extents(cct, in->ino, &in->layout, off, len, in->truncate_size, ls);
3815     objectcacher->discard_writeback(&in->oset, ls, nullptr);
3816   }
3817
3818   _schedule_invalidate_callback(in, off, len);
3819 }
3820
3821 bool Client::_release(Inode *in)
3822 {
3823   ldout(cct, 20) << "_release " << *in << dendl;
3824   if (in->cap_refs[CEPH_CAP_FILE_CACHE] == 0) {
3825     _invalidate_inode_cache(in);
3826     return true;
3827   }
3828   return false;
3829 }
3830
3831 bool Client::_flush(Inode *in, Context *onfinish)
3832 {
3833   ldout(cct, 10) << "_flush " << *in << dendl;
3834
3835   if (!in->oset.dirty_or_tx) {
3836     ldout(cct, 10) << " nothing to flush" << dendl;
3837     onfinish->complete(0);
3838     return true;
3839   }
3840
3841   if (objecter->osdmap_pool_full(in->layout.pool_id)) {
3842     ldout(cct, 8) << __func__ << ": FULL, purging for ENOSPC" << dendl;
3843     objectcacher->purge_set(&in->oset);
3844     if (onfinish) {
3845       onfinish->complete(-ENOSPC);
3846     }
3847     return true;
3848   }
3849
3850   return objectcacher->flush_set(&in->oset, onfinish);
3851 }
3852
3853 void Client::_flush_range(Inode *in, int64_t offset, uint64_t size)
3854 {
3855   assert(client_lock.is_locked());
3856   if (!in->oset.dirty_or_tx) {
3857     ldout(cct, 10) << " nothing to flush" << dendl;
3858     return;
3859   }
3860
3861   Mutex flock("Client::_flush_range flock");
3862   Cond cond;
3863   bool safe = false;
3864   Context *onflush = new C_SafeCond(&flock, &cond, &safe);
3865   bool ret = objectcacher->file_flush(&in->oset, &in->layout, in->snaprealm->get_snap_context(),
3866                                       offset, size, onflush);
3867   if (!ret) {
3868     // wait for flush
3869     client_lock.Unlock();
3870     flock.Lock();
3871     while (!safe)
3872       cond.Wait(flock);
3873     flock.Unlock();
3874     client_lock.Lock();
3875   }
3876 }
3877
3878 void Client::flush_set_callback(ObjectCacher::ObjectSet *oset)
3879 {
3880   //  Mutex::Locker l(client_lock);
3881   assert(client_lock.is_locked());   // will be called via dispatch() -> objecter -> ...
3882   Inode *in = static_cast<Inode *>(oset->parent);
3883   assert(in);
3884   _flushed(in);
3885 }
3886
3887 void Client::_flushed(Inode *in)
3888 {
3889   ldout(cct, 10) << "_flushed " << *in << dendl;
3890
3891   put_cap_ref(in, CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_BUFFER);
3892 }
3893
3894
3895
3896 // checks common to add_update_cap, handle_cap_grant
3897 void Client::check_cap_issue(Inode *in, Cap *cap, unsigned issued)
3898 {
3899   unsigned had = in->caps_issued();
3900
3901   if ((issued & CEPH_CAP_FILE_CACHE) &&
3902       !(had & CEPH_CAP_FILE_CACHE))
3903     in->cache_gen++;
3904
3905   if ((issued & CEPH_CAP_FILE_SHARED) &&
3906       !(had & CEPH_CAP_FILE_SHARED)) {
3907     in->shared_gen++;
3908
3909     if (in->is_dir())
3910       clear_dir_complete_and_ordered(in, true);
3911   }
3912 }
3913
3914 void Client::add_update_cap(Inode *in, MetaSession *mds_session, uint64_t cap_id,
3915                             unsigned issued, unsigned seq, unsigned mseq, inodeno_t realm,
3916                             int flags, const UserPerm& cap_perms)
3917 {
3918   Cap *cap = 0;
3919   mds_rank_t mds = mds_session->mds_num;
3920   if (in->caps.count(mds)) {
3921     cap = in->caps[mds];
3922
3923     /*
3924      * auth mds of the inode changed. we received the cap export
3925      * message, but still haven't received the cap import message.
3926      * handle_cap_export() updated the new auth MDS' cap.
3927      *
3928      * "ceph_seq_cmp(seq, cap->seq) <= 0" means we are processing
3929      * a message that was send before the cap import message. So
3930      * don't remove caps.
3931      */
3932     if (ceph_seq_cmp(seq, cap->seq) <= 0) {
3933       assert(cap == in->auth_cap);
3934       assert(cap->cap_id == cap_id);
3935       seq = cap->seq;
3936       mseq = cap->mseq;
3937       issued |= cap->issued;
3938       flags |= CEPH_CAP_FLAG_AUTH;
3939     }
3940   } else {
3941     mds_session->num_caps++;
3942     if (!in->is_any_caps()) {
3943       assert(in->snaprealm == 0);
3944       in->snaprealm = get_snap_realm(realm);
3945       in->snaprealm->inodes_with_caps.push_back(&in->snaprealm_item);
3946       ldout(cct, 15) << "add_update_cap first one, opened snaprealm " << in->snaprealm << dendl;
3947     }
3948     in->caps[mds] = cap = new Cap;
3949
3950     mds_session->caps.push_back(&cap->cap_item);
3951     cap->session = mds_session;
3952     cap->inode = in;
3953     cap->gen = mds_session->cap_gen;
3954   }
3955
3956   check_cap_issue(in, cap, issued);
3957
3958   if (flags & CEPH_CAP_FLAG_AUTH) {
3959     if (in->auth_cap != cap &&
3960         (!in->auth_cap || ceph_seq_cmp(in->auth_cap->mseq, mseq) < 0)) {
3961       if (in->auth_cap && in->flushing_cap_item.is_on_list()) {
3962         ldout(cct, 10) << "add_update_cap changing auth cap: "
3963                        << "add myself to new auth MDS' flushing caps list" << dendl;
3964         adjust_session_flushing_caps(in, in->auth_cap->session, mds_session);
3965       }
3966       in->auth_cap = cap;
3967     }
3968   }
3969
3970   unsigned old_caps = cap->issued;
3971   cap->cap_id = cap_id;
3972   cap->issued = issued;
3973   cap->implemented |= issued;
3974   cap->seq = seq;
3975   cap->issue_seq = seq;
3976   cap->mseq = mseq;
3977   cap->gen = mds_session->cap_gen;
3978   cap->latest_perms = cap_perms;
3979   ldout(cct, 10) << "add_update_cap issued " << ccap_string(old_caps) << " -> " << ccap_string(cap->issued)
3980            << " from mds." << mds
3981            << " on " << *in
3982            << dendl;
3983
3984   if ((issued & ~old_caps) && in->auth_cap == cap) {
3985     // non-auth MDS is revoking the newly grant caps ?
3986     for (map<mds_rank_t,Cap*>::iterator it = in->caps.begin(); it != in->caps.end(); ++it) {
3987       if (it->second == cap)
3988         continue;
3989       if (it->second->implemented & ~it->second->issued & issued) {
3990         check_caps(in, CHECK_CAPS_NODELAY);
3991         break;
3992       }
3993     }
3994   }
3995
3996   if (issued & ~old_caps)
3997     signal_cond_list(in->waitfor_caps);
3998 }
3999
4000 void Client::remove_cap(Cap *cap, bool queue_release)
4001 {
4002   Inode *in = cap->inode;
4003   MetaSession *session = cap->session;
4004   mds_rank_t mds = cap->session->mds_num;
4005
4006   ldout(cct, 10) << "remove_cap mds." << mds << " on " << *in << dendl;
4007
4008   if (queue_release) {
4009     session->enqueue_cap_release(
4010       in->ino,
4011       cap->cap_id,
4012       cap->issue_seq,
4013       cap->mseq,
4014       cap_epoch_barrier);
4015   }
4016
4017   if (in->auth_cap == cap) {
4018     if (in->flushing_cap_item.is_on_list()) {
4019       ldout(cct, 10) << " removing myself from flushing_cap list" << dendl;
4020       in->flushing_cap_item.remove_myself();
4021     }
4022     in->auth_cap = NULL;
4023   }
4024   assert(in->caps.count(mds));
4025   in->caps.erase(mds);
4026
4027   cap->cap_item.remove_myself();
4028   delete cap;
4029   cap = nullptr;
4030
4031   if (!in->is_any_caps()) {
4032     ldout(cct, 15) << "remove_cap last one, closing snaprealm " << in->snaprealm << dendl;
4033     in->snaprealm_item.remove_myself();
4034     put_snap_realm(in->snaprealm);
4035     in->snaprealm = 0;
4036   }
4037 }
4038
4039 void Client::remove_all_caps(Inode *in)
4040 {
4041   while (!in->caps.empty())
4042     remove_cap(in->caps.begin()->second, true);
4043 }
4044
4045 void Client::remove_session_caps(MetaSession *s)
4046 {
4047   ldout(cct, 10) << "remove_session_caps mds." << s->mds_num << dendl;
4048
4049   while (s->caps.size()) {
4050     Cap *cap = *s->caps.begin();
4051     Inode *in = cap->inode;
4052     bool dirty_caps = false, cap_snaps = false;
4053     if (in->auth_cap == cap) {
4054       cap_snaps = !in->cap_snaps.empty();
4055       dirty_caps = in->dirty_caps | in->flushing_caps;
4056       in->wanted_max_size = 0;
4057       in->requested_max_size = 0;
4058       in->flags |= I_CAP_DROPPED;
4059     }
4060     remove_cap(cap, false);
4061     signal_cond_list(in->waitfor_caps);
4062     if (cap_snaps) {
4063       InodeRef tmp_ref(in);
4064       in->cap_snaps.clear();
4065     }
4066     if (dirty_caps) {
4067       lderr(cct) << "remove_session_caps still has dirty|flushing caps on " << *in << dendl;
4068       if (in->flushing_caps) {
4069         num_flushing_caps--;
4070         in->flushing_cap_tids.clear();
4071       }
4072       in->flushing_caps = 0;
4073       in->mark_caps_clean();
4074       put_inode(in);
4075     }
4076   }
4077   s->flushing_caps_tids.clear();
4078   sync_cond.Signal();
4079 }
4080
4081 int Client::_do_remount(bool retry_on_error)
4082 {
4083   uint64_t max_retries = cct->_conf->get_val<uint64_t>("mds_max_retries_on_remount_failure");
4084
4085   errno = 0;
4086   int r = remount_cb(callback_handle);
4087   if (r == 0) {
4088     retries_on_invalidate = 0;
4089   } else {
4090     int e = errno;
4091     client_t whoami = get_nodeid();
4092     if (r == -1) {
4093       lderr(cct) <<
4094           "failed to remount (to trim kernel dentries): "
4095           "errno = " << e << " (" << strerror(e) << ")" << dendl;
4096     } else {
4097       lderr(cct) <<
4098           "failed to remount (to trim kernel dentries): "
4099           "return code = " << r << dendl;
4100     }
4101     bool should_abort =
4102       (cct->_conf->get_val<bool>("client_die_on_failed_remount") ||
4103        cct->_conf->get_val<bool>("client_die_on_failed_dentry_invalidate")) &&
4104       !(retry_on_error && (++retries_on_invalidate < max_retries));
4105     if (should_abort && !unmounting) {
4106       lderr(cct) << "failed to remount for kernel dentry trimming; quitting!" << dendl;
4107       ceph_abort();
4108     }
4109   }
4110   return r;
4111 }
4112
4113 class C_Client_Remount : public Context  {
4114 private:
4115   Client *client;
4116 public:
4117   explicit C_Client_Remount(Client *c) : client(c) {}
4118   void finish(int r) override {
4119     assert(r == 0);
4120     client->_do_remount(true);
4121   }
4122 };
4123
4124 void Client::_invalidate_kernel_dcache()
4125 {
4126   if (unmounting)
4127     return;
4128   if (can_invalidate_dentries) {
4129     if (dentry_invalidate_cb && root->dir) {
4130       for (ceph::unordered_map<string, Dentry*>::iterator p = root->dir->dentries.begin();
4131          p != root->dir->dentries.end();
4132          ++p) {
4133        if (p->second->inode)
4134         _schedule_invalidate_dentry_callback(p->second, false);
4135       }
4136     }
4137   } else if (remount_cb) {
4138     // Hacky:
4139     // when remounting a file system, linux kernel trims all unused dentries in the fs
4140     remount_finisher.queue(new C_Client_Remount(this));
4141   }
4142 }
4143
4144 void Client::_trim_negative_child_dentries(InodeRef& in)
4145 {
4146   if (!in->is_dir())
4147     return;
4148
4149   Dir* dir = in->dir;
4150   if (dir && dir->dentries.size() == dir->num_null_dentries) {
4151     for (auto p = dir->dentries.begin(); p != dir->dentries.end(); ) {
4152       Dentry *dn = p->second;
4153       ++p;
4154       assert(!dn->inode);
4155       if (dn->lru_is_expireable())
4156         unlink(dn, true, false);  // keep dir, drop dentry
4157     }
4158     if (dir->dentries.empty()) {
4159       close_dir(dir);
4160     }
4161   }
4162
4163   if (in->flags & I_SNAPDIR_OPEN) {
4164     InodeRef snapdir = open_snapdir(in.get());
4165     _trim_negative_child_dentries(snapdir);
4166   }
4167 }
4168
4169 void Client::trim_caps(MetaSession *s, uint64_t max)
4170 {
4171   mds_rank_t mds = s->mds_num;
4172   size_t caps_size = s->caps.size();
4173   ldout(cct, 10) << "trim_caps mds." << mds << " max " << max
4174     << " caps " << caps_size << dendl;
4175
4176   uint64_t trimmed = 0;
4177   auto p = s->caps.begin();
4178   std::set<Dentry *> to_trim; /* this avoids caps other than the one we're
4179                                * looking at from getting deleted during traversal. */
4180   while ((caps_size - trimmed) > max && !p.end()) {
4181     Cap *cap = *p;
4182     InodeRef in(cap->inode);
4183
4184     // Increment p early because it will be invalidated if cap
4185     // is deleted inside remove_cap
4186     ++p;
4187
4188     if (in->caps.size() > 1 && cap != in->auth_cap) {
4189       int mine = cap->issued | cap->implemented;
4190       int oissued = in->auth_cap ? in->auth_cap->issued : 0;
4191       // disposable non-auth cap
4192       if (!(get_caps_used(in.get()) & ~oissued & mine)) {
4193         ldout(cct, 20) << " removing unused, unneeded non-auth cap on " << *in << dendl;
4194         cap = (remove_cap(cap, true), nullptr);
4195         trimmed++;
4196       }
4197     } else {
4198       ldout(cct, 20) << " trying to trim dentries for " << *in << dendl;
4199       _trim_negative_child_dentries(in);
4200       bool all = true;
4201       set<Dentry*>::iterator q = in->dn_set.begin();
4202       while (q != in->dn_set.end()) {
4203         Dentry *dn = *q++;
4204         if (dn->lru_is_expireable()) {
4205           if (can_invalidate_dentries &&
4206               dn->dir->parent_inode->ino == MDS_INO_ROOT) {
4207             // Only issue one of these per DN for inodes in root: handle
4208             // others more efficiently by calling for root-child DNs at
4209             // the end of this function.
4210             _schedule_invalidate_dentry_callback(dn, true);
4211           }
4212           ldout(cct, 20) << " queueing dentry for trimming: " << dn->name << dendl;
4213           to_trim.insert(dn);
4214         } else {
4215           ldout(cct, 20) << "  not expirable: " << dn->name << dendl;
4216           all = false;
4217         }
4218       }
4219       if (all && in->ino != MDS_INO_ROOT) {
4220         ldout(cct, 20) << __func__ << " counting as trimmed: " << *in << dendl;
4221         trimmed++;
4222       }
4223     }
4224   }
4225   ldout(cct, 20) << " trimming queued dentries: " << dendl;
4226   for (const auto &dn : to_trim) {
4227     trim_dentry(dn);
4228   }
4229   to_trim.clear();
4230
4231   caps_size = s->caps.size();
4232   if (caps_size > max)
4233     _invalidate_kernel_dcache();
4234 }
4235
4236 void Client::force_session_readonly(MetaSession *s)
4237 {
4238   s->readonly = true;
4239   for (xlist<Cap*>::iterator p = s->caps.begin(); !p.end(); ++p) {
4240     Inode *in = (*p)->inode;
4241     if (in->caps_wanted() & CEPH_CAP_FILE_WR)
4242       signal_cond_list(in->waitfor_caps);
4243   }
4244 }
4245
4246 int Client::mark_caps_flushing(Inode *in, ceph_tid_t* ptid)
4247 {
4248   MetaSession *session = in->auth_cap->session;
4249
4250   int flushing = in->dirty_caps;
4251   assert(flushing);
4252
4253   ceph_tid_t flush_tid = ++last_flush_tid;
4254   in->flushing_cap_tids[flush_tid] = flushing;
4255
4256   if (!in->flushing_caps) {
4257     ldout(cct, 10) << "mark_caps_flushing " << ccap_string(flushing) << " " << *in << dendl;
4258     num_flushing_caps++;
4259   } else {
4260     ldout(cct, 10) << "mark_caps_flushing (more) " << ccap_string(flushing) << " " << *in << dendl;
4261   }
4262
4263   in->flushing_caps |= flushing;
4264   in->mark_caps_clean();
4265
4266   if (!in->flushing_cap_item.is_on_list())
4267     session->flushing_caps.push_back(&in->flushing_cap_item);
4268   session->flushing_caps_tids.insert(flush_tid);
4269
4270   *ptid = flush_tid;
4271   return flushing;
4272 }
4273
4274 void Client::adjust_session_flushing_caps(Inode *in, MetaSession *old_s,  MetaSession *new_s)
4275 {
4276   for (auto &p : in->cap_snaps) {
4277     CapSnap &capsnap = p.second;
4278     if (capsnap.flush_tid > 0) {
4279       old_s->flushing_caps_tids.erase(capsnap.flush_tid);
4280       new_s->flushing_caps_tids.insert(capsnap.flush_tid);
4281     }
4282   }
4283   for (map<ceph_tid_t, int>::iterator it = in->flushing_cap_tids.begin();
4284        it != in->flushing_cap_tids.end();
4285        ++it) {
4286     old_s->flushing_caps_tids.erase(it->first);
4287     new_s->flushing_caps_tids.insert(it->first);
4288   }
4289   new_s->flushing_caps.push_back(&in->flushing_cap_item);
4290 }
4291
4292 /*
4293  * Flush all caps back to the MDS. Because the callers generally wait on the
4294  * result of this function (syncfs and umount cases), we set
4295  * CHECK_CAPS_SYNCHRONOUS on the last check_caps call.
4296  */
4297 void Client::flush_caps_sync()
4298 {
4299   ldout(cct, 10) << __func__ << dendl;
4300   xlist<Inode*>::iterator p = delayed_list.begin();
4301   while (!p.end()) {
4302     unsigned flags = CHECK_CAPS_NODELAY;
4303     Inode *in = *p;
4304
4305     ++p;
4306     delayed_list.pop_front();
4307     if (p.end() && dirty_list.empty())
4308       flags |= CHECK_CAPS_SYNCHRONOUS;
4309     check_caps(in, flags);
4310   }
4311
4312   // other caps, too
4313   p = dirty_list.begin();
4314   while (!p.end()) {
4315     unsigned flags = CHECK_CAPS_NODELAY;
4316     Inode *in = *p;
4317
4318     ++p;
4319     if (p.end())
4320       flags |= CHECK_CAPS_SYNCHRONOUS;
4321     check_caps(in, flags);
4322   }
4323 }
4324
4325 void Client::flush_caps(Inode *in, MetaSession *session, bool sync)
4326 {
4327   ldout(cct, 10) << "flush_caps " << in << " mds." << session->mds_num << dendl;
4328   Cap *cap = in->auth_cap;
4329   assert(cap->session == session);
4330
4331   for (map<ceph_tid_t,int>::iterator p = in->flushing_cap_tids.begin();
4332        p != in->flushing_cap_tids.end();
4333        ++p) {
4334     bool req_sync = false;
4335
4336     /* If this is a synchronous request, then flush the journal on last one */
4337     if (sync && (p->first == in->flushing_cap_tids.rbegin()->first))
4338       req_sync = true;
4339
4340     send_cap(in, session, cap, req_sync,
4341              (get_caps_used(in) | in->caps_dirty()),
4342              in->caps_wanted(), (cap->issued | cap->implemented),
4343              p->second, p->first);
4344   }
4345 }
4346
4347 void Client::wait_sync_caps(Inode *in, ceph_tid_t want)
4348 {
4349   while (in->flushing_caps) {
4350     map<ceph_tid_t, int>::iterator it = in->flushing_cap_tids.begin();
4351     assert(it != in->flushing_cap_tids.end());
4352     if (it->first > want)
4353       break;
4354     ldout(cct, 10) << "wait_sync_caps on " << *in << " flushing "
4355                    << ccap_string(it->second) << " want " << want
4356                    << " last " << it->first << dendl;
4357     wait_on_list(in->waitfor_caps);
4358   }
4359 }
4360
4361 void Client::wait_sync_caps(ceph_tid_t want)
4362 {
4363  retry:
4364   ldout(cct, 10) << "wait_sync_caps want " << want  << " (last is " << last_flush_tid << ", "
4365            << num_flushing_caps << " total flushing)" << dendl;
4366   for (map<mds_rank_t,MetaSession*>::iterator p = mds_sessions.begin();
4367        p != mds_sessions.end();
4368        ++p) {
4369     MetaSession *s = p->second;
4370     if (s->flushing_caps_tids.empty())
4371         continue;
4372     ceph_tid_t oldest_tid = *s->flushing_caps_tids.begin();
4373     if (oldest_tid <= want) {
4374       ldout(cct, 10) << " waiting on mds." << p->first << " tid " << oldest_tid
4375                      << " (want " << want << ")" << dendl;
4376       sync_cond.Wait(client_lock);
4377       goto retry;
4378     }
4379   }
4380 }
4381
4382 void Client::kick_flushing_caps(MetaSession *session)
4383 {
4384   mds_rank_t mds = session->mds_num;
4385   ldout(cct, 10) << "kick_flushing_caps mds." << mds << dendl;
4386
4387   for (xlist<Inode*>::iterator p = session->flushing_caps.begin(); !p.end(); ++p) {
4388     Inode *in = *p;
4389     if (session->early_flushing_caps.count(in))
4390       continue;
4391     ldout(cct, 20) << " reflushing caps on " << *in << " to mds." << mds << dendl;
4392     if (in->cap_snaps.size())
4393       flush_snaps(in, true);
4394     if (in->flushing_caps)
4395       flush_caps(in, session);
4396   }
4397
4398   session->early_flushing_caps.clear();
4399 }
4400
4401 void Client::early_kick_flushing_caps(MetaSession *session)
4402 {
4403   session->early_flushing_caps.clear();
4404
4405   for (xlist<Inode*>::iterator p = session->flushing_caps.begin(); !p.end(); ++p) {
4406     Inode *in = *p;
4407     assert(in->auth_cap);
4408
4409     // if flushing caps were revoked, we re-send the cap flush in client reconnect
4410     // stage. This guarantees that MDS processes the cap flush message before issuing
4411     // the flushing caps to other client.
4412     if ((in->flushing_caps & in->auth_cap->issued) == in->flushing_caps)
4413       continue;
4414
4415     ldout(cct, 20) << " reflushing caps (early_kick) on " << *in
4416                    << " to mds." << session->mds_num << dendl;
4417
4418     session->early_flushing_caps.insert(in);
4419
4420     if (in->cap_snaps.size())
4421       flush_snaps(in, true);
4422     if (in->flushing_caps)
4423       flush_caps(in, session);
4424
4425   }
4426 }
4427
4428 void Client::kick_maxsize_requests(MetaSession *session)
4429 {
4430   xlist<Cap*>::iterator iter = session->caps.begin();
4431   while (!iter.end()){
4432     (*iter)->inode->requested_max_size = 0;
4433     (*iter)->inode->wanted_max_size = 0;
4434     signal_cond_list((*iter)->inode->waitfor_caps);
4435     ++iter;
4436   }
4437 }
4438
4439 void SnapRealm::build_snap_context()
4440 {
4441   set<snapid_t> snaps;
4442   snapid_t max_seq = seq;
4443
4444   // start with prior_parents?
4445   for (unsigned i=0; i<prior_parent_snaps.size(); i++)
4446     snaps.insert(prior_parent_snaps[i]);
4447
4448   // current parent's snaps
4449   if (pparent) {
4450     const SnapContext& psnapc = pparent->get_snap_context();
4451     for (unsigned i=0; i<psnapc.snaps.size(); i++)
4452       if (psnapc.snaps[i] >= parent_since)
4453         snaps.insert(psnapc.snaps[i]);
4454     if (psnapc.seq > max_seq)
4455       max_seq = psnapc.seq;
4456   }
4457
4458   // my snaps
4459   for (unsigned i=0; i<my_snaps.size(); i++)
4460     snaps.insert(my_snaps[i]);
4461
4462   // ok!
4463   cached_snap_context.seq = max_seq;
4464   cached_snap_context.snaps.resize(0);
4465   cached_snap_context.snaps.reserve(snaps.size());
4466   for (set<snapid_t>::reverse_iterator p = snaps.rbegin(); p != snaps.rend(); ++p)
4467     cached_snap_context.snaps.push_back(*p);
4468 }
4469
4470 void Client::invalidate_snaprealm_and_children(SnapRealm *realm)
4471 {
4472   list<SnapRealm*> q;
4473   q.push_back(realm);
4474
4475   while (!q.empty()) {
4476     realm = q.front();
4477     q.pop_front();
4478
4479     ldout(cct, 10) << "invalidate_snaprealm_and_children " << *realm << dendl;
4480     realm->invalidate_cache();
4481
4482     for (set<SnapRealm*>::iterator p = realm->pchildren.begin();
4483          p != realm->pchildren.end();
4484          ++p)
4485       q.push_back(*p);
4486   }
4487 }
4488
4489 SnapRealm *Client::get_snap_realm(inodeno_t r)
4490 {
4491   SnapRealm *realm = snap_realms[r];
4492   if (!realm)
4493     snap_realms[r] = realm = new SnapRealm(r);
4494   ldout(cct, 20) << "get_snap_realm " << r << " " << realm << " " << realm->nref << " -> " << (realm->nref + 1) << dendl;
4495   realm->nref++;
4496   return realm;
4497 }
4498
4499 SnapRealm *Client::get_snap_realm_maybe(inodeno_t r)
4500 {
4501   if (snap_realms.count(r) == 0) {
4502     ldout(cct, 20) << "get_snap_realm_maybe " << r << " fail" << dendl;
4503     return NULL;
4504   }
4505   SnapRealm *realm = snap_realms[r];
4506   ldout(cct, 20) << "get_snap_realm_maybe " << r << " " << realm << " " << realm->nref << " -> " << (realm->nref + 1) << dendl;
4507   realm->nref++;
4508   return realm;
4509 }
4510
4511 void Client::put_snap_realm(SnapRealm *realm)
4512 {
4513   ldout(cct, 20) << "put_snap_realm " << realm->ino << " " << realm
4514                  << " " << realm->nref << " -> " << (realm->nref - 1) << dendl;
4515   if (--realm->nref == 0) {
4516     snap_realms.erase(realm->ino);
4517     if (realm->pparent) {
4518       realm->pparent->pchildren.erase(realm);
4519       put_snap_realm(realm->pparent);
4520     }
4521     delete realm;
4522   }
4523 }
4524
4525 bool Client::adjust_realm_parent(SnapRealm *realm, inodeno_t parent)
4526 {
4527   if (realm->parent != parent) {
4528     ldout(cct, 10) << "adjust_realm_parent " << *realm
4529              << " " << realm->parent << " -> " << parent << dendl;
4530     realm->parent = parent;
4531     if (realm->pparent) {
4532       realm->pparent->pchildren.erase(realm);
4533       put_snap_realm(realm->pparent);
4534     }
4535     realm->pparent = get_snap_realm(parent);
4536     realm->pparent->pchildren.insert(realm);
4537     return true;
4538   }
4539   return false;
4540 }
4541
4542 static bool has_new_snaps(const SnapContext& old_snapc,
4543                           const SnapContext& new_snapc)
4544 {
4545   return !new_snapc.snaps.empty() && new_snapc.snaps[0] > old_snapc.seq;
4546 }
4547
4548
4549 void Client::update_snap_trace(bufferlist& bl, SnapRealm **realm_ret, bool flush)
4550 {
4551   SnapRealm *first_realm = NULL;
4552   ldout(cct, 10) << "update_snap_trace len " << bl.length() << dendl;
4553
4554   map<SnapRealm*, SnapContext> dirty_realms;
4555
4556   bufferlist::iterator p = bl.begin();
4557   while (!p.end()) {
4558     SnapRealmInfo info;
4559     ::decode(info, p);
4560     SnapRealm *realm = get_snap_realm(info.ino());
4561
4562     bool invalidate = false;
4563
4564     if (info.seq() > realm->seq) {
4565       ldout(cct, 10) << "update_snap_trace " << *realm << " seq " << info.seq() << " > " << realm->seq
4566                << dendl;
4567
4568       if (flush) {
4569         // writeback any dirty caps _before_ updating snap list (i.e. with old snap info)
4570         //  flush me + children
4571         list<SnapRealm*> q;
4572         q.push_back(realm);
4573         while (!q.empty()) {
4574           SnapRealm *realm = q.front();
4575           q.pop_front();
4576
4577           for (set<SnapRealm*>::iterator p = realm->pchildren.begin();
4578                p != realm->pchildren.end();
4579                ++p)
4580             q.push_back(*p);
4581
4582           if (dirty_realms.count(realm) == 0) {
4583             realm->nref++;
4584             dirty_realms[realm] = realm->get_snap_context();
4585           }
4586         }
4587       }
4588
4589       // update
4590       realm->seq = info.seq();
4591       realm->created = info.created();
4592       realm->parent_since = info.parent_since();
4593       realm->prior_parent_snaps = info.prior_parent_snaps;
4594       realm->my_snaps = info.my_snaps;
4595       invalidate = true;
4596     }
4597
4598     // _always_ verify parent
4599     if (adjust_realm_parent(realm, info.parent()))
4600       invalidate = true;
4601
4602     if (invalidate) {
4603       invalidate_snaprealm_and_children(realm);
4604       ldout(cct, 15) << "update_snap_trace " << *realm << " self|parent updated" << dendl;
4605       ldout(cct, 15) << "  snapc " << realm->get_snap_context() << dendl;
4606     } else {
4607       ldout(cct, 10) << "update_snap_trace " << *realm << " seq " << info.seq()
4608                << " <= " << realm->seq << " and same parent, SKIPPING" << dendl;
4609     }
4610
4611     if (!first_realm)
4612       first_realm = realm;
4613     else
4614       put_snap_realm(realm);
4615   }
4616
4617   for (map<SnapRealm*, SnapContext>::iterator q = dirty_realms.begin();
4618        q != dirty_realms.end();
4619        ++q) {
4620     SnapRealm *realm = q->first;
4621     // if there are new snaps ?
4622     if (has_new_snaps(q->second, realm->get_snap_context())) {
4623       ldout(cct, 10) << " flushing caps on " << *realm << dendl;
4624       xlist<Inode*>::iterator r = realm->inodes_with_caps.begin();
4625       while (!r.end()) {
4626         Inode *in = *r;
4627         ++r;
4628         queue_cap_snap(in, q->second);
4629       }
4630     } else {
4631       ldout(cct, 10) << " no new snap on " << *realm << dendl;
4632     }
4633     put_snap_realm(realm);
4634   }
4635
4636   if (realm_ret)
4637     *realm_ret = first_realm;
4638   else
4639     put_snap_realm(first_realm);
4640 }
4641
4642 void Client::handle_snap(MClientSnap *m)
4643 {
4644   ldout(cct, 10) << "handle_snap " << *m << dendl;
4645   mds_rank_t mds = mds_rank_t(m->get_source().num());
4646   MetaSession *session = _get_mds_session(mds, m->get_connection().get());
4647   if (!session) {
4648     m->put();
4649     return;
4650   }
4651
4652   got_mds_push(session);
4653
4654   map<Inode*, SnapContext> to_move;
4655   SnapRealm *realm = 0;
4656
4657   if (m->head.op == CEPH_SNAP_OP_SPLIT) {
4658     assert(m->head.split);
4659     SnapRealmInfo info;
4660     bufferlist::iterator p = m->bl.begin();
4661     ::decode(info, p);
4662     assert(info.ino() == m->head.split);
4663
4664     // flush, then move, ino's.
4665     realm = get_snap_realm(info.ino());
4666     ldout(cct, 10) << " splitting off " << *realm << dendl;
4667     for (vector<inodeno_t>::iterator p = m->split_inos.begin();
4668          p != m->split_inos.end();
4669          ++p) {
4670       vinodeno_t vino(*p, CEPH_NOSNAP);
4671       if (inode_map.count(vino)) {
4672         Inode *in = inode_map[vino];
4673         if (!in->snaprealm || in->snaprealm == realm)
4674           continue;
4675         if (in->snaprealm->created > info.created()) {
4676           ldout(cct, 10) << " NOT moving " << *in << " from _newer_ realm "
4677                    << *in->snaprealm << dendl;
4678           continue;
4679         }
4680         ldout(cct, 10) << " moving " << *in << " from " << *in->snaprealm << dendl;
4681
4682
4683         in->snaprealm_item.remove_myself();
4684         to_move[in] = in->snaprealm->get_snap_context();
4685         put_snap_realm(in->snaprealm);
4686       }
4687     }
4688
4689     // move child snaprealms, too
4690     for (vector<inodeno_t>::iterator p = m->split_realms.begin();
4691          p != m->split_realms.end();
4692          ++p) {
4693       ldout(cct, 10) << "adjusting snaprealm " << *p << " parent" << dendl;
4694       SnapRealm *child = get_snap_realm_maybe(*p);
4695       if (!child)
4696         continue;
4697       adjust_realm_parent(child, realm->ino);
4698       put_snap_realm(child);
4699     }
4700   }
4701
4702   update_snap_trace(m->bl, NULL, m->head.op != CEPH_SNAP_OP_DESTROY);
4703
4704   if (realm) {
4705     for (auto p = to_move.begin(); p != to_move.end(); ++p) {
4706       Inode *in = p->first;
4707       in->snaprealm = realm;
4708       realm->inodes_with_caps.push_back(&in->snaprealm_item);
4709       realm->nref++;
4710       // queue for snap writeback
4711       if (has_new_snaps(p->second, realm->get_snap_context()))
4712         queue_cap_snap(in, p->second);
4713     }
4714     put_snap_realm(realm);
4715   }
4716
4717   m->put();
4718 }
4719
4720 void Client::handle_quota(MClientQuota *m)
4721 {
4722   mds_rank_t mds = mds_rank_t(m->get_source().num());
4723   MetaSession *session = _get_mds_session(mds, m->get_connection().get());
4724   if (!session) {
4725     m->put();
4726     return;
4727   }
4728
4729   got_mds_push(session);
4730
4731   ldout(cct, 10) << "handle_quota " << *m << " from mds." << mds << dendl;
4732
4733   vinodeno_t vino(m->ino, CEPH_NOSNAP);
4734   if (inode_map.count(vino)) {
4735     Inode *in = NULL;
4736     in = inode_map[vino];
4737
4738     if (in) {
4739       in->quota = m->quota;
4740       in->rstat = m->rstat;
4741     }
4742   }
4743
4744   m->put();
4745 }
4746
4747 void Client::handle_caps(MClientCaps *m)
4748 {
4749   mds_rank_t mds = mds_rank_t(m->get_source().num());
4750   MetaSession *session = _get_mds_session(mds, m->get_connection().get());
4751   if (!session) {
4752     m->put();
4753     return;
4754   }
4755
4756   if (m->osd_epoch_barrier && !objecter->have_map(m->osd_epoch_barrier)) {
4757     // Pause RADOS operations until we see the required epoch
4758     objecter->set_epoch_barrier(m->osd_epoch_barrier);
4759   }
4760
4761   if (m->osd_epoch_barrier > cap_epoch_barrier) {
4762     // Record the barrier so that we will transmit it to MDS when releasing
4763     set_cap_epoch_barrier(m->osd_epoch_barrier);
4764   }
4765
4766   got_mds_push(session);
4767
4768   m->clear_payload();  // for if/when we send back to MDS
4769
4770   Inode *in = 0;
4771   vinodeno_t vino(m->get_ino(), CEPH_NOSNAP);
4772   if (inode_map.count(vino))
4773     in = inode_map[vino];
4774   if (!in) {
4775     if (m->get_op() == CEPH_CAP_OP_IMPORT) {
4776       ldout(cct, 5) << "handle_caps don't have vino " << vino << " on IMPORT, immediately releasing" << dendl;
4777       session->enqueue_cap_release(
4778         m->get_ino(),
4779         m->get_cap_id(),
4780         m->get_seq(),
4781         m->get_mseq(),
4782         cap_epoch_barrier);
4783     } else {
4784       ldout(cct, 5) << "handle_caps don't have vino " << vino << ", dropping" << dendl;
4785     }
4786     m->put();
4787
4788     // in case the mds is waiting on e.g. a revocation
4789     flush_cap_releases();
4790     return;
4791   }
4792
4793   switch (m->get_op()) {
4794   case CEPH_CAP_OP_EXPORT:
4795     return handle_cap_export(session, in, m);
4796   case CEPH_CAP_OP_FLUSHSNAP_ACK:
4797     return handle_cap_flushsnap_ack(session, in, m);
4798   case CEPH_CAP_OP_IMPORT:
4799     handle_cap_import(session, in, m);
4800   }
4801
4802   if (in->caps.count(mds) == 0) {
4803     ldout(cct, 5) << "handle_caps don't have " << *in << " cap on mds." << mds << dendl;
4804     m->put();
4805     return;
4806   }
4807
4808   Cap *cap = in->caps[mds];
4809
4810   switch (m->get_op()) {
4811   case CEPH_CAP_OP_TRUNC: return handle_cap_trunc(session, in, m);
4812   case CEPH_CAP_OP_IMPORT:
4813   case CEPH_CAP_OP_REVOKE:
4814   case CEPH_CAP_OP_GRANT: return handle_cap_grant(session, in, cap, m);
4815   case CEPH_CAP_OP_FLUSH_ACK: return handle_cap_flush_ack(session, in, cap, m);
4816   default:
4817     m->put();
4818   }
4819 }
4820
4821 void Client::handle_cap_import(MetaSession *session, Inode *in, MClientCaps *m)
4822 {
4823   mds_rank_t mds = session->mds_num;
4824
4825   ldout(cct, 5) << "handle_cap_import ino " << m->get_ino() << " mseq " << m->get_mseq()
4826                 << " IMPORT from mds." << mds << dendl;
4827
4828   const mds_rank_t peer_mds = mds_rank_t(m->peer.mds);
4829   Cap *cap = NULL;
4830   UserPerm cap_perms;
4831   if (m->peer.cap_id && in->caps.count(peer_mds)) {
4832     cap = in->caps[peer_mds];
4833     if (cap) {
4834       cap_perms = cap->latest_perms;
4835     }
4836   }
4837
4838   // add/update it
4839   SnapRealm *realm = NULL;
4840   update_snap_trace(m->snapbl, &realm);
4841
4842   add_update_cap(in, session, m->get_cap_id(),
4843                  m->get_caps(), m->get_seq(), m->get_mseq(), m->get_realm(),
4844                  CEPH_CAP_FLAG_AUTH, cap_perms);
4845
4846   if (cap && cap->cap_id == m->peer.cap_id) {
4847       remove_cap(cap, (m->peer.flags & CEPH_CAP_FLAG_RELEASE));
4848   }
4849
4850   if (realm)
4851     put_snap_realm(realm);
4852
4853   if (in->auth_cap && in->auth_cap->session->mds_num == mds) {
4854     // reflush any/all caps (if we are now the auth_cap)
4855     if (in->cap_snaps.size())
4856       flush_snaps(in, true);
4857     if (in->flushing_caps)
4858       flush_caps(in, session);
4859   }
4860 }
4861
4862 void Client::handle_cap_export(MetaSession *session, Inode *in, MClientCaps *m)
4863 {
4864   mds_rank_t mds = session->mds_num;
4865
4866   ldout(cct, 5) << "handle_cap_export ino " << m->get_ino() << " mseq " << m->get_mseq()
4867                 << " EXPORT from mds." << mds << dendl;
4868
4869   Cap *cap = NULL;
4870   if (in->caps.count(mds))
4871     cap = in->caps[mds];
4872
4873   const mds_rank_t peer_mds = mds_rank_t(m->peer.mds);
4874
4875   if (cap && cap->cap_id == m->get_cap_id()) {
4876     if (m->peer.cap_id) {
4877       MetaSession *tsession = _get_or_open_mds_session(peer_mds);
4878       if (in->caps.count(peer_mds)) {
4879         Cap *tcap = in->caps[peer_mds];
4880         if (tcap->cap_id == m->peer.cap_id &&
4881             ceph_seq_cmp(tcap->seq, m->peer.seq) < 0) {
4882           tcap->cap_id = m->peer.cap_id;
4883           tcap->seq = m->peer.seq - 1;
4884           tcap->issue_seq = tcap->seq;
4885           tcap->issued |= cap->issued;
4886           tcap->implemented |= cap->issued;
4887           if (cap == in->auth_cap)
4888             in->auth_cap = tcap;
4889           if (in->auth_cap == tcap && in->flushing_cap_item.is_on_list())
4890             adjust_session_flushing_caps(in, session, tsession);
4891         }
4892       } else {
4893         add_update_cap(in, tsession, m->peer.cap_id, cap->issued,
4894                        m->peer.seq - 1, m->peer.mseq, (uint64_t)-1,
4895                        cap == in->auth_cap ? CEPH_CAP_FLAG_AUTH : 0,
4896                        cap->latest_perms);
4897       }
4898     } else {
4899       if (cap == in->auth_cap)
4900         in->flags |= I_CAP_DROPPED;
4901     }
4902
4903     remove_cap(cap, false);
4904   }
4905
4906   m->put();
4907 }
4908
4909 void Client::handle_cap_trunc(MetaSession *session, Inode *in, MClientCaps *m)
4910 {
4911   mds_rank_t mds = session->mds_num;
4912   assert(in->caps[mds]);
4913
4914   ldout(cct, 10) << "handle_cap_trunc on ino " << *in
4915            << " size " << in->size << " -> " << m->get_size()
4916            << dendl;
4917
4918   int issued;
4919   in->caps_issued(&issued);
4920   issued |= in->caps_dirty();
4921   update_inode_file_size(in, issued, m->get_size(),
4922                          m->get_truncate_seq(), m->get_truncate_size());
4923   m->put();
4924 }
4925
4926 void Client::handle_cap_flush_ack(MetaSession *session, Inode *in, Cap *cap, MClientCaps *m)
4927 {
4928   ceph_tid_t flush_ack_tid = m->get_client_tid();
4929   int dirty = m->get_dirty();
4930   int cleaned = 0;
4931   int flushed = 0;
4932
4933   for (map<ceph_tid_t, int>::iterator it = in->flushing_cap_tids.begin();
4934        it != in->flushing_cap_tids.end(); ) {
4935     if (it->first == flush_ack_tid)
4936       cleaned = it->second;
4937     if (it->first <= flush_ack_tid) {
4938       session->flushing_caps_tids.erase(it->first);
4939       in->flushing_cap_tids.erase(it++);
4940       ++flushed;
4941       continue;
4942     }
4943     cleaned &= ~it->second;
4944     if (!cleaned)
4945       break;
4946     ++it;
4947   }
4948
4949   ldout(cct, 5) << "handle_cap_flush_ack mds." << session->mds_num
4950           << " cleaned " << ccap_string(cleaned) << " on " << *in
4951           << " with " << ccap_string(dirty) << dendl;
4952
4953   if (flushed) {
4954     signal_cond_list(in->waitfor_caps);
4955     if (session->flushing_caps_tids.empty() ||
4956         *session->flushing_caps_tids.begin() > flush_ack_tid)
4957       sync_cond.Signal();
4958   }
4959
4960   if (!dirty) {
4961     in->cap_dirtier_uid = -1;
4962     in->cap_dirtier_gid = -1;
4963   }
4964
4965   if (!cleaned) {
4966     ldout(cct, 10) << " tid " << m->get_client_tid() << " != any cap bit tids" << dendl;
4967   } else {
4968     if (in->flushing_caps) {
4969       ldout(cct, 5) << "  flushing_caps " << ccap_string(in->flushing_caps)
4970               << " -> " << ccap_string(in->flushing_caps & ~cleaned) << dendl;
4971       in->flushing_caps &= ~cleaned;
4972       if (in->flushing_caps == 0) {
4973         ldout(cct, 10) << " " << *in << " !flushing" << dendl;
4974         num_flushing_caps--;
4975         if (in->cap_snaps.empty())
4976           in->flushing_cap_item.remove_myself();
4977       }
4978       if (!in->caps_dirty())
4979         put_inode(in);
4980     }
4981   }
4982
4983   m->put();
4984 }
4985
4986
4987 void Client::handle_cap_flushsnap_ack(MetaSession *session, Inode *in, MClientCaps *m)
4988 {
4989   mds_rank_t mds = session->mds_num;
4990   assert(in->caps[mds]);
4991   snapid_t follows = m->get_snap_follows();
4992
4993   if (in->cap_snaps.count(follows)) {
4994     CapSnap &capsnap = in->cap_snaps.at(follows);
4995     if (m->get_client_tid() != capsnap.flush_tid) {
4996       ldout(cct, 10) << " tid " << m->get_client_tid() << " != " << capsnap.flush_tid << dendl;
4997     } else {
4998       ldout(cct, 5) << "handle_cap_flushedsnap mds." << mds << " flushed snap follows " << follows
4999               << " on " << *in << dendl;
5000       InodeRef tmp_ref;
5001       if (in->get_num_ref() == 1)
5002         tmp_ref = in; // make sure inode not get freed while erasing item from in->cap_snaps
5003       if (in->flushing_caps == 0 && in->cap_snaps.empty())
5004         in->flushing_cap_item.remove_myself();
5005       session->flushing_caps_tids.erase(capsnap.flush_tid);
5006       in->cap_snaps.erase(follows);
5007     }
5008   } else {
5009     ldout(cct, 5) << "handle_cap_flushedsnap DUP(?) mds." << mds << " flushed snap follows " << follows
5010             << " on " << *in << dendl;
5011     // we may not have it if we send multiple FLUSHSNAP requests and (got multiple FLUSHEDSNAPs back)
5012   }
5013
5014   m->put();
5015 }
5016
5017 class C_Client_DentryInvalidate : public Context  {
5018 private:
5019   Client *client;
5020   vinodeno_t dirino;
5021   vinodeno_t ino;
5022   string name;
5023 public:
5024   C_Client_DentryInvalidate(Client *c, Dentry *dn, bool del) :
5025     client(c), name(dn->name) {
5026       if (client->use_faked_inos()) {
5027         dirino.ino = dn->dir->parent_inode->faked_ino;
5028         if (del)
5029           ino.ino = dn->inode->faked_ino;
5030       } else {
5031         dirino = dn->dir->parent_inode->vino();
5032         if (del)
5033           ino = dn->inode->vino();
5034       }
5035       if (!del)
5036         ino.ino = inodeno_t();
5037   }
5038   void finish(int r) override {
5039     // _async_dentry_invalidate is responsible for its own locking
5040     assert(!client->client_lock.is_locked_by_me());
5041     client->_async_dentry_invalidate(dirino, ino, name);
5042   }
5043 };
5044
5045 void Client::_async_dentry_invalidate(vinodeno_t dirino, vinodeno_t ino, string& name)
5046 {
5047   if (unmounting)
5048     return;
5049   ldout(cct, 10) << "_async_dentry_invalidate '" << name << "' ino " << ino
5050                  << " in dir " << dirino << dendl;
5051   dentry_invalidate_cb(callback_handle, dirino, ino, name);
5052 }
5053
5054 void Client::_schedule_invalidate_dentry_callback(Dentry *dn, bool del)
5055 {
5056   if (dentry_invalidate_cb && dn->inode->ll_ref > 0)
5057     async_dentry_invalidator.queue(new C_Client_DentryInvalidate(this, dn, del));
5058 }
5059
5060 void Client::_try_to_trim_inode(Inode *in, bool sched_inval)
5061 {
5062   int ref = in->get_num_ref();
5063
5064   if (in->dir && !in->dir->dentries.empty()) {
5065     for (auto p = in->dir->dentries.begin();
5066          p != in->dir->dentries.end(); ) {
5067       Dentry *dn = p->second;
5068       ++p;
5069       /* rmsnap removes whole subtree, need trim inodes recursively.
5070        * we don't need to invalidate dentries recursively. because
5071        * invalidating a directory dentry effectively invalidate
5072        * whole subtree */
5073       if (in->snapid != CEPH_NOSNAP && dn->inode && dn->inode->is_dir())
5074         _try_to_trim_inode(dn->inode.get(), false);
5075
5076       if (dn->lru_is_expireable())
5077         unlink(dn, true, false);  // keep dir, drop dentry
5078     }
5079     if (in->dir->dentries.empty()) {
5080       close_dir(in->dir);
5081       --ref;
5082     }
5083   }
5084
5085   if (ref > 0 && (in->flags & I_SNAPDIR_OPEN)) {
5086     InodeRef snapdir = open_snapdir(in);
5087     _try_to_trim_inode(snapdir.get(), false);
5088     --ref;
5089   }
5090
5091   if (ref > 0 && in->ll_ref > 0 && sched_inval) {
5092     set<Dentry*>::iterator q = in->dn_set.begin();
5093     while (q != in->dn_set.end()) {
5094       Dentry *dn = *q++;
5095       // FIXME: we play lots of unlink/link tricks when handling MDS replies,
5096       //        so in->dn_set doesn't always reflect the state of kernel's dcache.
5097       _schedule_invalidate_dentry_callback(dn, true);
5098       unlink(dn, true, true);
5099     }
5100   }
5101 }
5102
5103 void Client::handle_cap_grant(MetaSession *session, Inode *in, Cap *cap, MClientCaps *m)
5104 {
5105   mds_rank_t mds = session->mds_num;
5106   int used = get_caps_used(in);
5107   int wanted = in->caps_wanted();
5108
5109   const int old_caps = cap->issued;
5110   const int new_caps = m->get_caps();
5111   ldout(cct, 5) << "handle_cap_grant on in " << m->get_ino()
5112                 << " mds." << mds << " seq " << m->get_seq()
5113                 << " caps now " << ccap_string(new_caps)
5114                 << " was " << ccap_string(old_caps) << dendl;
5115   cap->seq = m->get_seq();
5116   cap->gen = session->cap_gen;
5117
5118   // update inode
5119   int issued;
5120   in->caps_issued(&issued);
5121   issued |= in->caps_dirty();
5122
5123   if ((new_caps & CEPH_CAP_AUTH_SHARED) &&
5124       !(issued & CEPH_CAP_AUTH_EXCL)) {
5125     in->mode = m->head.mode;
5126     in->uid = m->head.uid;
5127     in->gid = m->head.gid;
5128     in->btime = m->btime;
5129   }
5130   bool deleted_inode = false;
5131   if ((new_caps & CEPH_CAP_LINK_SHARED) &&
5132       !(issued & CEPH_CAP_LINK_EXCL)) {
5133     in->nlink = m->head.nlink;
5134     if (in->nlink == 0 &&
5135         (new_caps & (CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL)))
5136       deleted_inode = true;
5137   }
5138   if (!(issued & CEPH_CAP_XATTR_EXCL) &&
5139       m->xattrbl.length() &&
5140       m->head.xattr_version > in->xattr_version) {
5141     bufferlist::iterator p = m->xattrbl.begin();
5142     ::decode(in->xattrs, p);
5143     in->xattr_version = m->head.xattr_version;
5144   }
5145
5146   if ((new_caps & CEPH_CAP_FILE_SHARED) && m->dirstat_is_valid()) {
5147     in->dirstat.nfiles = m->get_nfiles();
5148     in->dirstat.nsubdirs = m->get_nsubdirs();
5149   }
5150
5151   if (new_caps & CEPH_CAP_ANY_RD) {
5152     update_inode_file_time(in, issued, m->get_time_warp_seq(),
5153                            m->get_ctime(), m->get_mtime(), m->get_atime());
5154   }
5155
5156   if (new_caps & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR)) {
5157     in->layout = m->get_layout();
5158     update_inode_file_size(in, issued, m->get_size(),
5159                            m->get_truncate_seq(), m->get_truncate_size());
5160   }
5161
5162   if (m->inline_version > in->inline_version) {
5163     in->inline_data = m->inline_data;
5164     in->inline_version = m->inline_version;
5165   }
5166
5167   /* always take a newer change attr */
5168   if (m->get_change_attr() > in->change_attr)
5169     in->change_attr = m->get_change_attr();
5170
5171   // max_size
5172   if (cap == in->auth_cap &&
5173       (new_caps & CEPH_CAP_ANY_FILE_WR) &&
5174       (m->get_max_size() != in->max_size)) {
5175     ldout(cct, 10) << "max_size " << in->max_size << " -> " << m->get_max_size() << dendl;
5176     in->max_size = m->get_max_size();
5177     if (in->max_size > in->wanted_max_size) {
5178       in->wanted_max_size = 0;
5179       in->requested_max_size = 0;
5180     }
5181   }
5182
5183   bool check = false;
5184   if (m->get_op() == CEPH_CAP_OP_IMPORT && m->get_wanted() != wanted)
5185     check = true;
5186
5187   check_cap_issue(in, cap, new_caps);
5188
5189   // update caps
5190   int revoked = old_caps & ~new_caps;
5191   if (revoked) {
5192     ldout(cct, 10) << "  revocation of " << ccap_string(revoked) << dendl;
5193     cap->issued = new_caps;
5194     cap->implemented |= new_caps;
5195
5196     // recall delegations if we're losing caps necessary for them
5197     if (revoked & ceph_deleg_caps_for_type(CEPH_DELEGATION_RD))
5198       in->recall_deleg(false);
5199     else if (revoked & ceph_deleg_caps_for_type(CEPH_DELEGATION_WR))
5200       in->recall_deleg(true);
5201
5202     if ((used & revoked & CEPH_CAP_FILE_BUFFER) &&
5203         !_flush(in, new C_Client_FlushComplete(this, in))) {
5204       // waitin' for flush
5205     } else if (revoked & CEPH_CAP_FILE_CACHE) {
5206       if (_release(in))
5207         check = true;
5208     } else {
5209       cap->wanted = 0; // don't let check_caps skip sending a response to MDS
5210       check = true;
5211     }
5212   } else if (old_caps == new_caps) {
5213     ldout(cct, 10) << "  caps unchanged at " << ccap_string(old_caps) << dendl;
5214   } else {
5215     ldout(cct, 10) << "  grant, new caps are " << ccap_string(new_caps & ~old_caps) << dendl;
5216     cap->issued = new_caps;
5217     cap->implemented |= new_caps;
5218
5219     if (cap == in->auth_cap) {
5220       // non-auth MDS is revoking the newly grant caps ?
5221       for (map<mds_rank_t, Cap*>::iterator it = in->caps.begin(); it != in->caps.end(); ++it) {
5222         if (it->second == cap)
5223           continue;
5224         if (it->second->implemented & ~it->second->issued & new_caps) {
5225           check = true;
5226           break;
5227         }
5228       }
5229     }
5230   }
5231
5232   if (check)
5233     check_caps(in, 0);
5234
5235   // wake up waiters
5236   if (new_caps)
5237     signal_cond_list(in->waitfor_caps);
5238
5239   // may drop inode's last ref
5240   if (deleted_inode)
5241     _try_to_trim_inode(in, true);
5242
5243   m->put();
5244 }
5245
5246 int Client::inode_permission(Inode *in, const UserPerm& perms, unsigned want)
5247 {
5248   if (perms.uid() == 0)
5249     return 0;
5250
5251   if (perms.uid() != in->uid && (in->mode & S_IRWXG)) {
5252     int ret = _posix_acl_permission(in, perms, want);
5253     if (ret != -EAGAIN)
5254       return ret;
5255   }
5256
5257   // check permissions before doing anything else
5258   if (!in->check_mode(perms, want))
5259     return -EACCES;
5260   return 0;
5261 }
5262
5263 int Client::xattr_permission(Inode *in, const char *name, unsigned want,
5264                              const UserPerm& perms)
5265 {
5266   int r = _getattr_for_perm(in, perms);
5267   if (r < 0)
5268     goto out;
5269
5270   r = 0;
5271   if (strncmp(name, "system.", 7) == 0) {
5272     if ((want & MAY_WRITE) && (perms.uid() != 0 && perms.uid() != in->uid))
5273       r = -EPERM;
5274   } else {
5275     r = inode_permission(in, perms, want);
5276   }
5277 out:
5278   ldout(cct, 5) << __func__ << " " << in << " = " << r <<  dendl;
5279   return r;
5280 }
5281
5282 ostream& operator<<(ostream &out, const UserPerm& perm) {
5283   out << "UserPerm(uid: " << perm.uid() << ", gid: " << perm.gid() << ")";
5284   return out;
5285 }
5286
5287 int Client::may_setattr(Inode *in, struct ceph_statx *stx, int mask,
5288                         const UserPerm& perms)
5289 {
5290   ldout(cct, 20) << __func__ << " " << *in << "; " << perms << dendl;
5291   int r = _getattr_for_perm(in, perms);
5292   if (r < 0)
5293     goto out;
5294
5295   if (mask & CEPH_SETATTR_SIZE) {
5296     r = inode_permission(in, perms, MAY_WRITE);
5297     if (r < 0)
5298       goto out;
5299   }
5300
5301   r = -EPERM;
5302   if (mask & CEPH_SETATTR_UID) {
5303     if (perms.uid() != 0 && (perms.uid() != in->uid || stx->stx_uid != in->uid))
5304       goto out;
5305   }
5306   if (mask & CEPH_SETATTR_GID) {
5307     if (perms.uid() != 0 && (perms.uid() != in->uid ||
5308                (!perms.gid_in_groups(stx->stx_gid) && stx->stx_gid != in->gid)))
5309       goto out;
5310   }
5311
5312   if (mask & CEPH_SETATTR_MODE) {
5313     if (perms.uid() != 0 && perms.uid() != in->uid)
5314       goto out;
5315
5316     gid_t i_gid = (mask & CEPH_SETATTR_GID) ? stx->stx_gid : in->gid;
5317     if (perms.uid() != 0 && !perms.gid_in_groups(i_gid))
5318       stx->stx_mode &= ~S_ISGID;
5319   }
5320
5321   if (mask & (CEPH_SETATTR_CTIME | CEPH_SETATTR_BTIME |
5322               CEPH_SETATTR_MTIME | CEPH_SETATTR_ATIME)) {
5323     if (perms.uid() != 0 && perms.uid() != in->uid) {
5324       int check_mask = CEPH_SETATTR_CTIME | CEPH_SETATTR_BTIME;
5325       if (!(mask & CEPH_SETATTR_MTIME_NOW))
5326         check_mask |= CEPH_SETATTR_MTIME;
5327       if (!(mask & CEPH_SETATTR_ATIME_NOW))
5328         check_mask |= CEPH_SETATTR_ATIME;
5329       if (check_mask & mask) {
5330         goto out;
5331       } else {
5332         r = inode_permission(in, perms, MAY_WRITE);
5333         if (r < 0)
5334           goto out;
5335       }
5336     }
5337   }
5338   r = 0;
5339 out:
5340   ldout(cct, 3) << __func__ << " " << in << " = " << r <<  dendl;
5341   return r;
5342 }
5343
5344 int Client::may_open(Inode *in, int flags, const UserPerm& perms)
5345 {
5346   ldout(cct, 20) << __func__ << " " << *in << "; " << perms << dendl;
5347   unsigned want = 0;
5348
5349   if ((flags & O_ACCMODE) == O_WRONLY)
5350     want = MAY_WRITE;
5351   else if ((flags & O_ACCMODE) == O_RDWR)
5352     want = MAY_READ | MAY_WRITE;
5353   else if ((flags & O_ACCMODE) == O_RDONLY)
5354     want = MAY_READ;
5355   if (flags & O_TRUNC)
5356     want |= MAY_WRITE;
5357
5358   int r = 0;
5359   switch (in->mode & S_IFMT) {
5360     case S_IFLNK:
5361       r = -ELOOP;
5362       goto out;
5363     case S_IFDIR:
5364       if (want & MAY_WRITE) {
5365         r = -EISDIR;
5366         goto out;
5367       }
5368       break;
5369   }
5370
5371   r = _getattr_for_perm(in, perms);
5372   if (r < 0)
5373     goto out;
5374
5375   r = inode_permission(in, perms, want);
5376 out:
5377   ldout(cct, 3) << __func__ << " " << in << " = " << r <<  dendl;
5378   return r;
5379 }
5380
5381 int Client::may_lookup(Inode *dir, const UserPerm& perms)
5382 {
5383   ldout(cct, 20) << __func__ << " " << *dir << "; " << perms << dendl;
5384   int r = _getattr_for_perm(dir, perms);
5385   if (r < 0)
5386     goto out;
5387
5388   r = inode_permission(dir, perms, MAY_EXEC);
5389 out:
5390   ldout(cct, 3) << __func__ << " " << dir << " = " << r <<  dendl;
5391   return r;
5392 }
5393
5394 int Client::may_create(Inode *dir, const UserPerm& perms)
5395 {
5396   ldout(cct, 20) << __func__ << " " << *dir << "; " << perms << dendl;
5397   int r = _getattr_for_perm(dir, perms);
5398   if (r < 0)
5399     goto out;
5400
5401   r = inode_permission(dir, perms, MAY_EXEC | MAY_WRITE);
5402 out:
5403   ldout(cct, 3) << __func__ << " " << dir << " = " << r <<  dendl;
5404   return r;
5405 }
5406
5407 int Client::may_delete(Inode *dir, const char *name, const UserPerm& perms)
5408 {
5409   ldout(cct, 20) << __func__ << " " << *dir << "; " << "; name " << name << "; " << perms << dendl;
5410   int r = _getattr_for_perm(dir, perms);
5411   if (r < 0)
5412     goto out;
5413
5414   r = inode_permission(dir, perms, MAY_EXEC | MAY_WRITE);
5415   if (r < 0)
5416     goto out;
5417
5418   /* 'name == NULL' means rmsnap */
5419   if (perms.uid() != 0 && name && (dir->mode & S_ISVTX)) {
5420     InodeRef otherin;
5421     r = _lookup(dir, name, CEPH_CAP_AUTH_SHARED, &otherin, perms);
5422     if (r < 0)
5423       goto out;
5424     if (dir->uid != perms.uid() && otherin->uid != perms.uid())
5425       r = -EPERM;
5426   }
5427 out:
5428   ldout(cct, 3) << __func__ << " " << dir << " = " << r <<  dendl;
5429   return r;
5430 }
5431
5432 int Client::may_hardlink(Inode *in, const UserPerm& perms)
5433 {
5434   ldout(cct, 20) << __func__ << " " << *in << "; " << perms << dendl;
5435   int r = _getattr_for_perm(in, perms);
5436   if (r < 0)
5437     goto out;
5438
5439   if (perms.uid() == 0 || perms.uid() == in->uid) {
5440     r = 0;
5441     goto out;
5442   }
5443
5444   r = -EPERM;
5445   if (!S_ISREG(in->mode))
5446     goto out;
5447
5448   if (in->mode & S_ISUID)
5449     goto out;
5450
5451   if ((in->mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP))
5452     goto out;
5453
5454   r = inode_permission(in, perms, MAY_READ | MAY_WRITE);
5455 out:
5456   ldout(cct, 3) << __func__ << " " << in << " = " << r <<  dendl;
5457   return r;
5458 }
5459
5460 int Client::_getattr_for_perm(Inode *in, const UserPerm& perms)
5461 {
5462   int mask = CEPH_STAT_CAP_MODE;
5463   bool force = false;
5464   if (acl_type != NO_ACL) {
5465     mask |= CEPH_STAT_CAP_XATTR;
5466     force = in->xattr_version == 0;
5467   }
5468   return _getattr(in, mask, perms, force);
5469 }
5470
5471 vinodeno_t Client::_get_vino(Inode *in)
5472 {
5473   /* The caller must hold the client lock */
5474   return vinodeno_t(in->ino, in->snapid);
5475 }
5476
5477 inodeno_t Client::_get_inodeno(Inode *in)
5478 {
5479   /* The caller must hold the client lock */
5480   return in->ino;
5481 }
5482
5483
5484 /**
5485  * Resolve an MDS spec to a list of MDS daemon GIDs.
5486  *
5487  * The spec is a string representing a GID, rank, filesystem:rank, or name/id.
5488  * It may be '*' in which case it matches all GIDs.
5489  *
5490  * If no error is returned, the `targets` vector will be populated with at least
5491  * one MDS.
5492  */
5493 int Client::resolve_mds(
5494     const std::string &mds_spec,
5495     std::vector<mds_gid_t> *targets)
5496 {
5497   assert(fsmap);
5498   assert(targets != nullptr);
5499
5500   mds_role_t role;
5501   std::stringstream ss;
5502   int role_r = fsmap->parse_role(mds_spec, &role, ss);
5503   if (role_r == 0) {
5504     // We got a role, resolve it to a GID
5505     ldout(cct, 10) << __func__ << ": resolved '" << mds_spec << "' to role '"
5506       << role << "'" << dendl;
5507     targets->push_back(
5508         fsmap->get_filesystem(role.fscid)->mds_map.get_info(role.rank).global_id);
5509     return 0;
5510   }
5511
5512   std::string strtol_err;
5513   long long rank_or_gid = strict_strtoll(mds_spec.c_str(), 10, &strtol_err);
5514   if (strtol_err.empty()) {
5515     // It is a possible GID
5516     const mds_gid_t mds_gid = mds_gid_t(rank_or_gid);
5517     if (fsmap->gid_exists(mds_gid)) {
5518       ldout(cct, 10) << __func__ << ": validated GID " << mds_gid << dendl;
5519       targets->push_back(mds_gid);
5520     } else {
5521       lderr(cct) << __func__ << ": GID " << mds_gid << " not in MDS map"
5522                  << dendl;
5523       return -ENOENT;
5524     }
5525   } else if (mds_spec == "*") {
5526     // It is a wildcard: use all MDSs
5527     const auto mds_info = fsmap->get_mds_info();
5528
5529     if (mds_info.empty()) {
5530       lderr(cct) << __func__ << ": * passed but no MDS daemons found" << dendl;
5531       return -ENOENT;
5532     }
5533
5534     for (const auto i : mds_info) {
5535       targets->push_back(i.first);
5536     }
5537   } else {
5538     // It did not parse as an integer, it is not a wildcard, it must be a name
5539     const mds_gid_t mds_gid = fsmap->find_mds_gid_by_name(mds_spec);
5540     if (mds_gid == 0) {
5541       lderr(cct) << "MDS ID '" << mds_spec << "' not found" << dendl;
5542
5543       lderr(cct) << "FSMap: " << *fsmap << dendl;
5544
5545       return -ENOENT;
5546     } else {
5547       ldout(cct, 10) << __func__ << ": resolved ID '" << mds_spec
5548                      << "' to GID " << mds_gid << dendl;
5549       targets->push_back(mds_gid);
5550     }
5551   }
5552
5553   return 0;
5554 }
5555
5556
5557 /**
5558  * Authenticate with mon and establish global ID
5559  */
5560 int Client::authenticate()
5561 {
5562   assert(client_lock.is_locked_by_me());
5563
5564   if (monclient->is_authenticated()) {
5565     return 0;
5566   }
5567
5568   client_lock.Unlock();
5569   int r = monclient->authenticate(cct->_conf->client_mount_timeout);
5570   client_lock.Lock();
5571   if (r < 0) {
5572     return r;
5573   }
5574
5575   whoami = monclient->get_global_id();
5576   messenger->set_myname(entity_name_t::CLIENT(whoami.v));
5577
5578   return 0;
5579 }
5580
5581 int Client::fetch_fsmap(bool user)
5582 {
5583   int r;
5584   // Retrieve FSMap to enable looking up daemon addresses.  We need FSMap
5585   // rather than MDSMap because no one MDSMap contains all the daemons, and
5586   // a `tell` can address any daemon.
5587   version_t fsmap_latest;
5588   do {
5589     C_SaferCond cond;
5590     monclient->get_version("fsmap", &fsmap_latest, NULL, &cond);
5591     client_lock.Unlock();
5592     r = cond.wait();
5593     client_lock.Lock();
5594   } while (r == -EAGAIN);
5595
5596   if (r < 0) {
5597     lderr(cct) << "Failed to learn FSMap version: " << cpp_strerror(r) << dendl;
5598     return r;
5599   }
5600
5601   ldout(cct, 10) << __func__ << " learned FSMap version " << fsmap_latest << dendl;
5602
5603   if (user) {
5604     if (!fsmap_user || fsmap_user->get_epoch() < fsmap_latest) {
5605       monclient->sub_want("fsmap.user", fsmap_latest, CEPH_SUBSCRIBE_ONETIME);
5606       monclient->renew_subs();
5607       wait_on_list(waiting_for_fsmap);
5608     }
5609     assert(fsmap_user);
5610     assert(fsmap_user->get_epoch() >= fsmap_latest);
5611   } else {
5612     if (!fsmap || fsmap->get_epoch() < fsmap_latest) {
5613       monclient->sub_want("fsmap", fsmap_latest, CEPH_SUBSCRIBE_ONETIME);
5614       monclient->renew_subs();
5615       wait_on_list(waiting_for_fsmap);
5616     }
5617     assert(fsmap);
5618     assert(fsmap->get_epoch() >= fsmap_latest);
5619   }
5620   ldout(cct, 10) << __func__ << " finished waiting for FSMap version "
5621                  << fsmap_latest << dendl;
5622   return 0;
5623 }
5624
5625 /**
5626  *
5627  * @mds_spec one of ID, rank, GID, "*"
5628  *
5629  */
5630 int Client::mds_command(
5631     const std::string &mds_spec,
5632     const vector<string>& cmd,
5633     const bufferlist& inbl,
5634     bufferlist *outbl,
5635     string *outs,
5636     Context *onfinish)
5637 {
5638   Mutex::Locker lock(client_lock);
5639
5640   if (!initialized)
5641     return -ENOTCONN;
5642
5643   int r;
5644   r = authenticate();
5645   if (r < 0) {
5646     return r;
5647   }
5648
5649   r = fetch_fsmap(false);
5650   if (r < 0) {
5651     return r;
5652   }
5653
5654   // Look up MDS target(s) of the command
5655   std::vector<mds_gid_t> targets;
5656   r = resolve_mds(mds_spec, &targets);
5657   if (r < 0) {
5658     return r;
5659   }
5660
5661   // If daemons are laggy, we won't send them commands.  If all
5662   // are laggy then we fail.
5663   std::vector<mds_gid_t> non_laggy;
5664   for (const auto gid : targets) {
5665     const auto info = fsmap->get_info_gid(gid);
5666     if (!info.laggy()) {
5667       non_laggy.push_back(gid);
5668     }
5669   }
5670   if (non_laggy.size() == 0) {
5671     *outs = "All targeted MDS daemons are laggy";
5672     return -ENOENT;
5673   }
5674
5675   if (metadata.empty()) {
5676     // We are called on an unmounted client, so metadata
5677     // won't be initialized yet.
5678     populate_metadata("");
5679   }
5680
5681   // Send commands to targets
5682   C_GatherBuilder gather(cct, onfinish);
5683   for (const auto target_gid : non_laggy) {
5684     const auto info = fsmap->get_info_gid(target_gid);
5685
5686     // Open a connection to the target MDS
5687     entity_inst_t inst = info.get_inst();
5688     ConnectionRef conn = messenger->get_connection(inst);
5689
5690     // Generate MDSCommandOp state
5691     auto &op = command_table.start_command();
5692
5693     op.on_finish = gather.new_sub();
5694     op.cmd = cmd;
5695     op.outbl = outbl;
5696     op.outs = outs;
5697     op.inbl = inbl;
5698     op.mds_gid = target_gid;
5699     op.con = conn;
5700
5701     ldout(cct, 4) << __func__ << ": new command op to " << target_gid
5702       << " tid=" << op.tid << cmd << dendl;
5703
5704     // Construct and send MCommand
5705     MCommand *m = op.get_message(monclient->get_fsid());
5706     conn->send_message(m);
5707   }
5708   gather.activate();
5709
5710   return 0;
5711 }
5712
5713 void Client::handle_command_reply(MCommandReply *m)
5714 {
5715   ceph_tid_t const tid = m->get_tid();
5716
5717   ldout(cct, 10) << __func__ << ": tid=" << m->get_tid() << dendl;
5718
5719   if (!command_table.exists(tid)) {
5720     ldout(cct, 1) << __func__ << ": unknown tid " << tid << ", dropping" << dendl;
5721     m->put();
5722     return;
5723   }
5724
5725   auto &op = command_table.get_command(tid);
5726   if (op.outbl) {
5727     op.outbl->claim(m->get_data());
5728   }
5729   if (op.outs) {
5730     *op.outs = m->rs;
5731   }
5732
5733   if (op.on_finish) {
5734     op.on_finish->complete(m->r);
5735   }
5736
5737   command_table.erase(tid);
5738
5739   m->put();
5740 }
5741
5742 // -------------------
5743 // MOUNT
5744
5745 int Client::mount(const std::string &mount_root, const UserPerm& perms,
5746                   bool require_mds)
5747 {
5748   Mutex::Locker lock(client_lock);
5749
5750   if (mounted) {
5751     ldout(cct, 5) << "already mounted" << dendl;
5752     return 0;
5753   }
5754
5755   unmounting = false;
5756
5757   int r = authenticate();
5758   if (r < 0) {
5759     lderr(cct) << "authentication failed: " << cpp_strerror(r) << dendl;
5760     return r;
5761   }
5762
5763   std::string want = "mdsmap";
5764   const auto &mds_ns = cct->_conf->client_mds_namespace;
5765   if (!mds_ns.empty()) {
5766     r = fetch_fsmap(true);
5767     if (r < 0)
5768       return r;
5769     fs_cluster_id_t cid = fsmap_user->get_fs_cid(mds_ns);
5770     if (cid == FS_CLUSTER_ID_NONE)
5771       return -ENOENT;
5772
5773     std::ostringstream oss;
5774     oss << want << "." << cid;
5775     want = oss.str();
5776   }
5777   ldout(cct, 10) << "Subscribing to map '" << want << "'" << dendl;
5778
5779   monclient->sub_want(want, 0, 0);
5780   monclient->renew_subs();
5781
5782   tick(); // start tick
5783
5784   if (require_mds) {
5785     while (1) {
5786       auto availability = mdsmap->is_cluster_available();
5787       if (availability == MDSMap::STUCK_UNAVAILABLE) {
5788         // Error out
5789         ldout(cct, 10) << "mds cluster unavailable: epoch=" << mdsmap->get_epoch() << dendl;
5790         return CEPH_FUSE_NO_MDS_UP;
5791       } else if (availability == MDSMap::AVAILABLE) {
5792         // Continue to mount
5793         break;
5794       } else if (availability == MDSMap::TRANSIENT_UNAVAILABLE) {
5795         // Else, wait.  MDSMonitor will update the map to bring
5796         // us to a conclusion eventually.
5797         wait_on_list(waiting_for_mdsmap);
5798       } else {
5799         // Unexpected value!
5800         ceph_abort();
5801       }
5802     }
5803   }
5804
5805   populate_metadata(mount_root.empty() ? "/" : mount_root);
5806
5807   filepath fp(CEPH_INO_ROOT);
5808   if (!mount_root.empty()) {
5809     fp = filepath(mount_root.c_str());
5810   }
5811   while (true) {
5812     MetaRequest *req = new MetaRequest(CEPH_MDS_OP_GETATTR);
5813     req->set_filepath(fp);
5814     req->head.args.getattr.mask = CEPH_STAT_CAP_INODE_ALL;
5815     int res = make_request(req, perms);
5816     if (res < 0) {
5817       if (res == -EACCES && root) {
5818         ldout(cct, 1) << __func__ << " EACCES on parent of mount point; quotas may not work" << dendl;
5819         break;
5820       }
5821       return res;
5822     }
5823
5824     if (fp.depth())
5825       fp.pop_dentry();
5826     else
5827       break;
5828   }
5829
5830   assert(root);
5831   _ll_get(root);
5832
5833   mounted = true;
5834
5835   // trace?
5836   if (!cct->_conf->client_trace.empty()) {
5837     traceout.open(cct->_conf->client_trace.c_str());
5838     if (traceout.is_open()) {
5839       ldout(cct, 1) << "opened trace file '" << cct->_conf->client_trace << "'" << dendl;
5840     } else {
5841       ldout(cct, 1) << "FAILED to open trace file '" << cct->_conf->client_trace << "'" << dendl;
5842     }
5843   }
5844
5845   /*
5846   ldout(cct, 3) << "op: // client trace data structs" << dendl;
5847   ldout(cct, 3) << "op: struct stat st;" << dendl;
5848   ldout(cct, 3) << "op: struct utimbuf utim;" << dendl;
5849   ldout(cct, 3) << "op: int readlinkbuf_len = 1000;" << dendl;
5850   ldout(cct, 3) << "op: char readlinkbuf[readlinkbuf_len];" << dendl;
5851   ldout(cct, 3) << "op: map<string, inode_t*> dir_contents;" << dendl;
5852   ldout(cct, 3) << "op: map<int, int> open_files;" << dendl;
5853   ldout(cct, 3) << "op: int fd;" << dendl;
5854   */
5855   return 0;
5856 }
5857
5858 // UNMOUNT
5859
5860 void Client::_close_sessions()
5861 {
5862   while (!mds_sessions.empty()) {
5863     // send session closes!
5864     for (map<mds_rank_t,MetaSession*>::iterator p = mds_sessions.begin();
5865         p != mds_sessions.end();
5866         ++p) {
5867       if (p->second->state != MetaSession::STATE_CLOSING) {
5868         _close_mds_session(p->second);
5869       }
5870     }
5871
5872     // wait for sessions to close
5873     ldout(cct, 2) << "waiting for " << mds_sessions.size() << " mds sessions to close" << dendl;
5874     mount_cond.Wait(client_lock);
5875   }
5876 }
5877
5878 void Client::flush_mdlog_sync()
5879 {
5880   if (mds_requests.empty())
5881     return;
5882   for (map<mds_rank_t,MetaSession*>::iterator p = mds_sessions.begin();
5883        p != mds_sessions.end();
5884        ++p) {
5885     MetaSession *s = p->second;
5886     flush_mdlog(s);
5887   }
5888 }
5889
5890 void Client::flush_mdlog(MetaSession *session)
5891 {
5892   // Only send this to Luminous or newer MDS daemons, older daemons
5893   // will crash if they see an unknown CEPH_SESSION_* value in this msg.
5894   const uint64_t features = session->con->get_features();
5895   if (HAVE_FEATURE(features, SERVER_LUMINOUS)) {
5896     MClientSession *m = new MClientSession(CEPH_SESSION_REQUEST_FLUSH_MDLOG);
5897     session->con->send_message(m);
5898   }
5899 }
5900
5901
5902 void Client::_unmount()
5903 {
5904   if (unmounting)
5905     return;
5906
5907   ldout(cct, 2) << "unmounting" << dendl;
5908   unmounting = true;
5909
5910   deleg_timeout = 0;
5911
5912   flush_mdlog_sync(); // flush the mdlog for pending requests, if any
5913   while (!mds_requests.empty()) {
5914     ldout(cct, 10) << "waiting on " << mds_requests.size() << " requests" << dendl;
5915     mount_cond.Wait(client_lock);
5916   }
5917
5918   if (tick_event)
5919     timer.cancel_event(tick_event);
5920   tick_event = 0;
5921
5922   cwd.reset();
5923
5924   // clean up any unclosed files
5925   while (!fd_map.empty()) {
5926     Fh *fh = fd_map.begin()->second;
5927     fd_map.erase(fd_map.begin());
5928     ldout(cct, 0) << " destroyed lost open file " << fh << " on " << *fh->inode << dendl;
5929     _release_fh(fh);
5930   }
5931
5932   while (!ll_unclosed_fh_set.empty()) {
5933     set<Fh*>::iterator it = ll_unclosed_fh_set.begin();
5934     Fh *fh = *it;
5935     ll_unclosed_fh_set.erase(fh);
5936     ldout(cct, 0) << " destroyed lost open file " << fh << " on " << *(fh->inode) << dendl;
5937     _release_fh(fh);
5938   }
5939
5940   while (!opened_dirs.empty()) {
5941     dir_result_t *dirp = *opened_dirs.begin();
5942     ldout(cct, 0) << " destroyed lost open dir " << dirp << " on " << *dirp->inode << dendl;
5943     _closedir(dirp);
5944   }
5945
5946   _ll_drop_pins();
5947
5948   if (blacklisted) {
5949     ldout(cct, 0) << " skipping clean shutdown, we are blacklisted" << dendl;
5950
5951     if (cct->_conf->client_oc) {
5952       // Purge all cached data so that ObjectCacher doesn't get hung up
5953       // trying to flush it.  ObjectCacher's behaviour on EBLACKLISTED
5954       // is to just leave things marked dirty
5955       // (http://tracker.ceph.com/issues/9105)
5956       for (const auto &i : inode_map) {
5957         objectcacher->purge_set(&(i.second->oset));
5958       }
5959     }
5960
5961     mounted = false;
5962     return;
5963   }
5964
5965   while (unsafe_sync_write > 0) {
5966     ldout(cct, 0) << unsafe_sync_write << " unsafe_sync_writes, waiting"  << dendl;
5967     mount_cond.Wait(client_lock);
5968   }
5969
5970   if (cct->_conf->client_oc) {
5971     // flush/release all buffered data
5972     ceph::unordered_map<vinodeno_t, Inode*>::iterator next;
5973     for (ceph::unordered_map<vinodeno_t, Inode*>::iterator p = inode_map.begin();
5974          p != inode_map.end();
5975          p = next) {
5976       next = p;
5977       ++next;
5978       Inode *in = p->second;
5979       if (!in) {
5980         ldout(cct, 0) << "null inode_map entry ino " << p->first << dendl;
5981         assert(in);
5982       }
5983       if (!in->caps.empty()) {
5984         InodeRef tmp_ref(in);
5985         _release(in);
5986         _flush(in, new C_Client_FlushComplete(this, in));
5987       }
5988     }
5989   }
5990
5991   flush_caps_sync();
5992   wait_sync_caps(last_flush_tid);
5993
5994   // empty lru cache
5995   trim_cache();
5996
5997   while (lru.lru_get_size() > 0 ||
5998          !inode_map.empty()) {
5999     ldout(cct, 2) << "cache still has " << lru.lru_get_size()
6000             << "+" << inode_map.size() << " items"
6001             << ", waiting (for caps to release?)"
6002             << dendl;
6003     utime_t until = ceph_clock_now() + utime_t(5, 0);
6004     int r = mount_cond.WaitUntil(client_lock, until);
6005     if (r == ETIMEDOUT) {
6006       dump_cache(NULL);
6007     }
6008   }
6009   assert(lru.lru_get_size() == 0);
6010   assert(inode_map.empty());
6011
6012   // stop tracing
6013   if (!cct->_conf->client_trace.empty()) {
6014     ldout(cct, 1) << "closing trace file '" << cct->_conf->client_trace << "'" << dendl;
6015     traceout.close();
6016   }
6017
6018   _close_sessions();
6019
6020   mounted = false;
6021
6022   ldout(cct, 2) << "unmounted." << dendl;
6023 }
6024
6025 void Client::unmount()
6026 {
6027   Mutex::Locker lock(client_lock);
6028   _unmount();
6029 }
6030
6031 void Client::flush_cap_releases()
6032 {
6033   // send any cap releases
6034   for (map<mds_rank_t,MetaSession*>::iterator p = mds_sessions.begin();
6035        p != mds_sessions.end();
6036        ++p) {
6037     if (p->second->release && mdsmap->is_clientreplay_or_active_or_stopping(
6038           p->first)) {
6039       if (cct->_conf->client_inject_release_failure) {
6040         ldout(cct, 20) << __func__ << " injecting failure to send cap release message" << dendl;
6041         p->second->release->put();
6042       } else {
6043         p->second->con->send_message(p->second->release);
6044       }
6045       p->second->release = 0;
6046     }
6047   }
6048 }
6049
6050 void Client::tick()
6051 {
6052   if (cct->_conf->client_debug_inject_tick_delay > 0) {
6053     sleep(cct->_conf->client_debug_inject_tick_delay);
6054     assert(0 == cct->_conf->set_val("client_debug_inject_tick_delay", "0"));
6055     cct->_conf->apply_changes(NULL);
6056   }
6057
6058   ldout(cct, 21) << "tick" << dendl;
6059   tick_event = timer.add_event_after(
6060     cct->_conf->client_tick_interval,
6061     new FunctionContext([this](int) {
6062         // Called back via Timer, which takes client_lock for us
6063         assert(client_lock.is_locked_by_me());
6064         tick();
6065       }));
6066   utime_t now = ceph_clock_now();
6067
6068   if (!mounted && !mds_requests.empty()) {
6069     MetaRequest *req = mds_requests.begin()->second;
6070     if (req->op_stamp + cct->_conf->client_mount_timeout < now) {
6071       req->abort(-ETIMEDOUT);
6072       if (req->caller_cond) {
6073         req->kick = true;
6074         req->caller_cond->Signal();
6075       }
6076       signal_cond_list(waiting_for_mdsmap);
6077       for (map<mds_rank_t,MetaSession*>::iterator p = mds_sessions.begin();
6078            p != mds_sessions.end();
6079           ++p)
6080         signal_context_list(p->second->waiting_for_open);
6081     }
6082   }
6083
6084   if (mdsmap->get_epoch()) {
6085     // renew caps?
6086     utime_t el = now - last_cap_renew;
6087     if (el > mdsmap->get_session_timeout() / 3.0)
6088       renew_caps();
6089
6090     flush_cap_releases();
6091   }
6092
6093   // delayed caps
6094   xlist<Inode*>::iterator p = delayed_list.begin();
6095   while (!p.end()) {
6096     Inode *in = *p;
6097     ++p;
6098     if (in->hold_caps_until > now)
6099       break;
6100     delayed_list.pop_front();
6101     check_caps(in, CHECK_CAPS_NODELAY);
6102   }
6103
6104   trim_cache(true);
6105 }
6106
6107 void Client::renew_caps()
6108 {
6109   ldout(cct, 10) << "renew_caps()" << dendl;
6110   last_cap_renew = ceph_clock_now();
6111
6112   for (map<mds_rank_t,MetaSession*>::iterator p = mds_sessions.begin();
6113        p != mds_sessions.end();
6114        ++p) {
6115     ldout(cct, 15) << "renew_caps requesting from mds." << p->first << dendl;
6116     if (mdsmap->get_state(p->first) >= MDSMap::STATE_REJOIN)
6117       renew_caps(p->second);
6118   }
6119 }
6120
6121 void Client::renew_caps(MetaSession *session)
6122 {
6123   ldout(cct, 10) << "renew_caps mds." << session->mds_num << dendl;
6124   session->last_cap_renew_request = ceph_clock_now();
6125   uint64_t seq = ++session->cap_renew_seq;
6126   session->con->send_message(new MClientSession(CEPH_SESSION_REQUEST_RENEWCAPS, seq));
6127 }
6128
6129
6130 // ===============================================================
6131 // high level (POSIXy) interface
6132
6133 int Client::_do_lookup(Inode *dir, const string& name, int mask,
6134                        InodeRef *target, const UserPerm& perms)
6135 {
6136   int op = dir->snapid == CEPH_SNAPDIR ? CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP;
6137   MetaRequest *req = new MetaRequest(op);
6138   filepath path;
6139   dir->make_nosnap_relative_path(path);
6140   path.push_dentry(name);
6141   req->set_filepath(path);
6142   req->set_inode(dir);
6143   if (cct->_conf->client_debug_getattr_caps && op == CEPH_MDS_OP_LOOKUP)
6144       mask |= DEBUG_GETATTR_CAPS;
6145   req->head.args.getattr.mask = mask;
6146
6147   ldout(cct, 10) << "_do_lookup on " << path << dendl;
6148
6149   int r = make_request(req, perms, target);
6150   ldout(cct, 10) << "_do_lookup res is " << r << dendl;
6151   return r;
6152 }
6153
6154 int Client::_lookup(Inode *dir, const string& dname, int mask, InodeRef *target,
6155                     const UserPerm& perms)
6156 {
6157   int r = 0;
6158   Dentry *dn = NULL;
6159
6160   if (!dir->is_dir()) {
6161     r = -ENOTDIR;
6162     goto done;
6163   }
6164
6165   if (dname == "..") {
6166     if (dir->dn_set.empty())
6167       *target = dir;
6168     else
6169       *target = dir->get_first_parent()->dir->parent_inode; //dirs can't be hard-linked
6170     goto done;
6171   }
6172
6173   if (dname == ".") {
6174     *target = dir;
6175     goto done;
6176   }
6177
6178   if (dname.length() > NAME_MAX) {
6179     r = -ENAMETOOLONG;
6180     goto done;
6181   }
6182
6183   if (dname == cct->_conf->client_snapdir &&
6184       dir->snapid == CEPH_NOSNAP) {
6185     *target = open_snapdir(dir);
6186     goto done;
6187   }
6188
6189   if (dir->dir &&
6190       dir->dir->dentries.count(dname)) {
6191     dn = dir->dir->dentries[dname];
6192
6193     ldout(cct, 20) << "_lookup have dn " << dname << " mds." << dn->lease_mds << " ttl " << dn->lease_ttl
6194              << " seq " << dn->lease_seq
6195              << dendl;
6196
6197     if (!dn->inode || dn->inode->caps_issued_mask(mask, true)) {
6198       // is dn lease valid?
6199       utime_t now = ceph_clock_now();
6200       if (dn->lease_mds >= 0 &&
6201           dn->lease_ttl > now &&
6202           mds_sessions.count(dn->lease_mds)) {
6203         MetaSession *s = mds_sessions[dn->lease_mds];
6204         if (s->cap_ttl > now &&
6205             s->cap_gen == dn->lease_gen) {
6206           // touch this mds's dir cap too, even though we don't _explicitly_ use it here, to
6207           // make trim_caps() behave.
6208           dir->try_touch_cap(dn->lease_mds);
6209           goto hit_dn;
6210         }
6211         ldout(cct, 20) << " bad lease, cap_ttl " << s->cap_ttl << ", cap_gen " << s->cap_gen
6212                        << " vs lease_gen " << dn->lease_gen << dendl;
6213       }
6214       // dir lease?
6215       if (dir->caps_issued_mask(CEPH_CAP_FILE_SHARED, true)) {
6216         if (dn->cap_shared_gen == dir->shared_gen &&
6217             (!dn->inode || dn->inode->caps_issued_mask(mask, true)))
6218               goto hit_dn;
6219         if (!dn->inode && (dir->flags & I_COMPLETE)) {
6220           ldout(cct, 10) << "_lookup concluded ENOENT locally for "
6221                          << *dir << " dn '" << dname << "'" << dendl;
6222           return -ENOENT;
6223         }
6224       }
6225     } else {
6226       ldout(cct, 20) << " no cap on " << dn->inode->vino() << dendl;
6227     }
6228   } else {
6229     // can we conclude ENOENT locally?
6230     if (dir->caps_issued_mask(CEPH_CAP_FILE_SHARED, true) &&
6231         (dir->flags & I_COMPLETE)) {
6232       ldout(cct, 10) << "_lookup concluded ENOENT locally for " << *dir << " dn '" << dname << "'" << dendl;
6233       return -ENOENT;
6234     }
6235   }
6236
6237   r = _do_lookup(dir, dname, mask, target, perms);
6238   goto done;
6239
6240  hit_dn:
6241   if (dn->inode) {
6242     *target = dn->inode;
6243   } else {
6244     r = -ENOENT;
6245   }
6246   touch_dn(dn);
6247
6248  done:
6249   if (r < 0)
6250     ldout(cct, 10) << "_lookup " << *dir << " " << dname << " = " << r << dendl;
6251   else
6252     ldout(cct, 10) << "_lookup " << *dir << " " << dname << " = " << **target << dendl;
6253   return r;
6254 }
6255
6256 int Client::get_or_create(Inode *dir, const char* name,
6257                           Dentry **pdn, bool expect_null)
6258 {
6259   // lookup
6260   ldout(cct, 20) << "get_or_create " << *dir << " name " << name << dendl;
6261   dir->open_dir();
6262   if (dir->dir->dentries.count(name)) {
6263     Dentry *dn = dir->dir->dentries[name];
6264
6265     // is dn lease valid?
6266     utime_t now = ceph_clock_now();
6267     if (dn->inode &&
6268         dn->lease_mds >= 0 &&
6269         dn->lease_ttl > now &&
6270         mds_sessions.count(dn->lease_mds)) {
6271       MetaSession *s = mds_sessions[dn->lease_mds];
6272       if (s->cap_ttl > now &&
6273           s->cap_gen == dn->lease_gen) {
6274         if (expect_null)
6275           return -EEXIST;
6276       }
6277     }
6278     *pdn = dn;
6279   } else {
6280     // otherwise link up a new one
6281     *pdn = link(dir->dir, name, NULL, NULL);
6282   }
6283
6284   // success
6285   return 0;
6286 }
6287
6288 int Client::path_walk(const filepath& origpath, InodeRef *end,
6289                       const UserPerm& perms, bool followsym, int mask)
6290 {
6291   filepath path = origpath;
6292   InodeRef cur;
6293   if (origpath.absolute())
6294     cur = root;
6295   else
6296     cur = cwd;
6297   assert(cur);
6298
6299   ldout(cct, 10) << "path_walk " << path << dendl;
6300
6301   int symlinks = 0;
6302
6303   unsigned i=0;
6304   while (i < path.depth() && cur) {
6305     int caps = 0;
6306     const string &dname = path[i];
6307     ldout(cct, 10) << " " << i << " " << *cur << " " << dname << dendl;
6308     ldout(cct, 20) << "  (path is " << path << ")" << dendl;
6309     InodeRef next;
6310     if (cct->_conf->client_permissions) {
6311       int r = may_lookup(cur.get(), perms);
6312       if (r < 0)
6313         return r;
6314       caps = CEPH_CAP_AUTH_SHARED;
6315     }
6316
6317     /* Get extra requested caps on the last component */
6318     if (i == (path.depth() - 1))
6319       caps |= mask;
6320     int r = _lookup(cur.get(), dname, caps, &next, perms);
6321     if (r < 0)
6322       return r;
6323     // only follow trailing symlink if followsym.  always follow
6324     // 'directory' symlinks.
6325     if (next && next->is_symlink()) {
6326       symlinks++;
6327       ldout(cct, 20) << " symlink count " << symlinks << ", value is '" << next->symlink << "'" << dendl;
6328       if (symlinks > MAXSYMLINKS) {
6329         return -ELOOP;
6330       }
6331
6332       if (i < path.depth() - 1) {
6333         // dir symlink
6334         // replace consumed components of path with symlink dir target
6335         filepath resolved(next->symlink.c_str());
6336         resolved.append(path.postfixpath(i + 1));
6337         path = resolved;
6338         i = 0;
6339         if (next->symlink[0] == '/') {
6340           cur = root;
6341         }
6342         continue;
6343       } else if (followsym) {
6344         if (next->symlink[0] == '/') {
6345           path = next->symlink.c_str();
6346           i = 0;
6347           // reset position
6348           cur = root;
6349         } else {
6350           filepath more(next->symlink.c_str());
6351           // we need to remove the symlink component from off of the path
6352           // before adding the target that the symlink points to.  remain
6353           // at the same position in the path.
6354           path.pop_dentry();
6355           path.append(more);
6356         }
6357         continue;
6358       }
6359     }
6360     cur.swap(next);
6361     i++;
6362   }
6363   if (!cur)
6364     return -ENOENT;
6365   if (end)
6366     end->swap(cur);
6367   return 0;
6368 }
6369
6370
6371 // namespace ops
6372
6373 int Client::link(const char *relexisting, const char *relpath, const UserPerm& perm)
6374 {
6375   Mutex::Locker lock(client_lock);
6376   tout(cct) << "link" << std::endl;
6377   tout(cct) << relexisting << std::endl;
6378   tout(cct) << relpath << std::endl;
6379
6380   if (unmounting)
6381     return -ENOTCONN;
6382
6383   filepath existing(relexisting);
6384
6385   InodeRef in, dir;
6386   int r = path_walk(existing, &in, perm, true);
6387   if (r < 0)
6388     return r;
6389   if (std::string(relpath) == "/") {
6390     r = -EEXIST;
6391     return r;
6392   }
6393   filepath path(relpath);
6394   string name = path.last_dentry();
6395   path.pop_dentry();
6396
6397   r = path_walk(path, &dir, perm, true);
6398   if (r < 0)
6399     return r;
6400   if (cct->_conf->client_permissions) {
6401     if (S_ISDIR(in->mode)) {
6402       r = -EPERM;
6403       return r;
6404     }
6405     r = may_hardlink(in.get(), perm);
6406     if (r < 0)
6407       return r;
6408     r = may_create(dir.get(), perm);
6409     if (r < 0)
6410       return r;
6411   }
6412   r = _link(in.get(), dir.get(), name.c_str(), perm);
6413   return r;
6414 }
6415
6416 int Client::unlink(const char *relpath, const UserPerm& perm)
6417 {
6418   Mutex::Locker lock(client_lock);
6419   tout(cct) << "unlink" << std::endl;
6420   tout(cct) << relpath << std::endl;
6421
6422   if (unmounting)
6423     return -ENOTCONN;
6424
6425   if (std::string(relpath) == "/")
6426     return -EISDIR;
6427
6428   filepath path(relpath);
6429   string name = path.last_dentry();
6430   path.pop_dentry();
6431   InodeRef dir;
6432   int r = path_walk(path, &dir, perm);
6433   if (r < 0)
6434     return r;
6435   if (cct->_conf->client_permissions) {
6436     r = may_delete(dir.get(), name.c_str(), perm);
6437     if (r < 0)
6438       return r;
6439   }
6440   return _unlink(dir.get(), name.c_str(), perm);
6441 }
6442
6443 int Client::rename(const char *relfrom, const char *relto, const UserPerm& perm)
6444 {
6445   Mutex::Locker lock(client_lock);
6446   tout(cct) << "rename" << std::endl;
6447   tout(cct) << relfrom << std::endl;
6448   tout(cct) << relto << std::endl;
6449
6450   if (unmounting)
6451     return -ENOTCONN;
6452
6453   if (std::string(relfrom) == "/" || std::string(relto) == "/")
6454     return -EBUSY;
6455
6456   filepath from(relfrom);
6457   filepath to(relto);
6458   string fromname = from.last_dentry();
6459   from.pop_dentry();
6460   string toname = to.last_dentry();
6461   to.pop_dentry();
6462
6463   InodeRef fromdir, todir;
6464   int r = path_walk(from, &fromdir, perm);
6465   if (r < 0)
6466     goto out;
6467   r = path_walk(to, &todir, perm);
6468   if (r < 0)
6469     goto out;
6470
6471   if (cct->_conf->client_permissions) {
6472     int r = may_delete(fromdir.get(), fromname.c_str(), perm);
6473     if (r < 0)
6474       return r;
6475     r = may_delete(todir.get(), toname.c_str(), perm);
6476     if (r < 0 && r != -ENOENT)
6477       return r;
6478   }
6479   r = _rename(fromdir.get(), fromname.c_str(), todir.get(), toname.c_str(), perm);
6480 out:
6481   return r;
6482 }
6483
6484 // dirs
6485
6486 int Client::mkdir(const char *relpath, mode_t mode, const UserPerm& perm)
6487 {
6488   Mutex::Locker lock(client_lock);
6489   tout(cct) << "mkdir" << std::endl;
6490   tout(cct) << relpath << std::endl;
6491   tout(cct) << mode << std::endl;
6492   ldout(cct, 10) << "mkdir: " << relpath << dendl;
6493
6494   if (unmounting)
6495     return -ENOTCONN;
6496
6497   if (std::string(relpath) == "/")
6498     return -EEXIST;
6499
6500   filepath path(relpath);
6501   string name = path.last_dentry();
6502   path.pop_dentry();
6503   InodeRef dir;
6504   int r = path_walk(path, &dir, perm);
6505   if (r < 0)
6506     return r;
6507   if (cct->_conf->client_permissions) {
6508     r = may_create(dir.get(), perm);
6509     if (r < 0)
6510       return r;
6511   }
6512   return _mkdir(dir.get(), name.c_str(), mode, perm);
6513 }
6514
6515 int Client::mkdirs(const char *relpath, mode_t mode, const UserPerm& perms)
6516 {
6517   Mutex::Locker lock(client_lock);
6518   ldout(cct, 10) << "Client::mkdirs " << relpath << dendl;
6519   tout(cct) << "mkdirs" << std::endl;
6520   tout(cct) << relpath << std::endl;
6521   tout(cct) << mode << std::endl;
6522
6523   if (unmounting)
6524     return -ENOTCONN;
6525
6526   //get through existing parts of path
6527   filepath path(relpath);
6528   unsigned int i;
6529   int r = 0, caps = 0;
6530   InodeRef cur, next;
6531   cur = cwd;
6532   for (i=0; i<path.depth(); ++i) {
6533     if (cct->_conf->client_permissions) {
6534       r = may_lookup(cur.get(), perms);
6535       if (r < 0)
6536         break;
6537       caps = CEPH_CAP_AUTH_SHARED;
6538     }
6539     r = _lookup(cur.get(), path[i].c_str(), caps, &next, perms);
6540     if (r < 0)
6541       break;
6542     cur.swap(next);
6543   }
6544   //check that we have work left to do
6545   if (i==path.depth()) return -EEXIST;
6546   if (r!=-ENOENT) return r;
6547   ldout(cct, 20) << "mkdirs got through " << i << " directories on path " << relpath << dendl;
6548   //make new directory at each level
6549   for (; i<path.depth(); ++i) {
6550     if (cct->_conf->client_permissions) {
6551       r = may_create(cur.get(), perms);
6552       if (r < 0)
6553         return r;
6554     }
6555     //make new dir
6556     r = _mkdir(cur.get(), path[i].c_str(), mode, perms, &next);
6557
6558     //check proper creation/existence
6559     if(-EEXIST == r && i < path.depth() - 1) {
6560       r = _lookup(cur.get(), path[i].c_str(), CEPH_CAP_AUTH_SHARED, &next, perms);
6561     }
6562     if (r < 0)
6563       return r;
6564     //move to new dir and continue
6565     cur.swap(next);
6566     ldout(cct, 20) << "mkdirs: successfully created directory "
6567                    << filepath(cur->ino).get_path() << dendl;
6568   }
6569   return 0;
6570 }
6571
6572 int Client::rmdir(const char *relpath, const UserPerm& perms)
6573 {
6574   Mutex::Locker lock(client_lock);
6575   tout(cct) << "rmdir" << std::endl;
6576   tout(cct) << relpath << std::endl;
6577
6578   if (unmounting)
6579     return -ENOTCONN;
6580
6581   if (std::string(relpath) == "/")
6582     return -EBUSY;
6583
6584   filepath path(relpath);
6585   string name = path.last_dentry();
6586   path.pop_dentry();
6587   InodeRef dir;
6588   int r = path_walk(path, &dir, perms);
6589   if (r < 0)
6590     return r;
6591   if (cct->_conf->client_permissions) {
6592     int r = may_delete(dir.get(), name.c_str(), perms);
6593     if (r < 0)
6594       return r;
6595   }
6596   return _rmdir(dir.get(), name.c_str(), perms);
6597 }
6598
6599 int Client::mknod(const char *relpath, mode_t mode, const UserPerm& perms, dev_t rdev)
6600 {
6601   Mutex::Locker lock(client_lock);
6602   tout(cct) << "mknod" << std::endl;
6603   tout(cct) << relpath << std::endl;
6604   tout(cct) << mode << std::endl;
6605   tout(cct) << rdev << std::endl;
6606
6607   if (unmounting)
6608     return -ENOTCONN;
6609
6610   if (std::string(relpath) == "/")
6611     return -EEXIST;
6612
6613   filepath path(relpath);
6614   string name = path.last_dentry();
6615   path.pop_dentry();
6616   InodeRef dir;
6617   int r = path_walk(path, &dir, perms);
6618   if (r < 0)
6619     return r;
6620   if (cct->_conf->client_permissions) {
6621     int r = may_create(dir.get(), perms);
6622     if (r < 0)
6623       return r;
6624   }
6625   return _mknod(dir.get(), name.c_str(), mode, rdev, perms);
6626 }
6627
6628 // symlinks
6629
6630 int Client::symlink(const char *target, const char *relpath, const UserPerm& perms)
6631 {
6632   Mutex::Locker lock(client_lock);
6633   tout(cct) << "symlink" << std::endl;
6634   tout(cct) << target << std::endl;
6635   tout(cct) << relpath << std::endl;
6636
6637   if (unmounting)
6638     return -ENOTCONN;
6639
6640   if (std::string(relpath) == "/")
6641     return -EEXIST;
6642
6643   filepath path(relpath);
6644   string name = path.last_dentry();
6645   path.pop_dentry();
6646   InodeRef dir;
6647   int r = path_walk(path, &dir, perms);
6648   if (r < 0)
6649     return r;
6650   if (cct->_conf->client_permissions) {
6651     int r = may_create(dir.get(), perms);
6652     if (r < 0)
6653       return r;
6654   }
6655   return _symlink(dir.get(), name.c_str(), target, perms);
6656 }
6657
6658 int Client::readlink(const char *relpath, char *buf, loff_t size, const UserPerm& perms)
6659 {
6660   Mutex::Locker lock(client_lock);
6661   tout(cct) << "readlink" << std::endl;
6662   tout(cct) << relpath << std::endl;
6663
6664   if (unmounting)
6665     return -ENOTCONN;
6666
6667   filepath path(relpath);
6668   InodeRef in;
6669   int r = path_walk(path, &in, perms, false);
6670   if (r < 0)
6671     return r;
6672
6673   return _readlink(in.get(), buf, size);
6674 }
6675
6676 int Client::_readlink(Inode *in, char *buf, size_t size)
6677 {
6678   if (!in->is_symlink())
6679     return -EINVAL;
6680
6681   // copy into buf (at most size bytes)
6682   int r = in->symlink.length();
6683   if (r > (int)size)
6684     r = size;
6685   memcpy(buf, in->symlink.c_str(), r);
6686   return r;
6687 }
6688
6689
6690 // inode stuff
6691
6692 int Client::_getattr(Inode *in, int mask, const UserPerm& perms, bool force)
6693 {
6694   bool yes = in->caps_issued_mask(mask, true);
6695
6696   ldout(cct, 10) << "_getattr mask " << ccap_string(mask) << " issued=" << yes << dendl;
6697   if (yes && !force)
6698     return 0;
6699
6700   MetaRequest *req = new MetaRequest(CEPH_MDS_OP_GETATTR);
6701   filepath path;
6702   in->make_nosnap_relative_path(path);
6703   req->set_filepath(path);
6704   req->set_inode(in);
6705   req->head.args.getattr.mask = mask;
6706
6707   int res = make_request(req, perms);
6708   ldout(cct, 10) << "_getattr result=" << res << dendl;
6709   return res;
6710 }
6711
6712 int Client::_do_setattr(Inode *in, struct ceph_statx *stx, int mask,
6713                         const UserPerm& perms, InodeRef *inp)
6714 {
6715   int issued = in->caps_issued();
6716
6717   ldout(cct, 10) << "_setattr mask " << mask << " issued " <<
6718     ccap_string(issued) << dendl;
6719
6720   if (in->snapid != CEPH_NOSNAP) {
6721     return -EROFS;
6722   }
6723   if ((mask & CEPH_SETATTR_SIZE) &&
6724       (unsigned long)stx->stx_size > in->size &&
6725       is_quota_bytes_exceeded(in, (unsigned long)stx->stx_size - in->size,
6726                               perms)) {
6727     return -EDQUOT;
6728   }
6729
6730   // make the change locally?
6731   if ((in->cap_dirtier_uid >= 0 && perms.uid() != in->cap_dirtier_uid) ||
6732       (in->cap_dirtier_gid >= 0 && perms.gid() != in->cap_dirtier_gid)) {
6733     ldout(cct, 10) << __func__ << " caller " << perms.uid() << ":" << perms.gid()
6734                    << " != cap dirtier " << in->cap_dirtier_uid << ":"
6735                    << in->cap_dirtier_gid << ", forcing sync setattr"
6736                    << dendl;
6737     /*
6738      * This works because we implicitly flush the caps as part of the
6739      * request, so the cap update check will happen with the writeback
6740      * cap context, and then the setattr check will happen with the
6741      * caller's context.
6742      *
6743      * In reality this pattern is likely pretty rare (different users
6744      * setattr'ing the same file).  If that turns out not to be the
6745      * case later, we can build a more complex pipelined cap writeback
6746      * infrastructure...
6747      */
6748     if (!mask)
6749       mask |= CEPH_SETATTR_CTIME;
6750     goto force_request;
6751   }
6752
6753   if (!mask) {
6754     // caller just needs us to bump the ctime
6755     in->ctime = ceph_clock_now();
6756     in->cap_dirtier_uid = perms.uid();
6757     in->cap_dirtier_gid = perms.gid();
6758     if (issued & CEPH_CAP_AUTH_EXCL)
6759       in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
6760     else if (issued & CEPH_CAP_FILE_EXCL)
6761       in->mark_caps_dirty(CEPH_CAP_FILE_EXCL);
6762     else if (issued & CEPH_CAP_XATTR_EXCL)
6763       in->mark_caps_dirty(CEPH_CAP_XATTR_EXCL);
6764     else
6765       mask |= CEPH_SETATTR_CTIME;
6766   }
6767
6768   if (in->caps_issued_mask(CEPH_CAP_AUTH_EXCL)) {
6769     bool kill_sguid = mask & (CEPH_SETATTR_SIZE|CEPH_SETATTR_KILL_SGUID);
6770
6771     mask &= ~CEPH_SETATTR_KILL_SGUID;
6772
6773     if (mask & CEPH_SETATTR_UID) {
6774       in->ctime = ceph_clock_now();
6775       in->cap_dirtier_uid = perms.uid();
6776       in->cap_dirtier_gid = perms.gid();
6777       in->uid = stx->stx_uid;
6778       in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
6779       mask &= ~CEPH_SETATTR_UID;
6780       kill_sguid = true;
6781       ldout(cct,10) << "changing uid to " << stx->stx_uid << dendl;
6782     }
6783     if (mask & CEPH_SETATTR_GID) {
6784       in->ctime = ceph_clock_now();
6785       in->cap_dirtier_uid = perms.uid();
6786       in->cap_dirtier_gid = perms.gid();
6787       in->gid = stx->stx_gid;
6788       in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
6789       mask &= ~CEPH_SETATTR_GID;
6790       kill_sguid = true;
6791       ldout(cct,10) << "changing gid to " << stx->stx_gid << dendl;
6792     }
6793
6794     if (mask & CEPH_SETATTR_MODE) {
6795       in->ctime = ceph_clock_now();
6796       in->cap_dirtier_uid = perms.uid();
6797       in->cap_dirtier_gid = perms.gid();
6798       in->mode = (in->mode & ~07777) | (stx->stx_mode & 07777);
6799       in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
6800       mask &= ~CEPH_SETATTR_MODE;
6801       ldout(cct,10) << "changing mode to " << stx->stx_mode << dendl;
6802     } else if (kill_sguid && S_ISREG(in->mode) && (in->mode & (S_IXUSR|S_IXGRP|S_IXOTH))) {
6803       /* Must squash the any setuid/setgid bits with an ownership change */
6804       in->mode &= ~(S_ISUID|S_ISGID);
6805       in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
6806     }
6807
6808     if (mask & CEPH_SETATTR_BTIME) {
6809       in->ctime = ceph_clock_now();
6810       in->cap_dirtier_uid = perms.uid();
6811       in->cap_dirtier_gid = perms.gid();
6812       in->btime = utime_t(stx->stx_btime);
6813       in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
6814       mask &= ~CEPH_SETATTR_BTIME;
6815       ldout(cct,10) << "changing btime to " << in->btime << dendl;
6816     }
6817   } else if (mask & CEPH_SETATTR_SIZE) {
6818     /* If we don't have Ax, then we must ask the server to clear them on truncate */
6819     mask |= CEPH_SETATTR_KILL_SGUID;
6820   }
6821
6822   if (in->caps_issued_mask(CEPH_CAP_FILE_EXCL)) {
6823     if (mask & (CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME)) {
6824       if (mask & CEPH_SETATTR_MTIME)
6825         in->mtime = utime_t(stx->stx_mtime);
6826       if (mask & CEPH_SETATTR_ATIME)
6827         in->atime = utime_t(stx->stx_atime);
6828       in->ctime = ceph_clock_now();
6829       in->cap_dirtier_uid = perms.uid();
6830       in->cap_dirtier_gid = perms.gid();
6831       in->time_warp_seq++;
6832       in->mark_caps_dirty(CEPH_CAP_FILE_EXCL);
6833       mask &= ~(CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME);
6834     }
6835   }
6836   if (!mask) {
6837     in->change_attr++;
6838     return 0;
6839   }
6840
6841 force_request:
6842   MetaRequest *req = new MetaRequest(CEPH_MDS_OP_SETATTR);
6843
6844   filepath path;
6845
6846   in->make_nosnap_relative_path(path);
6847   req->set_filepath(path);
6848   req->set_inode(in);
6849
6850   if (mask & CEPH_SETATTR_KILL_SGUID) {
6851     req->inode_drop |= CEPH_CAP_AUTH_SHARED;
6852   }
6853   if (mask & CEPH_SETATTR_MODE) {
6854     req->head.args.setattr.mode = stx->stx_mode;
6855     req->inode_drop |= CEPH_CAP_AUTH_SHARED;
6856     ldout(cct,10) << "changing mode to " << stx->stx_mode << dendl;
6857   }
6858   if (mask & CEPH_SETATTR_UID) {
6859     req->head.args.setattr.uid = stx->stx_uid;
6860     req->inode_drop |= CEPH_CAP_AUTH_SHARED;
6861     ldout(cct,10) << "changing uid to " << stx->stx_uid << dendl;
6862   }
6863   if (mask & CEPH_SETATTR_GID) {
6864     req->head.args.setattr.gid = stx->stx_gid;
6865     req->inode_drop |= CEPH_CAP_AUTH_SHARED;
6866     ldout(cct,10) << "changing gid to " << stx->stx_gid << dendl;
6867   }
6868   if (mask & CEPH_SETATTR_BTIME) {
6869     req->head.args.setattr.btime = utime_t(stx->stx_btime);
6870     req->inode_drop |= CEPH_CAP_AUTH_SHARED;
6871   }
6872   if (mask & CEPH_SETATTR_MTIME) {
6873     req->head.args.setattr.mtime = utime_t(stx->stx_mtime);
6874     req->inode_drop |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_RD |
6875       CEPH_CAP_FILE_WR;
6876   }
6877   if (mask & CEPH_SETATTR_ATIME) {
6878     req->head.args.setattr.atime = utime_t(stx->stx_atime);
6879     req->inode_drop |= CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_RD |
6880       CEPH_CAP_FILE_WR;
6881   }
6882   if (mask & CEPH_SETATTR_SIZE) {
6883     if ((unsigned long)stx->stx_size < mdsmap->get_max_filesize()) {
6884       req->head.args.setattr.size = stx->stx_size;
6885       ldout(cct,10) << "changing size to " << stx->stx_size << dendl;
6886     } else { //too big!
6887       put_request(req);
6888       ldout(cct,10) << "unable to set size to " << stx->stx_size << ". Too large!" << dendl;
6889       return -EFBIG;
6890     }
6891     req->inode_drop |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_RD |
6892       CEPH_CAP_FILE_WR;
6893   }
6894   req->head.args.setattr.mask = mask;
6895
6896   req->regetattr_mask = mask;
6897
6898   int res = make_request(req, perms, inp);
6899   ldout(cct, 10) << "_setattr result=" << res << dendl;
6900   return res;
6901 }
6902
6903 /* Note that we only care about attrs that setattr cares about */
6904 void Client::stat_to_statx(struct stat *st, struct ceph_statx *stx)
6905 {
6906   stx->stx_size = st->st_size;
6907   stx->stx_mode = st->st_mode;
6908   stx->stx_uid = st->st_uid;
6909   stx->stx_gid = st->st_gid;
6910   stx->stx_mtime = st->st_mtim;
6911   stx->stx_atime = st->st_atim;
6912 }
6913
6914 int Client::__setattrx(Inode *in, struct ceph_statx *stx, int mask,
6915                        const UserPerm& perms, InodeRef *inp)
6916 {
6917   int ret = _do_setattr(in, stx, mask, perms, inp);
6918   if (ret < 0)
6919    return ret;
6920   if (mask & CEPH_SETATTR_MODE)
6921     ret = _posix_acl_chmod(in, stx->stx_mode, perms);
6922   return ret;
6923 }
6924
6925 int Client::_setattrx(InodeRef &in, struct ceph_statx *stx, int mask,
6926                       const UserPerm& perms)
6927 {
6928   mask &= (CEPH_SETATTR_MODE | CEPH_SETATTR_UID |
6929            CEPH_SETATTR_GID | CEPH_SETATTR_MTIME |
6930            CEPH_SETATTR_ATIME | CEPH_SETATTR_SIZE |
6931            CEPH_SETATTR_CTIME | CEPH_SETATTR_BTIME);
6932   if (cct->_conf->client_permissions) {
6933     int r = may_setattr(in.get(), stx, mask, perms);
6934     if (r < 0)
6935       return r;
6936   }
6937   return __setattrx(in.get(), stx, mask, perms);
6938 }
6939
6940 int Client::_setattr(InodeRef &in, struct stat *attr, int mask,
6941                      const UserPerm& perms)
6942 {
6943   struct ceph_statx stx;
6944
6945   stat_to_statx(attr, &stx);
6946   mask &= ~CEPH_SETATTR_BTIME;
6947
6948   if ((mask & CEPH_SETATTR_UID) && attr->st_uid == static_cast<uid_t>(-1)) {
6949     mask &= ~CEPH_SETATTR_UID;
6950   }
6951   if ((mask & CEPH_SETATTR_GID) && attr->st_gid == static_cast<uid_t>(-1)) {
6952     mask &= ~CEPH_SETATTR_GID;
6953   }
6954
6955   return _setattrx(in, &stx, mask, perms);
6956 }
6957
6958 int Client::setattr(const char *relpath, struct stat *attr, int mask,
6959                     const UserPerm& perms)
6960 {
6961   Mutex::Locker lock(client_lock);
6962   tout(cct) << "setattr" << std::endl;
6963   tout(cct) << relpath << std::endl;
6964   tout(cct) << mask  << std::endl;
6965
6966   if (unmounting)
6967     return -ENOTCONN;
6968
6969   filepath path(relpath);
6970   InodeRef in;
6971   int r = path_walk(path, &in, perms);
6972   if (r < 0)
6973     return r;
6974   return _setattr(in, attr, mask, perms);
6975 }
6976
6977 int Client::setattrx(const char *relpath, struct ceph_statx *stx, int mask,
6978                      const UserPerm& perms, int flags)
6979 {
6980   Mutex::Locker lock(client_lock);
6981   tout(cct) << "setattrx" << std::endl;
6982   tout(cct) << relpath << std::endl;
6983   tout(cct) << mask  << std::endl;
6984
6985   if (unmounting)
6986     return -ENOTCONN;
6987
6988   filepath path(relpath);
6989   InodeRef in;
6990   int r = path_walk(path, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW));
6991   if (r < 0)
6992     return r;
6993   return _setattrx(in, stx, mask, perms);
6994 }
6995
6996 int Client::fsetattr(int fd, struct stat *attr, int mask, const UserPerm& perms)
6997 {
6998   Mutex::Locker lock(client_lock);
6999   tout(cct) << "fsetattr" << std::endl;
7000   tout(cct) << fd << std::endl;
7001   tout(cct) << mask  << std::endl;
7002
7003   if (unmounting)
7004     return -ENOTCONN;
7005
7006   Fh *f = get_filehandle(fd);
7007   if (!f)
7008     return -EBADF;
7009 #if defined(__linux__) && defined(O_PATH)
7010   if (f->flags & O_PATH)
7011     return -EBADF;
7012 #endif
7013   return _setattr(f->inode, attr, mask, perms);
7014 }
7015
7016 int Client::fsetattrx(int fd, struct ceph_statx *stx, int mask, const UserPerm& perms)
7017 {
7018   Mutex::Locker lock(client_lock);
7019   tout(cct) << "fsetattr" << std::endl;
7020   tout(cct) << fd << std::endl;
7021   tout(cct) << mask  << std::endl;
7022
7023   if (unmounting)
7024     return -ENOTCONN;
7025
7026   Fh *f = get_filehandle(fd);
7027   if (!f)
7028     return -EBADF;
7029 #if defined(__linux__) && defined(O_PATH)
7030   if (f->flags & O_PATH)
7031     return -EBADF;
7032 #endif
7033   return _setattrx(f->inode, stx, mask, perms);
7034 }
7035
7036 int Client::stat(const char *relpath, struct stat *stbuf, const UserPerm& perms,
7037                  frag_info_t *dirstat, int mask)
7038 {
7039   ldout(cct, 3) << "stat enter (relpath " << relpath << " mask " << mask << ")" << dendl;
7040   Mutex::Locker lock(client_lock);
7041   tout(cct) << "stat" << std::endl;
7042   tout(cct) << relpath << std::endl;
7043
7044   if (unmounting)
7045     return -ENOTCONN;
7046
7047   filepath path(relpath);
7048   InodeRef in;
7049   int r = path_walk(path, &in, perms, true, mask);
7050   if (r < 0)
7051     return r;
7052   r = _getattr(in, mask, perms);
7053   if (r < 0) {
7054     ldout(cct, 3) << "stat exit on error!" << dendl;
7055     return r;
7056   }
7057   fill_stat(in, stbuf, dirstat);
7058   ldout(cct, 3) << "stat exit (relpath " << relpath << " mask " << mask << ")" << dendl;
7059   return r;
7060 }
7061
7062 unsigned Client::statx_to_mask(unsigned int flags, unsigned int want)
7063 {
7064   unsigned mask = 0;
7065
7066   /* if NO_ATTR_SYNC is set, then we don't need any -- just use what's in cache */
7067   if (flags & AT_NO_ATTR_SYNC)
7068     goto out;
7069
7070   /* Always set PIN to distinguish from AT_NO_ATTR_SYNC case */
7071   mask |= CEPH_CAP_PIN;
7072   if (want & (CEPH_STATX_MODE|CEPH_STATX_UID|CEPH_STATX_GID|CEPH_STATX_BTIME|CEPH_STATX_CTIME|CEPH_STATX_VERSION))
7073     mask |= CEPH_CAP_AUTH_SHARED;
7074   if (want & (CEPH_STATX_NLINK|CEPH_STATX_CTIME|CEPH_STATX_VERSION))
7075     mask |= CEPH_CAP_LINK_SHARED;
7076   if (want & (CEPH_STATX_ATIME|CEPH_STATX_MTIME|CEPH_STATX_CTIME|CEPH_STATX_SIZE|CEPH_STATX_BLOCKS|CEPH_STATX_VERSION))
7077     mask |= CEPH_CAP_FILE_SHARED;
7078   if (want & (CEPH_STATX_VERSION|CEPH_STATX_CTIME))
7079     mask |= CEPH_CAP_XATTR_SHARED;
7080 out:
7081   return mask;
7082 }
7083
7084 int Client::statx(const char *relpath, struct ceph_statx *stx,
7085                   const UserPerm& perms,
7086                   unsigned int want, unsigned int flags)
7087 {
7088   ldout(cct, 3) << "statx enter (relpath " << relpath << " want " << want << ")" << dendl;
7089   Mutex::Locker lock(client_lock);
7090   tout(cct) << "statx" << std::endl;
7091   tout(cct) << relpath << std::endl;
7092
7093   if (unmounting)
7094     return -ENOTCONN;
7095
7096   filepath path(relpath);
7097   InodeRef in;
7098
7099   unsigned mask = statx_to_mask(flags, want);
7100
7101   int r = path_walk(path, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW), mask);
7102   if (r < 0)
7103     return r;
7104
7105   r = _getattr(in, mask, perms);
7106   if (r < 0) {
7107     ldout(cct, 3) << "statx exit on error!" << dendl;
7108     return r;
7109   }
7110
7111   fill_statx(in, mask, stx);
7112   ldout(cct, 3) << "statx exit (relpath " << relpath << " mask " << stx->stx_mask << ")" << dendl;
7113   return r;
7114 }
7115
7116 int Client::lstat(const char *relpath, struct stat *stbuf,
7117                   const UserPerm& perms, frag_info_t *dirstat, int mask)
7118 {
7119   ldout(cct, 3) << "lstat enter (relpath " << relpath << " mask " << mask << ")" << dendl;
7120   Mutex::Locker lock(client_lock);
7121   tout(cct) << "lstat" << std::endl;
7122   tout(cct) << relpath << std::endl;
7123
7124   if (unmounting)
7125     return -ENOTCONN;
7126
7127   filepath path(relpath);
7128   InodeRef in;
7129   // don't follow symlinks
7130   int r = path_walk(path, &in, perms, false, mask);
7131   if (r < 0)
7132     return r;
7133   r = _getattr(in, mask, perms);
7134   if (r < 0) {
7135     ldout(cct, 3) << "lstat exit on error!" << dendl;
7136     return r;
7137   }
7138   fill_stat(in, stbuf, dirstat);
7139   ldout(cct, 3) << "lstat exit (relpath " << relpath << " mask " << mask << ")" << dendl;
7140   return r;
7141 }
7142
7143 int Client::fill_stat(Inode *in, struct stat *st, frag_info_t *dirstat, nest_info_t *rstat)
7144 {
7145   ldout(cct, 10) << "fill_stat on " << in->ino << " snap/dev" << in->snapid
7146            << " mode 0" << oct << in->mode << dec
7147            << " mtime " << in->mtime << " ctime " << in->ctime << dendl;
7148   memset(st, 0, sizeof(struct stat));
7149   if (use_faked_inos())
7150     st->st_ino = in->faked_ino;
7151   else
7152     st->st_ino = in->ino;
7153   st->st_dev = in->snapid;
7154   st->st_mode = in->mode;
7155   st->st_rdev = in->rdev;
7156   if (in->is_dir()) {
7157     switch (in->nlink) {
7158       case 0:
7159         st->st_nlink = 0; /* dir is unlinked */
7160         break;
7161       case 1:
7162         st->st_nlink = 1 /* parent dentry */
7163                        + 1 /* <dir>/. */
7164                        + in->dirstat.nsubdirs; /* include <dir>/. self-reference */
7165         break;
7166       default:
7167         ceph_abort();
7168     }
7169   } else {
7170     st->st_nlink = in->nlink;
7171   }
7172   st->st_uid = in->uid;
7173   st->st_gid = in->gid;
7174   if (in->ctime > in->mtime) {
7175     stat_set_ctime_sec(st, in->ctime.sec());
7176     stat_set_ctime_nsec(st, in->ctime.nsec());
7177   } else {
7178     stat_set_ctime_sec(st, in->mtime.sec());
7179     stat_set_ctime_nsec(st, in->mtime.nsec());
7180   }
7181   stat_set_atime_sec(st, in->atime.sec());
7182   stat_set_atime_nsec(st, in->atime.nsec());
7183   stat_set_mtime_sec(st, in->mtime.sec());
7184   stat_set_mtime_nsec(st, in->mtime.nsec());
7185   if (in->is_dir()) {
7186     if (cct->_conf->client_dirsize_rbytes)
7187       st->st_size = in->rstat.rbytes;
7188     else
7189       st->st_size = in->dirstat.size();
7190     st->st_blocks = 1;
7191   } else {
7192     st->st_size = in->size;
7193     st->st_blocks = (in->size + 511) >> 9;
7194   }
7195   st->st_blksize = MAX(in->layout.stripe_unit, 4096);
7196
7197   if (dirstat)
7198     *dirstat = in->dirstat;
7199   if (rstat)
7200     *rstat = in->rstat;
7201
7202   return in->caps_issued();
7203 }
7204
7205 void Client::fill_statx(Inode *in, unsigned int mask, struct ceph_statx *stx)
7206 {
7207   ldout(cct, 10) << "fill_statx on " << in->ino << " snap/dev" << in->snapid
7208            << " mode 0" << oct << in->mode << dec
7209            << " mtime " << in->mtime << " ctime " << in->ctime << dendl;
7210   memset(stx, 0, sizeof(struct ceph_statx));
7211
7212   /*
7213    * If mask is 0, then the caller set AT_NO_ATTR_SYNC. Reset the mask
7214    * so that all bits are set.
7215    */
7216   if (!mask)
7217     mask = ~0;
7218
7219   /* These are always considered to be available */
7220   stx->stx_dev = in->snapid;
7221   stx->stx_blksize = MAX(in->layout.stripe_unit, 4096);
7222
7223   /* Type bits are always set, even when CEPH_STATX_MODE is not */
7224   stx->stx_mode = S_IFMT & in->mode;
7225   stx->stx_ino = use_faked_inos() ? in->faked_ino : (ino_t)in->ino;
7226   stx->stx_rdev = in->rdev;
7227   stx->stx_mask |= (CEPH_STATX_INO|CEPH_STATX_RDEV);
7228
7229   if (mask & CEPH_CAP_AUTH_SHARED) {
7230     stx->stx_uid = in->uid;
7231     stx->stx_gid = in->gid;
7232     stx->stx_mode = in->mode;
7233     in->btime.to_timespec(&stx->stx_btime);
7234     stx->stx_mask |= (CEPH_STATX_MODE|CEPH_STATX_UID|CEPH_STATX_GID|CEPH_STATX_BTIME);
7235   }
7236
7237   if (mask & CEPH_CAP_LINK_SHARED) {
7238     if (in->is_dir()) {
7239       switch (in->nlink) {
7240         case 0:
7241           stx->stx_nlink = 0; /* dir is unlinked */
7242           break;
7243         case 1:
7244           stx->stx_nlink = 1 /* parent dentry */
7245                            + 1 /* <dir>/. */
7246                            + in->dirstat.nsubdirs; /* include <dir>/. self-reference */
7247           break;
7248         default:
7249           ceph_abort();
7250       }
7251     } else {
7252       stx->stx_nlink = in->nlink;
7253     }
7254     stx->stx_mask |= CEPH_STATX_NLINK;
7255   }
7256
7257   if (mask & CEPH_CAP_FILE_SHARED) {
7258
7259     in->atime.to_timespec(&stx->stx_atime);
7260     in->mtime.to_timespec(&stx->stx_mtime);
7261
7262     if (in->is_dir()) {
7263       if (cct->_conf->client_dirsize_rbytes)
7264         stx->stx_size = in->rstat.rbytes;
7265       else
7266         stx->stx_size = in->dirstat.size();
7267       stx->stx_blocks = 1;
7268     } else {
7269       stx->stx_size = in->size;
7270       stx->stx_blocks = (in->size + 511) >> 9;
7271     }
7272     stx->stx_mask |= (CEPH_STATX_ATIME|CEPH_STATX_MTIME|
7273                       CEPH_STATX_SIZE|CEPH_STATX_BLOCKS);
7274   }
7275
7276   /* Change time and change_attr both require all shared caps to view */
7277   if ((mask & CEPH_STAT_CAP_INODE_ALL) == CEPH_STAT_CAP_INODE_ALL) {
7278     stx->stx_version = in->change_attr;
7279     if (in->ctime > in->mtime)
7280       in->ctime.to_timespec(&stx->stx_ctime);
7281     else
7282       in->mtime.to_timespec(&stx->stx_ctime);
7283     stx->stx_mask |= (CEPH_STATX_CTIME|CEPH_STATX_VERSION);
7284   }
7285
7286 }
7287
7288 void Client::touch_dn(Dentry *dn)
7289 {
7290   lru.lru_touch(dn);
7291 }
7292
7293 int Client::chmod(const char *relpath, mode_t mode, const UserPerm& perms)
7294 {
7295   Mutex::Locker lock(client_lock);
7296   tout(cct) << "chmod" << std::endl;
7297   tout(cct) << relpath << std::endl;
7298   tout(cct) << mode << std::endl;
7299
7300   if (unmounting)
7301     return -ENOTCONN;
7302
7303   filepath path(relpath);
7304   InodeRef in;
7305   int r = path_walk(path, &in, perms);
7306   if (r < 0)
7307     return r;
7308   struct stat attr;
7309   attr.st_mode = mode;
7310   return _setattr(in, &attr, CEPH_SETATTR_MODE, perms);
7311 }
7312
7313 int Client::fchmod(int fd, mode_t mode, const UserPerm& perms)
7314 {
7315   Mutex::Locker lock(client_lock);
7316   tout(cct) << "fchmod" << std::endl;
7317   tout(cct) << fd << std::endl;
7318   tout(cct) << mode << std::endl;
7319
7320   if (unmounting)
7321     return -ENOTCONN;
7322
7323   Fh *f = get_filehandle(fd);
7324   if (!f)
7325     return -EBADF;
7326 #if defined(__linux__) && defined(O_PATH)
7327   if (f->flags & O_PATH)
7328     return -EBADF;
7329 #endif
7330   struct stat attr;
7331   attr.st_mode = mode;
7332   return _setattr(f->inode, &attr, CEPH_SETATTR_MODE, perms);
7333 }
7334
7335 int Client::lchmod(const char *relpath, mode_t mode, const UserPerm& perms)
7336 {
7337   Mutex::Locker lock(client_lock);
7338   tout(cct) << "lchmod" << std::endl;
7339   tout(cct) << relpath << std::endl;
7340   tout(cct) << mode << std::endl;
7341
7342   if (unmounting)
7343     return -ENOTCONN;
7344
7345   filepath path(relpath);
7346   InodeRef in;
7347   // don't follow symlinks
7348   int r = path_walk(path, &in, perms, false);
7349   if (r < 0)
7350     return r;
7351   struct stat attr;
7352   attr.st_mode = mode;
7353   return _setattr(in, &attr, CEPH_SETATTR_MODE, perms);
7354 }
7355
7356 int Client::chown(const char *relpath, uid_t new_uid, gid_t new_gid,
7357                   const UserPerm& perms)
7358 {
7359   Mutex::Locker lock(client_lock);
7360   tout(cct) << "chown" << std::endl;
7361   tout(cct) << relpath << std::endl;
7362   tout(cct) << new_uid << std::endl;
7363   tout(cct) << new_gid << std::endl;
7364
7365   if (unmounting)
7366     return -ENOTCONN;
7367
7368   filepath path(relpath);
7369   InodeRef in;
7370   int r = path_walk(path, &in, perms);
7371   if (r < 0)
7372     return r;
7373   struct stat attr;
7374   attr.st_uid = new_uid;
7375   attr.st_gid = new_gid;
7376   return _setattr(in, &attr, CEPH_SETATTR_UID|CEPH_SETATTR_GID, perms);
7377 }
7378
7379 int Client::fchown(int fd, uid_t new_uid, gid_t new_gid, const UserPerm& perms)
7380 {
7381   Mutex::Locker lock(client_lock);
7382   tout(cct) << "fchown" << std::endl;
7383   tout(cct) << fd << std::endl;
7384   tout(cct) << new_uid << std::endl;
7385   tout(cct) << new_gid << std::endl;
7386
7387   if (unmounting)
7388     return -ENOTCONN;
7389
7390   Fh *f = get_filehandle(fd);
7391   if (!f)
7392     return -EBADF;
7393 #if defined(__linux__) && defined(O_PATH)
7394   if (f->flags & O_PATH)
7395     return -EBADF;
7396 #endif
7397   struct stat attr;
7398   attr.st_uid = new_uid;
7399   attr.st_gid = new_gid;
7400   int mask = 0;
7401   if (new_uid != static_cast<uid_t>(-1)) mask |= CEPH_SETATTR_UID;
7402   if (new_gid != static_cast<gid_t>(-1)) mask |= CEPH_SETATTR_GID;
7403   return _setattr(f->inode, &attr, mask, perms);
7404 }
7405
7406 int Client::lchown(const char *relpath, uid_t new_uid, gid_t new_gid,
7407                    const UserPerm& perms)
7408 {
7409   Mutex::Locker lock(client_lock);
7410   tout(cct) << "lchown" << std::endl;
7411   tout(cct) << relpath << std::endl;
7412   tout(cct) << new_uid << std::endl;
7413   tout(cct) << new_gid << std::endl;
7414
7415   if (unmounting)
7416     return -ENOTCONN;
7417
7418   filepath path(relpath);
7419   InodeRef in;
7420   // don't follow symlinks
7421   int r = path_walk(path, &in, perms, false);
7422   if (r < 0)
7423     return r;
7424   struct stat attr;
7425   attr.st_uid = new_uid;
7426   attr.st_gid = new_gid;
7427   int mask = 0;
7428   if (new_uid != static_cast<uid_t>(-1)) mask |= CEPH_SETATTR_UID;
7429   if (new_gid != static_cast<gid_t>(-1)) mask |= CEPH_SETATTR_GID;
7430   return _setattr(in, &attr, mask, perms);
7431 }
7432
7433 int Client::utime(const char *relpath, struct utimbuf *buf,
7434                   const UserPerm& perms)
7435 {
7436   Mutex::Locker lock(client_lock);
7437   tout(cct) << "utime" << std::endl;
7438   tout(cct) << relpath << std::endl;
7439   tout(cct) << buf->modtime << std::endl;
7440   tout(cct) << buf->actime << std::endl;
7441
7442   if (unmounting)
7443     return -ENOTCONN;
7444
7445   filepath path(relpath);
7446   InodeRef in;
7447   int r = path_walk(path, &in, perms);
7448   if (r < 0)
7449     return r;
7450   struct stat attr;
7451   stat_set_mtime_sec(&attr, buf->modtime);
7452   stat_set_mtime_nsec(&attr, 0);
7453   stat_set_atime_sec(&attr, buf->actime);
7454   stat_set_atime_nsec(&attr, 0);
7455   return _setattr(in, &attr, CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME, perms);
7456 }
7457
7458 int Client::lutime(const char *relpath, struct utimbuf *buf,
7459                    const UserPerm& perms)
7460 {
7461   Mutex::Locker lock(client_lock);
7462   tout(cct) << "lutime" << std::endl;
7463   tout(cct) << relpath << std::endl;
7464   tout(cct) << buf->modtime << std::endl;
7465   tout(cct) << buf->actime << std::endl;
7466
7467   if (unmounting)
7468     return -ENOTCONN;
7469
7470   filepath path(relpath);
7471   InodeRef in;
7472   // don't follow symlinks
7473   int r = path_walk(path, &in, perms, false);
7474   if (r < 0)
7475     return r;
7476   struct stat attr;
7477   stat_set_mtime_sec(&attr, buf->modtime);
7478   stat_set_mtime_nsec(&attr, 0);
7479   stat_set_atime_sec(&attr, buf->actime);
7480   stat_set_atime_nsec(&attr, 0);
7481   return _setattr(in, &attr, CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME, perms);
7482 }
7483
7484 int Client::flock(int fd, int operation, uint64_t owner)
7485 {
7486   Mutex::Locker lock(client_lock);
7487   tout(cct) << "flock" << std::endl;
7488   tout(cct) << fd << std::endl;
7489   tout(cct) << operation << std::endl;
7490   tout(cct) << owner << std::endl;
7491
7492   if (unmounting)
7493     return -ENOTCONN;
7494
7495   Fh *f = get_filehandle(fd);
7496   if (!f)
7497     return -EBADF;
7498
7499   return _flock(f, operation, owner);
7500 }
7501
7502 int Client::opendir(const char *relpath, dir_result_t **dirpp, const UserPerm& perms)
7503 {
7504   Mutex::Locker lock(client_lock);
7505   tout(cct) << "opendir" << std::endl;
7506   tout(cct) << relpath << std::endl;
7507
7508   if (unmounting)
7509     return -ENOTCONN;
7510
7511   filepath path(relpath);
7512   InodeRef in;
7513   int r = path_walk(path, &in, perms, true);
7514   if (r < 0)
7515     return r;
7516   if (cct->_conf->client_permissions) {
7517     int r = may_open(in.get(), O_RDONLY, perms);
7518     if (r < 0)
7519       return r;
7520   }
7521   r = _opendir(in.get(), dirpp, perms);
7522   /* if ENOTDIR, dirpp will be an uninitialized point and it's very dangerous to access its value */
7523   if (r != -ENOTDIR)
7524       tout(cct) << (unsigned long)*dirpp << std::endl;
7525   return r;
7526 }
7527
7528 int Client::_opendir(Inode *in, dir_result_t **dirpp, const UserPerm& perms)
7529 {
7530   if (!in->is_dir())
7531     return -ENOTDIR;
7532   *dirpp = new dir_result_t(in, perms);
7533   opened_dirs.insert(*dirpp);
7534   ldout(cct, 8) << "_opendir(" << in->ino << ") = " << 0 << " (" << *dirpp << ")" << dendl;
7535   return 0;
7536 }
7537
7538
7539 int Client::closedir(dir_result_t *dir)
7540 {
7541   Mutex::Locker lock(client_lock);
7542   tout(cct) << "closedir" << std::endl;
7543   tout(cct) << (unsigned long)dir << std::endl;
7544
7545   ldout(cct, 3) << "closedir(" << dir << ") = 0" << dendl;
7546   _closedir(dir);
7547   return 0;
7548 }
7549
7550 void Client::_closedir(dir_result_t *dirp)
7551 {
7552   ldout(cct, 10) << "_closedir(" << dirp << ")" << dendl;
7553   if (dirp->inode) {
7554     ldout(cct, 10) << "_closedir detaching inode " << dirp->inode << dendl;
7555     dirp->inode.reset();
7556   }
7557   _readdir_drop_dirp_buffer(dirp);
7558   opened_dirs.erase(dirp);
7559   delete dirp;
7560 }
7561
7562 void Client::rewinddir(dir_result_t *dirp)
7563 {
7564   Mutex::Locker lock(client_lock);
7565   ldout(cct, 3) << "rewinddir(" << dirp << ")" << dendl;
7566
7567   if (unmounting)
7568     return;
7569
7570   dir_result_t *d = static_cast<dir_result_t*>(dirp);
7571   _readdir_drop_dirp_buffer(d);
7572   d->reset();
7573 }
7574
7575 loff_t Client::telldir(dir_result_t *dirp)
7576 {
7577   dir_result_t *d = static_cast<dir_result_t*>(dirp);
7578   ldout(cct, 3) << "telldir(" << dirp << ") = " << d->offset << dendl;
7579   return d->offset;
7580 }
7581
7582 void Client::seekdir(dir_result_t *dirp, loff_t offset)
7583 {
7584   Mutex::Locker lock(client_lock);
7585
7586   ldout(cct, 3) << "seekdir(" << dirp << ", " << offset << ")" << dendl;
7587
7588   if (unmounting)
7589     return;
7590
7591   if (offset == dirp->offset)
7592     return;
7593
7594   if (offset > dirp->offset)
7595     dirp->release_count = 0;   // bump if we do a forward seek
7596   else
7597     dirp->ordered_count = 0;   // disable filling readdir cache
7598
7599   if (dirp->hash_order()) {
7600     if (dirp->offset > offset) {
7601       _readdir_drop_dirp_buffer(dirp);
7602       dirp->reset();
7603     }
7604   } else {
7605     if (offset == 0 ||
7606         dirp->buffer_frag != frag_t(dir_result_t::fpos_high(offset)) ||
7607         dirp->offset_low() > dir_result_t::fpos_low(offset))  {
7608       _readdir_drop_dirp_buffer(dirp);
7609       dirp->reset();
7610     }
7611   }
7612
7613   dirp->offset = offset;
7614 }
7615
7616
7617 //struct dirent {
7618 //  ino_t          d_ino;       /* inode number */
7619 //  off_t          d_off;       /* offset to the next dirent */
7620 //  unsigned short d_reclen;    /* length of this record */
7621 //  unsigned char  d_type;      /* type of file */
7622 //  char           d_name[256]; /* filename */
7623 //};
7624 void Client::fill_dirent(struct dirent *de, const char *name, int type, uint64_t ino, loff_t next_off)
7625 {
7626   strncpy(de->d_name, name, 255);
7627   de->d_name[255] = '\0';
7628 #ifndef __CYGWIN__
7629   de->d_ino = ino;
7630 #if !defined(DARWIN) && !defined(__FreeBSD__)
7631   de->d_off = next_off;
7632 #endif
7633   de->d_reclen = 1;
7634   de->d_type = IFTODT(type);
7635   ldout(cct, 10) << "fill_dirent '" << de->d_name << "' -> " << inodeno_t(de->d_ino)
7636            << " type " << (int)de->d_type << " w/ next_off " << hex << next_off << dec << dendl;
7637 #endif
7638 }
7639
7640 void Client::_readdir_next_frag(dir_result_t *dirp)
7641 {
7642   frag_t fg = dirp->buffer_frag;
7643
7644   if (fg.is_rightmost()) {
7645     ldout(cct, 10) << "_readdir_next_frag advance from " << fg << " to END" << dendl;
7646     dirp->set_end();
7647     return;
7648   }
7649
7650   // advance
7651   fg = fg.next();
7652   ldout(cct, 10) << "_readdir_next_frag advance from " << dirp->buffer_frag << " to " << fg << dendl;
7653
7654   if (dirp->hash_order()) {
7655     // keep last_name
7656     int64_t new_offset = dir_result_t::make_fpos(fg.value(), 2, true);
7657     if (dirp->offset < new_offset) // don't decrease offset
7658       dirp->offset = new_offset;
7659   } else {
7660     dirp->last_name.clear();
7661     dirp->offset = dir_result_t::make_fpos(fg, 2, false);
7662     _readdir_rechoose_frag(dirp);
7663   }
7664 }
7665
7666 void Client::_readdir_rechoose_frag(dir_result_t *dirp)
7667 {
7668   assert(dirp->inode);
7669
7670   if (dirp->hash_order())
7671     return;
7672
7673   frag_t cur = frag_t(dirp->offset_high());
7674   frag_t fg = dirp->inode->dirfragtree[cur.value()];
7675   if (fg != cur) {
7676     ldout(cct, 10) << "_readdir_rechoose_frag frag " << cur << " maps to " << fg << dendl;
7677     dirp->offset = dir_result_t::make_fpos(fg, 2, false);
7678     dirp->last_name.clear();
7679     dirp->next_offset = 2;
7680   }
7681 }
7682
7683 void Client::_readdir_drop_dirp_buffer(dir_result_t *dirp)
7684 {
7685   ldout(cct, 10) << "_readdir_drop_dirp_buffer " << dirp << dendl;
7686   dirp->buffer.clear();
7687 }
7688
7689 int Client::_readdir_get_frag(dir_result_t *dirp)
7690 {
7691   assert(dirp);
7692   assert(dirp->inode);
7693
7694   // get the current frag.
7695   frag_t fg;
7696   if (dirp->hash_order())
7697     fg = dirp->inode->dirfragtree[dirp->offset_high()];
7698   else
7699     fg = frag_t(dirp->offset_high());
7700
7701   ldout(cct, 10) << "_readdir_get_frag " << dirp << " on " << dirp->inode->ino << " fg " << fg
7702                  << " offset " << hex << dirp->offset << dec << dendl;
7703
7704   int op = CEPH_MDS_OP_READDIR;
7705   if (dirp->inode && dirp->inode->snapid == CEPH_SNAPDIR)
7706     op = CEPH_MDS_OP_LSSNAP;
7707
7708   InodeRef& diri = dirp->inode;
7709
7710   MetaRequest *req = new MetaRequest(op);
7711   filepath path;
7712   diri->make_nosnap_relative_path(path);
7713   req->set_filepath(path);
7714   req->set_inode(diri.get());
7715   req->head.args.readdir.frag = fg;
7716   req->head.args.readdir.flags = CEPH_READDIR_REPLY_BITFLAGS;
7717   if (dirp->last_name.length()) {
7718     req->path2.set_path(dirp->last_name);
7719   } else if (dirp->hash_order()) {
7720     req->head.args.readdir.offset_hash = dirp->offset_high();
7721   }
7722   req->dirp = dirp;
7723
7724   bufferlist dirbl;
7725   int res = make_request(req, dirp->perms, NULL, NULL, -1, &dirbl);
7726
7727   if (res == -EAGAIN) {
7728     ldout(cct, 10) << "_readdir_get_frag got EAGAIN, retrying" << dendl;
7729     _readdir_rechoose_frag(dirp);
7730     return _readdir_get_frag(dirp);
7731   }
7732
7733   if (res == 0) {
7734     ldout(cct, 10) << "_readdir_get_frag " << dirp << " got frag " << dirp->buffer_frag
7735                    << " size " << dirp->buffer.size() << dendl;
7736   } else {
7737     ldout(cct, 10) << "_readdir_get_frag got error " << res << ", setting end flag" << dendl;
7738     dirp->set_end();
7739   }
7740
7741   return res;
7742 }
7743
7744 struct dentry_off_lt {
7745   bool operator()(const Dentry* dn, int64_t off) const {
7746     return dir_result_t::fpos_cmp(dn->offset, off) < 0;
7747   }
7748 };
7749
7750 int Client::_readdir_cache_cb(dir_result_t *dirp, add_dirent_cb_t cb, void *p,
7751                               int caps, bool getref)
7752 {
7753   assert(client_lock.is_locked());
7754   ldout(cct, 10) << "_readdir_cache_cb " << dirp << " on " << dirp->inode->ino
7755            << " last_name " << dirp->last_name << " offset " << hex << dirp->offset << dec
7756            << dendl;
7757   Dir *dir = dirp->inode->dir;
7758
7759   if (!dir) {
7760     ldout(cct, 10) << " dir is empty" << dendl;
7761     dirp->set_end();
7762     return 0;
7763   }
7764
7765   vector<Dentry*>::iterator pd = std::lower_bound(dir->readdir_cache.begin(),
7766                                                   dir->readdir_cache.end(),
7767                                                   dirp->offset, dentry_off_lt());
7768
7769   string dn_name;
7770   while (true) {
7771     if (!dirp->inode->is_complete_and_ordered())
7772       return -EAGAIN;
7773     if (pd == dir->readdir_cache.end())
7774       break;
7775     Dentry *dn = *pd;
7776     if (dn->inode == NULL) {
7777       ldout(cct, 15) << " skipping null '" << dn->name << "'" << dendl;
7778       ++pd;
7779       continue;
7780     }
7781     if (dn->cap_shared_gen != dir->parent_inode->shared_gen) {
7782       ldout(cct, 15) << " skipping mismatch shared gen '" << dn->name << "'" << dendl;
7783       ++pd;
7784       continue;
7785     }
7786
7787     int r = _getattr(dn->inode, caps, dirp->perms);
7788     if (r < 0)
7789       return r;
7790
7791     struct ceph_statx stx;
7792     struct dirent de;
7793     fill_statx(dn->inode, caps, &stx);
7794
7795     uint64_t next_off = dn->offset + 1;
7796     ++pd;
7797     if (pd == dir->readdir_cache.end())
7798       next_off = dir_result_t::END;
7799
7800     Inode *in = NULL;
7801     fill_dirent(&de, dn->name.c_str(), stx.stx_mode, stx.stx_ino, next_off);
7802     if (getref) {
7803       in = dn->inode.get();
7804       _ll_get(in);
7805     }
7806
7807     dn_name = dn->name; // fill in name while we have lock
7808
7809     client_lock.Unlock();
7810     r = cb(p, &de, &stx, next_off, in);  // _next_ offset
7811     client_lock.Lock();
7812     ldout(cct, 15) << " de " << de.d_name << " off " << hex << dn->offset << dec
7813                    << " = " << r << dendl;
7814     if (r < 0) {
7815       return r;
7816     }
7817
7818     dirp->offset = next_off;
7819     if (dirp->at_end())
7820       dirp->next_offset = 2;
7821     else
7822       dirp->next_offset = dirp->offset_low();
7823     dirp->last_name = dn_name; // we successfully returned this one; update!
7824     dirp->release_count = 0; // last_name no longer match cache index
7825     if (r > 0)
7826       return r;
7827   }
7828
7829   ldout(cct, 10) << "_readdir_cache_cb " << dirp << " on " << dirp->inode->ino << " at end" << dendl;
7830   dirp->set_end();
7831   return 0;
7832 }
7833
7834 int Client::readdir_r_cb(dir_result_t *d, add_dirent_cb_t cb, void *p,
7835                          unsigned want, unsigned flags, bool getref)
7836 {
7837   int caps = statx_to_mask(flags, want);
7838
7839   Mutex::Locker lock(client_lock);
7840
7841   if (unmounting)
7842     return -ENOTCONN;
7843
7844   dir_result_t *dirp = static_cast<dir_result_t*>(d);
7845
7846   ldout(cct, 10) << "readdir_r_cb " << *dirp->inode << " offset " << hex << dirp->offset
7847                  << dec << " at_end=" << dirp->at_end()
7848                  << " hash_order=" << dirp->hash_order() << dendl;
7849
7850   struct dirent de;
7851   struct ceph_statx stx;
7852   memset(&de, 0, sizeof(de));
7853   memset(&stx, 0, sizeof(stx));
7854
7855   InodeRef& diri = dirp->inode;
7856
7857   if (dirp->at_end())
7858     return 0;
7859
7860   if (dirp->offset == 0) {
7861     ldout(cct, 15) << " including ." << dendl;
7862     assert(diri->dn_set.size() < 2); // can't have multiple hard-links to a dir
7863     uint64_t next_off = 1;
7864
7865     int r;
7866     r = _getattr(diri, caps, dirp->perms);
7867     if (r < 0)
7868       return r;
7869
7870     fill_statx(diri, caps, &stx);
7871     fill_dirent(&de, ".", S_IFDIR, stx.stx_ino, next_off);
7872
7873     Inode *inode = NULL;
7874     if (getref) {
7875       inode = diri.get();
7876       _ll_get(inode);
7877     }
7878
7879     client_lock.Unlock();
7880     r = cb(p, &de, &stx, next_off, inode);
7881     client_lock.Lock();
7882     if (r < 0)
7883       return r;
7884
7885     dirp->offset = next_off;
7886     if (r > 0)
7887       return r;
7888   }
7889   if (dirp->offset == 1) {
7890     ldout(cct, 15) << " including .." << dendl;
7891     uint64_t next_off = 2;
7892     InodeRef in;
7893     if (diri->dn_set.empty())
7894       in = diri;
7895     else
7896       in = diri->get_first_parent()->dir->parent_inode;
7897
7898     int r;
7899     r = _getattr(in, caps, dirp->perms);
7900     if (r < 0)
7901       return r;
7902
7903     fill_statx(in, caps, &stx);
7904     fill_dirent(&de, "..", S_IFDIR, stx.stx_ino, next_off);
7905
7906     Inode *inode = NULL;
7907     if (getref) {
7908       inode = in.get();
7909       _ll_get(inode);
7910     }
7911
7912     client_lock.Unlock();
7913     r = cb(p, &de, &stx, next_off, inode);
7914     client_lock.Lock();
7915     if (r < 0)
7916       return r;
7917
7918     dirp->offset = next_off;
7919     if (r > 0)
7920       return r;
7921   }
7922
7923   // can we read from our cache?
7924   ldout(cct, 10) << "offset " << hex << dirp->offset << dec
7925            << " snapid " << dirp->inode->snapid << " (complete && ordered) "
7926            << dirp->inode->is_complete_and_ordered()
7927            << " issued " << ccap_string(dirp->inode->caps_issued())
7928            << dendl;
7929   if (dirp->inode->snapid != CEPH_SNAPDIR &&
7930       dirp->inode->is_complete_and_ordered() &&
7931       dirp->inode->caps_issued_mask(CEPH_CAP_FILE_SHARED, true)) {
7932     int err = _readdir_cache_cb(dirp, cb, p, caps, getref);
7933     if (err != -EAGAIN)
7934       return err;
7935   }
7936
7937   while (1) {
7938     if (dirp->at_end())
7939       return 0;
7940
7941     bool check_caps = true;
7942     if (!dirp->is_cached()) {
7943       int r = _readdir_get_frag(dirp);
7944       if (r)
7945         return r;
7946       // _readdir_get_frag () may updates dirp->offset if the replied dirfrag is
7947       // different than the requested one. (our dirfragtree was outdated)
7948       check_caps = false;
7949     }
7950     frag_t fg = dirp->buffer_frag;
7951
7952     ldout(cct, 10) << "frag " << fg << " buffer size " << dirp->buffer.size()
7953                    << " offset " << hex << dirp->offset << dendl;
7954
7955     for (auto it = std::lower_bound(dirp->buffer.begin(), dirp->buffer.end(),
7956                                     dirp->offset, dir_result_t::dentry_off_lt());
7957          it != dirp->buffer.end();
7958          ++it) {
7959       dir_result_t::dentry &entry = *it;
7960
7961       uint64_t next_off = entry.offset + 1;
7962
7963       int r;
7964       if (check_caps) {
7965         r = _getattr(entry.inode, caps, dirp->perms);
7966         if (r < 0)
7967           return r;
7968       }
7969
7970       fill_statx(entry.inode, caps, &stx);
7971       fill_dirent(&de, entry.name.c_str(), stx.stx_mode, stx.stx_ino, next_off);
7972
7973       Inode *inode = NULL;
7974       if (getref) {
7975         inode = entry.inode.get();
7976         _ll_get(inode);
7977       }
7978
7979       client_lock.Unlock();
7980       r = cb(p, &de, &stx, next_off, inode);  // _next_ offset
7981       client_lock.Lock();
7982
7983       ldout(cct, 15) << " de " << de.d_name << " off " << hex << next_off - 1 << dec
7984                      << " = " << r << dendl;
7985       if (r < 0)
7986         return r;
7987
7988       dirp->offset = next_off;
7989       if (r > 0)
7990         return r;
7991     }
7992
7993     if (dirp->next_offset > 2) {
7994       ldout(cct, 10) << " fetching next chunk of this frag" << dendl;
7995       _readdir_drop_dirp_buffer(dirp);
7996       continue;  // more!
7997     }
7998
7999     if (!fg.is_rightmost()) {
8000       // next frag!
8001       _readdir_next_frag(dirp);
8002       continue;
8003     }
8004
8005     if (diri->shared_gen == dirp->start_shared_gen &&
8006         diri->dir_release_count == dirp->release_count) {
8007       if (diri->dir_ordered_count == dirp->ordered_count) {
8008         ldout(cct, 10) << " marking (I_COMPLETE|I_DIR_ORDERED) on " << *diri << dendl;
8009         if (diri->dir) {
8010           assert(diri->dir->readdir_cache.size() >= dirp->cache_index);
8011           diri->dir->readdir_cache.resize(dirp->cache_index);
8012         }
8013         diri->flags |= I_COMPLETE | I_DIR_ORDERED;
8014       } else {
8015         ldout(cct, 10) << " marking I_COMPLETE on " << *diri << dendl;
8016         diri->flags |= I_COMPLETE;
8017       }
8018     }
8019
8020     dirp->set_end();
8021     return 0;
8022   }
8023   ceph_abort();
8024   return 0;
8025 }
8026
8027
8028 int Client::readdir_r(dir_result_t *d, struct dirent *de)
8029 {
8030   return readdirplus_r(d, de, 0, 0, 0, NULL);
8031 }
8032
8033 /*
8034  * readdirplus_r
8035  *
8036  * returns
8037  *  1 if we got a dirent
8038  *  0 for end of directory
8039  * <0 on error
8040  */
8041
8042 struct single_readdir {
8043   struct dirent *de;
8044   struct ceph_statx *stx;
8045   Inode *inode;
8046   bool full;
8047 };
8048
8049 static int _readdir_single_dirent_cb(void *p, struct dirent *de,
8050                                      struct ceph_statx *stx, off_t off,
8051                                      Inode *in)
8052 {
8053   single_readdir *c = static_cast<single_readdir *>(p);
8054
8055   if (c->full)
8056     return -1;  // already filled this dirent
8057
8058   *c->de = *de;
8059   if (c->stx)
8060     *c->stx = *stx;
8061   c->inode = in;
8062   c->full = true;
8063   return 1;
8064 }
8065
8066 struct dirent *Client::readdir(dir_result_t *d)
8067 {
8068   int ret;
8069   static struct dirent de;
8070   single_readdir sr;
8071   sr.de = &de;
8072   sr.stx = NULL;
8073   sr.inode = NULL;
8074   sr.full = false;
8075
8076   // our callback fills the dirent and sets sr.full=true on first
8077   // call, and returns -1 the second time around.
8078   ret = readdir_r_cb(d, _readdir_single_dirent_cb, (void *)&sr);
8079   if (ret < -1) {
8080     errno = -ret;  // this sucks.
8081     return (dirent *) NULL;
8082   }
8083   if (sr.full) {
8084     return &de;
8085   }
8086   return (dirent *) NULL;
8087 }
8088
8089 int Client::readdirplus_r(dir_result_t *d, struct dirent *de,
8090                           struct ceph_statx *stx, unsigned want,
8091                           unsigned flags, Inode **out)
8092 {
8093   single_readdir sr;
8094   sr.de = de;
8095   sr.stx = stx;
8096   sr.inode = NULL;
8097   sr.full = false;
8098
8099   // our callback fills the dirent and sets sr.full=true on first
8100   // call, and returns -1 the second time around.
8101   int r = readdir_r_cb(d, _readdir_single_dirent_cb, (void *)&sr, want, flags, out);
8102   if (r < -1)
8103     return r;
8104   if (out)
8105     *out = sr.inode;
8106   if (sr.full)
8107     return 1;
8108   return 0;
8109 }
8110
8111
8112 /* getdents */
8113 struct getdents_result {
8114   char *buf;
8115   int buflen;
8116   int pos;
8117   bool fullent;
8118 };
8119
8120 static int _readdir_getdent_cb(void *p, struct dirent *de,
8121                                struct ceph_statx *stx, off_t off, Inode *in)
8122 {
8123   struct getdents_result *c = static_cast<getdents_result *>(p);
8124
8125   int dlen;
8126   if (c->fullent)
8127     dlen = sizeof(*de);
8128   else
8129     dlen = strlen(de->d_name) + 1;
8130
8131   if (c->pos + dlen > c->buflen)
8132     return -1;  // doesn't fit
8133
8134   if (c->fullent) {
8135     memcpy(c->buf + c->pos, de, sizeof(*de));
8136   } else {
8137     memcpy(c->buf + c->pos, de->d_name, dlen);
8138   }
8139   c->pos += dlen;
8140   return 0;
8141 }
8142
8143 int Client::_getdents(dir_result_t *dir, char *buf, int buflen, bool fullent)
8144 {
8145   getdents_result gr;
8146   gr.buf = buf;
8147   gr.buflen = buflen;
8148   gr.fullent = fullent;
8149   gr.pos = 0;
8150
8151   int r = readdir_r_cb(dir, _readdir_getdent_cb, (void *)&gr);
8152
8153   if (r < 0) { // some error
8154     if (r == -1) { // buffer ran out of space
8155       if (gr.pos) { // but we got some entries already!
8156         return gr.pos;
8157       } // or we need a larger buffer
8158       return -ERANGE;
8159     } else { // actual error, return it
8160       return r;
8161     }
8162   }
8163   return gr.pos;
8164 }
8165
8166
8167 /* getdir */
8168 struct getdir_result {
8169   list<string> *contents;
8170   int num;
8171 };
8172
8173 static int _getdir_cb(void *p, struct dirent *de, struct ceph_statx *stx, off_t off, Inode *in)
8174 {
8175   getdir_result *r = static_cast<getdir_result *>(p);
8176
8177   r->contents->push_back(de->d_name);
8178   r->num++;
8179   return 0;
8180 }
8181
8182 int Client::getdir(const char *relpath, list<string>& contents,
8183                    const UserPerm& perms)
8184 {
8185   ldout(cct, 3) << "getdir(" << relpath << ")" << dendl;
8186   {
8187     Mutex::Locker lock(client_lock);
8188     tout(cct) << "getdir" << std::endl;
8189     tout(cct) << relpath << std::endl;
8190   }
8191
8192   dir_result_t *d;
8193   int r = opendir(relpath, &d, perms);
8194   if (r < 0)
8195     return r;
8196
8197   getdir_result gr;
8198   gr.contents = &contents;
8199   gr.num = 0;
8200   r = readdir_r_cb(d, _getdir_cb, (void *)&gr);
8201
8202   closedir(d);
8203
8204   if (r < 0)
8205     return r;
8206   return gr.num;
8207 }
8208
8209
8210 /****** file i/o **********/
8211 int Client::open(const char *relpath, int flags, const UserPerm& perms,
8212                  mode_t mode, int stripe_unit, int stripe_count,
8213                  int object_size, const char *data_pool)
8214 {
8215   ldout(cct, 3) << "open enter(" << relpath << ", " << ceph_flags_sys2wire(flags) << "," << mode << ")" << dendl;
8216   Mutex::Locker lock(client_lock);
8217   tout(cct) << "open" << std::endl;
8218   tout(cct) << relpath << std::endl;
8219   tout(cct) << ceph_flags_sys2wire(flags) << std::endl;
8220
8221   if (unmounting)
8222     return -ENOTCONN;
8223
8224   Fh *fh = NULL;
8225
8226 #if defined(__linux__) && defined(O_PATH)
8227   /* When the O_PATH is being specified, others flags than O_DIRECTORY
8228    * and O_NOFOLLOW are ignored. Please refer do_entry_open() function
8229    * in kernel (fs/open.c). */
8230   if (flags & O_PATH)
8231     flags &= O_DIRECTORY | O_NOFOLLOW | O_PATH;
8232 #endif
8233
8234   filepath path(relpath);
8235   InodeRef in;
8236   bool created = false;
8237   /* O_CREATE with O_EXCL enforces O_NOFOLLOW. */
8238   bool followsym = !((flags & O_NOFOLLOW) || ((flags & O_CREAT) && (flags & O_EXCL)));
8239   int r = path_walk(path, &in, perms, followsym, ceph_caps_for_mode(mode));
8240
8241   if (r == 0 && (flags & O_CREAT) && (flags & O_EXCL))
8242     return -EEXIST;
8243
8244 #if defined(__linux__) && defined(O_PATH)
8245   if (r == 0 && in->is_symlink() && (flags & O_NOFOLLOW) && !(flags & O_PATH))
8246 #else
8247   if (r == 0 && in->is_symlink() && (flags & O_NOFOLLOW))
8248 #endif
8249     return -ELOOP;
8250
8251   if (r == -ENOENT && (flags & O_CREAT)) {
8252     filepath dirpath = path;
8253     string dname = dirpath.last_dentry();
8254     dirpath.pop_dentry();
8255     InodeRef dir;
8256     r = path_walk(dirpath, &dir, perms, true,
8257                   cct->_conf->client_permissions ? CEPH_CAP_AUTH_SHARED : 0);
8258     if (r < 0)
8259       goto out;
8260     if (cct->_conf->client_permissions) {
8261       r = may_create(dir.get(), perms);
8262       if (r < 0)
8263         goto out;
8264     }
8265     r = _create(dir.get(), dname.c_str(), flags, mode, &in, &fh, stripe_unit,
8266                 stripe_count, object_size, data_pool, &created, perms);
8267   }
8268   if (r < 0)
8269     goto out;
8270
8271   if (!created) {
8272     // posix says we can only check permissions of existing files
8273     if (cct->_conf->client_permissions) {
8274       r = may_open(in.get(), flags, perms);
8275       if (r < 0)
8276         goto out;
8277     }
8278   }
8279
8280   if (!fh)
8281     r = _open(in.get(), flags, mode, &fh, perms);
8282   if (r >= 0) {
8283     // allocate a integer file descriptor
8284     assert(fh);
8285     r = get_fd();
8286     assert(fd_map.count(r) == 0);
8287     fd_map[r] = fh;
8288   }
8289
8290  out:
8291   tout(cct) << r << std::endl;
8292   ldout(cct, 3) << "open exit(" << path << ", " << ceph_flags_sys2wire(flags) << ") = " << r << dendl;
8293   return r;
8294 }
8295
8296 int Client::open(const char *relpath, int flags, const UserPerm& perms, mode_t mode)
8297 {
8298   /* Use default file striping parameters */
8299   return open(relpath, flags, perms, mode, 0, 0, 0, NULL);
8300 }
8301
8302 int Client::lookup_hash(inodeno_t ino, inodeno_t dirino, const char *name,
8303                         const UserPerm& perms)
8304 {
8305   Mutex::Locker lock(client_lock);
8306   ldout(cct, 3) << "lookup_hash enter(" << ino << ", #" << dirino << "/" << name << ")" << dendl;
8307
8308   if (unmounting)
8309     return -ENOTCONN;
8310
8311   MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPHASH);
8312   filepath path(ino);
8313   req->set_filepath(path);
8314
8315   uint32_t h = ceph_str_hash(CEPH_STR_HASH_RJENKINS, name, strlen(name));
8316   char f[30];
8317   sprintf(f, "%u", h);
8318   filepath path2(dirino);
8319   path2.push_dentry(string(f));
8320   req->set_filepath2(path2);
8321
8322   int r = make_request(req, perms, NULL, NULL,
8323                        rand() % mdsmap->get_num_in_mds());
8324   ldout(cct, 3) << "lookup_hash exit(" << ino << ", #" << dirino << "/" << name << ") = " << r << dendl;
8325   return r;
8326 }
8327
8328
8329 /**
8330  * Load inode into local cache.
8331  *
8332  * If inode pointer is non-NULL, and take a reference on
8333  * the resulting Inode object in one operation, so that caller
8334  * can safely assume inode will still be there after return.
8335  */
8336 int Client::_lookup_ino(inodeno_t ino, const UserPerm& perms, Inode **inode)
8337 {
8338   ldout(cct, 8) << "lookup_ino enter(" << ino << ")" << dendl;
8339
8340   if (unmounting)
8341     return -ENOTCONN;
8342
8343   MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPINO);
8344   filepath path(ino);
8345   req->set_filepath(path);
8346
8347   int r = make_request(req, perms, NULL, NULL, rand() % mdsmap->get_num_in_mds());
8348   if (r == 0 && inode != NULL) {
8349     vinodeno_t vino(ino, CEPH_NOSNAP);
8350     unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino);
8351     assert(p != inode_map.end());
8352     *inode = p->second;
8353     _ll_get(*inode);
8354   }
8355   ldout(cct, 8) << "lookup_ino exit(" << ino << ") = " << r << dendl;
8356   return r;
8357 }
8358
8359 int Client::lookup_ino(inodeno_t ino, const UserPerm& perms, Inode **inode)
8360 {
8361   Mutex::Locker lock(client_lock);
8362   return _lookup_ino(ino, perms, inode);
8363 }
8364
8365 /**
8366  * Find the parent inode of `ino` and insert it into
8367  * our cache.  Conditionally also set `parent` to a referenced
8368  * Inode* if caller provides non-NULL value.
8369  */
8370 int Client::_lookup_parent(Inode *ino, const UserPerm& perms, Inode **parent)
8371 {
8372   ldout(cct, 8) << "lookup_parent enter(" << ino->ino << ")" << dendl;
8373
8374   if (unmounting)
8375     return -ENOTCONN;
8376
8377   if (!ino->dn_set.empty()) {
8378     // if we exposed the parent here, we'd need to check permissions,
8379     // but right now we just rely on the MDS doing so in make_request
8380     ldout(cct, 8) << "lookup_parent dentry already present" << dendl;
8381     return 0;
8382   }
8383
8384   if (ino->is_root()) {
8385     *parent = NULL;
8386     ldout(cct, 8) << "ino is root, no parent" << dendl;
8387     return -EINVAL;
8388   }
8389
8390   MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPPARENT);
8391   filepath path(ino->ino);
8392   req->set_filepath(path);
8393
8394   InodeRef target;
8395   int r = make_request(req, perms, &target, NULL, rand() % mdsmap->get_num_in_mds());
8396   // Give caller a reference to the parent ino if they provided a pointer.
8397   if (parent != NULL) {
8398     if (r == 0) {
8399       *parent = target.get();
8400       _ll_get(*parent);
8401       ldout(cct, 8) << "lookup_parent found parent " << (*parent)->ino << dendl;
8402     } else {
8403       *parent = NULL;
8404     }
8405   }
8406   ldout(cct, 8) << "lookup_parent exit(" << ino->ino << ") = " << r << dendl;
8407   return r;
8408 }
8409
8410 int Client::lookup_parent(Inode *ino, const UserPerm& perms, Inode **parent)
8411 {
8412   Mutex::Locker lock(client_lock);
8413   return _lookup_parent(ino, perms, parent);
8414 }
8415
8416 /**
8417  * Populate the parent dentry for `ino`, provided it is
8418  * a child of `parent`.
8419  */
8420 int Client::_lookup_name(Inode *ino, Inode *parent, const UserPerm& perms)
8421 {
8422   assert(parent->is_dir());
8423   ldout(cct, 3) << "lookup_name enter(" << ino->ino << ")" << dendl;
8424
8425   if (unmounting)
8426     return -ENOTCONN;
8427
8428   MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPNAME);
8429   req->set_filepath2(filepath(parent->ino));
8430   req->set_filepath(filepath(ino->ino));
8431   req->set_inode(ino);
8432
8433   int r = make_request(req, perms, NULL, NULL, rand() % mdsmap->get_num_in_mds());
8434   ldout(cct, 3) << "lookup_name exit(" << ino->ino << ") = " << r << dendl;
8435   return r;
8436 }
8437
8438 int Client::lookup_name(Inode *ino, Inode *parent, const UserPerm& perms)
8439 {
8440   Mutex::Locker lock(client_lock);
8441   return _lookup_name(ino, parent, perms);
8442 }
8443
8444  Fh *Client::_create_fh(Inode *in, int flags, int cmode, const UserPerm& perms)
8445 {
8446   assert(in);
8447   Fh *f = new Fh(in);
8448   f->mode = cmode;
8449   f->flags = flags;
8450
8451   // inode
8452   f->actor_perms = perms;
8453
8454   ldout(cct, 10) << "_create_fh " << in->ino << " mode " << cmode << dendl;
8455
8456   if (in->snapid != CEPH_NOSNAP) {
8457     in->snap_cap_refs++;
8458     ldout(cct, 5) << "open success, fh is " << f << " combined IMMUTABLE SNAP caps "
8459             << ccap_string(in->caps_issued()) << dendl;
8460   }
8461
8462   const md_config_t *conf = cct->_conf;
8463   f->readahead.set_trigger_requests(1);
8464   f->readahead.set_min_readahead_size(conf->client_readahead_min);
8465   uint64_t max_readahead = Readahead::NO_LIMIT;
8466   if (conf->client_readahead_max_bytes) {
8467     max_readahead = MIN(max_readahead, (uint64_t)conf->client_readahead_max_bytes);
8468   }
8469   if (conf->client_readahead_max_periods) {
8470     max_readahead = MIN(max_readahead, in->layout.get_period()*(uint64_t)conf->client_readahead_max_periods);
8471   }
8472   f->readahead.set_max_readahead_size(max_readahead);
8473   vector<uint64_t> alignments;
8474   alignments.push_back(in->layout.get_period());
8475   alignments.push_back(in->layout.stripe_unit);
8476   f->readahead.set_alignments(alignments);
8477
8478   return f;
8479 }
8480
8481 int Client::_release_fh(Fh *f)
8482 {
8483   //ldout(cct, 3) << "op: client->close(open_files[ " << fh << " ]);" << dendl;
8484   //ldout(cct, 3) << "op: open_files.erase( " << fh << " );" << dendl;
8485   Inode *in = f->inode.get();
8486   ldout(cct, 8) << "_release_fh " << f << " mode " << f->mode << " on " << *in << dendl;
8487
8488   in->unset_deleg(f);
8489
8490   if (in->snapid == CEPH_NOSNAP) {
8491     if (in->put_open_ref(f->mode)) {
8492       _flush(in, new C_Client_FlushComplete(this, in));
8493       check_caps(in, 0);
8494     }
8495   } else {
8496     assert(in->snap_cap_refs > 0);
8497     in->snap_cap_refs--;
8498   }
8499
8500   _release_filelocks(f);
8501
8502   // Finally, read any async err (i.e. from flushes)
8503   int err = f->take_async_err();
8504   if (err != 0) {
8505     ldout(cct, 1) << "_release_fh " << f << " on inode " << *in << " caught async_err = "
8506                   << cpp_strerror(err) << dendl;
8507   } else {
8508     ldout(cct, 10) << "_release_fh " << f << " on inode " << *in << " no async_err state" << dendl;
8509   }
8510
8511   _put_fh(f);
8512
8513   return err;
8514 }
8515
8516 void Client::_put_fh(Fh *f)
8517 {
8518   int left = f->put();
8519   if (!left) {
8520     delete f;
8521   }
8522 }
8523
8524 int Client::_open(Inode *in, int flags, mode_t mode, Fh **fhp,
8525                   const UserPerm& perms)
8526 {
8527   if (in->snapid != CEPH_NOSNAP &&
8528       (flags & (O_WRONLY | O_RDWR | O_CREAT | O_TRUNC | O_APPEND))) {
8529     return -EROFS;
8530   }
8531
8532   // use normalized flags to generate cmode
8533   int cmode = ceph_flags_to_mode(ceph_flags_sys2wire(flags));
8534   if (cmode < 0)
8535     return -EINVAL;
8536   int want = ceph_caps_for_mode(cmode);
8537   int result = 0;
8538
8539   in->get_open_ref(cmode);  // make note of pending open, since it effects _wanted_ caps.
8540
8541   if ((flags & O_TRUNC) == 0 && in->caps_issued_mask(want)) {
8542     // update wanted?
8543     check_caps(in, CHECK_CAPS_NODELAY);
8544   } else {
8545
8546     MetaRequest *req = new MetaRequest(CEPH_MDS_OP_OPEN);
8547     filepath path;
8548     in->make_nosnap_relative_path(path);
8549     req->set_filepath(path);
8550     req->head.args.open.flags = ceph_flags_sys2wire(flags & ~O_CREAT);
8551     req->head.args.open.mode = mode;
8552     req->head.args.open.pool = -1;
8553     if (cct->_conf->client_debug_getattr_caps)
8554       req->head.args.open.mask = DEBUG_GETATTR_CAPS;
8555     else
8556       req->head.args.open.mask = 0;
8557     req->head.args.open.old_size = in->size;   // for O_TRUNC
8558     req->set_inode(in);
8559     result = make_request(req, perms);
8560
8561     /*
8562      * NFS expects that delegations will be broken on a conflicting open,
8563      * not just when there is actual conflicting access to the file. SMB leases
8564      * and oplocks also have similar semantics.
8565      *
8566      * Ensure that clients that have delegations enabled will wait on minimal
8567      * caps during open, just to ensure that other clients holding delegations
8568      * return theirs first.
8569      */
8570     if (deleg_timeout && result == 0) {
8571       int need = 0, have;
8572
8573       if (cmode & CEPH_FILE_MODE_WR)
8574         need |= CEPH_CAP_FILE_WR;
8575       if (cmode & CEPH_FILE_MODE_RD)
8576         need |= CEPH_CAP_FILE_RD;
8577
8578       result = get_caps(in, need, want, &have, -1);
8579       if (result < 0) {
8580         ldout(cct, 8) << "Unable to get caps after open of inode " << *in <<
8581                           " . Denying open: " <<
8582                           cpp_strerror(result) << dendl;
8583         in->put_open_ref(cmode);
8584       } else {
8585         put_cap_ref(in, need);
8586       }
8587     }
8588   }
8589
8590   // success?
8591   if (result >= 0) {
8592     if (fhp)
8593       *fhp = _create_fh(in, flags, cmode, perms);
8594   } else {
8595     in->put_open_ref(cmode);
8596   }
8597
8598   trim_cache();
8599
8600   return result;
8601 }
8602
8603 int Client::_renew_caps(Inode *in)
8604 {
8605   int wanted = in->caps_file_wanted();
8606   if (in->is_any_caps() &&
8607       ((wanted & CEPH_CAP_ANY_WR) == 0 || in->auth_cap)) {
8608     check_caps(in, CHECK_CAPS_NODELAY);
8609     return 0;
8610   }
8611
8612   int flags = 0;
8613   if ((wanted & CEPH_CAP_FILE_RD) && (wanted & CEPH_CAP_FILE_WR))
8614     flags = O_RDWR;
8615   else if (wanted & CEPH_CAP_FILE_RD)
8616     flags = O_RDONLY;
8617   else if (wanted & CEPH_CAP_FILE_WR)
8618     flags = O_WRONLY;
8619
8620   MetaRequest *req = new MetaRequest(CEPH_MDS_OP_OPEN);
8621   filepath path;
8622   in->make_nosnap_relative_path(path);
8623   req->set_filepath(path);
8624   req->head.args.open.flags = flags;
8625   req->head.args.open.pool = -1;
8626   if (cct->_conf->client_debug_getattr_caps)
8627     req->head.args.open.mask = DEBUG_GETATTR_CAPS;
8628   else
8629     req->head.args.open.mask = 0;
8630   req->set_inode(in);
8631
8632   // duplicate in case Cap goes away; not sure if that race is a concern?
8633   const UserPerm *pperm = in->get_best_perms();
8634   UserPerm perms;
8635   if (pperm != NULL)
8636     perms = *pperm;
8637   int ret = make_request(req, perms);
8638   return ret;
8639 }
8640
8641 int Client::close(int fd)
8642 {
8643   ldout(cct, 3) << "close enter(" << fd << ")" << dendl;
8644   Mutex::Locker lock(client_lock);
8645   tout(cct) << "close" << std::endl;
8646   tout(cct) << fd << std::endl;
8647
8648   if (unmounting)
8649     return -ENOTCONN;
8650
8651   Fh *fh = get_filehandle(fd);
8652   if (!fh)
8653     return -EBADF;
8654   int err = _release_fh(fh);
8655   fd_map.erase(fd);
8656   put_fd(fd);
8657   ldout(cct, 3) << "close exit(" << fd << ")" << dendl;
8658   return err;
8659 }
8660
8661
8662 // ------------
8663 // read, write
8664
8665 loff_t Client::lseek(int fd, loff_t offset, int whence)
8666 {
8667   Mutex::Locker lock(client_lock);
8668   tout(cct) << "lseek" << std::endl;
8669   tout(cct) << fd << std::endl;
8670   tout(cct) << offset << std::endl;
8671   tout(cct) << whence << std::endl;
8672
8673   if (unmounting)
8674     return -ENOTCONN;
8675
8676   Fh *f = get_filehandle(fd);
8677   if (!f)
8678     return -EBADF;
8679 #if defined(__linux__) && defined(O_PATH)
8680   if (f->flags & O_PATH)
8681     return -EBADF;
8682 #endif
8683   return _lseek(f, offset, whence);
8684 }
8685
8686 loff_t Client::_lseek(Fh *f, loff_t offset, int whence)
8687 {
8688   Inode *in = f->inode.get();
8689   int r;
8690
8691   switch (whence) {
8692   case SEEK_SET:
8693     f->pos = offset;
8694     break;
8695
8696   case SEEK_CUR:
8697     f->pos += offset;
8698     break;
8699
8700   case SEEK_END:
8701     r = _getattr(in, CEPH_STAT_CAP_SIZE, f->actor_perms);
8702     if (r < 0)
8703       return r;
8704     f->pos = in->size + offset;
8705     break;
8706
8707   default:
8708     ceph_abort();
8709   }
8710
8711   ldout(cct, 8) << "_lseek(" << f << ", " << offset << ", " << whence << ") = " << f->pos << dendl;
8712   return f->pos;
8713 }
8714
8715
8716 void Client::lock_fh_pos(Fh *f)
8717 {
8718   ldout(cct, 10) << "lock_fh_pos " << f << dendl;
8719
8720   if (f->pos_locked || !f->pos_waiters.empty()) {
8721     Cond cond;
8722     f->pos_waiters.push_back(&cond);
8723     ldout(cct, 10) << "lock_fh_pos BLOCKING on " << f << dendl;
8724     while (f->pos_locked || f->pos_waiters.front() != &cond)
8725       cond.Wait(client_lock);
8726     ldout(cct, 10) << "lock_fh_pos UNBLOCKING on " << f << dendl;
8727     assert(f->pos_waiters.front() == &cond);
8728     f->pos_waiters.pop_front();
8729   }
8730
8731   f->pos_locked = true;
8732 }
8733
8734 void Client::unlock_fh_pos(Fh *f)
8735 {
8736   ldout(cct, 10) << "unlock_fh_pos " << f << dendl;
8737   f->pos_locked = false;
8738 }
8739
8740 int Client::uninline_data(Inode *in, Context *onfinish)
8741 {
8742   if (!in->inline_data.length()) {
8743     onfinish->complete(0);
8744     return 0;
8745   }
8746
8747   char oid_buf[32];
8748   snprintf(oid_buf, sizeof(oid_buf), "%llx.00000000", (long long unsigned)in->ino);
8749   object_t oid = oid_buf;
8750
8751   ObjectOperation create_ops;
8752   create_ops.create(false);
8753
8754   objecter->mutate(oid,
8755                    OSDMap::file_to_object_locator(in->layout),
8756                    create_ops,
8757                    in->snaprealm->get_snap_context(),
8758                    ceph::real_clock::now(),
8759                    0,
8760                    NULL);
8761
8762   bufferlist inline_version_bl;
8763   ::encode(in->inline_version, inline_version_bl);
8764
8765   ObjectOperation uninline_ops;
8766   uninline_ops.cmpxattr("inline_version",
8767                         CEPH_OSD_CMPXATTR_OP_GT,
8768                         CEPH_OSD_CMPXATTR_MODE_U64,
8769                         inline_version_bl);
8770   bufferlist inline_data = in->inline_data;
8771   uninline_ops.write(0, inline_data, in->truncate_size, in->truncate_seq);
8772   uninline_ops.setxattr("inline_version", stringify(in->inline_version));
8773
8774   objecter->mutate(oid,
8775                    OSDMap::file_to_object_locator(in->layout),
8776                    uninline_ops,
8777                    in->snaprealm->get_snap_context(),
8778                    ceph::real_clock::now(),
8779                    0,
8780                    onfinish);
8781
8782   return 0;
8783 }
8784
8785 //
8786
8787 // blocking osd interface
8788
8789 int Client::read(int fd, char *buf, loff_t size, loff_t offset)
8790 {
8791   Mutex::Locker lock(client_lock);
8792   tout(cct) << "read" << std::endl;
8793   tout(cct) << fd << std::endl;
8794   tout(cct) << size << std::endl;
8795   tout(cct) << offset << std::endl;
8796
8797   if (unmounting)
8798     return -ENOTCONN;
8799
8800   Fh *f = get_filehandle(fd);
8801   if (!f)
8802     return -EBADF;
8803 #if defined(__linux__) && defined(O_PATH)
8804   if (f->flags & O_PATH)
8805     return -EBADF;
8806 #endif
8807   bufferlist bl;
8808   int r = _read(f, offset, size, &bl);
8809   ldout(cct, 3) << "read(" << fd << ", " << (void*)buf << ", " << size << ", " << offset << ") = " << r << dendl;
8810   if (r >= 0) {
8811     bl.copy(0, bl.length(), buf);
8812     r = bl.length();
8813   }
8814   return r;
8815 }
8816
8817 int Client::preadv(int fd, const struct iovec *iov, int iovcnt, loff_t offset)
8818 {
8819   if (iovcnt < 0)
8820     return -EINVAL;
8821   return _preadv_pwritev(fd, iov, iovcnt, offset, false);
8822 }
8823
8824 int Client::_read(Fh *f, int64_t offset, uint64_t size, bufferlist *bl)
8825 {
8826   const md_config_t *conf = cct->_conf;
8827   Inode *in = f->inode.get();
8828
8829   if ((f->mode & CEPH_FILE_MODE_RD) == 0)
8830     return -EBADF;
8831   //bool lazy = f->mode == CEPH_FILE_MODE_LAZY;
8832
8833   bool movepos = false;
8834   if (offset < 0) {
8835     lock_fh_pos(f);
8836     offset = f->pos;
8837     movepos = true;
8838   }
8839   loff_t start_pos = offset;
8840
8841   if (in->inline_version == 0) {
8842     int r = _getattr(in, CEPH_STAT_CAP_INLINE_DATA, f->actor_perms, true);
8843     if (r < 0) {
8844       if (movepos)
8845         unlock_fh_pos(f);
8846       return r;
8847     }
8848     assert(in->inline_version > 0);
8849   }
8850
8851 retry:
8852   int have;
8853   int r = get_caps(in, CEPH_CAP_FILE_RD, CEPH_CAP_FILE_CACHE, &have, -1);
8854   if (r < 0) {
8855     if (movepos)
8856         unlock_fh_pos(f);
8857     return r;
8858   }
8859   if (f->flags & O_DIRECT)
8860     have &= ~CEPH_CAP_FILE_CACHE;
8861
8862   Mutex uninline_flock("Client::_read_uninline_data flock");
8863   Cond uninline_cond;
8864   bool uninline_done = false;
8865   int uninline_ret = 0;
8866   Context *onuninline = NULL;
8867
8868   if (in->inline_version < CEPH_INLINE_NONE) {
8869     if (!(have & CEPH_CAP_FILE_CACHE)) {
8870       onuninline = new C_SafeCond(&uninline_flock,
8871                                   &uninline_cond,
8872                                   &uninline_done,
8873                                   &uninline_ret);
8874       uninline_data(in, onuninline);
8875     } else {
8876       uint32_t len = in->inline_data.length();
8877
8878       uint64_t endoff = offset + size;
8879       if (endoff > in->size)
8880         endoff = in->size;
8881
8882       if (offset < len) {
8883         if (endoff <= len) {
8884           bl->substr_of(in->inline_data, offset, endoff - offset);
8885         } else {
8886           bl->substr_of(in->inline_data, offset, len - offset);
8887           bl->append_zero(endoff - len);
8888         }
8889       } else if ((uint64_t)offset < endoff) {
8890         bl->append_zero(endoff - offset);
8891       }
8892
8893       goto success;
8894     }
8895   }
8896
8897   if (!conf->client_debug_force_sync_read &&
8898       (conf->client_oc && (have & CEPH_CAP_FILE_CACHE))) {
8899
8900     if (f->flags & O_RSYNC) {
8901       _flush_range(in, offset, size);
8902     }
8903     r = _read_async(f, offset, size, bl);
8904     if (r < 0)
8905       goto done;
8906   } else {
8907     if (f->flags & O_DIRECT)
8908       _flush_range(in, offset, size);
8909
8910     bool checkeof = false;
8911     r = _read_sync(f, offset, size, bl, &checkeof);
8912     if (r < 0)
8913       goto done;
8914     if (checkeof) {
8915       offset += r;
8916       size -= r;
8917
8918       put_cap_ref(in, CEPH_CAP_FILE_RD);
8919       have = 0;
8920       // reverify size
8921       r = _getattr(in, CEPH_STAT_CAP_SIZE, f->actor_perms);
8922       if (r < 0)
8923         goto done;
8924
8925       // eof?  short read.
8926       if ((uint64_t)offset < in->size)
8927         goto retry;
8928     }
8929   }
8930
8931 success:
8932   if (movepos) {
8933     // adjust fd pos
8934     f->pos = start_pos + bl->length();
8935     unlock_fh_pos(f);
8936   }
8937
8938 done:
8939   // done!
8940
8941   if (onuninline) {
8942     client_lock.Unlock();
8943     uninline_flock.Lock();
8944     while (!uninline_done)
8945       uninline_cond.Wait(uninline_flock);
8946     uninline_flock.Unlock();
8947     client_lock.Lock();
8948
8949     if (uninline_ret >= 0 || uninline_ret == -ECANCELED) {
8950       in->inline_data.clear();
8951       in->inline_version = CEPH_INLINE_NONE;
8952       in->mark_caps_dirty(CEPH_CAP_FILE_WR);
8953       check_caps(in, 0);
8954     } else
8955       r = uninline_ret;
8956   }
8957
8958   if (have)
8959     put_cap_ref(in, CEPH_CAP_FILE_RD);
8960   if (r < 0) {
8961     if (movepos)
8962         unlock_fh_pos(f);
8963     return r;
8964   } else
8965     return bl->length();
8966 }
8967
8968 Client::C_Readahead::C_Readahead(Client *c, Fh *f) :
8969     client(c), f(f) {
8970   f->get();
8971   f->readahead.inc_pending();
8972 }
8973
8974 Client::C_Readahead::~C_Readahead() {
8975   f->readahead.dec_pending();
8976   client->_put_fh(f);
8977 }
8978
8979 void Client::C_Readahead::finish(int r) {
8980   lgeneric_subdout(client->cct, client, 20) << "client." << client->get_nodeid() << " " << "C_Readahead on " << f->inode << dendl;
8981   client->put_cap_ref(f->inode.get(), CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE);
8982 }
8983
8984 int Client::_read_async(Fh *f, uint64_t off, uint64_t len, bufferlist *bl)
8985 {
8986   const md_config_t *conf = cct->_conf;
8987   Inode *in = f->inode.get();
8988
8989   ldout(cct, 10) << "_read_async " << *in << " " << off << "~" << len << dendl;
8990
8991   // trim read based on file size?
8992   if (off >= in->size)
8993     return 0;
8994   if (len == 0)
8995     return 0;
8996   if (off + len > in->size) {
8997     len = in->size - off;
8998   }
8999
9000   ldout(cct, 10) << " min_bytes=" << f->readahead.get_min_readahead_size()
9001                  << " max_bytes=" << f->readahead.get_max_readahead_size()
9002                  << " max_periods=" << conf->client_readahead_max_periods << dendl;
9003
9004   // read (and possibly block)
9005   int r, rvalue = 0;
9006   Mutex flock("Client::_read_async flock");
9007   Cond cond;
9008   bool done = false;
9009   Context *onfinish = new C_SafeCond(&flock, &cond, &done, &rvalue);
9010   r = objectcacher->file_read(&in->oset, &in->layout, in->snapid,
9011                               off, len, bl, 0, onfinish);
9012   if (r == 0) {
9013     get_cap_ref(in, CEPH_CAP_FILE_CACHE);
9014     client_lock.Unlock();
9015     flock.Lock();
9016     while (!done)
9017       cond.Wait(flock);
9018     flock.Unlock();
9019     client_lock.Lock();
9020     put_cap_ref(in, CEPH_CAP_FILE_CACHE);
9021     r = rvalue;
9022   } else {
9023     // it was cached.
9024     delete onfinish;
9025   }
9026
9027   if(f->readahead.get_min_readahead_size() > 0) {
9028     pair<uint64_t, uint64_t> readahead_extent = f->readahead.update(off, len, in->size);
9029     if (readahead_extent.second > 0) {
9030       ldout(cct, 20) << "readahead " << readahead_extent.first << "~" << readahead_extent.second
9031                      << " (caller wants " << off << "~" << len << ")" << dendl;
9032       Context *onfinish2 = new C_Readahead(this, f);
9033       int r2 = objectcacher->file_read(&in->oset, &in->layout, in->snapid,
9034                                        readahead_extent.first, readahead_extent.second,
9035                                        NULL, 0, onfinish2);
9036       if (r2 == 0) {
9037         ldout(cct, 20) << "readahead initiated, c " << onfinish2 << dendl;
9038         get_cap_ref(in, CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE);
9039       } else {
9040         ldout(cct, 20) << "readahead was no-op, already cached" << dendl;
9041         delete onfinish2;
9042       }
9043     }
9044   }
9045
9046   return r;
9047 }
9048
9049 int Client::_read_sync(Fh *f, uint64_t off, uint64_t len, bufferlist *bl,
9050                        bool *checkeof)
9051 {
9052   Inode *in = f->inode.get();
9053   uint64_t pos = off;
9054   int left = len;
9055   int read = 0;
9056
9057   ldout(cct, 10) << "_read_sync " << *in << " " << off << "~" << len << dendl;
9058
9059   Mutex flock("Client::_read_sync flock");
9060   Cond cond;
9061   while (left > 0) {
9062     int r = 0;
9063     bool done = false;
9064     Context *onfinish = new C_SafeCond(&flock, &cond, &done, &r);
9065     bufferlist tbl;
9066
9067     int wanted = left;
9068     filer->read_trunc(in->ino, &in->layout, in->snapid,
9069                       pos, left, &tbl, 0,
9070                       in->truncate_size, in->truncate_seq,
9071                       onfinish);
9072     client_lock.Unlock();
9073     flock.Lock();
9074     while (!done)
9075       cond.Wait(flock);
9076     flock.Unlock();
9077     client_lock.Lock();
9078
9079     // if we get ENOENT from OSD, assume 0 bytes returned
9080     if (r == -ENOENT)
9081       r = 0;
9082     if (r < 0)
9083       return r;
9084     if (tbl.length()) {
9085       r = tbl.length();
9086
9087       read += r;
9088       pos += r;
9089       left -= r;
9090       bl->claim_append(tbl);
9091     }
9092     // short read?
9093     if (r >= 0 && r < wanted) {
9094       if (pos < in->size) {
9095         // zero up to known EOF
9096         int64_t some = in->size - pos;
9097         if (some > left)
9098           some = left;
9099         bufferptr z(some);
9100         z.zero();
9101         bl->push_back(z);
9102         read += some;
9103         pos += some;
9104         left -= some;
9105         if (left == 0)
9106           return read;
9107       }
9108
9109       *checkeof = true;
9110       return read;
9111     }
9112   }
9113   return read;
9114 }
9115
9116
9117 /*
9118  * we keep count of uncommitted sync writes on the inode, so that
9119  * fsync can DDRT.
9120  */
9121 void Client::_sync_write_commit(Inode *in)
9122 {
9123   assert(unsafe_sync_write > 0);
9124   unsafe_sync_write--;
9125
9126   put_cap_ref(in, CEPH_CAP_FILE_BUFFER);
9127
9128   ldout(cct, 15) << "sync_write_commit unsafe_sync_write = " << unsafe_sync_write << dendl;
9129   if (unsafe_sync_write == 0 && unmounting) {
9130     ldout(cct, 10) << "sync_write_commit -- no more unsafe writes, unmount can proceed" << dendl;
9131     mount_cond.Signal();
9132   }
9133 }
9134
9135 int Client::write(int fd, const char *buf, loff_t size, loff_t offset)
9136 {
9137   Mutex::Locker lock(client_lock);
9138   tout(cct) << "write" << std::endl;
9139   tout(cct) << fd << std::endl;
9140   tout(cct) << size << std::endl;
9141   tout(cct) << offset << std::endl;
9142
9143   if (unmounting)
9144     return -ENOTCONN;
9145
9146   Fh *fh = get_filehandle(fd);
9147   if (!fh)
9148     return -EBADF;
9149 #if defined(__linux__) && defined(O_PATH)
9150   if (fh->flags & O_PATH)
9151     return -EBADF;
9152 #endif
9153   int r = _write(fh, offset, size, buf, NULL, 0);
9154   ldout(cct, 3) << "write(" << fd << ", \"...\", " << size << ", " << offset << ") = " << r << dendl;
9155   return r;
9156 }
9157
9158 int Client::pwritev(int fd, const struct iovec *iov, int iovcnt, int64_t offset)
9159 {
9160   if (iovcnt < 0)
9161     return -EINVAL;
9162   return _preadv_pwritev(fd, iov, iovcnt, offset, true);
9163 }
9164
9165 int Client::_preadv_pwritev(int fd, const struct iovec *iov, unsigned iovcnt, int64_t offset, bool write)
9166 {
9167     Mutex::Locker lock(client_lock);
9168     tout(cct) << fd << std::endl;
9169     tout(cct) << offset << std::endl;
9170
9171     if (unmounting)
9172      return -ENOTCONN;
9173
9174     Fh *fh = get_filehandle(fd);
9175     if (!fh)
9176         return -EBADF;
9177 #if defined(__linux__) && defined(O_PATH)
9178     if (fh->flags & O_PATH)
9179         return -EBADF;
9180 #endif
9181     loff_t totallen = 0;
9182     for (unsigned i = 0; i < iovcnt; i++) {
9183         totallen += iov[i].iov_len;
9184     }
9185     if (write) {
9186         int w = _write(fh, offset, totallen, NULL, iov, iovcnt);
9187         ldout(cct, 3) << "pwritev(" << fd << ", \"...\", " << totallen << ", " << offset << ") = " << w << dendl;
9188         return w;
9189     } else {
9190         bufferlist bl;
9191         int r = _read(fh, offset, totallen, &bl);
9192         ldout(cct, 3) << "preadv(" << fd << ", " <<  offset << ") = " << r << dendl;
9193         if (r <= 0)
9194           return r;
9195
9196         int bufoff = 0;
9197         for (unsigned j = 0, resid = r; j < iovcnt && resid > 0; j++) {
9198                /*
9199                 * This piece of code aims to handle the case that bufferlist does not have enough data
9200                 * to fill in the iov
9201                 */
9202                if (resid < iov[j].iov_len) {
9203                     bl.copy(bufoff, resid, (char *)iov[j].iov_base);
9204                     break;
9205                } else {
9206                     bl.copy(bufoff, iov[j].iov_len, (char *)iov[j].iov_base);
9207                }
9208                resid -= iov[j].iov_len;
9209                bufoff += iov[j].iov_len;
9210         }
9211         return r;
9212     }
9213 }
9214
9215 int Client::_write(Fh *f, int64_t offset, uint64_t size, const char *buf,
9216                   const struct iovec *iov, int iovcnt)
9217 {
9218   uint64_t fpos = 0;
9219
9220   if ((uint64_t)(offset+size) > mdsmap->get_max_filesize()) //too large!
9221     return -EFBIG;
9222
9223   //ldout(cct, 7) << "write fh " << fh << " size " << size << " offset " << offset << dendl;
9224   Inode *in = f->inode.get();
9225
9226   if (objecter->osdmap_pool_full(in->layout.pool_id)) {
9227     return -ENOSPC;
9228   }
9229
9230   assert(in->snapid == CEPH_NOSNAP);
9231
9232   // was Fh opened as writeable?
9233   if ((f->mode & CEPH_FILE_MODE_WR) == 0)
9234     return -EBADF;
9235
9236   // check quota
9237   uint64_t endoff = offset + size;
9238   std::list<InodeRef> quota_roots;
9239   if (endoff > in->size &&
9240       is_quota_bytes_exceeded(in, endoff - in->size, f->actor_perms, &quota_roots)) {
9241     return -EDQUOT;
9242   }
9243
9244   // use/adjust fd pos?
9245   if (offset < 0) {
9246     lock_fh_pos(f);
9247     /*
9248      * FIXME: this is racy in that we may block _after_ this point waiting for caps, and size may
9249      * change out from under us.
9250      */
9251     if (f->flags & O_APPEND) {
9252       int r = _lseek(f, 0, SEEK_END);
9253       if (r < 0) {
9254         unlock_fh_pos(f);
9255         return r;
9256       }
9257     }
9258     offset = f->pos;
9259     fpos = offset+size;
9260     unlock_fh_pos(f);
9261   }
9262
9263   //bool lazy = f->mode == CEPH_FILE_MODE_LAZY;
9264
9265   ldout(cct, 10) << "cur file size is " << in->size << dendl;
9266
9267   // time it.
9268   utime_t start = ceph_clock_now();
9269
9270   if (in->inline_version == 0) {
9271     int r = _getattr(in, CEPH_STAT_CAP_INLINE_DATA, f->actor_perms, true);
9272     if (r < 0)
9273       return r;
9274     assert(in->inline_version > 0);
9275   }
9276
9277   // copy into fresh buffer (since our write may be resub, async)
9278   bufferlist bl;
9279   if (buf) {
9280     if (size > 0)
9281       bl.append(buf, size);
9282   } else if (iov){
9283     for (int i = 0; i < iovcnt; i++) {
9284       if (iov[i].iov_len > 0) {
9285         bl.append((const char *)iov[i].iov_base, iov[i].iov_len);
9286       }
9287     }
9288   }
9289
9290   utime_t lat;
9291   uint64_t totalwritten;
9292   int have;
9293   int r = get_caps(in, CEPH_CAP_FILE_WR|CEPH_CAP_AUTH_SHARED,
9294                     CEPH_CAP_FILE_BUFFER, &have, endoff);
9295   if (r < 0)
9296     return r;
9297
9298   /* clear the setuid/setgid bits, if any */
9299   if (unlikely(in->mode & (S_ISUID|S_ISGID)) && size > 0) {
9300     struct ceph_statx stx = { 0 };
9301
9302     put_cap_ref(in, CEPH_CAP_AUTH_SHARED);
9303     r = __setattrx(in, &stx, CEPH_SETATTR_KILL_SGUID, f->actor_perms);
9304     if (r < 0)
9305       return r;
9306   } else {
9307     put_cap_ref(in, CEPH_CAP_AUTH_SHARED);
9308   }
9309
9310   if (f->flags & O_DIRECT)
9311     have &= ~CEPH_CAP_FILE_BUFFER;
9312
9313   ldout(cct, 10) << " snaprealm " << *in->snaprealm << dendl;
9314
9315   Mutex uninline_flock("Client::_write_uninline_data flock");
9316   Cond uninline_cond;
9317   bool uninline_done = false;
9318   int uninline_ret = 0;
9319   Context *onuninline = NULL;
9320
9321   if (in->inline_version < CEPH_INLINE_NONE) {
9322     if (endoff > cct->_conf->client_max_inline_size ||
9323         endoff > CEPH_INLINE_MAX_SIZE ||
9324         !(have & CEPH_CAP_FILE_BUFFER)) {
9325       onuninline = new C_SafeCond(&uninline_flock,
9326                                   &uninline_cond,
9327                                   &uninline_done,
9328                                   &uninline_ret);
9329       uninline_data(in, onuninline);
9330     } else {
9331       get_cap_ref(in, CEPH_CAP_FILE_BUFFER);
9332
9333       uint32_t len = in->inline_data.length();
9334
9335       if (endoff < len)
9336         in->inline_data.copy(endoff, len - endoff, bl);
9337
9338       if (offset < len)
9339         in->inline_data.splice(offset, len - offset);
9340       else if (offset > len)
9341         in->inline_data.append_zero(offset - len);
9342
9343       in->inline_data.append(bl);
9344       in->inline_version++;
9345
9346       put_cap_ref(in, CEPH_CAP_FILE_BUFFER);
9347
9348       goto success;
9349     }
9350   }
9351
9352   if (cct->_conf->client_oc && (have & CEPH_CAP_FILE_BUFFER)) {
9353     // do buffered write
9354     if (!in->oset.dirty_or_tx)
9355       get_cap_ref(in, CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_BUFFER);
9356
9357     get_cap_ref(in, CEPH_CAP_FILE_BUFFER);
9358
9359     // async, caching, non-blocking.
9360     r = objectcacher->file_write(&in->oset, &in->layout,
9361                                  in->snaprealm->get_snap_context(),
9362                                  offset, size, bl, ceph::real_clock::now(),
9363                                  0);
9364     put_cap_ref(in, CEPH_CAP_FILE_BUFFER);
9365
9366     if (r < 0)
9367       goto done;
9368
9369     // flush cached write if O_SYNC is set on file fh
9370     // O_DSYNC == O_SYNC on linux < 2.6.33
9371     // O_SYNC = __O_SYNC | O_DSYNC on linux >= 2.6.33
9372     if ((f->flags & O_SYNC) || (f->flags & O_DSYNC)) {
9373       _flush_range(in, offset, size);
9374     }
9375   } else {
9376     if (f->flags & O_DIRECT)
9377       _flush_range(in, offset, size);
9378
9379     // simple, non-atomic sync write
9380     Mutex flock("Client::_write flock");
9381     Cond cond;
9382     bool done = false;
9383     Context *onfinish = new C_SafeCond(&flock, &cond, &done);
9384
9385     unsafe_sync_write++;
9386     get_cap_ref(in, CEPH_CAP_FILE_BUFFER);  // released by onsafe callback
9387
9388     filer->write_trunc(in->ino, &in->layout, in->snaprealm->get_snap_context(),
9389                        offset, size, bl, ceph::real_clock::now(), 0,
9390                        in->truncate_size, in->truncate_seq,
9391                        onfinish);
9392     client_lock.Unlock();
9393     flock.Lock();
9394
9395     while (!done)
9396       cond.Wait(flock);
9397     flock.Unlock();
9398     client_lock.Lock();
9399     _sync_write_commit(in);
9400   }
9401
9402   // if we get here, write was successful, update client metadata
9403 success:
9404   // time
9405   lat = ceph_clock_now();
9406   lat -= start;
9407   logger->tinc(l_c_wrlat, lat);
9408
9409   if (fpos) {
9410     lock_fh_pos(f);
9411     f->pos = fpos;
9412     unlock_fh_pos(f);
9413   }
9414   totalwritten = size;
9415   r = (int)totalwritten;
9416
9417   // extend file?
9418   if (totalwritten + offset > in->size) {
9419     in->size = totalwritten + offset;
9420     in->mark_caps_dirty(CEPH_CAP_FILE_WR);
9421
9422     if (is_quota_bytes_approaching(in, quota_roots)) {
9423       check_caps(in, CHECK_CAPS_NODELAY);
9424     } else if (is_max_size_approaching(in)) {
9425       check_caps(in, 0);
9426     }
9427
9428     ldout(cct, 7) << "wrote to " << totalwritten+offset << ", extending file size" << dendl;
9429   } else {
9430     ldout(cct, 7) << "wrote to " << totalwritten+offset << ", leaving file size at " << in->size << dendl;
9431   }
9432
9433   // mtime
9434   in->mtime = in->ctime = ceph_clock_now();
9435   in->change_attr++;
9436   in->mark_caps_dirty(CEPH_CAP_FILE_WR);
9437
9438 done:
9439
9440   if (onuninline) {
9441     client_lock.Unlock();
9442     uninline_flock.Lock();
9443     while (!uninline_done)
9444       uninline_cond.Wait(uninline_flock);
9445     uninline_flock.Unlock();
9446     client_lock.Lock();
9447
9448     if (uninline_ret >= 0 || uninline_ret == -ECANCELED) {
9449       in->inline_data.clear();
9450       in->inline_version = CEPH_INLINE_NONE;
9451       in->mark_caps_dirty(CEPH_CAP_FILE_WR);
9452       check_caps(in, 0);
9453     } else
9454       r = uninline_ret;
9455   }
9456
9457   put_cap_ref(in, CEPH_CAP_FILE_WR);
9458   return r;
9459 }
9460
9461 int Client::_flush(Fh *f)
9462 {
9463   Inode *in = f->inode.get();
9464   int err = f->take_async_err();
9465   if (err != 0) {
9466     ldout(cct, 1) << __func__ << ": " << f << " on inode " << *in << " caught async_err = "
9467                   << cpp_strerror(err) << dendl;
9468   } else {
9469     ldout(cct, 10) << __func__ << ": " << f << " on inode " << *in << " no async_err state" << dendl;
9470   }
9471
9472   return err;
9473 }
9474
9475 int Client::truncate(const char *relpath, loff_t length, const UserPerm& perms)
9476 {
9477   struct ceph_statx stx;
9478   stx.stx_size = length;
9479   return setattrx(relpath, &stx, CEPH_SETATTR_SIZE, perms);
9480 }
9481
9482 int Client::ftruncate(int fd, loff_t length, const UserPerm& perms)
9483 {
9484   Mutex::Locker lock(client_lock);
9485   tout(cct) << "ftruncate" << std::endl;
9486   tout(cct) << fd << std::endl;
9487   tout(cct) << length << std::endl;
9488
9489   if (unmounting)
9490     return -ENOTCONN;
9491
9492   Fh *f = get_filehandle(fd);
9493   if (!f)
9494     return -EBADF;
9495 #if defined(__linux__) && defined(O_PATH)
9496   if (f->flags & O_PATH)
9497     return -EBADF;
9498 #endif
9499   struct stat attr;
9500   attr.st_size = length;
9501   return _setattr(f->inode, &attr, CEPH_SETATTR_SIZE, perms);
9502 }
9503
9504 int Client::fsync(int fd, bool syncdataonly)
9505 {
9506   Mutex::Locker lock(client_lock);
9507   tout(cct) << "fsync" << std::endl;
9508   tout(cct) << fd << std::endl;
9509   tout(cct) << syncdataonly << std::endl;
9510
9511   if (unmounting)
9512     return -ENOTCONN;
9513
9514   Fh *f = get_filehandle(fd);
9515   if (!f)
9516     return -EBADF;
9517 #if defined(__linux__) && defined(O_PATH)
9518   if (f->flags & O_PATH)
9519     return -EBADF;
9520 #endif
9521   int r = _fsync(f, syncdataonly);
9522   if (r == 0) {
9523     // The IOs in this fsync were okay, but maybe something happened
9524     // in the background that we shoudl be reporting?
9525     r = f->take_async_err();
9526     ldout(cct, 5) << "fsync(" << fd << ", " << syncdataonly
9527                   << ") = 0, async_err = " << r << dendl;
9528   } else {
9529     // Assume that an error we encountered during fsync, even reported
9530     // synchronously, would also have applied the error to the Fh, and we
9531     // should clear it here to avoid returning the same error again on next
9532     // call.
9533     ldout(cct, 5) << "fsync(" << fd << ", " << syncdataonly << ") = "
9534                   << r << dendl;
9535     f->take_async_err();
9536   }
9537   return r;
9538 }
9539
9540 int Client::_fsync(Inode *in, bool syncdataonly)
9541 {
9542   int r = 0;
9543   Mutex lock("Client::_fsync::lock");
9544   Cond cond;
9545   bool done = false;
9546   C_SafeCond *object_cacher_completion = NULL;
9547   ceph_tid_t flush_tid = 0;
9548   InodeRef tmp_ref;
9549
9550   ldout(cct, 8) << "_fsync on " << *in << " " << (syncdataonly ? "(dataonly)":"(data+metadata)") << dendl;
9551
9552   if (cct->_conf->client_oc) {
9553     object_cacher_completion = new C_SafeCond(&lock, &cond, &done, &r);
9554     tmp_ref = in; // take a reference; C_SafeCond doesn't and _flush won't either
9555     _flush(in, object_cacher_completion);
9556     ldout(cct, 15) << "using return-valued form of _fsync" << dendl;
9557   }
9558
9559   if (!syncdataonly && in->dirty_caps) {
9560     check_caps(in, CHECK_CAPS_NODELAY|CHECK_CAPS_SYNCHRONOUS);
9561     if (in->flushing_caps)
9562       flush_tid = last_flush_tid;
9563   } else ldout(cct, 10) << "no metadata needs to commit" << dendl;
9564
9565   if (!syncdataonly && !in->unsafe_ops.empty()) {
9566     flush_mdlog_sync();
9567
9568     MetaRequest *req = in->unsafe_ops.back();
9569     ldout(cct, 15) << "waiting on unsafe requests, last tid " << req->get_tid() <<  dendl;
9570
9571     req->get();
9572     wait_on_list(req->waitfor_safe);
9573     put_request(req);
9574   }
9575
9576   if (object_cacher_completion) { // wait on a real reply instead of guessing
9577     client_lock.Unlock();
9578     lock.Lock();
9579     ldout(cct, 15) << "waiting on data to flush" << dendl;
9580     while (!done)
9581       cond.Wait(lock);
9582     lock.Unlock();
9583     client_lock.Lock();
9584     ldout(cct, 15) << "got " << r << " from flush writeback" << dendl;
9585   } else {
9586     // FIXME: this can starve
9587     while (in->cap_refs[CEPH_CAP_FILE_BUFFER] > 0) {
9588       ldout(cct, 10) << "ino " << in->ino << " has " << in->cap_refs[CEPH_CAP_FILE_BUFFER]
9589                      << " uncommitted, waiting" << dendl;
9590       wait_on_list(in->waitfor_commit);
9591     }
9592   }
9593
9594   if (!r) {
9595     if (flush_tid > 0)
9596       wait_sync_caps(in, flush_tid);
9597
9598     ldout(cct, 10) << "ino " << in->ino << " has no uncommitted writes" << dendl;
9599   } else {
9600     ldout(cct, 8) << "ino " << in->ino << " failed to commit to disk! "
9601                   << cpp_strerror(-r) << dendl;
9602   }
9603
9604   return r;
9605 }
9606
9607 int Client::_fsync(Fh *f, bool syncdataonly)
9608 {
9609   ldout(cct, 8) << "_fsync(" << f << ", " << (syncdataonly ? "dataonly)":"data+metadata)") << dendl;
9610   return _fsync(f->inode.get(), syncdataonly);
9611 }
9612
9613 int Client::fstat(int fd, struct stat *stbuf, const UserPerm& perms, int mask)
9614 {
9615   Mutex::Locker lock(client_lock);
9616   tout(cct) << "fstat mask " << hex << mask << dec << std::endl;
9617   tout(cct) << fd << std::endl;
9618
9619   if (unmounting)
9620     return -ENOTCONN;
9621
9622   Fh *f = get_filehandle(fd);
9623   if (!f)
9624     return -EBADF;
9625   int r = _getattr(f->inode, mask, perms);
9626   if (r < 0)
9627     return r;
9628   fill_stat(f->inode, stbuf, NULL);
9629   ldout(cct, 5) << "fstat(" << fd << ", " << stbuf << ") = " << r << dendl;
9630   return r;
9631 }
9632
9633 int Client::fstatx(int fd, struct ceph_statx *stx, const UserPerm& perms,
9634                    unsigned int want, unsigned int flags)
9635 {
9636   Mutex::Locker lock(client_lock);
9637   tout(cct) << "fstatx flags " << hex << flags << " want " << want << dec << std::endl;
9638   tout(cct) << fd << std::endl;
9639
9640   if (unmounting)
9641     return -ENOTCONN;
9642
9643   Fh *f = get_filehandle(fd);
9644   if (!f)
9645     return -EBADF;
9646
9647   unsigned mask = statx_to_mask(flags, want);
9648
9649   int r = 0;
9650   if (mask && !f->inode->caps_issued_mask(mask, true)) {
9651     r = _getattr(f->inode, mask, perms);
9652     if (r < 0) {
9653       ldout(cct, 3) << "fstatx exit on error!" << dendl;
9654       return r;
9655     }
9656   }
9657
9658   fill_statx(f->inode, mask, stx);
9659   ldout(cct, 3) << "fstatx(" << fd << ", " << stx << ") = " << r << dendl;
9660   return r;
9661 }
9662
9663 // not written yet, but i want to link!
9664
9665 int Client::chdir(const char *relpath, std::string &new_cwd,
9666                   const UserPerm& perms)
9667 {
9668   Mutex::Locker lock(client_lock);
9669   tout(cct) << "chdir" << std::endl;
9670   tout(cct) << relpath << std::endl;
9671
9672   if (unmounting)
9673     return -ENOTCONN;
9674
9675   filepath path(relpath);
9676   InodeRef in;
9677   int r = path_walk(path, &in, perms);
9678   if (r < 0)
9679     return r;
9680   if (cwd != in)
9681     cwd.swap(in);
9682   ldout(cct, 3) << "chdir(" << relpath << ")  cwd now " << cwd->ino << dendl;
9683
9684   _getcwd(new_cwd, perms);
9685   return 0;
9686 }
9687
9688 void Client::_getcwd(string& dir, const UserPerm& perms)
9689 {
9690   filepath path;
9691   ldout(cct, 10) << "getcwd " << *cwd << dendl;
9692
9693   Inode *in = cwd.get();
9694   while (in != root) {
9695     assert(in->dn_set.size() < 2); // dirs can't be hard-linked
9696
9697     // A cwd or ancester is unlinked
9698     if (in->dn_set.empty()) {
9699       return;
9700     }
9701
9702     Dentry *dn = in->get_first_parent();
9703
9704
9705     if (!dn) {
9706       // look it up
9707       ldout(cct, 10) << "getcwd looking up parent for " << *in << dendl;
9708       MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPNAME);
9709       filepath path(in->ino);
9710       req->set_filepath(path);
9711       req->set_inode(in);
9712       int res = make_request(req, perms);
9713       if (res < 0)
9714         break;
9715
9716       // start over
9717       path = filepath();
9718       in = cwd.get();
9719       continue;
9720     }
9721     path.push_front_dentry(dn->name);
9722     in = dn->dir->parent_inode;
9723   }
9724   dir = "/";
9725   dir += path.get_path();
9726 }
9727
9728 void Client::getcwd(string& dir, const UserPerm& perms)
9729 {
9730   Mutex::Locker l(client_lock);
9731   if (!unmounting)
9732     _getcwd(dir, perms);
9733 }
9734
9735 int Client::statfs(const char *path, struct statvfs *stbuf,
9736                    const UserPerm& perms)
9737 {
9738   Mutex::Locker l(client_lock);
9739   tout(cct) << "statfs" << std::endl;
9740   unsigned long int total_files_on_fs;
9741
9742   if (unmounting)
9743     return -ENOTCONN;
9744
9745   ceph_statfs stats;
9746   C_SaferCond cond;
9747
9748   const vector<int64_t> &data_pools = mdsmap->get_data_pools();
9749   if (data_pools.size() == 1) {
9750     objecter->get_fs_stats(stats, data_pools[0], &cond);
9751   } else {
9752     objecter->get_fs_stats(stats, boost::optional<int64_t>(), &cond);
9753   }
9754
9755   client_lock.Unlock();
9756   int rval = cond.wait();
9757   assert(root);
9758   total_files_on_fs = root->rstat.rfiles + root->rstat.rsubdirs;
9759   client_lock.Lock();
9760
9761   if (rval < 0) {
9762     ldout(cct, 1) << "underlying call to statfs returned error: "
9763                   << cpp_strerror(rval)
9764                   << dendl;
9765     return rval;
9766   }
9767
9768   memset(stbuf, 0, sizeof(*stbuf));
9769
9770   /*
9771    * we're going to set a block size of 4MB so we can represent larger
9772    * FSes without overflowing. Additionally convert the space
9773    * measurements from KB to bytes while making them in terms of
9774    * blocks.  We use 4MB only because it is big enough, and because it
9775    * actually *is* the (ceph) default block size.
9776    */
9777   const int CEPH_BLOCK_SHIFT = 22;
9778   stbuf->f_frsize = 1 << CEPH_BLOCK_SHIFT;
9779   stbuf->f_bsize = 1 << CEPH_BLOCK_SHIFT;
9780   stbuf->f_files = total_files_on_fs;
9781   stbuf->f_ffree = 0;
9782   stbuf->f_favail = -1;
9783   stbuf->f_fsid = -1;       // ??
9784   stbuf->f_flag = 0;        // ??
9785   stbuf->f_namemax = NAME_MAX;
9786
9787   // Usually quota_root will == root_ancestor, but if the mount root has no
9788   // quota but we can see a parent of it that does have a quota, we'll
9789   // respect that one instead.
9790   assert(root != nullptr);
9791   Inode *quota_root = root->quota.is_enable() ? root : get_quota_root(root, perms);
9792
9793   // get_quota_root should always give us something
9794   // because client quotas are always enabled
9795   assert(quota_root != nullptr);
9796
9797   if (quota_root && cct->_conf->client_quota_df && quota_root->quota.max_bytes) {
9798
9799     // Skip the getattr if any sessions are stale, as we don't want to
9800     // block `df` if this client has e.g. been evicted, or if the MDS cluster
9801     // is unhealthy.
9802     if (!_any_stale_sessions()) {
9803       int r = _getattr(quota_root, 0, perms, true);
9804       if (r != 0) {
9805         // Ignore return value: error getting latest inode metadata is not a good
9806         // reason to break "df".
9807         lderr(cct) << "Error in getattr on quota root 0x"
9808                    << std::hex << quota_root->ino << std::dec
9809                    << " statfs result may be outdated" << dendl;
9810       }
9811     }
9812
9813     // Special case: if there is a size quota set on the Inode acting
9814     // as the root for this client mount, then report the quota status
9815     // as the filesystem statistics.
9816     const fsblkcnt_t total = quota_root->quota.max_bytes >> CEPH_BLOCK_SHIFT;
9817     const fsblkcnt_t used = quota_root->rstat.rbytes >> CEPH_BLOCK_SHIFT;
9818     // It is possible for a quota to be exceeded: arithmetic here must
9819     // handle case where used > total.
9820     const fsblkcnt_t free = total > used ? total - used : 0;
9821
9822     stbuf->f_blocks = total;
9823     stbuf->f_bfree = free;
9824     stbuf->f_bavail = free;
9825   } else {
9826     // General case: report the cluster statistics returned from RADOS. Because
9827     // multiple pools may be used without one filesystem namespace via
9828     // layouts, this is the most correct thing we can do.
9829     stbuf->f_blocks = stats.kb >> (CEPH_BLOCK_SHIFT - 10);
9830     stbuf->f_bfree = stats.kb_avail >> (CEPH_BLOCK_SHIFT - 10);
9831     stbuf->f_bavail = stats.kb_avail >> (CEPH_BLOCK_SHIFT - 10);
9832   }
9833
9834   return rval;
9835 }
9836
9837 int Client::_do_filelock(Inode *in, Fh *fh, int lock_type, int op, int sleep,
9838                          struct flock *fl, uint64_t owner, bool removing)
9839 {
9840   ldout(cct, 10) << "_do_filelock ino " << in->ino
9841                  << (lock_type == CEPH_LOCK_FCNTL ? " fcntl" : " flock")
9842                  << " type " << fl->l_type << " owner " << owner
9843                  << " " << fl->l_start << "~" << fl->l_len << dendl;
9844
9845   int lock_cmd;
9846   if (F_RDLCK == fl->l_type)
9847     lock_cmd = CEPH_LOCK_SHARED;
9848   else if (F_WRLCK == fl->l_type)
9849     lock_cmd = CEPH_LOCK_EXCL;
9850   else if (F_UNLCK == fl->l_type)
9851     lock_cmd = CEPH_LOCK_UNLOCK;
9852   else
9853     return -EIO;
9854
9855   if (op != CEPH_MDS_OP_SETFILELOCK || lock_cmd == CEPH_LOCK_UNLOCK)
9856     sleep = 0;
9857
9858   /*
9859    * Set the most significant bit, so that MDS knows the 'owner'
9860    * is sufficient to identify the owner of lock. (old code uses
9861    * both 'owner' and 'pid')
9862    */
9863   owner |= (1ULL << 63);
9864
9865   MetaRequest *req = new MetaRequest(op);
9866   filepath path;
9867   in->make_nosnap_relative_path(path);
9868   req->set_filepath(path);
9869   req->set_inode(in);
9870
9871   req->head.args.filelock_change.rule = lock_type;
9872   req->head.args.filelock_change.type = lock_cmd;
9873   req->head.args.filelock_change.owner = owner;
9874   req->head.args.filelock_change.pid = fl->l_pid;
9875   req->head.args.filelock_change.start = fl->l_start;
9876   req->head.args.filelock_change.length = fl->l_len;
9877   req->head.args.filelock_change.wait = sleep;
9878
9879   int ret;
9880   bufferlist bl;
9881
9882   if (sleep && switch_interrupt_cb) {
9883     // enable interrupt
9884     switch_interrupt_cb(callback_handle, req->get());
9885     ret = make_request(req, fh->actor_perms, NULL, NULL, -1, &bl);
9886     // disable interrupt
9887     switch_interrupt_cb(callback_handle, NULL);
9888     if (ret == 0 && req->aborted()) {
9889       // effect of this lock request has been revoked by the 'lock intr' request
9890       ret = req->get_abort_code();
9891     }
9892     put_request(req);
9893   } else {
9894     ret = make_request(req, fh->actor_perms, NULL, NULL, -1, &bl);
9895   }
9896
9897   if (ret == 0) {
9898     if (op == CEPH_MDS_OP_GETFILELOCK) {
9899       ceph_filelock filelock;
9900       bufferlist::iterator p = bl.begin();
9901       ::decode(filelock, p);
9902
9903       if (CEPH_LOCK_SHARED == filelock.type)
9904         fl->l_type = F_RDLCK;
9905       else if (CEPH_LOCK_EXCL == filelock.type)
9906         fl->l_type = F_WRLCK;
9907       else
9908         fl->l_type = F_UNLCK;
9909
9910       fl->l_whence = SEEK_SET;
9911       fl->l_start = filelock.start;
9912       fl->l_len = filelock.length;
9913       fl->l_pid = filelock.pid;
9914     } else if (op == CEPH_MDS_OP_SETFILELOCK) {
9915       ceph_lock_state_t *lock_state;
9916       if (lock_type == CEPH_LOCK_FCNTL) {
9917         if (!in->fcntl_locks)
9918           in->fcntl_locks = new ceph_lock_state_t(cct, CEPH_LOCK_FCNTL);
9919         lock_state = in->fcntl_locks;
9920       } else if (lock_type == CEPH_LOCK_FLOCK) {
9921         if (!in->flock_locks)
9922           in->flock_locks = new ceph_lock_state_t(cct, CEPH_LOCK_FLOCK);
9923         lock_state = in->flock_locks;
9924       } else {
9925         ceph_abort();
9926         return -EINVAL;
9927       }
9928       _update_lock_state(fl, owner, lock_state);
9929
9930       if (!removing) {
9931         if (lock_type == CEPH_LOCK_FCNTL) {
9932           if (!fh->fcntl_locks)
9933             fh->fcntl_locks = new ceph_lock_state_t(cct, CEPH_LOCK_FCNTL);
9934           lock_state = fh->fcntl_locks;
9935         } else {
9936           if (!fh->flock_locks)
9937             fh->flock_locks = new ceph_lock_state_t(cct, CEPH_LOCK_FLOCK);
9938           lock_state = fh->flock_locks;
9939         }
9940         _update_lock_state(fl, owner, lock_state);
9941       }
9942     } else
9943       ceph_abort();
9944   }
9945   return ret;
9946 }
9947
9948 int Client::_interrupt_filelock(MetaRequest *req)
9949 {
9950   // Set abort code, but do not kick. The abort code prevents the request
9951   // from being re-sent.
9952   req->abort(-EINTR);
9953   if (req->mds < 0)
9954     return 0; // haven't sent the request
9955
9956   Inode *in = req->inode();
9957
9958   int lock_type;
9959   if (req->head.args.filelock_change.rule == CEPH_LOCK_FLOCK)
9960     lock_type = CEPH_LOCK_FLOCK_INTR;
9961   else if (req->head.args.filelock_change.rule == CEPH_LOCK_FCNTL)
9962     lock_type = CEPH_LOCK_FCNTL_INTR;
9963   else {
9964     ceph_abort();
9965     return -EINVAL;
9966   }
9967
9968   MetaRequest *intr_req = new MetaRequest(CEPH_MDS_OP_SETFILELOCK);
9969   filepath path;
9970   in->make_nosnap_relative_path(path);
9971   intr_req->set_filepath(path);
9972   intr_req->set_inode(in);
9973   intr_req->head.args.filelock_change = req->head.args.filelock_change;
9974   intr_req->head.args.filelock_change.rule = lock_type;
9975   intr_req->head.args.filelock_change.type = CEPH_LOCK_UNLOCK;
9976
9977   UserPerm perms(req->get_uid(), req->get_gid());
9978   return make_request(intr_req, perms, NULL, NULL, -1);
9979 }
9980
9981 void Client::_encode_filelocks(Inode *in, bufferlist& bl)
9982 {
9983   if (!in->fcntl_locks && !in->flock_locks)
9984     return;
9985
9986   unsigned nr_fcntl_locks = in->fcntl_locks ? in->fcntl_locks->held_locks.size() : 0;
9987   ::encode(nr_fcntl_locks, bl);
9988   if (nr_fcntl_locks) {
9989     ceph_lock_state_t* lock_state = in->fcntl_locks;
9990     for(multimap<uint64_t, ceph_filelock>::iterator p = lock_state->held_locks.begin();
9991         p != lock_state->held_locks.end();
9992         ++p)
9993       ::encode(p->second, bl);
9994   }
9995
9996   unsigned nr_flock_locks = in->flock_locks ? in->flock_locks->held_locks.size() : 0;
9997   ::encode(nr_flock_locks, bl);
9998   if (nr_flock_locks) {
9999     ceph_lock_state_t* lock_state = in->flock_locks;
10000     for(multimap<uint64_t, ceph_filelock>::iterator p = lock_state->held_locks.begin();
10001         p != lock_state->held_locks.end();
10002         ++p)
10003       ::encode(p->second, bl);
10004   }
10005
10006   ldout(cct, 10) << "_encode_filelocks ino " << in->ino << ", " << nr_fcntl_locks
10007                  << " fcntl locks, " << nr_flock_locks << " flock locks" <<  dendl;
10008 }
10009
10010 void Client::_release_filelocks(Fh *fh)
10011 {
10012   if (!fh->fcntl_locks && !fh->flock_locks)
10013     return;
10014
10015   Inode *in = fh->inode.get();
10016   ldout(cct, 10) << "_release_filelocks " << fh << " ino " << in->ino << dendl;
10017
10018   list<pair<int, ceph_filelock> > to_release;
10019
10020   if (fh->fcntl_locks) {
10021     ceph_lock_state_t* lock_state = fh->fcntl_locks;
10022     for(multimap<uint64_t, ceph_filelock>::iterator p = lock_state->held_locks.begin();
10023         p != lock_state->held_locks.end();
10024         ++p)
10025       to_release.push_back(pair<int, ceph_filelock>(CEPH_LOCK_FCNTL, p->second));
10026     delete fh->fcntl_locks;
10027   }
10028   if (fh->flock_locks) {
10029     ceph_lock_state_t* lock_state = fh->flock_locks;
10030     for(multimap<uint64_t, ceph_filelock>::iterator p = lock_state->held_locks.begin();
10031         p != lock_state->held_locks.end();
10032         ++p)
10033       to_release.push_back(pair<int, ceph_filelock>(CEPH_LOCK_FLOCK, p->second));
10034     delete fh->flock_locks;
10035   }
10036
10037   if (to_release.empty())
10038     return;
10039
10040   struct flock fl;
10041   memset(&fl, 0, sizeof(fl));
10042   fl.l_whence = SEEK_SET;
10043   fl.l_type = F_UNLCK;
10044
10045   for (list<pair<int, ceph_filelock> >::iterator p = to_release.begin();
10046        p != to_release.end();
10047        ++p) {
10048     fl.l_start = p->second.start;
10049     fl.l_len = p->second.length;
10050     fl.l_pid = p->second.pid;
10051     _do_filelock(in, fh, p->first, CEPH_MDS_OP_SETFILELOCK, 0, &fl,
10052                  p->second.owner, true);
10053   }
10054 }
10055
10056 void Client::_update_lock_state(struct flock *fl, uint64_t owner,
10057                                 ceph_lock_state_t *lock_state)
10058 {
10059   int lock_cmd;
10060   if (F_RDLCK == fl->l_type)
10061     lock_cmd = CEPH_LOCK_SHARED;
10062   else if (F_WRLCK == fl->l_type)
10063     lock_cmd = CEPH_LOCK_EXCL;
10064   else
10065     lock_cmd = CEPH_LOCK_UNLOCK;;
10066
10067   ceph_filelock filelock;
10068   filelock.start = fl->l_start;
10069   filelock.length = fl->l_len;
10070   filelock.client = 0;
10071   // see comment in _do_filelock()
10072   filelock.owner = owner | (1ULL << 63);
10073   filelock.pid = fl->l_pid;
10074   filelock.type = lock_cmd;
10075
10076   if (filelock.type == CEPH_LOCK_UNLOCK) {
10077     list<ceph_filelock> activated_locks;
10078     lock_state->remove_lock(filelock, activated_locks);
10079   } else {
10080     bool r = lock_state->add_lock(filelock, false, false, NULL);
10081     assert(r);
10082   }
10083 }
10084
10085 int Client::_getlk(Fh *fh, struct flock *fl, uint64_t owner)
10086 {
10087   Inode *in = fh->inode.get();
10088   ldout(cct, 10) << "_getlk " << fh << " ino " << in->ino << dendl;
10089   int ret = _do_filelock(in, fh, CEPH_LOCK_FCNTL, CEPH_MDS_OP_GETFILELOCK, 0, fl, owner);
10090   return ret;
10091 }
10092
10093 int Client::_setlk(Fh *fh, struct flock *fl, uint64_t owner, int sleep)
10094 {
10095   Inode *in = fh->inode.get();
10096   ldout(cct, 10) << "_setlk " << fh << " ino " << in->ino << dendl;
10097   int ret =  _do_filelock(in, fh, CEPH_LOCK_FCNTL, CEPH_MDS_OP_SETFILELOCK, sleep, fl, owner);
10098   ldout(cct, 10) << "_setlk " << fh << " ino " << in->ino << " result=" << ret << dendl;
10099   return ret;
10100 }
10101
10102 int Client::_flock(Fh *fh, int cmd, uint64_t owner)
10103 {
10104   Inode *in = fh->inode.get();
10105   ldout(cct, 10) << "_flock " << fh << " ino " << in->ino << dendl;
10106
10107   int sleep = !(cmd & LOCK_NB);
10108   cmd &= ~LOCK_NB;
10109
10110   int type;
10111   switch (cmd) {
10112     case LOCK_SH:
10113       type = F_RDLCK;
10114       break;
10115     case LOCK_EX:
10116       type = F_WRLCK;
10117       break;
10118     case LOCK_UN:
10119       type = F_UNLCK;
10120       break;
10121     default:
10122       return -EINVAL;
10123   }
10124
10125   struct flock fl;
10126   memset(&fl, 0, sizeof(fl));
10127   fl.l_type = type;
10128   fl.l_whence = SEEK_SET;
10129
10130   int ret =  _do_filelock(in, fh, CEPH_LOCK_FLOCK, CEPH_MDS_OP_SETFILELOCK, sleep, &fl, owner);
10131   ldout(cct, 10) << "_flock " << fh << " ino " << in->ino << " result=" << ret << dendl;
10132   return ret;
10133 }
10134
10135 int Client::ll_statfs(Inode *in, struct statvfs *stbuf, const UserPerm& perms)
10136 {
10137   /* Since the only thing this does is wrap a call to statfs, and
10138      statfs takes a lock, it doesn't seem we have a need to split it
10139      out. */
10140   return statfs(0, stbuf, perms);
10141 }
10142
10143 void Client::ll_register_callbacks(struct client_callback_args *args)
10144 {
10145   if (!args)
10146     return;
10147   Mutex::Locker l(client_lock);
10148   ldout(cct, 10) << "ll_register_callbacks cb " << args->handle
10149                  << " invalidate_ino_cb " << args->ino_cb
10150                  << " invalidate_dentry_cb " << args->dentry_cb
10151                  << " switch_interrupt_cb " << args->switch_intr_cb
10152                  << " remount_cb " << args->remount_cb
10153                  << dendl;
10154   callback_handle = args->handle;
10155   if (args->ino_cb) {
10156     ino_invalidate_cb = args->ino_cb;
10157     async_ino_invalidator.start();
10158   }
10159   if (args->dentry_cb) {
10160     dentry_invalidate_cb = args->dentry_cb;
10161     async_dentry_invalidator.start();
10162   }
10163   if (args->switch_intr_cb) {
10164     switch_interrupt_cb = args->switch_intr_cb;
10165     interrupt_finisher.start();
10166   }
10167   if (args->remount_cb) {
10168     remount_cb = args->remount_cb;
10169     remount_finisher.start();
10170   }
10171   umask_cb = args->umask_cb;
10172 }
10173
10174 int Client::test_dentry_handling(bool can_invalidate)
10175 {
10176   int r = 0;
10177
10178   can_invalidate_dentries = can_invalidate;
10179
10180   if (can_invalidate_dentries) {
10181     assert(dentry_invalidate_cb);
10182     ldout(cct, 1) << "using dentry_invalidate_cb" << dendl;
10183     r = 0;
10184   } else if (remount_cb) {
10185     ldout(cct, 1) << "using remount_cb" << dendl;
10186     r = _do_remount(false);
10187   }
10188   if (r) {
10189     bool should_abort = cct->_conf->get_val<bool>("client_die_on_failed_dentry_invalidate");
10190     if (should_abort) {
10191       lderr(cct) << "no method to invalidate kernel dentry cache; quitting!" << dendl;
10192       ceph_abort();
10193     } else {
10194       lderr(cct) << "no method to invalidate kernel dentry cache; expect issues!" << dendl;
10195     }
10196   }
10197   return r;
10198 }
10199
10200 int Client::_sync_fs()
10201 {
10202   ldout(cct, 10) << "_sync_fs" << dendl;
10203
10204   // flush file data
10205   Mutex lock("Client::_fsync::lock");
10206   Cond cond;
10207   bool flush_done = false;
10208   if (cct->_conf->client_oc)
10209     objectcacher->flush_all(new C_SafeCond(&lock, &cond, &flush_done));
10210   else
10211     flush_done = true;
10212
10213   // flush caps
10214   flush_caps_sync();
10215   ceph_tid_t flush_tid = last_flush_tid;
10216
10217   // wait for unsafe mds requests
10218   wait_unsafe_requests();
10219
10220   wait_sync_caps(flush_tid);
10221
10222   if (!flush_done) {
10223     client_lock.Unlock();
10224     lock.Lock();
10225     ldout(cct, 15) << "waiting on data to flush" << dendl;
10226     while (!flush_done)
10227       cond.Wait(lock);
10228     lock.Unlock();
10229     client_lock.Lock();
10230   }
10231
10232   return 0;
10233 }
10234
10235 int Client::sync_fs()
10236 {
10237   Mutex::Locker l(client_lock);
10238
10239   if (unmounting)
10240     return -ENOTCONN;
10241
10242   return _sync_fs();
10243 }
10244
10245 int64_t Client::drop_caches()
10246 {
10247   Mutex::Locker l(client_lock);
10248   return objectcacher->release_all();
10249 }
10250
10251
10252 int Client::lazyio_propogate(int fd, loff_t offset, size_t count)
10253 {
10254   Mutex::Locker l(client_lock);
10255   ldout(cct, 3) << "op: client->lazyio_propogate(" << fd
10256           << ", " << offset << ", " << count << ")" << dendl;
10257
10258   Fh *f = get_filehandle(fd);
10259   if (!f)
10260     return -EBADF;
10261
10262   // for now
10263   _fsync(f, true);
10264
10265   return 0;
10266 }
10267
10268 int Client::lazyio_synchronize(int fd, loff_t offset, size_t count)
10269 {
10270   Mutex::Locker l(client_lock);
10271   ldout(cct, 3) << "op: client->lazyio_synchronize(" << fd
10272           << ", " << offset << ", " << count << ")" << dendl;
10273
10274   Fh *f = get_filehandle(fd);
10275   if (!f)
10276     return -EBADF;
10277   Inode *in = f->inode.get();
10278
10279   _fsync(f, true);
10280   if (_release(in))
10281     check_caps(in, 0);
10282   return 0;
10283 }
10284
10285
10286 // =============================
10287 // snaps
10288
10289 int Client::mksnap(const char *relpath, const char *name, const UserPerm& perm)
10290 {
10291   Mutex::Locker l(client_lock);
10292
10293   if (unmounting)
10294     return -ENOTCONN;
10295
10296   filepath path(relpath);
10297   InodeRef in;
10298   int r = path_walk(path, &in, perm);
10299   if (r < 0)
10300     return r;
10301   if (cct->_conf->client_permissions) {
10302     r = may_create(in.get(), perm);
10303     if (r < 0)
10304       return r;
10305   }
10306   Inode *snapdir = open_snapdir(in.get());
10307   return _mkdir(snapdir, name, 0, perm);
10308 }
10309
10310 int Client::rmsnap(const char *relpath, const char *name, const UserPerm& perms)
10311 {
10312   Mutex::Locker l(client_lock);
10313
10314   if (unmounting)
10315     return -ENOTCONN;
10316
10317   filepath path(relpath);
10318   InodeRef in;
10319   int r = path_walk(path, &in, perms);
10320   if (r < 0)
10321     return r;
10322   if (cct->_conf->client_permissions) {
10323     r = may_delete(in.get(), NULL, perms);
10324     if (r < 0)
10325       return r;
10326   }
10327   Inode *snapdir = open_snapdir(in.get());
10328   return _rmdir(snapdir, name, perms);
10329 }
10330
10331 // =============================
10332 // expose caps
10333
10334 int Client::get_caps_issued(int fd) {
10335
10336   Mutex::Locker lock(client_lock);
10337
10338   if (unmounting)
10339     return -ENOTCONN;
10340
10341   Fh *f = get_filehandle(fd);
10342   if (!f)
10343     return -EBADF;
10344
10345   return f->inode->caps_issued();
10346 }
10347
10348 int Client::get_caps_issued(const char *path, const UserPerm& perms)
10349 {
10350   Mutex::Locker lock(client_lock);
10351
10352   if (unmounting)
10353     return -ENOTCONN;
10354
10355   filepath p(path);
10356   InodeRef in;
10357   int r = path_walk(p, &in, perms, true);
10358   if (r < 0)
10359     return r;
10360   return in->caps_issued();
10361 }
10362
10363 // =========================================
10364 // low level
10365
10366 Inode *Client::open_snapdir(Inode *diri)
10367 {
10368   Inode *in;
10369   vinodeno_t vino(diri->ino, CEPH_SNAPDIR);
10370   if (!inode_map.count(vino)) {
10371     in = new Inode(this, vino, &diri->layout);
10372
10373     in->ino = diri->ino;
10374     in->snapid = CEPH_SNAPDIR;
10375     in->mode = diri->mode;
10376     in->uid = diri->uid;
10377     in->gid = diri->gid;
10378     in->mtime = diri->mtime;
10379     in->ctime = diri->ctime;
10380     in->btime = diri->btime;
10381     in->size = diri->size;
10382     in->change_attr = diri->change_attr;
10383
10384     in->dirfragtree.clear();
10385     in->snapdir_parent = diri;
10386     diri->flags |= I_SNAPDIR_OPEN;
10387     inode_map[vino] = in;
10388     if (use_faked_inos())
10389       _assign_faked_ino(in);
10390     ldout(cct, 10) << "open_snapdir created snapshot inode " << *in << dendl;
10391   } else {
10392     in = inode_map[vino];
10393     ldout(cct, 10) << "open_snapdir had snapshot inode " << *in << dendl;
10394   }
10395   return in;
10396 }
10397
10398 int Client::ll_lookup(Inode *parent, const char *name, struct stat *attr,
10399                       Inode **out, const UserPerm& perms)
10400 {
10401   Mutex::Locker lock(client_lock);
10402   vinodeno_t vparent = _get_vino(parent);
10403   ldout(cct, 3) << "ll_lookup " << vparent << " " << name << dendl;
10404   tout(cct) << "ll_lookup" << std::endl;
10405   tout(cct) << name << std::endl;
10406
10407   if (unmounting)
10408     return -ENOTCONN;
10409
10410   int r = 0;
10411   if (!cct->_conf->fuse_default_permissions) {
10412     r = may_lookup(parent, perms);
10413     if (r < 0)
10414       return r;
10415   }
10416
10417   string dname(name);
10418   InodeRef in;
10419
10420   r = _lookup(parent, dname, CEPH_STAT_CAP_INODE_ALL, &in, perms);
10421   if (r < 0) {
10422     attr->st_ino = 0;
10423     goto out;
10424   }
10425
10426   assert(in);
10427   fill_stat(in, attr);
10428   _ll_get(in.get());
10429
10430  out:
10431   ldout(cct, 3) << "ll_lookup " << vparent << " " << name
10432           << " -> " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
10433   tout(cct) << attr->st_ino << std::endl;
10434   *out = in.get();
10435   return r;
10436 }
10437
10438 int Client::ll_lookup_inode(
10439     struct inodeno_t ino,
10440     const UserPerm& perms,
10441     Inode **inode)
10442 {
10443   Mutex::Locker lock(client_lock);
10444   ldout(cct, 3) << "ll_lookup_inode " << ino  << dendl;
10445
10446   // Num1: get inode and *inode
10447   int r = _lookup_ino(ino, perms, inode);
10448   if (r) {
10449     return r;
10450   }
10451   assert(inode != NULL);
10452   assert(*inode != NULL);
10453
10454   // Num2: Request the parent inode, so that we can look up the name
10455   Inode *parent;
10456   r = _lookup_parent(*inode, perms, &parent);
10457   if (r && r != -EINVAL) {
10458     // Unexpected error
10459     _ll_forget(*inode, 1);
10460     return r;
10461   } else if (r == -EINVAL) {
10462     // EINVAL indicates node without parents (root), drop out now
10463     // and don't try to look up the non-existent dentry.
10464     return 0;
10465   }
10466   // FIXME: I don't think this works; lookup_parent() returns 0 if the parent
10467   // is already in cache
10468   assert(parent != NULL);
10469
10470   // Num3: Finally, get the name (dentry) of the requested inode
10471   r = _lookup_name(*inode, parent, perms);
10472   if (r) {
10473     // Unexpected error
10474     _ll_forget(parent, 1);
10475     _ll_forget(*inode, 1);
10476     return r;
10477   }
10478
10479   _ll_forget(parent, 1);
10480   return 0;
10481 }
10482
10483 int Client::ll_lookupx(Inode *parent, const char *name, Inode **out,
10484                        struct ceph_statx *stx, unsigned want, unsigned flags,
10485                        const UserPerm& perms)
10486 {
10487   Mutex::Locker lock(client_lock);
10488   vinodeno_t vparent = _get_vino(parent);
10489   ldout(cct, 3) << "ll_lookupx " << vparent << " " << name << dendl;
10490   tout(cct) << "ll_lookupx" << std::endl;
10491   tout(cct) << name << std::endl;
10492
10493   if (unmounting)
10494     return -ENOTCONN;
10495
10496   int r = 0;
10497   if (!cct->_conf->fuse_default_permissions) {
10498     r = may_lookup(parent, perms);
10499     if (r < 0)
10500       return r;
10501   }
10502
10503   string dname(name);
10504   InodeRef in;
10505
10506   unsigned mask = statx_to_mask(flags, want);
10507   r = _lookup(parent, dname, mask, &in, perms);
10508   if (r < 0) {
10509     stx->stx_ino = 0;
10510     stx->stx_mask = 0;
10511   } else {
10512     assert(in);
10513     fill_statx(in, mask, stx);
10514     _ll_get(in.get());
10515   }
10516
10517   ldout(cct, 3) << "ll_lookupx " << vparent << " " << name
10518           << " -> " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
10519   tout(cct) << stx->stx_ino << std::endl;
10520   *out = in.get();
10521   return r;
10522 }
10523
10524 int Client::ll_walk(const char* name, Inode **out, struct ceph_statx *stx,
10525                     unsigned int want, unsigned int flags, const UserPerm& perms)
10526 {
10527   Mutex::Locker lock(client_lock);
10528
10529   if (unmounting)
10530     return -ENOTCONN;
10531
10532   filepath fp(name, 0);
10533   InodeRef in;
10534   int rc;
10535   unsigned mask = statx_to_mask(flags, want);
10536
10537   ldout(cct, 3) << "ll_walk" << name << dendl;
10538   tout(cct) << "ll_walk" << std::endl;
10539   tout(cct) << name << std::endl;
10540
10541   rc = path_walk(fp, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW), mask);
10542   if (rc < 0) {
10543     /* zero out mask, just in case... */
10544     stx->stx_mask = 0;
10545     stx->stx_ino = 0;
10546     *out = NULL;
10547     return rc;
10548   } else {
10549     assert(in);
10550     fill_statx(in, mask, stx);
10551     _ll_get(in.get());
10552     *out = in.get();
10553     return 0;
10554   }
10555 }
10556
10557 void Client::_ll_get(Inode *in)
10558 {
10559   if (in->ll_ref == 0) {
10560     in->get();
10561     if (in->is_dir() && !in->dn_set.empty()) {
10562       assert(in->dn_set.size() == 1); // dirs can't be hard-linked
10563       in->get_first_parent()->get(); // pin dentry
10564     }
10565   }
10566   in->ll_get();
10567   ldout(cct, 20) << "_ll_get " << in << " " << in->ino << " -> " << in->ll_ref << dendl;
10568 }
10569
10570 int Client::_ll_put(Inode *in, int num)
10571 {
10572   in->ll_put(num);
10573   ldout(cct, 20) << "_ll_put " << in << " " << in->ino << " " << num << " -> " << in->ll_ref << dendl;
10574   if (in->ll_ref == 0) {
10575     if (in->is_dir() && !in->dn_set.empty()) {
10576       assert(in->dn_set.size() == 1); // dirs can't be hard-linked
10577       in->get_first_parent()->put(); // unpin dentry
10578     }
10579     put_inode(in);
10580     return 0;
10581   } else {
10582     return in->ll_ref;
10583   }
10584 }
10585
10586 void Client::_ll_drop_pins()
10587 {
10588   ldout(cct, 10) << "_ll_drop_pins" << dendl;
10589   std::set<InodeRef> to_be_put; //this set will be deconstructed item by item when exit
10590   ceph::unordered_map<vinodeno_t, Inode*>::iterator next;
10591   for (ceph::unordered_map<vinodeno_t, Inode*>::iterator it = inode_map.begin();
10592        it != inode_map.end();
10593        it = next) {
10594     Inode *in = it->second;
10595     next = it;
10596     ++next;
10597     if (in->ll_ref){
10598       to_be_put.insert(in);
10599       _ll_put(in, in->ll_ref);
10600     }
10601   }
10602 }
10603
10604 bool Client::_ll_forget(Inode *in, int count)
10605 {
10606   inodeno_t ino = _get_inodeno(in);
10607
10608   ldout(cct, 8) << "ll_forget " << ino << " " << count << dendl;
10609   tout(cct) << "ll_forget" << std::endl;
10610   tout(cct) << ino.val << std::endl;
10611   tout(cct) << count << std::endl;
10612
10613   // Ignore forget if we're no longer mounted
10614   if (unmounting)
10615     return true;
10616
10617   if (ino == 1) return true;  // ignore forget on root.
10618
10619   bool last = false;
10620   if (in->ll_ref < count) {
10621     ldout(cct, 1) << "WARNING: ll_forget on " << ino << " " << count
10622                   << ", which only has ll_ref=" << in->ll_ref << dendl;
10623     _ll_put(in, in->ll_ref);
10624     last = true;
10625   } else {
10626     if (_ll_put(in, count) == 0)
10627       last = true;
10628   }
10629
10630   return last;
10631 }
10632
10633 bool Client::ll_forget(Inode *in, int count)
10634 {
10635   Mutex::Locker lock(client_lock);
10636   return _ll_forget(in, count);
10637 }
10638
10639 bool Client::ll_put(Inode *in)
10640 {
10641   /* ll_forget already takes the lock */
10642   return ll_forget(in, 1);
10643 }
10644
10645 snapid_t Client::ll_get_snapid(Inode *in)
10646 {
10647   Mutex::Locker lock(client_lock);
10648   return in->snapid;
10649 }
10650
10651 Inode *Client::ll_get_inode(ino_t ino)
10652 {
10653   Mutex::Locker lock(client_lock);
10654
10655   if (unmounting)
10656     return NULL;
10657
10658   vinodeno_t vino = _map_faked_ino(ino);
10659   unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino);
10660   if (p == inode_map.end())
10661     return NULL;
10662   Inode *in = p->second;
10663   _ll_get(in);
10664   return in;
10665 }
10666
10667 Inode *Client::ll_get_inode(vinodeno_t vino)
10668 {
10669   Mutex::Locker lock(client_lock);
10670
10671   if (unmounting)
10672     return NULL;
10673
10674   unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino);
10675   if (p == inode_map.end())
10676     return NULL;
10677   Inode *in = p->second;
10678   _ll_get(in);
10679   return in;
10680 }
10681
10682 int Client::_ll_getattr(Inode *in, int caps, const UserPerm& perms)
10683 {
10684   vinodeno_t vino = _get_vino(in);
10685
10686   ldout(cct, 8) << "ll_getattr " << vino << dendl;
10687   tout(cct) << "ll_getattr" << std::endl;
10688   tout(cct) << vino.ino.val << std::endl;
10689
10690   if (vino.snapid < CEPH_NOSNAP)
10691     return 0;
10692   else
10693     return _getattr(in, caps, perms);
10694 }
10695
10696 int Client::ll_getattr(Inode *in, struct stat *attr, const UserPerm& perms)
10697 {
10698   Mutex::Locker lock(client_lock);
10699
10700   if (unmounting)
10701     return -ENOTCONN;
10702
10703   int res = _ll_getattr(in, CEPH_STAT_CAP_INODE_ALL, perms);
10704
10705   if (res == 0)
10706     fill_stat(in, attr);
10707   ldout(cct, 3) << "ll_getattr " << _get_vino(in) << " = " << res << dendl;
10708   return res;
10709 }
10710
10711 int Client::ll_getattrx(Inode *in, struct ceph_statx *stx, unsigned int want,
10712                         unsigned int flags, const UserPerm& perms)
10713 {
10714   Mutex::Locker lock(client_lock);
10715
10716   if (unmounting)
10717     return -ENOTCONN;
10718
10719   int res = 0;
10720   unsigned mask = statx_to_mask(flags, want);
10721
10722   if (mask && !in->caps_issued_mask(mask, true))
10723     res = _ll_getattr(in, mask, perms);
10724
10725   if (res == 0)
10726     fill_statx(in, mask, stx);
10727   ldout(cct, 3) << "ll_getattrx " << _get_vino(in) << " = " << res << dendl;
10728   return res;
10729 }
10730
10731 int Client::_ll_setattrx(Inode *in, struct ceph_statx *stx, int mask,
10732                          const UserPerm& perms, InodeRef *inp)
10733 {
10734   vinodeno_t vino = _get_vino(in);
10735
10736   ldout(cct, 8) << "ll_setattrx " << vino << " mask " << hex << mask << dec
10737                 << dendl;
10738   tout(cct) << "ll_setattrx" << std::endl;
10739   tout(cct) << vino.ino.val << std::endl;
10740   tout(cct) << stx->stx_mode << std::endl;
10741   tout(cct) << stx->stx_uid << std::endl;
10742   tout(cct) << stx->stx_gid << std::endl;
10743   tout(cct) << stx->stx_size << std::endl;
10744   tout(cct) << stx->stx_mtime << std::endl;
10745   tout(cct) << stx->stx_atime << std::endl;
10746   tout(cct) << stx->stx_btime << std::endl;
10747   tout(cct) << mask << std::endl;
10748
10749   if (!cct->_conf->fuse_default_permissions) {
10750     int res = may_setattr(in, stx, mask, perms);
10751     if (res < 0)
10752       return res;
10753   }
10754
10755   mask &= ~(CEPH_SETATTR_MTIME_NOW | CEPH_SETATTR_ATIME_NOW);
10756
10757   return __setattrx(in, stx, mask, perms, inp);
10758 }
10759
10760 int Client::ll_setattrx(Inode *in, struct ceph_statx *stx, int mask,
10761                         const UserPerm& perms)
10762 {
10763   Mutex::Locker lock(client_lock);
10764
10765   if (unmounting)
10766     return -ENOTCONN;
10767
10768   InodeRef target(in);
10769   int res = _ll_setattrx(in, stx, mask, perms, &target);
10770   if (res == 0) {
10771     assert(in == target.get());
10772     fill_statx(in, in->caps_issued(), stx);
10773   }
10774
10775   ldout(cct, 3) << "ll_setattrx " << _get_vino(in) << " = " << res << dendl;
10776   return res;
10777 }
10778
10779 int Client::ll_setattr(Inode *in, struct stat *attr, int mask,
10780                        const UserPerm& perms)
10781 {
10782   struct ceph_statx stx;
10783   stat_to_statx(attr, &stx);
10784
10785   Mutex::Locker lock(client_lock);
10786
10787   if (unmounting)
10788     return -ENOTCONN;
10789
10790   InodeRef target(in);
10791   int res = _ll_setattrx(in, &stx, mask, perms, &target);
10792   if (res == 0) {
10793     assert(in == target.get());
10794     fill_stat(in, attr);
10795   }
10796
10797   ldout(cct, 3) << "ll_setattr " << _get_vino(in) << " = " << res << dendl;
10798   return res;
10799 }
10800
10801
10802 // ----------
10803 // xattrs
10804
10805 int Client::getxattr(const char *path, const char *name, void *value, size_t size,
10806                      const UserPerm& perms)
10807 {
10808   Mutex::Locker lock(client_lock);
10809
10810   if (unmounting)
10811     return -ENOTCONN;
10812
10813   InodeRef in;
10814   int r = Client::path_walk(path, &in, perms, true, CEPH_STAT_CAP_XATTR);
10815   if (r < 0)
10816     return r;
10817   return _getxattr(in, name, value, size, perms);
10818 }
10819
10820 int Client::lgetxattr(const char *path, const char *name, void *value, size_t size,
10821                       const UserPerm& perms)
10822 {
10823   Mutex::Locker lock(client_lock);
10824
10825   if (unmounting)
10826     return -ENOTCONN;
10827
10828   InodeRef in;
10829   int r = Client::path_walk(path, &in, perms, false, CEPH_STAT_CAP_XATTR);
10830   if (r < 0)
10831     return r;
10832   return _getxattr(in, name, value, size, perms);
10833 }
10834
10835 int Client::fgetxattr(int fd, const char *name, void *value, size_t size,
10836                       const UserPerm& perms)
10837 {
10838   Mutex::Locker lock(client_lock);
10839
10840   if (unmounting)
10841     return -ENOTCONN;
10842
10843   Fh *f = get_filehandle(fd);
10844   if (!f)
10845     return -EBADF;
10846   return _getxattr(f->inode, name, value, size, perms);
10847 }
10848
10849 int Client::listxattr(const char *path, char *list, size_t size,
10850                       const UserPerm& perms)
10851 {
10852   Mutex::Locker lock(client_lock);
10853
10854   if (unmounting)
10855     return -ENOTCONN;
10856
10857   InodeRef in;
10858   int r = Client::path_walk(path, &in, perms, true, CEPH_STAT_CAP_XATTR);
10859   if (r < 0)
10860     return r;
10861   return Client::_listxattr(in.get(), list, size, perms);
10862 }
10863
10864 int Client::llistxattr(const char *path, char *list, size_t size,
10865                        const UserPerm& perms)
10866 {
10867   Mutex::Locker lock(client_lock);
10868
10869   if (unmounting)
10870     return -ENOTCONN;
10871
10872   InodeRef in;
10873   int r = Client::path_walk(path, &in, perms, false, CEPH_STAT_CAP_XATTR);
10874   if (r < 0)
10875     return r;
10876   return Client::_listxattr(in.get(), list, size, perms);
10877 }
10878
10879 int Client::flistxattr(int fd, char *list, size_t size, const UserPerm& perms)
10880 {
10881   Mutex::Locker lock(client_lock);
10882
10883   if (unmounting)
10884     return -ENOTCONN;
10885
10886   Fh *f = get_filehandle(fd);
10887   if (!f)
10888     return -EBADF;
10889   return Client::_listxattr(f->inode.get(), list, size, perms);
10890 }
10891
10892 int Client::removexattr(const char *path, const char *name,
10893                         const UserPerm& perms)
10894 {
10895   Mutex::Locker lock(client_lock);
10896
10897   if (unmounting)
10898     return -ENOTCONN;
10899
10900   InodeRef in;
10901   int r = Client::path_walk(path, &in, perms, true);
10902   if (r < 0)
10903     return r;
10904   return _removexattr(in, name, perms);
10905 }
10906
10907 int Client::lremovexattr(const char *path, const char *name,
10908                          const UserPerm& perms)
10909 {
10910   Mutex::Locker lock(client_lock);
10911
10912   if (unmounting)
10913     return -ENOTCONN;
10914
10915   InodeRef in;
10916   int r = Client::path_walk(path, &in, perms, false);
10917   if (r < 0)
10918     return r;
10919   return _removexattr(in, name, perms);
10920 }
10921
10922 int Client::fremovexattr(int fd, const char *name, const UserPerm& perms)
10923 {
10924   Mutex::Locker lock(client_lock);
10925
10926   if (unmounting)
10927     return -ENOTCONN;
10928
10929   Fh *f = get_filehandle(fd);
10930   if (!f)
10931     return -EBADF;
10932   return _removexattr(f->inode, name, perms);
10933 }
10934
10935 int Client::setxattr(const char *path, const char *name, const void *value,
10936                      size_t size, int flags, const UserPerm& perms)
10937 {
10938   _setxattr_maybe_wait_for_osdmap(name, value, size);
10939
10940   Mutex::Locker lock(client_lock);
10941
10942   if (unmounting)
10943     return -ENOTCONN;
10944
10945   InodeRef in;
10946   int r = Client::path_walk(path, &in, perms, true);
10947   if (r < 0)
10948     return r;
10949   return _setxattr(in, name, value, size, flags, perms);
10950 }
10951
10952 int Client::lsetxattr(const char *path, const char *name, const void *value,
10953                       size_t size, int flags, const UserPerm& perms)
10954 {
10955   _setxattr_maybe_wait_for_osdmap(name, value, size);
10956
10957   Mutex::Locker lock(client_lock);
10958
10959   if (unmounting)
10960     return -ENOTCONN;
10961
10962   InodeRef in;
10963   int r = Client::path_walk(path, &in, perms, false);
10964   if (r < 0)
10965     return r;
10966   return _setxattr(in, name, value, size, flags, perms);
10967 }
10968
10969 int Client::fsetxattr(int fd, const char *name, const void *value, size_t size,
10970                       int flags, const UserPerm& perms)
10971 {
10972   _setxattr_maybe_wait_for_osdmap(name, value, size);
10973
10974   Mutex::Locker lock(client_lock);
10975
10976   if (unmounting)
10977     return -ENOTCONN;
10978
10979   Fh *f = get_filehandle(fd);
10980   if (!f)
10981     return -EBADF;
10982   return _setxattr(f->inode, name, value, size, flags, perms);
10983 }
10984
10985 int Client::_getxattr(Inode *in, const char *name, void *value, size_t size,
10986                       const UserPerm& perms)
10987 {
10988   int r;
10989
10990   const VXattr *vxattr = _match_vxattr(in, name);
10991   if (vxattr) {
10992     r = -ENODATA;
10993
10994     // Do a force getattr to get the latest quota before returning
10995     // a value to userspace.
10996     int flags = 0;
10997     if (vxattr->flags & VXATTR_RSTAT) {
10998       flags |= CEPH_STAT_RSTAT;
10999     }
11000     r = _getattr(in, flags, perms, true);
11001     if (r != 0) {
11002       // Error from getattr!
11003       return r;
11004     }
11005
11006     // call pointer-to-member function
11007     char buf[256];
11008     if (!(vxattr->exists_cb && !(this->*(vxattr->exists_cb))(in))) {
11009       r = (this->*(vxattr->getxattr_cb))(in, buf, sizeof(buf));
11010     } else {
11011       r = -ENODATA;
11012     }
11013
11014     if (size != 0) {
11015       if (r > (int)size) {
11016         r = -ERANGE;
11017       } else if (r > 0) {
11018         memcpy(value, buf, r);
11019       }
11020     }
11021     goto out;
11022   }
11023
11024   if (acl_type == NO_ACL && !strncmp(name, "system.", 7)) {
11025     r = -EOPNOTSUPP;
11026     goto out;
11027   }
11028
11029   r = _getattr(in, CEPH_STAT_CAP_XATTR, perms, in->xattr_version == 0);
11030   if (r == 0) {
11031     string n(name);
11032     r = -ENODATA;
11033    if (in->xattrs.count(n)) {
11034       r = in->xattrs[n].length();
11035       if (r > 0 && size != 0) {
11036         if (size >= (unsigned)r)
11037           memcpy(value, in->xattrs[n].c_str(), r);
11038         else
11039           r = -ERANGE;
11040       }
11041     }
11042   }
11043  out:
11044   ldout(cct, 8) << "_getxattr(" << in->ino << ", \"" << name << "\", " << size << ") = " << r << dendl;
11045   return r;
11046 }
11047
11048 int Client::_getxattr(InodeRef &in, const char *name, void *value, size_t size,
11049                       const UserPerm& perms)
11050 {
11051   if (cct->_conf->client_permissions) {
11052     int r = xattr_permission(in.get(), name, MAY_READ, perms);
11053     if (r < 0)
11054       return r;
11055   }
11056   return _getxattr(in.get(), name, value, size, perms);
11057 }
11058
11059 int Client::ll_getxattr(Inode *in, const char *name, void *value,
11060                         size_t size, const UserPerm& perms)
11061 {
11062   Mutex::Locker lock(client_lock);
11063
11064   if (unmounting)
11065     return -ENOTCONN;
11066
11067   vinodeno_t vino = _get_vino(in);
11068
11069   ldout(cct, 3) << "ll_getxattr " << vino << " " << name << " size " << size << dendl;
11070   tout(cct) << "ll_getxattr" << std::endl;
11071   tout(cct) << vino.ino.val << std::endl;
11072   tout(cct) << name << std::endl;
11073
11074   if (!cct->_conf->fuse_default_permissions) {
11075     int r = xattr_permission(in, name, MAY_READ, perms);
11076     if (r < 0)
11077       return r;
11078   }
11079
11080   return _getxattr(in, name, value, size, perms);
11081 }
11082
11083 int Client::_listxattr(Inode *in, char *name, size_t size,
11084                        const UserPerm& perms)
11085 {
11086   int r = _getattr(in, CEPH_STAT_CAP_XATTR, perms, in->xattr_version == 0);
11087   if (r == 0) {
11088     for (map<string,bufferptr>::iterator p = in->xattrs.begin();
11089          p != in->xattrs.end();
11090          ++p)
11091       r += p->first.length() + 1;
11092
11093     const VXattr *vxattrs = _get_vxattrs(in);
11094     r += _vxattrs_name_size(vxattrs);
11095
11096     if (size != 0) {
11097       if (size >= (unsigned)r) {
11098         for (map<string,bufferptr>::iterator p = in->xattrs.begin();
11099              p != in->xattrs.end();
11100              ++p) {
11101           memcpy(name, p->first.c_str(), p->first.length());
11102           name += p->first.length();
11103           *name = '\0';
11104           name++;
11105         }
11106         if (vxattrs) {
11107           for (int i = 0; !vxattrs[i].name.empty(); i++) {
11108             const VXattr& vxattr = vxattrs[i];
11109             if (vxattr.hidden)
11110               continue;
11111             // call pointer-to-member function
11112             if(vxattr.exists_cb && !(this->*(vxattr.exists_cb))(in))
11113               continue;
11114             memcpy(name, vxattr.name.c_str(), vxattr.name.length());
11115             name += vxattr.name.length();
11116             *name = '\0';
11117             name++;
11118           }
11119         }
11120       } else
11121         r = -ERANGE;
11122     }
11123   }
11124   ldout(cct, 8) << "_listxattr(" << in->ino << ", " << size << ") = " << r << dendl;
11125   return r;
11126 }
11127
11128 int Client::ll_listxattr(Inode *in, char *names, size_t size,
11129                          const UserPerm& perms)
11130 {
11131   Mutex::Locker lock(client_lock);
11132
11133   if (unmounting)
11134     return -ENOTCONN;
11135
11136   vinodeno_t vino = _get_vino(in);
11137
11138   ldout(cct, 3) << "ll_listxattr " << vino << " size " << size << dendl;
11139   tout(cct) << "ll_listxattr" << std::endl;
11140   tout(cct) << vino.ino.val << std::endl;
11141   tout(cct) << size << std::endl;
11142
11143   return _listxattr(in, names, size, perms);
11144 }
11145
11146 int Client::_do_setxattr(Inode *in, const char *name, const void *value,
11147                          size_t size, int flags, const UserPerm& perms)
11148 {
11149
11150   int xattr_flags = 0;
11151   if (!value)
11152     xattr_flags |= CEPH_XATTR_REMOVE;
11153   if (flags & XATTR_CREATE)
11154     xattr_flags |= CEPH_XATTR_CREATE;
11155   if (flags & XATTR_REPLACE)
11156     xattr_flags |= CEPH_XATTR_REPLACE;
11157
11158   MetaRequest *req = new MetaRequest(CEPH_MDS_OP_SETXATTR);
11159   filepath path;
11160   in->make_nosnap_relative_path(path);
11161   req->set_filepath(path);
11162   req->set_string2(name);
11163   req->set_inode(in);
11164   req->head.args.setxattr.flags = xattr_flags;
11165
11166   bufferlist bl;
11167   bl.append((const char*)value, size);
11168   req->set_data(bl);
11169
11170   int res = make_request(req, perms);
11171
11172   trim_cache();
11173   ldout(cct, 3) << "_setxattr(" << in->ino << ", \"" << name << "\") = " <<
11174     res << dendl;
11175   return res;
11176 }
11177
11178 int Client::_setxattr(Inode *in, const char *name, const void *value,
11179                       size_t size, int flags, const UserPerm& perms)
11180 {
11181   if (in->snapid != CEPH_NOSNAP) {
11182     return -EROFS;
11183   }
11184
11185   bool posix_acl_xattr = false;
11186   if (acl_type == POSIX_ACL)
11187     posix_acl_xattr = !strncmp(name, "system.", 7);
11188
11189   if (strncmp(name, "user.", 5) &&
11190       strncmp(name, "security.", 9) &&
11191       strncmp(name, "trusted.", 8) &&
11192       strncmp(name, "ceph.", 5) &&
11193       !posix_acl_xattr)
11194     return -EOPNOTSUPP;
11195
11196   if (posix_acl_xattr) {
11197     if (!strcmp(name, ACL_EA_ACCESS)) {
11198       mode_t new_mode = in->mode;
11199       if (value) {
11200         int ret = posix_acl_equiv_mode(value, size, &new_mode);
11201         if (ret < 0)
11202           return ret;
11203         if (ret == 0) {
11204           value = NULL;
11205           size = 0;
11206         }
11207         if (new_mode != in->mode) {
11208           struct ceph_statx stx;
11209           stx.stx_mode = new_mode;
11210           ret = _do_setattr(in, &stx, CEPH_SETATTR_MODE, perms, NULL);
11211           if (ret < 0)
11212             return ret;
11213         }
11214       }
11215     } else if (!strcmp(name, ACL_EA_DEFAULT)) {
11216       if (value) {
11217         if (!S_ISDIR(in->mode))
11218           return -EACCES;
11219         int ret = posix_acl_check(value, size);
11220         if (ret < 0)
11221           return -EINVAL;
11222         if (ret == 0) {
11223           value = NULL;
11224           size = 0;
11225         }
11226       }
11227     } else {
11228       return -EOPNOTSUPP;
11229     }
11230   } else {
11231     const VXattr *vxattr = _match_vxattr(in, name);
11232     if (vxattr && vxattr->readonly)
11233       return -EOPNOTSUPP;
11234   }
11235
11236   return _do_setxattr(in, name, value, size, flags, perms);
11237 }
11238
11239 int Client::_setxattr(InodeRef &in, const char *name, const void *value,
11240                       size_t size, int flags, const UserPerm& perms)
11241 {
11242   if (cct->_conf->client_permissions) {
11243     int r = xattr_permission(in.get(), name, MAY_WRITE, perms);
11244     if (r < 0)
11245       return r;
11246   }
11247   return _setxattr(in.get(), name, value, size, flags, perms);
11248 }
11249
11250 int Client::_setxattr_check_data_pool(string& name, string& value, const OSDMap *osdmap)
11251 {
11252   string tmp;
11253   if (name == "layout") {
11254     string::iterator begin = value.begin();
11255     string::iterator end = value.end();
11256     keys_and_values<string::iterator> p;    // create instance of parser
11257     std::map<string, string> m;             // map to receive results
11258     if (!qi::parse(begin, end, p, m)) {     // returns true if successful
11259       return -EINVAL;
11260     }
11261     if (begin != end)
11262       return -EINVAL;
11263     for (map<string,string>::iterator q = m.begin(); q != m.end(); ++q) {
11264       if (q->first == "pool") {
11265         tmp = q->second;
11266         break;
11267       }
11268     }
11269   } else if (name == "layout.pool") {
11270     tmp = value;
11271   }
11272
11273   if (tmp.length()) {
11274     int64_t pool;
11275     try {
11276       pool = boost::lexical_cast<unsigned>(tmp);
11277       if (!osdmap->have_pg_pool(pool))
11278         return -ENOENT;
11279     } catch (boost::bad_lexical_cast const&) {
11280       pool = osdmap->lookup_pg_pool_name(tmp);
11281       if (pool < 0) {
11282         return -ENOENT;
11283       }
11284     }
11285   }
11286
11287   return 0;
11288 }
11289
11290 void Client::_setxattr_maybe_wait_for_osdmap(const char *name, const void *value, size_t size)
11291 {
11292   // For setting pool of layout, MetaRequest need osdmap epoch.
11293   // There is a race which create a new data pool but client and mds both don't have.
11294   // Make client got the latest osdmap which make mds quickly judge whether get newer osdmap.
11295   if (strcmp(name, "ceph.file.layout.pool") == 0 || strcmp(name, "ceph.dir.layout.pool") == 0 ||
11296       strcmp(name, "ceph.file.layout") == 0 || strcmp(name, "ceph.dir.layout") == 0) {
11297     string rest(strstr(name, "layout"));
11298     string v((const char*)value, size);
11299     int r = objecter->with_osdmap([&](const OSDMap& o) {
11300       return _setxattr_check_data_pool(rest, v, &o);
11301     });
11302
11303     if (r == -ENOENT) {
11304       C_SaferCond ctx;
11305       objecter->wait_for_latest_osdmap(&ctx);
11306       ctx.wait();
11307     }
11308   }
11309 }
11310
11311 int Client::ll_setxattr(Inode *in, const char *name, const void *value,
11312                         size_t size, int flags, const UserPerm& perms)
11313 {
11314   _setxattr_maybe_wait_for_osdmap(name, value, size);
11315
11316   Mutex::Locker lock(client_lock);
11317
11318   if (unmounting)
11319     return -ENOTCONN;
11320
11321   vinodeno_t vino = _get_vino(in);
11322
11323   ldout(cct, 3) << "ll_setxattr " << vino << " " << name << " size " << size << dendl;
11324   tout(cct) << "ll_setxattr" << std::endl;
11325   tout(cct) << vino.ino.val << std::endl;
11326   tout(cct) << name << std::endl;
11327
11328   if (!cct->_conf->fuse_default_permissions) {
11329     int r = xattr_permission(in, name, MAY_WRITE, perms);
11330     if (r < 0)
11331       return r;
11332   }
11333   return _setxattr(in, name, value, size, flags, perms);
11334 }
11335
11336 int Client::_removexattr(Inode *in, const char *name, const UserPerm& perms)
11337 {
11338   if (in->snapid != CEPH_NOSNAP) {
11339     return -EROFS;
11340   }
11341
11342   // same xattrs supported by kernel client
11343   if (strncmp(name, "user.", 5) &&
11344       strncmp(name, "system.", 7) &&
11345       strncmp(name, "security.", 9) &&
11346       strncmp(name, "trusted.", 8) &&
11347       strncmp(name, "ceph.", 5))
11348     return -EOPNOTSUPP;
11349
11350   const VXattr *vxattr = _match_vxattr(in, name);
11351   if (vxattr && vxattr->readonly)
11352     return -EOPNOTSUPP;
11353
11354   MetaRequest *req = new MetaRequest(CEPH_MDS_OP_RMXATTR);
11355   filepath path;
11356   in->make_nosnap_relative_path(path);
11357   req->set_filepath(path);
11358   req->set_filepath2(name);
11359   req->set_inode(in);
11360
11361   int res = make_request(req, perms);
11362
11363   trim_cache();
11364   ldout(cct, 8) << "_removexattr(" << in->ino << ", \"" << name << "\") = " << res << dendl;
11365   return res;
11366 }
11367
11368 int Client::_removexattr(InodeRef &in, const char *name, const UserPerm& perms)
11369 {
11370   if (cct->_conf->client_permissions) {
11371     int r = xattr_permission(in.get(), name, MAY_WRITE, perms);
11372     if (r < 0)
11373       return r;
11374   }
11375   return _removexattr(in.get(), name, perms);
11376 }
11377
11378 int Client::ll_removexattr(Inode *in, const char *name, const UserPerm& perms)
11379 {
11380   Mutex::Locker lock(client_lock);
11381
11382   if (unmounting)
11383     return -ENOTCONN;
11384
11385   vinodeno_t vino = _get_vino(in);
11386
11387   ldout(cct, 3) << "ll_removexattr " << vino << " " << name << dendl;
11388   tout(cct) << "ll_removexattr" << std::endl;
11389   tout(cct) << vino.ino.val << std::endl;
11390   tout(cct) << name << std::endl;
11391
11392   if (!cct->_conf->fuse_default_permissions) {
11393     int r = xattr_permission(in, name, MAY_WRITE, perms);
11394     if (r < 0)
11395       return r;
11396   }
11397
11398   return _removexattr(in, name, perms);
11399 }
11400
11401 bool Client::_vxattrcb_quota_exists(Inode *in)
11402 {
11403   return in->quota.is_enable();
11404 }
11405 size_t Client::_vxattrcb_quota(Inode *in, char *val, size_t size)
11406 {
11407   return snprintf(val, size,
11408                   "max_bytes=%lld max_files=%lld",
11409                   (long long int)in->quota.max_bytes,
11410                   (long long int)in->quota.max_files);
11411 }
11412 size_t Client::_vxattrcb_quota_max_bytes(Inode *in, char *val, size_t size)
11413 {
11414   return snprintf(val, size, "%lld", (long long int)in->quota.max_bytes);
11415 }
11416 size_t Client::_vxattrcb_quota_max_files(Inode *in, char *val, size_t size)
11417 {
11418   return snprintf(val, size, "%lld", (long long int)in->quota.max_files);
11419 }
11420
11421 bool Client::_vxattrcb_layout_exists(Inode *in)
11422 {
11423   return in->layout != file_layout_t();
11424 }
11425 size_t Client::_vxattrcb_layout(Inode *in, char *val, size_t size)
11426 {
11427   int r = snprintf(val, size,
11428       "stripe_unit=%lld stripe_count=%lld object_size=%lld pool=",
11429       (unsigned long long)in->layout.stripe_unit,
11430       (unsigned long long)in->layout.stripe_count,
11431       (unsigned long long)in->layout.object_size);
11432   objecter->with_osdmap([&](const OSDMap& o) {
11433       if (o.have_pg_pool(in->layout.pool_id))
11434         r += snprintf(val + r, size - r, "%s",
11435                       o.get_pool_name(in->layout.pool_id).c_str());
11436       else
11437         r += snprintf(val + r, size - r, "%" PRIu64,
11438                       (uint64_t)in->layout.pool_id);
11439     });
11440   if (in->layout.pool_ns.length())
11441     r += snprintf(val + r, size - r, " pool_namespace=%s",
11442                   in->layout.pool_ns.c_str());
11443   return r;
11444 }
11445 size_t Client::_vxattrcb_layout_stripe_unit(Inode *in, char *val, size_t size)
11446 {
11447   return snprintf(val, size, "%lld", (unsigned long long)in->layout.stripe_unit);
11448 }
11449 size_t Client::_vxattrcb_layout_stripe_count(Inode *in, char *val, size_t size)
11450 {
11451   return snprintf(val, size, "%lld", (unsigned long long)in->layout.stripe_count);
11452 }
11453 size_t Client::_vxattrcb_layout_object_size(Inode *in, char *val, size_t size)
11454 {
11455   return snprintf(val, size, "%lld", (unsigned long long)in->layout.object_size);
11456 }
11457 size_t Client::_vxattrcb_layout_pool(Inode *in, char *val, size_t size)
11458 {
11459   size_t r;
11460   objecter->with_osdmap([&](const OSDMap& o) {
11461       if (o.have_pg_pool(in->layout.pool_id))
11462         r = snprintf(val, size, "%s", o.get_pool_name(
11463                        in->layout.pool_id).c_str());
11464       else
11465         r = snprintf(val, size, "%" PRIu64, (uint64_t)in->layout.pool_id);
11466     });
11467   return r;
11468 }
11469 size_t Client::_vxattrcb_layout_pool_namespace(Inode *in, char *val, size_t size)
11470 {
11471   return snprintf(val, size, "%s", in->layout.pool_ns.c_str());
11472 }
11473 size_t Client::_vxattrcb_dir_entries(Inode *in, char *val, size_t size)
11474 {
11475   return snprintf(val, size, "%lld", (unsigned long long)(in->dirstat.nfiles + in->dirstat.nsubdirs));
11476 }
11477 size_t Client::_vxattrcb_dir_files(Inode *in, char *val, size_t size)
11478 {
11479   return snprintf(val, size, "%lld", (unsigned long long)in->dirstat.nfiles);
11480 }
11481 size_t Client::_vxattrcb_dir_subdirs(Inode *in, char *val, size_t size)
11482 {
11483   return snprintf(val, size, "%lld", (unsigned long long)in->dirstat.nsubdirs);
11484 }
11485 size_t Client::_vxattrcb_dir_rentries(Inode *in, char *val, size_t size)
11486 {
11487   return snprintf(val, size, "%lld", (unsigned long long)(in->rstat.rfiles + in->rstat.rsubdirs));
11488 }
11489 size_t Client::_vxattrcb_dir_rfiles(Inode *in, char *val, size_t size)
11490 {
11491   return snprintf(val, size, "%lld", (unsigned long long)in->rstat.rfiles);
11492 }
11493 size_t Client::_vxattrcb_dir_rsubdirs(Inode *in, char *val, size_t size)
11494 {
11495   return snprintf(val, size, "%lld", (unsigned long long)in->rstat.rsubdirs);
11496 }
11497 size_t Client::_vxattrcb_dir_rbytes(Inode *in, char *val, size_t size)
11498 {
11499   return snprintf(val, size, "%lld", (unsigned long long)in->rstat.rbytes);
11500 }
11501 size_t Client::_vxattrcb_dir_rctime(Inode *in, char *val, size_t size)
11502 {
11503   return snprintf(val, size, "%ld.09%ld", (long)in->rstat.rctime.sec(),
11504       (long)in->rstat.rctime.nsec());
11505 }
11506
11507 #define CEPH_XATTR_NAME(_type, _name) "ceph." #_type "." #_name
11508 #define CEPH_XATTR_NAME2(_type, _name, _name2) "ceph." #_type "." #_name "." #_name2
11509
11510 #define XATTR_NAME_CEPH(_type, _name)                           \
11511 {                                                               \
11512   name: CEPH_XATTR_NAME(_type, _name),                          \
11513   getxattr_cb: &Client::_vxattrcb_ ## _type ## _ ## _name,      \
11514   readonly: true,                                               \
11515   hidden: false,                                                \
11516   exists_cb: NULL,                                              \
11517   flags: 0,                                                     \
11518 }
11519 #define XATTR_NAME_CEPH2(_type, _name, _flags)                 \
11520 {                                                              \
11521   name: CEPH_XATTR_NAME(_type, _name),                         \
11522   getxattr_cb: &Client::_vxattrcb_ ## _type ## _ ## _name,     \
11523   readonly: true,                                              \
11524   hidden: false,                                               \
11525   exists_cb: NULL,                                             \
11526   flags: _flags,                                               \
11527 }
11528 #define XATTR_LAYOUT_FIELD(_type, _name, _field)                \
11529 {                                                               \
11530   name: CEPH_XATTR_NAME2(_type, _name, _field),                 \
11531   getxattr_cb: &Client::_vxattrcb_ ## _name ## _ ## _field,     \
11532   readonly: false,                                              \
11533   hidden: true,                                                 \
11534   exists_cb: &Client::_vxattrcb_layout_exists,                  \
11535   flags: 0,                                                     \
11536 }
11537 #define XATTR_QUOTA_FIELD(_type, _name)                         \
11538 {                                                               \
11539   name: CEPH_XATTR_NAME(_type, _name),                          \
11540   getxattr_cb: &Client::_vxattrcb_ ## _type ## _ ## _name,      \
11541   readonly: false,                                              \
11542   hidden: true,                                                 \
11543   exists_cb: &Client::_vxattrcb_quota_exists,                   \
11544   flags: 0,                                                     \
11545 }
11546
11547 const Client::VXattr Client::_dir_vxattrs[] = {
11548   {
11549     name: "ceph.dir.layout",
11550     getxattr_cb: &Client::_vxattrcb_layout,
11551     readonly: false,
11552     hidden: true,
11553     exists_cb: &Client::_vxattrcb_layout_exists,
11554     flags: 0,
11555   },
11556   XATTR_LAYOUT_FIELD(dir, layout, stripe_unit),
11557   XATTR_LAYOUT_FIELD(dir, layout, stripe_count),
11558   XATTR_LAYOUT_FIELD(dir, layout, object_size),
11559   XATTR_LAYOUT_FIELD(dir, layout, pool),
11560   XATTR_LAYOUT_FIELD(dir, layout, pool_namespace),
11561   XATTR_NAME_CEPH(dir, entries),
11562   XATTR_NAME_CEPH(dir, files),
11563   XATTR_NAME_CEPH(dir, subdirs),
11564   XATTR_NAME_CEPH2(dir, rentries, VXATTR_RSTAT),
11565   XATTR_NAME_CEPH2(dir, rfiles, VXATTR_RSTAT),
11566   XATTR_NAME_CEPH2(dir, rsubdirs, VXATTR_RSTAT),
11567   XATTR_NAME_CEPH2(dir, rbytes, VXATTR_RSTAT),
11568   XATTR_NAME_CEPH2(dir, rctime, VXATTR_RSTAT),
11569   {
11570     name: "ceph.quota",
11571     getxattr_cb: &Client::_vxattrcb_quota,
11572     readonly: false,
11573     hidden: true,
11574     exists_cb: &Client::_vxattrcb_quota_exists,
11575     flags: 0,
11576   },
11577   XATTR_QUOTA_FIELD(quota, max_bytes),
11578   XATTR_QUOTA_FIELD(quota, max_files),
11579   { name: "" }     /* Required table terminator */
11580 };
11581
11582 const Client::VXattr Client::_file_vxattrs[] = {
11583   {
11584     name: "ceph.file.layout",
11585     getxattr_cb: &Client::_vxattrcb_layout,
11586     readonly: false,
11587     hidden: true,
11588     exists_cb: &Client::_vxattrcb_layout_exists,
11589     flags: 0,
11590   },
11591   XATTR_LAYOUT_FIELD(file, layout, stripe_unit),
11592   XATTR_LAYOUT_FIELD(file, layout, stripe_count),
11593   XATTR_LAYOUT_FIELD(file, layout, object_size),
11594   XATTR_LAYOUT_FIELD(file, layout, pool),
11595   XATTR_LAYOUT_FIELD(file, layout, pool_namespace),
11596   { name: "" }     /* Required table terminator */
11597 };
11598
11599 const Client::VXattr *Client::_get_vxattrs(Inode *in)
11600 {
11601   if (in->is_dir())
11602     return _dir_vxattrs;
11603   else if (in->is_file())
11604     return _file_vxattrs;
11605   return NULL;
11606 }
11607
11608 const Client::VXattr *Client::_match_vxattr(Inode *in, const char *name)
11609 {
11610   if (strncmp(name, "ceph.", 5) == 0) {
11611     const VXattr *vxattr = _get_vxattrs(in);
11612     if (vxattr) {
11613       while (!vxattr->name.empty()) {
11614         if (vxattr->name == name)
11615           return vxattr;
11616         vxattr++;
11617       }
11618     }
11619   }
11620   return NULL;
11621 }
11622
11623 size_t Client::_vxattrs_calcu_name_size(const VXattr *vxattr)
11624 {
11625   size_t len = 0;
11626   while (!vxattr->name.empty()) {
11627     if (!vxattr->hidden)
11628       len += vxattr->name.length() + 1;
11629     vxattr++;
11630   }
11631   return len;
11632 }
11633
11634 int Client::ll_readlink(Inode *in, char *buf, size_t buflen, const UserPerm& perms)
11635 {
11636   Mutex::Locker lock(client_lock);
11637
11638   if (unmounting)
11639     return -ENOTCONN;
11640
11641   vinodeno_t vino = _get_vino(in);
11642
11643   ldout(cct, 3) << "ll_readlink " << vino << dendl;
11644   tout(cct) << "ll_readlink" << std::endl;
11645   tout(cct) << vino.ino.val << std::endl;
11646
11647   set<Dentry*>::iterator dn = in->dn_set.begin();
11648   while (dn != in->dn_set.end()) {
11649     touch_dn(*dn);
11650     ++dn;
11651   }
11652
11653   int r = _readlink(in, buf, buflen); // FIXME: no permission checking!
11654   ldout(cct, 3) << "ll_readlink " << vino << " = " << r << dendl;
11655   return r;
11656 }
11657
11658 int Client::_mknod(Inode *dir, const char *name, mode_t mode, dev_t rdev,
11659                    const UserPerm& perms, InodeRef *inp)
11660 {
11661   ldout(cct, 8) << "_mknod(" << dir->ino << " " << name << ", 0" << oct
11662                 << mode << dec << ", " << rdev << ", uid " << perms.uid()
11663                 << ", gid " << perms.gid() << ")" << dendl;
11664
11665   if (strlen(name) > NAME_MAX)
11666     return -ENAMETOOLONG;
11667
11668   if (dir->snapid != CEPH_NOSNAP) {
11669     return -EROFS;
11670   }
11671   if (is_quota_files_exceeded(dir, perms)) {
11672     return -EDQUOT;
11673   }
11674
11675   MetaRequest *req = new MetaRequest(CEPH_MDS_OP_MKNOD);
11676
11677   filepath path;
11678   dir->make_nosnap_relative_path(path);
11679   path.push_dentry(name);
11680   req->set_filepath(path);
11681   req->set_inode(dir);
11682   req->head.args.mknod.rdev = rdev;
11683   req->dentry_drop = CEPH_CAP_FILE_SHARED;
11684   req->dentry_unless = CEPH_CAP_FILE_EXCL;
11685
11686   bufferlist xattrs_bl;
11687   int res = _posix_acl_create(dir, &mode, xattrs_bl, perms);
11688   if (res < 0)
11689     goto fail;
11690   req->head.args.mknod.mode = mode;
11691   if (xattrs_bl.length() > 0)
11692     req->set_data(xattrs_bl);
11693
11694   Dentry *de;
11695   res = get_or_create(dir, name, &de);
11696   if (res < 0)
11697     goto fail;
11698   req->set_dentry(de);
11699
11700   res = make_request(req, perms, inp);
11701
11702   trim_cache();
11703
11704   ldout(cct, 8) << "mknod(" << path << ", 0" << oct << mode << dec << ") = " << res << dendl;
11705   return res;
11706
11707  fail:
11708   put_request(req);
11709   return res;
11710 }
11711
11712 int Client::ll_mknod(Inode *parent, const char *name, mode_t mode,
11713                      dev_t rdev, struct stat *attr, Inode **out,
11714                      const UserPerm& perms)
11715 {
11716   Mutex::Locker lock(client_lock);
11717
11718   if (unmounting)
11719     return -ENOTCONN;
11720
11721   vinodeno_t vparent = _get_vino(parent);
11722
11723   ldout(cct, 3) << "ll_mknod " << vparent << " " << name << dendl;
11724   tout(cct) << "ll_mknod" << std::endl;
11725   tout(cct) << vparent.ino.val << std::endl;
11726   tout(cct) << name << std::endl;
11727   tout(cct) << mode << std::endl;
11728   tout(cct) << rdev << std::endl;
11729
11730   if (!cct->_conf->fuse_default_permissions) {
11731     int r = may_create(parent, perms);
11732     if (r < 0)
11733       return r;
11734   }
11735
11736   InodeRef in;
11737   int r = _mknod(parent, name, mode, rdev, perms, &in);
11738   if (r == 0) {
11739     fill_stat(in, attr);
11740     _ll_get(in.get());
11741   }
11742   tout(cct) << attr->st_ino << std::endl;
11743   ldout(cct, 3) << "ll_mknod " << vparent << " " << name
11744           << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
11745   *out = in.get();
11746   return r;
11747 }
11748
11749 int Client::ll_mknodx(Inode *parent, const char *name, mode_t mode,
11750                       dev_t rdev, Inode **out,
11751                       struct ceph_statx *stx, unsigned want, unsigned flags,
11752                       const UserPerm& perms)
11753 {
11754   unsigned caps = statx_to_mask(flags, want);
11755   Mutex::Locker lock(client_lock);
11756
11757   if (unmounting)
11758     return -ENOTCONN;
11759
11760   vinodeno_t vparent = _get_vino(parent);
11761
11762   ldout(cct, 3) << "ll_mknodx " << vparent << " " << name << dendl;
11763   tout(cct) << "ll_mknodx" << std::endl;
11764   tout(cct) << vparent.ino.val << std::endl;
11765   tout(cct) << name << std::endl;
11766   tout(cct) << mode << std::endl;
11767   tout(cct) << rdev << std::endl;
11768
11769   if (!cct->_conf->fuse_default_permissions) {
11770     int r = may_create(parent, perms);
11771     if (r < 0)
11772       return r;
11773   }
11774
11775   InodeRef in;
11776   int r = _mknod(parent, name, mode, rdev, perms, &in);
11777   if (r == 0) {
11778     fill_statx(in, caps, stx);
11779     _ll_get(in.get());
11780   }
11781   tout(cct) << stx->stx_ino << std::endl;
11782   ldout(cct, 3) << "ll_mknodx " << vparent << " " << name
11783           << " = " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
11784   *out = in.get();
11785   return r;
11786 }
11787
11788 int Client::_create(Inode *dir, const char *name, int flags, mode_t mode,
11789                     InodeRef *inp, Fh **fhp, int stripe_unit, int stripe_count,
11790                     int object_size, const char *data_pool, bool *created,
11791                     const UserPerm& perms)
11792 {
11793   ldout(cct, 8) << "_create(" << dir->ino << " " << name << ", 0" << oct <<
11794     mode << dec << ")" << dendl;
11795
11796   if (strlen(name) > NAME_MAX)
11797     return -ENAMETOOLONG;
11798   if (dir->snapid != CEPH_NOSNAP) {
11799     return -EROFS;
11800   }
11801   if (is_quota_files_exceeded(dir, perms)) {
11802     return -EDQUOT;
11803   }
11804
11805   // use normalized flags to generate cmode
11806   int cmode = ceph_flags_to_mode(ceph_flags_sys2wire(flags));
11807   if (cmode < 0)
11808     return -EINVAL;
11809
11810   int64_t pool_id = -1;
11811   if (data_pool && *data_pool) {
11812     pool_id = objecter->with_osdmap(
11813       std::mem_fn(&OSDMap::lookup_pg_pool_name), data_pool);
11814     if (pool_id < 0)
11815       return -EINVAL;
11816     if (pool_id > 0xffffffffll)
11817       return -ERANGE;  // bummer!
11818   }
11819
11820   MetaRequest *req = new MetaRequest(CEPH_MDS_OP_CREATE);
11821
11822   filepath path;
11823   dir->make_nosnap_relative_path(path);
11824   path.push_dentry(name);
11825   req->set_filepath(path);
11826   req->set_inode(dir);
11827   req->head.args.open.flags = ceph_flags_sys2wire(flags | O_CREAT);
11828
11829   req->head.args.open.stripe_unit = stripe_unit;
11830   req->head.args.open.stripe_count = stripe_count;
11831   req->head.args.open.object_size = object_size;
11832   if (cct->_conf->client_debug_getattr_caps)
11833     req->head.args.open.mask = DEBUG_GETATTR_CAPS;
11834   else
11835     req->head.args.open.mask = 0;
11836   req->head.args.open.pool = pool_id;
11837   req->dentry_drop = CEPH_CAP_FILE_SHARED;
11838   req->dentry_unless = CEPH_CAP_FILE_EXCL;
11839
11840   mode |= S_IFREG;
11841   bufferlist xattrs_bl;
11842   int res = _posix_acl_create(dir, &mode, xattrs_bl, perms);
11843   if (res < 0)
11844     goto fail;
11845   req->head.args.open.mode = mode;
11846   if (xattrs_bl.length() > 0)
11847     req->set_data(xattrs_bl);
11848
11849   Dentry *de;
11850   res = get_or_create(dir, name, &de);
11851   if (res < 0)
11852     goto fail;
11853   req->set_dentry(de);
11854
11855   res = make_request(req, perms, inp, created);
11856   if (res < 0) {
11857     goto reply_error;
11858   }
11859
11860   /* If the caller passed a value in fhp, do the open */
11861   if(fhp) {
11862     (*inp)->get_open_ref(cmode);
11863     *fhp = _create_fh(inp->get(), flags, cmode, perms);
11864   }
11865
11866  reply_error:
11867   trim_cache();
11868
11869   ldout(cct, 8) << "create(" << path << ", 0" << oct << mode << dec
11870                 << " layout " << stripe_unit
11871                 << ' ' << stripe_count
11872                 << ' ' << object_size
11873                 <<") = " << res << dendl;
11874   return res;
11875
11876  fail:
11877   put_request(req);
11878   return res;
11879 }
11880
11881
11882 int Client::_mkdir(Inode *dir, const char *name, mode_t mode, const UserPerm& perm,
11883                    InodeRef *inp)
11884 {
11885   ldout(cct, 8) << "_mkdir(" << dir->ino << " " << name << ", 0" << oct
11886                 << mode << dec << ", uid " << perm.uid()
11887                 << ", gid " << perm.gid() << ")" << dendl;
11888
11889   if (strlen(name) > NAME_MAX)
11890     return -ENAMETOOLONG;
11891
11892   if (dir->snapid != CEPH_NOSNAP && dir->snapid != CEPH_SNAPDIR) {
11893     return -EROFS;
11894   }
11895   if (is_quota_files_exceeded(dir, perm)) {
11896     return -EDQUOT;
11897   }
11898   MetaRequest *req = new MetaRequest(dir->snapid == CEPH_SNAPDIR ?
11899                                      CEPH_MDS_OP_MKSNAP : CEPH_MDS_OP_MKDIR);
11900
11901   filepath path;
11902   dir->make_nosnap_relative_path(path);
11903   path.push_dentry(name);
11904   req->set_filepath(path);
11905   req->set_inode(dir);
11906   req->dentry_drop = CEPH_CAP_FILE_SHARED;
11907   req->dentry_unless = CEPH_CAP_FILE_EXCL;
11908
11909   mode |= S_IFDIR;
11910   bufferlist xattrs_bl;
11911   int res = _posix_acl_create(dir, &mode, xattrs_bl, perm);
11912   if (res < 0)
11913     goto fail;
11914   req->head.args.mkdir.mode = mode;
11915   if (xattrs_bl.length() > 0)
11916     req->set_data(xattrs_bl);
11917
11918   Dentry *de;
11919   res = get_or_create(dir, name, &de);
11920   if (res < 0)
11921     goto fail;
11922   req->set_dentry(de);
11923
11924   ldout(cct, 10) << "_mkdir: making request" << dendl;
11925   res = make_request(req, perm, inp);
11926   ldout(cct, 10) << "_mkdir result is " << res << dendl;
11927
11928   trim_cache();
11929
11930   ldout(cct, 8) << "_mkdir(" << path << ", 0" << oct << mode << dec << ") = " << res << dendl;
11931   return res;
11932
11933  fail:
11934   put_request(req);
11935   return res;
11936 }
11937
11938 int Client::ll_mkdir(Inode *parent, const char *name, mode_t mode,
11939                      struct stat *attr, Inode **out, const UserPerm& perm)
11940 {
11941   Mutex::Locker lock(client_lock);
11942
11943   if (unmounting)
11944     return -ENOTCONN;
11945
11946   vinodeno_t vparent = _get_vino(parent);
11947
11948   ldout(cct, 3) << "ll_mkdir " << vparent << " " << name << dendl;
11949   tout(cct) << "ll_mkdir" << std::endl;
11950   tout(cct) << vparent.ino.val << std::endl;
11951   tout(cct) << name << std::endl;
11952   tout(cct) << mode << std::endl;
11953
11954   if (!cct->_conf->fuse_default_permissions) {
11955     int r = may_create(parent, perm);
11956     if (r < 0)
11957       return r;
11958   }
11959
11960   InodeRef in;
11961   int r = _mkdir(parent, name, mode, perm, &in);
11962   if (r == 0) {
11963     fill_stat(in, attr);
11964     _ll_get(in.get());
11965   }
11966   tout(cct) << attr->st_ino << std::endl;
11967   ldout(cct, 3) << "ll_mkdir " << vparent << " " << name
11968           << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
11969   *out = in.get();
11970   return r;
11971 }
11972
11973 int Client::ll_mkdirx(Inode *parent, const char *name, mode_t mode, Inode **out,
11974                       struct ceph_statx *stx, unsigned want, unsigned flags,
11975                       const UserPerm& perms)
11976 {
11977   Mutex::Locker lock(client_lock);
11978
11979   if (unmounting)
11980     return -ENOTCONN;
11981
11982   vinodeno_t vparent = _get_vino(parent);
11983
11984   ldout(cct, 3) << "ll_mkdirx " << vparent << " " << name << dendl;
11985   tout(cct) << "ll_mkdirx" << std::endl;
11986   tout(cct) << vparent.ino.val << std::endl;
11987   tout(cct) << name << std::endl;
11988   tout(cct) << mode << std::endl;
11989
11990   if (!cct->_conf->fuse_default_permissions) {
11991     int r = may_create(parent, perms);
11992     if (r < 0)
11993       return r;
11994   }
11995
11996   InodeRef in;
11997   int r = _mkdir(parent, name, mode, perms, &in);
11998   if (r == 0) {
11999     fill_statx(in, statx_to_mask(flags, want), stx);
12000     _ll_get(in.get());
12001   } else {
12002     stx->stx_ino = 0;
12003     stx->stx_mask = 0;
12004   }
12005   tout(cct) << stx->stx_ino << std::endl;
12006   ldout(cct, 3) << "ll_mkdirx " << vparent << " " << name
12007           << " = " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
12008   *out = in.get();
12009   return r;
12010 }
12011
12012 int Client::_symlink(Inode *dir, const char *name, const char *target,
12013                      const UserPerm& perms, InodeRef *inp)
12014 {
12015   ldout(cct, 8) << "_symlink(" << dir->ino << " " << name << ", " << target
12016                 << ", uid " << perms.uid() << ", gid " << perms.gid() << ")"
12017                 << dendl;
12018
12019   if (strlen(name) > NAME_MAX)
12020     return -ENAMETOOLONG;
12021
12022   if (dir->snapid != CEPH_NOSNAP) {
12023     return -EROFS;
12024   }
12025   if (is_quota_files_exceeded(dir, perms)) {
12026     return -EDQUOT;
12027   }
12028
12029   MetaRequest *req = new MetaRequest(CEPH_MDS_OP_SYMLINK);
12030
12031   filepath path;
12032   dir->make_nosnap_relative_path(path);
12033   path.push_dentry(name);
12034   req->set_filepath(path);
12035   req->set_inode(dir);
12036   req->set_string2(target);
12037   req->dentry_drop = CEPH_CAP_FILE_SHARED;
12038   req->dentry_unless = CEPH_CAP_FILE_EXCL;
12039
12040   Dentry *de;
12041   int res = get_or_create(dir, name, &de);
12042   if (res < 0)
12043     goto fail;
12044   req->set_dentry(de);
12045
12046   res = make_request(req, perms, inp);
12047
12048   trim_cache();
12049   ldout(cct, 8) << "_symlink(\"" << path << "\", \"" << target << "\") = " <<
12050     res << dendl;
12051   return res;
12052
12053  fail:
12054   put_request(req);
12055   return res;
12056 }
12057
12058 int Client::ll_symlink(Inode *parent, const char *name, const char *value,
12059                        struct stat *attr, Inode **out, const UserPerm& perms)
12060 {
12061   Mutex::Locker lock(client_lock);
12062
12063   if (unmounting)
12064     return -ENOTCONN;
12065
12066   vinodeno_t vparent = _get_vino(parent);
12067
12068   ldout(cct, 3) << "ll_symlink " << vparent << " " << name << " -> " << value
12069                 << dendl;
12070   tout(cct) << "ll_symlink" << std::endl;
12071   tout(cct) << vparent.ino.val << std::endl;
12072   tout(cct) << name << std::endl;
12073   tout(cct) << value << std::endl;
12074
12075   if (!cct->_conf->fuse_default_permissions) {
12076     int r = may_create(parent, perms);
12077     if (r < 0)
12078       return r;
12079   }
12080
12081   InodeRef in;
12082   int r = _symlink(parent, name, value, perms, &in);
12083   if (r == 0) {
12084     fill_stat(in, attr);
12085     _ll_get(in.get());
12086   }
12087   tout(cct) << attr->st_ino << std::endl;
12088   ldout(cct, 3) << "ll_symlink " << vparent << " " << name
12089           << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
12090   *out = in.get();
12091   return r;
12092 }
12093
12094 int Client::ll_symlinkx(Inode *parent, const char *name, const char *value,
12095                         Inode **out, struct ceph_statx *stx, unsigned want,
12096                         unsigned flags, const UserPerm& perms)
12097 {
12098   Mutex::Locker lock(client_lock);
12099
12100   if (unmounting)
12101     return -ENOTCONN;
12102
12103   vinodeno_t vparent = _get_vino(parent);
12104
12105   ldout(cct, 3) << "ll_symlinkx " << vparent << " " << name << " -> " << value
12106                 << dendl;
12107   tout(cct) << "ll_symlinkx" << std::endl;
12108   tout(cct) << vparent.ino.val << std::endl;
12109   tout(cct) << name << std::endl;
12110   tout(cct) << value << std::endl;
12111
12112   if (!cct->_conf->fuse_default_permissions) {
12113     int r = may_create(parent, perms);
12114     if (r < 0)
12115       return r;
12116   }
12117
12118   InodeRef in;
12119   int r = _symlink(parent, name, value, perms, &in);
12120   if (r == 0) {
12121     fill_statx(in, statx_to_mask(flags, want), stx);
12122     _ll_get(in.get());
12123   }
12124   tout(cct) << stx->stx_ino << std::endl;
12125   ldout(cct, 3) << "ll_symlinkx " << vparent << " " << name
12126           << " = " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
12127   *out = in.get();
12128   return r;
12129 }
12130
12131 int Client::_unlink(Inode *dir, const char *name, const UserPerm& perm)
12132 {
12133   ldout(cct, 8) << "_unlink(" << dir->ino << " " << name
12134                 << " uid " << perm.uid() << " gid " << perm.gid()
12135                 << ")" << dendl;
12136
12137   if (dir->snapid != CEPH_NOSNAP) {
12138     return -EROFS;
12139   }
12140
12141   MetaRequest *req = new MetaRequest(CEPH_MDS_OP_UNLINK);
12142
12143   filepath path;
12144   dir->make_nosnap_relative_path(path);
12145   path.push_dentry(name);
12146   req->set_filepath(path);
12147
12148   InodeRef otherin;
12149   Inode *in;
12150   Dentry *de;
12151
12152   int res = get_or_create(dir, name, &de);
12153   if (res < 0)
12154     goto fail;
12155   req->set_dentry(de);
12156   req->dentry_drop = CEPH_CAP_FILE_SHARED;
12157   req->dentry_unless = CEPH_CAP_FILE_EXCL;
12158
12159   res = _lookup(dir, name, 0, &otherin, perm);
12160   if (res < 0)
12161     goto fail;
12162
12163   in = otherin.get();
12164   req->set_other_inode(in);
12165   in->break_all_delegs();
12166   req->other_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
12167
12168   req->set_inode(dir);
12169
12170   res = make_request(req, perm);
12171
12172   trim_cache();
12173   ldout(cct, 8) << "unlink(" << path << ") = " << res << dendl;
12174   return res;
12175
12176  fail:
12177   put_request(req);
12178   return res;
12179 }
12180
12181 int Client::ll_unlink(Inode *in, const char *name, const UserPerm& perm)
12182 {
12183   Mutex::Locker lock(client_lock);
12184
12185   if (unmounting)
12186     return -ENOTCONN;
12187
12188   vinodeno_t vino = _get_vino(in);
12189
12190   ldout(cct, 3) << "ll_unlink " << vino << " " << name << dendl;
12191   tout(cct) << "ll_unlink" << std::endl;
12192   tout(cct) << vino.ino.val << std::endl;
12193   tout(cct) << name << std::endl;
12194
12195   if (!cct->_conf->fuse_default_permissions) {
12196     int r = may_delete(in, name, perm);
12197     if (r < 0)
12198       return r;
12199   }
12200   return _unlink(in, name, perm);
12201 }
12202
12203 int Client::_rmdir(Inode *dir, const char *name, const UserPerm& perms)
12204 {
12205   ldout(cct, 8) << "_rmdir(" << dir->ino << " " << name << " uid "
12206                 << perms.uid() << " gid " << perms.gid() << ")" << dendl;
12207
12208   if (dir->snapid != CEPH_NOSNAP && dir->snapid != CEPH_SNAPDIR) {
12209     return -EROFS;
12210   }
12211
12212   int op = dir->snapid == CEPH_SNAPDIR ? CEPH_MDS_OP_RMSNAP : CEPH_MDS_OP_RMDIR;
12213   MetaRequest *req = new MetaRequest(op);
12214   filepath path;
12215   dir->make_nosnap_relative_path(path);
12216   path.push_dentry(name);
12217   req->set_filepath(path);
12218
12219   req->dentry_drop = CEPH_CAP_FILE_SHARED;
12220   req->dentry_unless = CEPH_CAP_FILE_EXCL;
12221   req->other_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
12222
12223   InodeRef in;
12224
12225   Dentry *de;
12226   int res = get_or_create(dir, name, &de);
12227   if (res < 0)
12228     goto fail;
12229   if (op == CEPH_MDS_OP_RMDIR)
12230     req->set_dentry(de);
12231   else
12232     de->get();
12233
12234   res = _lookup(dir, name, 0, &in, perms);
12235   if (res < 0)
12236     goto fail;
12237   if (op == CEPH_MDS_OP_RMDIR) {
12238     req->set_inode(dir);
12239     req->set_other_inode(in.get());
12240   } else {
12241     unlink(de, true, true);
12242     de->put();
12243     req->set_other_inode(in.get());
12244   }
12245
12246   res = make_request(req, perms);
12247
12248   trim_cache();
12249   ldout(cct, 8) << "rmdir(" << path << ") = " << res << dendl;
12250   return res;
12251
12252  fail:
12253   put_request(req);
12254   return res;
12255 }
12256
12257 int Client::ll_rmdir(Inode *in, const char *name, const UserPerm& perms)
12258 {
12259   Mutex::Locker lock(client_lock);
12260
12261   if (unmounting)
12262     return -ENOTCONN;
12263
12264   vinodeno_t vino = _get_vino(in);
12265
12266   ldout(cct, 3) << "ll_rmdir " << vino << " " << name << dendl;
12267   tout(cct) << "ll_rmdir" << std::endl;
12268   tout(cct) << vino.ino.val << std::endl;
12269   tout(cct) << name << std::endl;
12270
12271   if (!cct->_conf->fuse_default_permissions) {
12272     int r = may_delete(in, name, perms);
12273     if (r < 0)
12274       return r;
12275   }
12276
12277   return _rmdir(in, name, perms);
12278 }
12279
12280 int Client::_rename(Inode *fromdir, const char *fromname, Inode *todir, const char *toname, const UserPerm& perm)
12281 {
12282   ldout(cct, 8) << "_rename(" << fromdir->ino << " " << fromname << " to "
12283                 << todir->ino << " " << toname
12284                 << " uid " << perm.uid() << " gid " << perm.gid() << ")"
12285                 << dendl;
12286
12287   if (fromdir->snapid != todir->snapid)
12288     return -EXDEV;
12289
12290   int op = CEPH_MDS_OP_RENAME;
12291   if (fromdir->snapid != CEPH_NOSNAP) {
12292     if (fromdir == todir && fromdir->snapid == CEPH_SNAPDIR)
12293       op = CEPH_MDS_OP_RENAMESNAP;
12294     else
12295       return -EROFS;
12296   }
12297   if (fromdir != todir) {
12298     Inode *fromdir_root =
12299       fromdir->quota.is_enable() ? fromdir : get_quota_root(fromdir, perm);
12300     Inode *todir_root =
12301       todir->quota.is_enable() ? todir : get_quota_root(todir, perm);
12302     if (fromdir_root != todir_root) {
12303       return -EXDEV;
12304     }
12305   }
12306
12307   InodeRef target;
12308   MetaRequest *req = new MetaRequest(op);
12309
12310   filepath from;
12311   fromdir->make_nosnap_relative_path(from);
12312   from.push_dentry(fromname);
12313   filepath to;
12314   todir->make_nosnap_relative_path(to);
12315   to.push_dentry(toname);
12316   req->set_filepath(to);
12317   req->set_filepath2(from);
12318
12319   Dentry *oldde;
12320   int res = get_or_create(fromdir, fromname, &oldde);
12321   if (res < 0)
12322     goto fail;
12323   Dentry *de;
12324   res = get_or_create(todir, toname, &de);
12325   if (res < 0)
12326     goto fail;
12327
12328   if (op == CEPH_MDS_OP_RENAME) {
12329     req->set_old_dentry(oldde);
12330     req->old_dentry_drop = CEPH_CAP_FILE_SHARED;
12331     req->old_dentry_unless = CEPH_CAP_FILE_EXCL;
12332
12333     req->set_dentry(de);
12334     req->dentry_drop = CEPH_CAP_FILE_SHARED;
12335     req->dentry_unless = CEPH_CAP_FILE_EXCL;
12336
12337     InodeRef oldin, otherin;
12338     res = _lookup(fromdir, fromname, 0, &oldin, perm);
12339     if (res < 0)
12340       goto fail;
12341
12342     Inode *oldinode = oldin.get();
12343     oldinode->break_all_delegs();
12344     req->set_old_inode(oldinode);
12345     req->old_inode_drop = CEPH_CAP_LINK_SHARED;
12346
12347     res = _lookup(todir, toname, 0, &otherin, perm);
12348     switch (res) {
12349     case 0:
12350       {
12351         Inode *in = otherin.get();
12352         req->set_other_inode(in);
12353         in->break_all_delegs();
12354       }
12355       req->other_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
12356       break;
12357     case -ENOENT:
12358       break;
12359     default:
12360       goto fail;
12361     }
12362
12363     req->set_inode(todir);
12364   } else {
12365     // renamesnap reply contains no tracedn, so we need to invalidate
12366     // dentry manually
12367     unlink(oldde, true, true);
12368     unlink(de, true, true);
12369   }
12370
12371   res = make_request(req, perm, &target);
12372   ldout(cct, 10) << "rename result is " << res << dendl;
12373
12374   // renamed item from our cache
12375
12376   trim_cache();
12377   ldout(cct, 8) << "_rename(" << from << ", " << to << ") = " << res << dendl;
12378   return res;
12379
12380  fail:
12381   put_request(req);
12382   return res;
12383 }
12384
12385 int Client::ll_rename(Inode *parent, const char *name, Inode *newparent,
12386                       const char *newname, const UserPerm& perm)
12387 {
12388   Mutex::Locker lock(client_lock);
12389
12390   if (unmounting)
12391     return -ENOTCONN;
12392
12393   vinodeno_t vparent = _get_vino(parent);
12394   vinodeno_t vnewparent = _get_vino(newparent);
12395
12396   ldout(cct, 3) << "ll_rename " << vparent << " " << name << " to "
12397           << vnewparent << " " << newname << dendl;
12398   tout(cct) << "ll_rename" << std::endl;
12399   tout(cct) << vparent.ino.val << std::endl;
12400   tout(cct) << name << std::endl;
12401   tout(cct) << vnewparent.ino.val << std::endl;
12402   tout(cct) << newname << std::endl;
12403
12404   if (!cct->_conf->fuse_default_permissions) {
12405     int r = may_delete(parent, name, perm);
12406     if (r < 0)
12407       return r;
12408     r = may_delete(newparent, newname, perm);
12409     if (r < 0 && r != -ENOENT)
12410       return r;
12411   }
12412
12413   return _rename(parent, name, newparent, newname, perm);
12414 }
12415
12416 int Client::_link(Inode *in, Inode *dir, const char *newname, const UserPerm& perm, InodeRef *inp)
12417 {
12418   ldout(cct, 8) << "_link(" << in->ino << " to " << dir->ino << " " << newname
12419                 << " uid " << perm.uid() << " gid " << perm.gid() << ")" << dendl;
12420
12421   if (strlen(newname) > NAME_MAX)
12422     return -ENAMETOOLONG;
12423
12424   if (in->snapid != CEPH_NOSNAP || dir->snapid != CEPH_NOSNAP) {
12425     return -EROFS;
12426   }
12427   if (is_quota_files_exceeded(dir, perm)) {
12428     return -EDQUOT;
12429   }
12430
12431   in->break_all_delegs();
12432   MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LINK);
12433
12434   filepath path(newname, dir->ino);
12435   req->set_filepath(path);
12436   filepath existing(in->ino);
12437   req->set_filepath2(existing);
12438
12439   req->set_inode(dir);
12440   req->inode_drop = CEPH_CAP_FILE_SHARED;
12441   req->inode_unless = CEPH_CAP_FILE_EXCL;
12442
12443   Dentry *de;
12444   int res = get_or_create(dir, newname, &de);
12445   if (res < 0)
12446     goto fail;
12447   req->set_dentry(de);
12448
12449   res = make_request(req, perm, inp);
12450   ldout(cct, 10) << "link result is " << res << dendl;
12451
12452   trim_cache();
12453   ldout(cct, 8) << "link(" << existing << ", " << path << ") = " << res << dendl;
12454   return res;
12455
12456  fail:
12457   put_request(req);
12458   return res;
12459 }
12460
12461 int Client::ll_link(Inode *in, Inode *newparent, const char *newname,
12462                     const UserPerm& perm)
12463 {
12464   Mutex::Locker lock(client_lock);
12465
12466   if (unmounting)
12467     return -ENOTCONN;
12468
12469   vinodeno_t vino = _get_vino(in);
12470   vinodeno_t vnewparent = _get_vino(newparent);
12471
12472   ldout(cct, 3) << "ll_link " << vino << " to " << vnewparent << " " <<
12473     newname << dendl;
12474   tout(cct) << "ll_link" << std::endl;
12475   tout(cct) << vino.ino.val << std::endl;
12476   tout(cct) << vnewparent << std::endl;
12477   tout(cct) << newname << std::endl;
12478
12479   int r = 0;
12480   InodeRef target;
12481
12482   if (!cct->_conf->fuse_default_permissions) {
12483     if (S_ISDIR(in->mode))
12484       return -EPERM;
12485
12486     r = may_hardlink(in, perm);
12487     if (r < 0)
12488       return r;
12489
12490     r = may_create(newparent, perm);
12491     if (r < 0)
12492       return r;
12493   }
12494
12495   return _link(in, newparent, newname, perm, &target);
12496 }
12497
12498 int Client::ll_num_osds(void)
12499 {
12500   Mutex::Locker lock(client_lock);
12501   return objecter->with_osdmap(std::mem_fn(&OSDMap::get_num_osds));
12502 }
12503
12504 int Client::ll_osdaddr(int osd, uint32_t *addr)
12505 {
12506   Mutex::Locker lock(client_lock);
12507
12508   entity_addr_t g;
12509   bool exists = objecter->with_osdmap([&](const OSDMap& o) {
12510       if (!o.exists(osd))
12511         return false;
12512       g = o.get_addr(osd);
12513       return true;
12514     });
12515   if (!exists)
12516     return -1;
12517   uint32_t nb_addr = (g.in4_addr()).sin_addr.s_addr;
12518   *addr = ntohl(nb_addr);
12519   return 0;
12520 }
12521
12522 uint32_t Client::ll_stripe_unit(Inode *in)
12523 {
12524   Mutex::Locker lock(client_lock);
12525   return in->layout.stripe_unit;
12526 }
12527
12528 uint64_t Client::ll_snap_seq(Inode *in)
12529 {
12530   Mutex::Locker lock(client_lock);
12531   return in->snaprealm->seq;
12532 }
12533
12534 int Client::ll_file_layout(Inode *in, file_layout_t *layout)
12535 {
12536   Mutex::Locker lock(client_lock);
12537   *layout = in->layout;
12538   return 0;
12539 }
12540
12541 int Client::ll_file_layout(Fh *fh, file_layout_t *layout)
12542 {
12543   return ll_file_layout(fh->inode.get(), layout);
12544 }
12545
12546 /* Currently we cannot take advantage of redundancy in reads, since we
12547    would have to go through all possible placement groups (a
12548    potentially quite large number determined by a hash), and use CRUSH
12549    to calculate the appropriate set of OSDs for each placement group,
12550    then index into that.  An array with one entry per OSD is much more
12551    tractable and works for demonstration purposes. */
12552
12553 int Client::ll_get_stripe_osd(Inode *in, uint64_t blockno,
12554                               file_layout_t* layout)
12555 {
12556   Mutex::Locker lock(client_lock);
12557
12558   inodeno_t ino = in->ino;
12559   uint32_t object_size = layout->object_size;
12560   uint32_t su = layout->stripe_unit;
12561   uint32_t stripe_count = layout->stripe_count;
12562   uint64_t stripes_per_object = object_size / su;
12563
12564   uint64_t stripeno = blockno / stripe_count;    // which horizontal stripe        (Y)
12565   uint64_t stripepos = blockno % stripe_count;   // which object in the object set (X)
12566   uint64_t objectsetno = stripeno / stripes_per_object;       // which object set
12567   uint64_t objectno = objectsetno * stripe_count + stripepos;  // object id
12568
12569   object_t oid = file_object_t(ino, objectno);
12570   return objecter->with_osdmap([&](const OSDMap& o) {
12571       ceph_object_layout olayout =
12572         o.file_to_object_layout(oid, *layout);
12573       pg_t pg = (pg_t)olayout.ol_pgid;
12574       vector<int> osds;
12575       int primary;
12576       o.pg_to_acting_osds(pg, &osds, &primary);
12577       return primary;
12578     });
12579 }
12580
12581 /* Return the offset of the block, internal to the object */
12582
12583 uint64_t Client::ll_get_internal_offset(Inode *in, uint64_t blockno)
12584 {
12585   Mutex::Locker lock(client_lock);
12586   file_layout_t *layout=&(in->layout);
12587   uint32_t object_size = layout->object_size;
12588   uint32_t su = layout->stripe_unit;
12589   uint64_t stripes_per_object = object_size / su;
12590
12591   return (blockno % stripes_per_object) * su;
12592 }
12593
12594 int Client::ll_opendir(Inode *in, int flags, dir_result_t** dirpp,
12595                        const UserPerm& perms)
12596 {
12597   Mutex::Locker lock(client_lock);
12598
12599   if (unmounting)
12600     return -ENOTCONN;
12601
12602   vinodeno_t vino = _get_vino(in);
12603
12604   ldout(cct, 3) << "ll_opendir " << vino << dendl;
12605   tout(cct) << "ll_opendir" << std::endl;
12606   tout(cct) << vino.ino.val << std::endl;
12607
12608   if (!cct->_conf->fuse_default_permissions) {
12609     int r = may_open(in, flags, perms);
12610     if (r < 0)
12611       return r;
12612   }
12613
12614   int r = _opendir(in, dirpp, perms);
12615   tout(cct) << (unsigned long)*dirpp << std::endl;
12616
12617   ldout(cct, 3) << "ll_opendir " << vino << " = " << r << " (" << *dirpp << ")"
12618                 << dendl;
12619   return r;
12620 }
12621
12622 int Client::ll_releasedir(dir_result_t *dirp)
12623 {
12624   Mutex::Locker lock(client_lock);
12625   ldout(cct, 3) << "ll_releasedir " << dirp << dendl;
12626   tout(cct) << "ll_releasedir" << std::endl;
12627   tout(cct) << (unsigned long)dirp << std::endl;
12628
12629   if (unmounting)
12630     return -ENOTCONN;
12631
12632   _closedir(dirp);
12633   return 0;
12634 }
12635
12636 int Client::ll_fsyncdir(dir_result_t *dirp)
12637 {
12638   Mutex::Locker lock(client_lock);
12639   ldout(cct, 3) << "ll_fsyncdir " << dirp << dendl;
12640   tout(cct) << "ll_fsyncdir" << std::endl;
12641   tout(cct) << (unsigned long)dirp << std::endl;
12642
12643   if (unmounting)
12644     return -ENOTCONN;
12645
12646   return _fsync(dirp->inode.get(), false);
12647 }
12648
12649 int Client::ll_open(Inode *in, int flags, Fh **fhp, const UserPerm& perms)
12650 {
12651   assert(!(flags & O_CREAT));
12652
12653   Mutex::Locker lock(client_lock);
12654
12655   if (unmounting)
12656     return -ENOTCONN;
12657
12658   vinodeno_t vino = _get_vino(in);
12659
12660   ldout(cct, 3) << "ll_open " << vino << " " << ceph_flags_sys2wire(flags) << dendl;
12661   tout(cct) << "ll_open" << std::endl;
12662   tout(cct) << vino.ino.val << std::endl;
12663   tout(cct) << ceph_flags_sys2wire(flags) << std::endl;
12664
12665   int r;
12666   if (!cct->_conf->fuse_default_permissions) {
12667     r = may_open(in, flags, perms);
12668     if (r < 0)
12669       goto out;
12670   }
12671
12672   r = _open(in, flags, 0, fhp /* may be NULL */, perms);
12673
12674  out:
12675   Fh *fhptr = fhp ? *fhp : NULL;
12676   if (fhptr) {
12677     ll_unclosed_fh_set.insert(fhptr);
12678   }
12679   tout(cct) << (unsigned long)fhptr << std::endl;
12680   ldout(cct, 3) << "ll_open " << vino << " " << ceph_flags_sys2wire(flags) <<
12681       " = " << r << " (" << fhptr << ")" << dendl;
12682   return r;
12683 }
12684
12685 int Client::_ll_create(Inode *parent, const char *name, mode_t mode,
12686                       int flags, InodeRef *in, int caps, Fh **fhp,
12687                       const UserPerm& perms)
12688 {
12689   *fhp = NULL;
12690
12691   vinodeno_t vparent = _get_vino(parent);
12692
12693   ldout(cct, 8) << "_ll_create " << vparent << " " << name << " 0" << oct <<
12694     mode << dec << " " << ceph_flags_sys2wire(flags) << ", uid " << perms.uid()
12695                 << ", gid " << perms.gid() << dendl;
12696   tout(cct) << "ll_create" << std::endl;
12697   tout(cct) << vparent.ino.val << std::endl;
12698   tout(cct) << name << std::endl;
12699   tout(cct) << mode << std::endl;
12700   tout(cct) << ceph_flags_sys2wire(flags) << std::endl;
12701
12702   bool created = false;
12703   int r = _lookup(parent, name, caps, in, perms);
12704
12705   if (r == 0 && (flags & O_CREAT) && (flags & O_EXCL))
12706     return -EEXIST;
12707
12708   if (r == -ENOENT && (flags & O_CREAT)) {
12709     if (!cct->_conf->fuse_default_permissions) {
12710       r = may_create(parent, perms);
12711       if (r < 0)
12712         goto out;
12713     }
12714     r = _create(parent, name, flags, mode, in, fhp, 0, 0, 0, NULL, &created,
12715                 perms);
12716     if (r < 0)
12717       goto out;
12718   }
12719
12720   if (r < 0)
12721     goto out;
12722
12723   assert(*in);
12724
12725   ldout(cct, 20) << "_ll_create created = " << created << dendl;
12726   if (!created) {
12727     if (!cct->_conf->fuse_default_permissions) {
12728       r = may_open(in->get(), flags, perms);
12729       if (r < 0) {
12730         if (*fhp) {
12731           int release_r = _release_fh(*fhp);
12732           assert(release_r == 0);  // during create, no async data ops should have happened
12733         }
12734         goto out;
12735       }
12736     }
12737     if (*fhp == NULL) {
12738       r = _open(in->get(), flags, mode, fhp, perms);
12739       if (r < 0)
12740         goto out;
12741     }
12742   }
12743
12744 out:
12745   if (*fhp) {
12746     ll_unclosed_fh_set.insert(*fhp);
12747   }
12748
12749   ino_t ino = 0;
12750   if (r >= 0) {
12751     Inode *inode = in->get();
12752     if (use_faked_inos())
12753       ino = inode->faked_ino;
12754     else
12755       ino = inode->ino;
12756   }
12757
12758   tout(cct) << (unsigned long)*fhp << std::endl;
12759   tout(cct) << ino << std::endl;
12760   ldout(cct, 8) << "_ll_create " << vparent << " " << name << " 0" << oct <<
12761     mode << dec << " " << ceph_flags_sys2wire(flags) << " = " << r << " (" <<
12762     *fhp << " " << hex << ino << dec << ")" << dendl;
12763
12764   return r;
12765 }
12766
12767 int Client::ll_create(Inode *parent, const char *name, mode_t mode,
12768                       int flags, struct stat *attr, Inode **outp, Fh **fhp,
12769                       const UserPerm& perms)
12770 {
12771   Mutex::Locker lock(client_lock);
12772   InodeRef in;
12773
12774   if (unmounting)
12775     return -ENOTCONN;
12776
12777   int r = _ll_create(parent, name, mode, flags, &in, CEPH_STAT_CAP_INODE_ALL,
12778                       fhp, perms);
12779   if (r >= 0) {
12780     assert(in);
12781
12782     // passing an Inode in outp requires an additional ref
12783     if (outp) {
12784       _ll_get(in.get());
12785       *outp = in.get();
12786     }
12787     fill_stat(in, attr);
12788   } else {
12789     attr->st_ino = 0;
12790   }
12791
12792   return r;
12793 }
12794
12795 int Client::ll_createx(Inode *parent, const char *name, mode_t mode,
12796                         int oflags, Inode **outp, Fh **fhp,
12797                         struct ceph_statx *stx, unsigned want, unsigned lflags,
12798                         const UserPerm& perms)
12799 {
12800   unsigned caps = statx_to_mask(lflags, want);
12801   Mutex::Locker lock(client_lock);
12802   InodeRef in;
12803
12804   if (unmounting)
12805     return -ENOTCONN;
12806
12807   int r = _ll_create(parent, name, mode, oflags, &in, caps, fhp, perms);
12808   if (r >= 0) {
12809     assert(in);
12810
12811     // passing an Inode in outp requires an additional ref
12812     if (outp) {
12813       _ll_get(in.get());
12814       *outp = in.get();
12815     }
12816     fill_statx(in, caps, stx);
12817   } else {
12818     stx->stx_ino = 0;
12819     stx->stx_mask = 0;
12820   }
12821
12822   return r;
12823 }
12824
12825 loff_t Client::ll_lseek(Fh *fh, loff_t offset, int whence)
12826 {
12827   Mutex::Locker lock(client_lock);
12828   tout(cct) << "ll_lseek" << std::endl;
12829   tout(cct) << offset << std::endl;
12830   tout(cct) << whence << std::endl;
12831
12832   if (unmounting)
12833     return -ENOTCONN;
12834
12835   return _lseek(fh, offset, whence);
12836 }
12837
12838 int Client::ll_read(Fh *fh, loff_t off, loff_t len, bufferlist *bl)
12839 {
12840   Mutex::Locker lock(client_lock);
12841   ldout(cct, 3) << "ll_read " << fh << " " << fh->inode->ino << " " << " " << off << "~" << len << dendl;
12842   tout(cct) << "ll_read" << std::endl;
12843   tout(cct) << (unsigned long)fh << std::endl;
12844   tout(cct) << off << std::endl;
12845   tout(cct) << len << std::endl;
12846
12847   if (unmounting)
12848     return -ENOTCONN;
12849
12850   return _read(fh, off, len, bl);
12851 }
12852
12853 int Client::ll_read_block(Inode *in, uint64_t blockid,
12854                           char *buf,
12855                           uint64_t offset,
12856                           uint64_t length,
12857                           file_layout_t* layout)
12858 {
12859   Mutex::Locker lock(client_lock);
12860
12861   if (unmounting)
12862     return -ENOTCONN;
12863
12864   vinodeno_t vino = _get_vino(in);
12865   object_t oid = file_object_t(vino.ino, blockid);
12866   C_SaferCond onfinish;
12867   bufferlist bl;
12868
12869   objecter->read(oid,
12870                  object_locator_t(layout->pool_id),
12871                  offset,
12872                  length,
12873                  vino.snapid,
12874                  &bl,
12875                  CEPH_OSD_FLAG_READ,
12876                  &onfinish);
12877
12878   client_lock.Unlock();
12879   int r = onfinish.wait();
12880   client_lock.Lock();
12881
12882   if (r >= 0) {
12883       bl.copy(0, bl.length(), buf);
12884       r = bl.length();
12885   }
12886
12887   return r;
12888 }
12889
12890 /* It appears that the OSD doesn't return success unless the entire
12891    buffer was written, return the write length on success. */
12892
12893 int Client::ll_write_block(Inode *in, uint64_t blockid,
12894                            char* buf, uint64_t offset,
12895                            uint64_t length, file_layout_t* layout,
12896                            uint64_t snapseq, uint32_t sync)
12897 {
12898   Mutex flock("Client::ll_write_block flock");
12899   vinodeno_t vino = ll_get_vino(in);
12900   Cond cond;
12901   bool done;
12902   int r = 0;
12903   Context *onsafe = nullptr;
12904
12905   if (length == 0) {
12906     return -EINVAL;
12907   }
12908   if (true || sync) {
12909     /* if write is stable, the epilogue is waiting on
12910      * flock */
12911     onsafe = new C_SafeCond(&flock, &cond, &done, &r);
12912     done = false;
12913   } else {
12914     /* if write is unstable, we just place a barrier for
12915      * future commits to wait on */
12916     /*onsafe = new C_Block_Sync(this, vino.ino,
12917                               barrier_interval(offset, offset + length), &r);
12918     */
12919     done = true;
12920   }
12921   object_t oid = file_object_t(vino.ino, blockid);
12922   SnapContext fakesnap;
12923   bufferptr bp;
12924   if (length > 0) bp = buffer::copy(buf, length);
12925   bufferlist bl;
12926   bl.push_back(bp);
12927
12928   ldout(cct, 1) << "ll_block_write for " << vino.ino << "." << blockid
12929                 << dendl;
12930
12931   fakesnap.seq = snapseq;
12932
12933   /* lock just in time */
12934   client_lock.Lock();
12935   if (unmounting) {
12936     client_lock.Unlock();
12937     delete onsafe;
12938     return -ENOTCONN;
12939   }
12940
12941   objecter->write(oid,
12942                   object_locator_t(layout->pool_id),
12943                   offset,
12944                   length,
12945                   fakesnap,
12946                   bl,
12947                   ceph::real_clock::now(),
12948                   0,
12949                   onsafe);
12950
12951   client_lock.Unlock();
12952   if (!done /* also !sync */) {
12953     flock.Lock();
12954     while (! done)
12955       cond.Wait(flock);
12956     flock.Unlock();
12957   }
12958
12959   if (r < 0) {
12960     return r;
12961   } else {
12962     return length;
12963   }
12964 }
12965
12966 int Client::ll_commit_blocks(Inode *in,
12967                              uint64_t offset,
12968                              uint64_t length)
12969 {
12970     Mutex::Locker lock(client_lock);
12971     /*
12972     BarrierContext *bctx;
12973     vinodeno_t vino = _get_vino(in);
12974     uint64_t ino = vino.ino;
12975
12976     ldout(cct, 1) << "ll_commit_blocks for " << vino.ino << " from "
12977                   << offset << " to " << length << dendl;
12978
12979     if (length == 0) {
12980       return -EINVAL;
12981     }
12982
12983     map<uint64_t, BarrierContext*>::iterator p = barriers.find(ino);
12984     if (p != barriers.end()) {
12985       barrier_interval civ(offset, offset + length);
12986       p->second->commit_barrier(civ);
12987     }
12988     */
12989     return 0;
12990 }
12991
12992 int Client::ll_write(Fh *fh, loff_t off, loff_t len, const char *data)
12993 {
12994   Mutex::Locker lock(client_lock);
12995   ldout(cct, 3) << "ll_write " << fh << " " << fh->inode->ino << " " << off <<
12996     "~" << len << dendl;
12997   tout(cct) << "ll_write" << std::endl;
12998   tout(cct) << (unsigned long)fh << std::endl;
12999   tout(cct) << off << std::endl;
13000   tout(cct) << len << std::endl;
13001
13002   if (unmounting)
13003     return -ENOTCONN;
13004
13005   int r = _write(fh, off, len, data, NULL, 0);
13006   ldout(cct, 3) << "ll_write " << fh << " " << off << "~" << len << " = " << r
13007                 << dendl;
13008   return r;
13009 }
13010
13011 int Client::ll_flush(Fh *fh)
13012 {
13013   Mutex::Locker lock(client_lock);
13014   ldout(cct, 3) << "ll_flush " << fh << " " << fh->inode->ino << " " << dendl;
13015   tout(cct) << "ll_flush" << std::endl;
13016   tout(cct) << (unsigned long)fh << std::endl;
13017
13018   if (unmounting)
13019     return -ENOTCONN;
13020
13021   return _flush(fh);
13022 }
13023
13024 int Client::ll_fsync(Fh *fh, bool syncdataonly)
13025 {
13026   Mutex::Locker lock(client_lock);
13027   ldout(cct, 3) << "ll_fsync " << fh << " " << fh->inode->ino << " " << dendl;
13028   tout(cct) << "ll_fsync" << std::endl;
13029   tout(cct) << (unsigned long)fh << std::endl;
13030
13031   if (unmounting)
13032     return -ENOTCONN;
13033
13034   int r = _fsync(fh, syncdataonly);
13035   if (r) {
13036     // If we're returning an error, clear it from the FH
13037     fh->take_async_err();
13038   }
13039   return r;
13040 }
13041
13042 int Client::ll_sync_inode(Inode *in, bool syncdataonly)
13043 {
13044   Mutex::Locker lock(client_lock);
13045   ldout(cct, 3) << "ll_sync_inode " << *in << " " << dendl;
13046   tout(cct) << "ll_sync_inode" << std::endl;
13047   tout(cct) << (unsigned long)in << std::endl;
13048
13049   if (unmounting)
13050     return -ENOTCONN;
13051
13052   return _fsync(in, syncdataonly);
13053 }
13054
13055 #ifdef FALLOC_FL_PUNCH_HOLE
13056
13057 int Client::_fallocate(Fh *fh, int mode, int64_t offset, int64_t length)
13058 {
13059   if (offset < 0 || length <= 0)
13060     return -EINVAL;
13061
13062   if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
13063     return -EOPNOTSUPP;
13064
13065   if ((mode & FALLOC_FL_PUNCH_HOLE) && !(mode & FALLOC_FL_KEEP_SIZE))
13066     return -EOPNOTSUPP;
13067
13068   Inode *in = fh->inode.get();
13069
13070   if (objecter->osdmap_pool_full(in->layout.pool_id) &&
13071       !(mode & FALLOC_FL_PUNCH_HOLE)) {
13072     return -ENOSPC;
13073   }
13074
13075   if (in->snapid != CEPH_NOSNAP)
13076     return -EROFS;
13077
13078   if ((fh->mode & CEPH_FILE_MODE_WR) == 0)
13079     return -EBADF;
13080
13081   uint64_t size = offset + length;
13082   std::list<InodeRef> quota_roots;
13083   if (!(mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE)) &&
13084       size > in->size &&
13085       is_quota_bytes_exceeded(in, size - in->size, fh->actor_perms, &quota_roots)) {
13086     return -EDQUOT;
13087   }
13088
13089   int have;
13090   int r = get_caps(in, CEPH_CAP_FILE_WR, CEPH_CAP_FILE_BUFFER, &have, -1);
13091   if (r < 0)
13092     return r;
13093
13094   Mutex uninline_flock("Client::_fallocate_uninline_data flock");
13095   Cond uninline_cond;
13096   bool uninline_done = false;
13097   int uninline_ret = 0;
13098   Context *onuninline = NULL;
13099
13100   if (mode & FALLOC_FL_PUNCH_HOLE) {
13101     if (in->inline_version < CEPH_INLINE_NONE &&
13102         (have & CEPH_CAP_FILE_BUFFER)) {
13103       bufferlist bl;
13104       int len = in->inline_data.length();
13105       if (offset < len) {
13106         if (offset > 0)
13107           in->inline_data.copy(0, offset, bl);
13108         int size = length;
13109         if (offset + size > len)
13110           size = len - offset;
13111         if (size > 0)
13112           bl.append_zero(size);
13113         if (offset + size < len)
13114           in->inline_data.copy(offset + size, len - offset - size, bl);
13115         in->inline_data = bl;
13116         in->inline_version++;
13117       }
13118       in->mtime = in->ctime = ceph_clock_now();
13119       in->change_attr++;
13120       in->mark_caps_dirty(CEPH_CAP_FILE_WR);
13121     } else {
13122       if (in->inline_version < CEPH_INLINE_NONE) {
13123         onuninline = new C_SafeCond(&uninline_flock,
13124                                     &uninline_cond,
13125                                     &uninline_done,
13126                                     &uninline_ret);
13127         uninline_data(in, onuninline);
13128       }
13129
13130       Mutex flock("Client::_punch_hole flock");
13131       Cond cond;
13132       bool done = false;
13133       Context *onfinish = new C_SafeCond(&flock, &cond, &done);
13134
13135       unsafe_sync_write++;
13136       get_cap_ref(in, CEPH_CAP_FILE_BUFFER);
13137
13138       _invalidate_inode_cache(in, offset, length);
13139       filer->zero(in->ino, &in->layout,
13140                   in->snaprealm->get_snap_context(),
13141                   offset, length,
13142                   ceph::real_clock::now(),
13143                   0, true, onfinish);
13144       in->mtime = in->ctime = ceph_clock_now();
13145       in->change_attr++;
13146       in->mark_caps_dirty(CEPH_CAP_FILE_WR);
13147
13148       client_lock.Unlock();
13149       flock.Lock();
13150       while (!done)
13151         cond.Wait(flock);
13152       flock.Unlock();
13153       client_lock.Lock();
13154       _sync_write_commit(in);
13155     }
13156   } else if (!(mode & FALLOC_FL_KEEP_SIZE)) {
13157     uint64_t size = offset + length;
13158     if (size > in->size) {
13159       in->size = size;
13160       in->mtime = in->ctime = ceph_clock_now();
13161       in->change_attr++;
13162       in->mark_caps_dirty(CEPH_CAP_FILE_WR);
13163
13164       if (is_quota_bytes_approaching(in, quota_roots)) {
13165         check_caps(in, CHECK_CAPS_NODELAY);
13166       } else if (is_max_size_approaching(in)) {
13167         check_caps(in, 0);
13168       }
13169     }
13170   }
13171
13172   if (onuninline) {
13173     client_lock.Unlock();
13174     uninline_flock.Lock();
13175     while (!uninline_done)
13176       uninline_cond.Wait(uninline_flock);
13177     uninline_flock.Unlock();
13178     client_lock.Lock();
13179
13180     if (uninline_ret >= 0 || uninline_ret == -ECANCELED) {
13181       in->inline_data.clear();
13182       in->inline_version = CEPH_INLINE_NONE;
13183       in->mark_caps_dirty(CEPH_CAP_FILE_WR);
13184       check_caps(in, 0);
13185     } else
13186       r = uninline_ret;
13187   }
13188
13189   put_cap_ref(in, CEPH_CAP_FILE_WR);
13190   return r;
13191 }
13192 #else
13193
13194 int Client::_fallocate(Fh *fh, int mode, int64_t offset, int64_t length)
13195 {
13196   return -EOPNOTSUPP;
13197 }
13198
13199 #endif
13200
13201
13202 int Client::ll_fallocate(Fh *fh, int mode, loff_t offset, loff_t length)
13203 {
13204   Mutex::Locker lock(client_lock);
13205   ldout(cct, 3) << "ll_fallocate " << fh << " " << fh->inode->ino << " " << dendl;
13206   tout(cct) << "ll_fallocate " << mode << " " << offset << " " << length << std::endl;
13207   tout(cct) << (unsigned long)fh << std::endl;
13208
13209   if (unmounting)
13210     return -ENOTCONN;
13211
13212   return _fallocate(fh, mode, offset, length);
13213 }
13214
13215 int Client::fallocate(int fd, int mode, loff_t offset, loff_t length)
13216 {
13217   Mutex::Locker lock(client_lock);
13218   tout(cct) << "fallocate " << " " << fd << mode << " " << offset << " " << length << std::endl;
13219
13220   if (unmounting)
13221     return -ENOTCONN;
13222
13223   Fh *fh = get_filehandle(fd);
13224   if (!fh)
13225     return -EBADF;
13226 #if defined(__linux__) && defined(O_PATH)
13227   if (fh->flags & O_PATH)
13228     return -EBADF;
13229 #endif
13230   return _fallocate(fh, mode, offset, length);
13231 }
13232
13233 int Client::ll_release(Fh *fh)
13234 {
13235   Mutex::Locker lock(client_lock);
13236
13237   if (unmounting)
13238     return -ENOTCONN;
13239
13240   ldout(cct, 3) << "ll_release (fh)" << fh << " " << fh->inode->ino << " " <<
13241     dendl;
13242   tout(cct) << "ll_release (fh)" << std::endl;
13243   tout(cct) << (unsigned long)fh << std::endl;
13244
13245   if (ll_unclosed_fh_set.count(fh))
13246     ll_unclosed_fh_set.erase(fh);
13247   return _release_fh(fh);
13248 }
13249
13250 int Client::ll_getlk(Fh *fh, struct flock *fl, uint64_t owner)
13251 {
13252   Mutex::Locker lock(client_lock);
13253
13254   ldout(cct, 3) << "ll_getlk (fh)" << fh << " " << fh->inode->ino << dendl;
13255   tout(cct) << "ll_getk (fh)" << (unsigned long)fh << std::endl;
13256
13257   if (unmounting)
13258     return -ENOTCONN;
13259
13260   return _getlk(fh, fl, owner);
13261 }
13262
13263 int Client::ll_setlk(Fh *fh, struct flock *fl, uint64_t owner, int sleep)
13264 {
13265   Mutex::Locker lock(client_lock);
13266
13267   ldout(cct, 3) << "ll_setlk  (fh) " << fh << " " << fh->inode->ino << dendl;
13268   tout(cct) << "ll_setk (fh)" << (unsigned long)fh << std::endl;
13269
13270   if (unmounting)
13271     return -ENOTCONN;
13272
13273   return _setlk(fh, fl, owner, sleep);
13274 }
13275
13276 int Client::ll_flock(Fh *fh, int cmd, uint64_t owner)
13277 {
13278   Mutex::Locker lock(client_lock);
13279
13280   ldout(cct, 3) << "ll_flock  (fh) " << fh << " " << fh->inode->ino << dendl;
13281   tout(cct) << "ll_flock (fh)" << (unsigned long)fh << std::endl;
13282
13283   if (unmounting)
13284     return -ENOTCONN;
13285
13286   return _flock(fh, cmd, owner);
13287 }
13288
13289 int Client::set_deleg_timeout(uint32_t timeout)
13290 {
13291   Mutex::Locker lock(client_lock);
13292
13293   /*
13294    * The whole point is to prevent blacklisting so we must time out the
13295    * delegation before the session autoclose timeout kicks in.
13296    */
13297   if (timeout >= mdsmap->get_session_autoclose())
13298     return -EINVAL;
13299
13300   deleg_timeout = timeout;
13301   return 0;
13302 }
13303
13304 int Client::ll_delegation(Fh *fh, unsigned cmd, ceph_deleg_cb_t cb, void *priv)
13305 {
13306   int ret = -EINVAL;
13307
13308   Mutex::Locker lock(client_lock);
13309
13310   if (!mounted)
13311     return -ENOTCONN;
13312
13313   Inode *inode = fh->inode.get();
13314
13315   switch(cmd) {
13316   case CEPH_DELEGATION_NONE:
13317     inode->unset_deleg(fh);
13318     ret = 0;
13319     break;
13320   default:
13321     try {
13322       ret = inode->set_deleg(fh, cmd, cb, priv);
13323     } catch (std::bad_alloc) {
13324       ret = -ENOMEM;
13325     }
13326     break;
13327   }
13328   return ret;
13329 }
13330
13331 class C_Client_RequestInterrupt : public Context  {
13332 private:
13333   Client *client;
13334   MetaRequest *req;
13335 public:
13336   C_Client_RequestInterrupt(Client *c, MetaRequest *r) : client(c), req(r) {
13337     req->get();
13338   }
13339   void finish(int r) override {
13340     Mutex::Locker l(client->client_lock);
13341     assert(req->head.op == CEPH_MDS_OP_SETFILELOCK);
13342     client->_interrupt_filelock(req);
13343     client->put_request(req);
13344   }
13345 };
13346
13347 void Client::ll_interrupt(void *d)
13348 {
13349   MetaRequest *req = static_cast<MetaRequest*>(d);
13350   ldout(cct, 3) << "ll_interrupt tid " << req->get_tid() << dendl;
13351   tout(cct) << "ll_interrupt tid " << req->get_tid() << std::endl;
13352   interrupt_finisher.queue(new C_Client_RequestInterrupt(this, req));
13353 }
13354
13355 // =========================================
13356 // layout
13357
13358 // expose file layouts
13359
13360 int Client::describe_layout(const char *relpath, file_layout_t *lp,
13361                             const UserPerm& perms)
13362 {
13363   Mutex::Locker lock(client_lock);
13364
13365   if (unmounting)
13366     return -ENOTCONN;
13367
13368   filepath path(relpath);
13369   InodeRef in;
13370   int r = path_walk(path, &in, perms);
13371   if (r < 0)
13372     return r;
13373
13374   *lp = in->layout;
13375
13376   ldout(cct, 3) << "describe_layout(" << relpath << ") = 0" << dendl;
13377   return 0;
13378 }
13379
13380 int Client::fdescribe_layout(int fd, file_layout_t *lp)
13381 {
13382   Mutex::Locker lock(client_lock);
13383
13384   if (unmounting)
13385     return -ENOTCONN;
13386
13387   Fh *f = get_filehandle(fd);
13388   if (!f)
13389     return -EBADF;
13390   Inode *in = f->inode.get();
13391
13392   *lp = in->layout;
13393
13394   ldout(cct, 3) << "fdescribe_layout(" << fd << ") = 0" << dendl;
13395   return 0;
13396 }
13397
13398 int64_t Client::get_default_pool_id()
13399 {
13400   Mutex::Locker lock(client_lock);
13401
13402   if (unmounting)
13403     return -ENOTCONN;
13404
13405   /* first data pool is the default */
13406   return mdsmap->get_first_data_pool();
13407 }
13408
13409 // expose osdmap
13410
13411 int64_t Client::get_pool_id(const char *pool_name)
13412 {
13413   Mutex::Locker lock(client_lock);
13414
13415   if (unmounting)
13416     return -ENOTCONN;
13417
13418   return objecter->with_osdmap(std::mem_fn(&OSDMap::lookup_pg_pool_name),
13419                                pool_name);
13420 }
13421
13422 string Client::get_pool_name(int64_t pool)
13423 {
13424   Mutex::Locker lock(client_lock);
13425
13426   if (unmounting)
13427     return string();
13428
13429   return objecter->with_osdmap([pool](const OSDMap& o) {
13430       return o.have_pg_pool(pool) ? o.get_pool_name(pool) : string();
13431     });
13432 }
13433
13434 int Client::get_pool_replication(int64_t pool)
13435 {
13436   Mutex::Locker lock(client_lock);
13437
13438   if (unmounting)
13439     return -ENOTCONN;
13440
13441   return objecter->with_osdmap([pool](const OSDMap& o) {
13442       return o.have_pg_pool(pool) ? o.get_pg_pool(pool)->get_size() : -ENOENT;
13443     });
13444 }
13445
13446 int Client::get_file_extent_osds(int fd, loff_t off, loff_t *len, vector<int>& osds)
13447 {
13448   Mutex::Locker lock(client_lock);
13449
13450   if (unmounting)
13451     return -ENOTCONN;
13452
13453   Fh *f = get_filehandle(fd);
13454   if (!f)
13455     return -EBADF;
13456   Inode *in = f->inode.get();
13457
13458   vector<ObjectExtent> extents;
13459   Striper::file_to_extents(cct, in->ino, &in->layout, off, 1, in->truncate_size, extents);
13460   assert(extents.size() == 1);
13461
13462   objecter->with_osdmap([&](const OSDMap& o) {
13463       pg_t pg = o.object_locator_to_pg(extents[0].oid, extents[0].oloc);
13464       o.pg_to_acting_osds(pg, osds);
13465     });
13466
13467   if (osds.empty())
13468     return -EINVAL;
13469
13470   /*
13471    * Return the remainder of the extent (stripe unit)
13472    *
13473    * If length = 1 is passed to Striper::file_to_extents we get a single
13474    * extent back, but its length is one so we still need to compute the length
13475    * to the end of the stripe unit.
13476    *
13477    * If length = su then we may get 1 or 2 objects back in the extents vector
13478    * which would have to be examined. Even then, the offsets are local to the
13479    * object, so matching up to the file offset is extra work.
13480    *
13481    * It seems simpler to stick with length = 1 and manually compute the
13482    * remainder.
13483    */
13484   if (len) {
13485     uint64_t su = in->layout.stripe_unit;
13486     *len = su - (off % su);
13487   }
13488
13489   return 0;
13490 }
13491
13492 int Client::get_osd_crush_location(int id, vector<pair<string, string> >& path)
13493 {
13494   Mutex::Locker lock(client_lock);
13495
13496   if (unmounting)
13497     return -ENOTCONN;
13498
13499   if (id < 0)
13500     return -EINVAL;
13501   return objecter->with_osdmap([&](const OSDMap& o) {
13502       return o.crush->get_full_location_ordered(id, path);
13503     });
13504 }
13505
13506 int Client::get_file_stripe_address(int fd, loff_t offset,
13507                                     vector<entity_addr_t>& address)
13508 {
13509   Mutex::Locker lock(client_lock);
13510
13511   if (unmounting)
13512     return -ENOTCONN;
13513
13514   Fh *f = get_filehandle(fd);
13515   if (!f)
13516     return -EBADF;
13517   Inode *in = f->inode.get();
13518
13519   // which object?
13520   vector<ObjectExtent> extents;
13521   Striper::file_to_extents(cct, in->ino, &in->layout, offset, 1,
13522                            in->truncate_size, extents);
13523   assert(extents.size() == 1);
13524
13525   // now we have the object and its 'layout'
13526   return objecter->with_osdmap([&](const OSDMap& o) {
13527       pg_t pg = o.object_locator_to_pg(extents[0].oid, extents[0].oloc);
13528       vector<int> osds;
13529       o.pg_to_acting_osds(pg, osds);
13530       if (osds.empty())
13531         return -EINVAL;
13532       for (unsigned i = 0; i < osds.size(); i++) {
13533         entity_addr_t addr = o.get_addr(osds[i]);
13534         address.push_back(addr);
13535       }
13536       return 0;
13537     });
13538 }
13539
13540 int Client::get_osd_addr(int osd, entity_addr_t& addr)
13541 {
13542   Mutex::Locker lock(client_lock);
13543
13544   if (unmounting)
13545     return -ENOTCONN;
13546
13547   return objecter->with_osdmap([&](const OSDMap& o) {
13548       if (!o.exists(osd))
13549         return -ENOENT;
13550
13551       addr = o.get_addr(osd);
13552       return 0;
13553     });
13554 }
13555
13556 int Client::enumerate_layout(int fd, vector<ObjectExtent>& result,
13557                              loff_t length, loff_t offset)
13558 {
13559   Mutex::Locker lock(client_lock);
13560
13561   if (unmounting)
13562     return -ENOTCONN;
13563
13564   Fh *f = get_filehandle(fd);
13565   if (!f)
13566     return -EBADF;
13567   Inode *in = f->inode.get();
13568
13569   // map to a list of extents
13570   Striper::file_to_extents(cct, in->ino, &in->layout, offset, length, in->truncate_size, result);
13571
13572   ldout(cct, 3) << "enumerate_layout(" << fd << ", " << length << ", " << offset << ") = 0" << dendl;
13573   return 0;
13574 }
13575
13576
13577 /* find an osd with the same ip.  -ENXIO if none. */
13578 int Client::get_local_osd()
13579 {
13580   Mutex::Locker lock(client_lock);
13581
13582   if (unmounting)
13583     return -ENOTCONN;
13584
13585   objecter->with_osdmap([this](const OSDMap& o) {
13586       if (o.get_epoch() != local_osd_epoch) {
13587         local_osd = o.find_osd_on_ip(messenger->get_myaddr());
13588         local_osd_epoch = o.get_epoch();
13589       }
13590     });
13591   return local_osd;
13592 }
13593
13594
13595
13596
13597
13598
13599 // ===============================
13600
13601 void Client::ms_handle_connect(Connection *con)
13602 {
13603   ldout(cct, 10) << "ms_handle_connect on " << con->get_peer_addr() << dendl;
13604 }
13605
13606 bool Client::ms_handle_reset(Connection *con)
13607 {
13608   ldout(cct, 0) << "ms_handle_reset on " << con->get_peer_addr() << dendl;
13609   return false;
13610 }
13611
13612 void Client::ms_handle_remote_reset(Connection *con)
13613 {
13614   ldout(cct, 0) << "ms_handle_remote_reset on " << con->get_peer_addr() << dendl;
13615   Mutex::Locker l(client_lock);
13616   switch (con->get_peer_type()) {
13617   case CEPH_ENTITY_TYPE_MDS:
13618     {
13619       // kludge to figure out which mds this is; fixme with a Connection* state
13620       mds_rank_t mds = MDS_RANK_NONE;
13621       MetaSession *s = NULL;
13622       for (map<mds_rank_t,MetaSession*>::iterator p = mds_sessions.begin();
13623            p != mds_sessions.end();
13624            ++p) {
13625         if (mdsmap->get_addr(p->first) == con->get_peer_addr()) {
13626           mds = p->first;
13627           s = p->second;
13628         }
13629       }
13630       if (mds >= 0) {
13631         assert (s != NULL);
13632         switch (s->state) {
13633         case MetaSession::STATE_CLOSING:
13634           ldout(cct, 1) << "reset from mds we were closing; we'll call that closed" << dendl;
13635           _closed_mds_session(s);
13636           break;
13637
13638         case MetaSession::STATE_OPENING:
13639           {
13640             ldout(cct, 1) << "reset from mds we were opening; retrying" << dendl;
13641             list<Context*> waiters;
13642             waiters.swap(s->waiting_for_open);
13643             _closed_mds_session(s);
13644             MetaSession *news = _get_or_open_mds_session(mds);
13645             news->waiting_for_open.swap(waiters);
13646           }
13647           break;
13648
13649         case MetaSession::STATE_OPEN:
13650           {
13651             objecter->maybe_request_map(); /* to check if we are blacklisted */
13652             const md_config_t *conf = cct->_conf;
13653             if (conf->client_reconnect_stale) {
13654               ldout(cct, 1) << "reset from mds we were open; close mds session for reconnect" << dendl;
13655               _closed_mds_session(s);
13656             } else {
13657               ldout(cct, 1) << "reset from mds we were open; mark session as stale" << dendl;
13658               s->state = MetaSession::STATE_STALE;
13659             }
13660           }
13661           break;
13662
13663         case MetaSession::STATE_NEW:
13664         case MetaSession::STATE_CLOSED:
13665         default:
13666           break;
13667         }
13668       }
13669     }
13670     break;
13671   }
13672 }
13673
13674 bool Client::ms_handle_refused(Connection *con)
13675 {
13676   ldout(cct, 1) << "ms_handle_refused on " << con->get_peer_addr() << dendl;
13677   return false;
13678 }
13679
13680 bool Client::ms_get_authorizer(int dest_type, AuthAuthorizer **authorizer, bool force_new)
13681 {
13682   if (dest_type == CEPH_ENTITY_TYPE_MON)
13683     return true;
13684   *authorizer = monclient->build_authorizer(dest_type);
13685   return true;
13686 }
13687
13688 Inode *Client::get_quota_root(Inode *in, const UserPerm& perms)
13689 {
13690   Inode *cur = in;
13691   utime_t now = ceph_clock_now();
13692
13693   while (cur) {
13694     if (cur != in && cur->quota.is_enable())
13695       break;
13696
13697     Inode *parent_in = NULL;
13698     if (!cur->dn_set.empty()) {
13699       for (auto p = cur->dn_set.begin(); p != cur->dn_set.end(); ++p) {
13700         Dentry *dn = *p;
13701         if (dn->lease_mds >= 0 &&
13702             dn->lease_ttl > now &&
13703             mds_sessions.count(dn->lease_mds)) {
13704           parent_in = dn->dir->parent_inode;
13705         } else {
13706           Inode *diri = dn->dir->parent_inode;
13707           if (diri->caps_issued_mask(CEPH_CAP_FILE_SHARED) &&
13708               diri->shared_gen == dn->cap_shared_gen) {
13709             parent_in = dn->dir->parent_inode;
13710           }
13711         }
13712         if (parent_in)
13713           break;
13714       }
13715     } else if (root_parents.count(cur)) {
13716       parent_in = root_parents[cur].get();
13717     }
13718
13719     if (parent_in) {
13720       cur = parent_in;
13721       continue;
13722     }
13723
13724     if (cur == root_ancestor)
13725       break;
13726
13727     // deleted inode
13728     if (cur->nlink == 0) {
13729       cur = root_ancestor;
13730       break;
13731     }
13732
13733     MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPNAME);
13734     filepath path(cur->ino);
13735     req->set_filepath(path);
13736     req->set_inode(cur);
13737
13738     InodeRef parent_ref;
13739     int ret = make_request(req, perms, &parent_ref);
13740     if (ret < 0) {
13741       ldout(cct, 1) << __func__ << " " << in->vino()
13742                     << " failed to find parent of " << cur->vino()
13743                     << " err " << ret <<  dendl;
13744       // FIXME: what to do?
13745       cur = root_ancestor;
13746       break;
13747     }
13748
13749     now = ceph_clock_now();
13750     if (cur == in)
13751       cur = parent_ref.get();
13752     else
13753       cur = in; // start over
13754   }
13755
13756   ldout(cct, 10) << __func__ << " " << in->vino() << " -> " << cur->vino() << dendl;
13757   return cur;
13758 }
13759
13760 /**
13761  * Traverse quota ancestors of the Inode, return true
13762  * if any of them passes the passed function
13763  */
13764 bool Client::check_quota_condition(Inode *in, const UserPerm& perms,
13765                                    std::function<bool (const Inode &in)> test)
13766 {
13767   while (true) {
13768     assert(in != NULL);
13769     if (test(*in)) {
13770       return true;
13771     }
13772
13773     if (in == root_ancestor) {
13774       // We're done traversing, drop out
13775       return false;
13776     } else {
13777       // Continue up the tree
13778       in = get_quota_root(in, perms);
13779     }
13780   }
13781
13782   return false;
13783 }
13784
13785 bool Client::is_quota_files_exceeded(Inode *in, const UserPerm& perms)
13786 {
13787   return check_quota_condition(in, perms,
13788       [](const Inode &in) {
13789         return in.quota.max_files && in.rstat.rsize() >= in.quota.max_files;
13790       });
13791 }
13792
13793 bool Client::is_quota_bytes_exceeded(Inode *in, int64_t new_bytes,
13794                                      const UserPerm& perms,
13795                                      std::list<InodeRef>* quota_roots)
13796 {
13797   return check_quota_condition(in, perms,
13798       [&new_bytes, quota_roots](const Inode &in) {
13799         if (quota_roots)
13800           quota_roots->emplace_back(const_cast<Inode*>(&in));
13801         return in.quota.max_bytes && (in.rstat.rbytes + new_bytes)
13802                > in.quota.max_bytes;
13803       });
13804 }
13805
13806 bool Client::is_quota_bytes_approaching(Inode *in, std::list<InodeRef>& quota_roots)
13807 {
13808   assert(in->size >= in->reported_size);
13809   const uint64_t size = in->size - in->reported_size;
13810
13811   for (auto& diri : quota_roots) {
13812     if (diri->quota.max_bytes) {
13813       if (diri->rstat.rbytes >= diri->quota.max_bytes)
13814         return true;
13815
13816       uint64_t space = diri->quota.max_bytes - diri->rstat.rbytes;
13817       if ((space >> 4) < size)
13818         return true;
13819     }
13820   }
13821   return false;
13822 }
13823
13824 enum {
13825   POOL_CHECKED = 1,
13826   POOL_CHECKING = 2,
13827   POOL_READ = 4,
13828   POOL_WRITE = 8,
13829 };
13830
13831 int Client::check_pool_perm(Inode *in, int need)
13832 {
13833   if (!cct->_conf->client_check_pool_perm)
13834     return 0;
13835
13836   int64_t pool_id = in->layout.pool_id;
13837   std::string pool_ns = in->layout.pool_ns;
13838   std::pair<int64_t, std::string> perm_key(pool_id, pool_ns);
13839   int have = 0;
13840   while (true) {
13841     auto it = pool_perms.find(perm_key);
13842     if (it == pool_perms.end())
13843       break;
13844     if (it->second == POOL_CHECKING) {
13845       // avoid concurrent checkings
13846       wait_on_list(waiting_for_pool_perm);
13847     } else {
13848       have = it->second;
13849       assert(have & POOL_CHECKED);
13850       break;
13851     }
13852   }
13853
13854   if (!have) {
13855     if (in->snapid != CEPH_NOSNAP) {
13856       // pool permission check needs to write to the first object. But for snapshot,
13857       // head of the first object may have alread been deleted. To avoid creating
13858       // orphan object, skip the check for now.
13859       return 0;
13860     }
13861
13862     pool_perms[perm_key] = POOL_CHECKING;
13863
13864     char oid_buf[32];
13865     snprintf(oid_buf, sizeof(oid_buf), "%llx.00000000", (unsigned long long)in->ino);
13866     object_t oid = oid_buf;
13867
13868     SnapContext nullsnapc;
13869
13870     C_SaferCond rd_cond;
13871     ObjectOperation rd_op;
13872     rd_op.stat(NULL, (ceph::real_time*)nullptr, NULL);
13873
13874     objecter->mutate(oid, OSDMap::file_to_object_locator(in->layout), rd_op,
13875                      nullsnapc, ceph::real_clock::now(), 0, &rd_cond);
13876
13877     C_SaferCond wr_cond;
13878     ObjectOperation wr_op;
13879     wr_op.create(true);
13880
13881     objecter->mutate(oid, OSDMap::file_to_object_locator(in->layout), wr_op,
13882                      nullsnapc, ceph::real_clock::now(), 0, &wr_cond);
13883
13884     client_lock.Unlock();
13885     int rd_ret = rd_cond.wait();
13886     int wr_ret = wr_cond.wait();
13887     client_lock.Lock();
13888
13889     bool errored = false;
13890
13891     if (rd_ret == 0 || rd_ret == -ENOENT)
13892       have |= POOL_READ;
13893     else if (rd_ret != -EPERM) {
13894       ldout(cct, 10) << "check_pool_perm on pool " << pool_id << " ns " << pool_ns
13895                      << " rd_err = " << rd_ret << " wr_err = " << wr_ret << dendl;
13896       errored = true;
13897     }
13898
13899     if (wr_ret == 0 || wr_ret == -EEXIST)
13900       have |= POOL_WRITE;
13901     else if (wr_ret != -EPERM) {
13902       ldout(cct, 10) << "check_pool_perm on pool " << pool_id << " ns " << pool_ns
13903                      << " rd_err = " << rd_ret << " wr_err = " << wr_ret << dendl;
13904       errored = true;
13905     }
13906
13907     if (errored) {
13908       // Indeterminate: erase CHECKING state so that subsequent calls re-check.
13909       // Raise EIO because actual error code might be misleading for
13910       // userspace filesystem user.
13911       pool_perms.erase(perm_key);
13912       signal_cond_list(waiting_for_pool_perm);
13913       return -EIO;
13914     }
13915
13916     pool_perms[perm_key] = have | POOL_CHECKED;
13917     signal_cond_list(waiting_for_pool_perm);
13918   }
13919
13920   if ((need & CEPH_CAP_FILE_RD) && !(have & POOL_READ)) {
13921     ldout(cct, 10) << "check_pool_perm on pool " << pool_id << " ns " << pool_ns
13922                    << " need " << ccap_string(need) << ", but no read perm" << dendl;
13923     return -EPERM;
13924   }
13925   if ((need & CEPH_CAP_FILE_WR) && !(have & POOL_WRITE)) {
13926     ldout(cct, 10) << "check_pool_perm on pool " << pool_id << " ns " << pool_ns
13927                    << " need " << ccap_string(need) << ", but no write perm" << dendl;
13928     return -EPERM;
13929   }
13930
13931   return 0;
13932 }
13933
13934 int Client::_posix_acl_permission(Inode *in, const UserPerm& perms, unsigned want)
13935 {
13936   if (acl_type == POSIX_ACL) {
13937     if (in->xattrs.count(ACL_EA_ACCESS)) {
13938       const bufferptr& access_acl = in->xattrs[ACL_EA_ACCESS];
13939
13940       return posix_acl_permits(access_acl, in->uid, in->gid, perms, want);
13941     }
13942   }
13943   return -EAGAIN;
13944 }
13945
13946 int Client::_posix_acl_chmod(Inode *in, mode_t mode, const UserPerm& perms)
13947 {
13948   if (acl_type == NO_ACL)
13949     return 0;
13950
13951   int r = _getattr(in, CEPH_STAT_CAP_XATTR, perms, in->xattr_version == 0);
13952   if (r < 0)
13953     goto out;
13954
13955   if (acl_type == POSIX_ACL) {
13956     if (in->xattrs.count(ACL_EA_ACCESS)) {
13957       const bufferptr& access_acl = in->xattrs[ACL_EA_ACCESS];
13958       bufferptr acl(access_acl.c_str(), access_acl.length());
13959       r = posix_acl_access_chmod(acl, mode);
13960       if (r < 0)
13961         goto out;
13962       r = _do_setxattr(in, ACL_EA_ACCESS, acl.c_str(), acl.length(), 0, perms);
13963     } else {
13964       r = 0;
13965     }
13966   }
13967 out:
13968   ldout(cct, 10) << __func__ << " ino " << in->ino << " result=" << r << dendl;
13969   return r;
13970 }
13971
13972 int Client::_posix_acl_create(Inode *dir, mode_t *mode, bufferlist& xattrs_bl,
13973                               const UserPerm& perms)
13974 {
13975   if (acl_type == NO_ACL)
13976     return 0;
13977
13978   if (S_ISLNK(*mode))
13979     return 0;
13980
13981   int r = _getattr(dir, CEPH_STAT_CAP_XATTR, perms, dir->xattr_version == 0);
13982   if (r < 0)
13983     goto out;
13984
13985   if (acl_type == POSIX_ACL) {
13986     if (dir->xattrs.count(ACL_EA_DEFAULT)) {
13987       map<string, bufferptr> xattrs;
13988
13989       const bufferptr& default_acl = dir->xattrs[ACL_EA_DEFAULT];
13990       bufferptr acl(default_acl.c_str(), default_acl.length());
13991       r = posix_acl_inherit_mode(acl, mode);
13992       if (r < 0)
13993         goto out;
13994
13995       if (r > 0) {
13996         r = posix_acl_equiv_mode(acl.c_str(), acl.length(), mode);
13997         if (r < 0)
13998           goto out;
13999         if (r > 0)
14000           xattrs[ACL_EA_ACCESS] = acl;
14001       }
14002
14003       if (S_ISDIR(*mode))
14004         xattrs[ACL_EA_DEFAULT] = dir->xattrs[ACL_EA_DEFAULT];
14005
14006       r = xattrs.size();
14007       if (r > 0)
14008         ::encode(xattrs, xattrs_bl);
14009     } else {
14010       if (umask_cb)
14011         *mode &= ~umask_cb(callback_handle);
14012       r = 0;
14013     }
14014   }
14015 out:
14016   ldout(cct, 10) << __func__ << " dir ino " << dir->ino << " result=" << r << dendl;
14017   return r;
14018 }
14019
14020 void Client::set_filer_flags(int flags)
14021 {
14022   Mutex::Locker l(client_lock);
14023   assert(flags == 0 ||
14024          flags == CEPH_OSD_FLAG_LOCALIZE_READS);
14025   objecter->add_global_op_flags(flags);
14026 }
14027
14028 void Client::clear_filer_flags(int flags)
14029 {
14030   Mutex::Locker l(client_lock);
14031   assert(flags == CEPH_OSD_FLAG_LOCALIZE_READS);
14032   objecter->clear_global_op_flag(flags);
14033 }
14034
14035 /**
14036  * This is included in cap release messages, to cause
14037  * the MDS to wait until this OSD map epoch.  It is necessary
14038  * in corner cases where we cancel RADOS ops, so that
14039  * nobody else tries to do IO to the same objects in
14040  * the same epoch as the cancelled ops.
14041  */
14042 void Client::set_cap_epoch_barrier(epoch_t e)
14043 {
14044   ldout(cct, 5) << __func__ << " epoch = " << e << dendl;
14045   cap_epoch_barrier = e;
14046 }
14047
14048 const char** Client::get_tracked_conf_keys() const
14049 {
14050   static const char* keys[] = {
14051     "client_cache_size",
14052     "client_cache_mid",
14053     "client_acl_type",
14054     "client_deleg_timeout",
14055     "client_deleg_break_on_open",
14056     NULL
14057   };
14058   return keys;
14059 }
14060
14061 void Client::handle_conf_change(const struct md_config_t *conf,
14062                                 const std::set <std::string> &changed)
14063 {
14064   Mutex::Locker lock(client_lock);
14065
14066   if (changed.count("client_cache_mid")) {
14067     lru.lru_set_midpoint(cct->_conf->client_cache_mid);
14068   }
14069   if (changed.count("client_acl_type")) {
14070     acl_type = NO_ACL;
14071     if (cct->_conf->client_acl_type == "posix_acl")
14072       acl_type = POSIX_ACL;
14073   }
14074 }
14075
14076 void intrusive_ptr_add_ref(Inode *in)
14077 {
14078   in->get();
14079 }
14080
14081 void intrusive_ptr_release(Inode *in)
14082 {
14083   in->client->put_inode(in);
14084 }
14085
14086 mds_rank_t Client::_get_random_up_mds() const
14087 {
14088   assert(client_lock.is_locked_by_me());
14089
14090   std::set<mds_rank_t> up;
14091   mdsmap->get_up_mds_set(up);
14092
14093   if (up.empty())
14094     return MDS_RANK_NONE;
14095   std::set<mds_rank_t>::const_iterator p = up.begin();
14096   for (int n = rand() % up.size(); n; n--)
14097     ++p;
14098   return *p;
14099 }
14100
14101
14102 StandaloneClient::StandaloneClient(Messenger *m, MonClient *mc)
14103     : Client(m, mc, new Objecter(m->cct, m, mc, NULL, 0, 0))
14104 {
14105   monclient->set_messenger(m);
14106   objecter->set_client_incarnation(0);
14107 }
14108
14109 StandaloneClient::~StandaloneClient()
14110 {
14111   delete objecter;
14112   objecter = nullptr;
14113 }
14114
14115 int StandaloneClient::init()
14116 {
14117   timer.init();
14118   objectcacher->start();
14119   objecter->init();
14120
14121   client_lock.Lock();
14122   assert(!initialized);
14123
14124   messenger->add_dispatcher_tail(objecter);
14125   messenger->add_dispatcher_tail(this);
14126
14127   monclient->set_want_keys(CEPH_ENTITY_TYPE_MDS | CEPH_ENTITY_TYPE_OSD);
14128   int r = monclient->init();
14129   if (r < 0) {
14130     // need to do cleanup because we're in an intermediate init state
14131     timer.shutdown();
14132     client_lock.Unlock();
14133     objecter->shutdown();
14134     objectcacher->stop();
14135     monclient->shutdown();
14136     return r;
14137   }
14138   objecter->start();
14139
14140   client_lock.Unlock();
14141   _finish_init();
14142
14143   return 0;
14144 }
14145
14146 void StandaloneClient::shutdown()
14147 {
14148   Client::shutdown();
14149   objecter->shutdown();
14150   monclient->shutdown();
14151 }