ceph/src/mds/RecoveryQueue.cc

   1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
   2 // vim: ts=8 sw=2 smarttab
   3 /*
   4  * Ceph - scalable distributed file system
   5  *
   6  * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
   7  *
   8  * This is free software; you can redistribute it and/or
   9  * modify it under the terms of the GNU Lesser General Public
  10  * License version 2.1, as published by the Free Software
  11  * Foundation.  See file COPYING.
  12  *
  13  */
  14
  15 #include "CInode.h"
  16 #include "MDCache.h"
  17 #include "MDSRank.h"
  18 #include "Locker.h"
  19 #include "osdc/Filer.h"
  20
  21 #include "RecoveryQueue.h"
  22
  23 #define dout_context g_ceph_context
  24 #define dout_subsys ceph_subsys_mds
  25 #undef dout_prefix
  26 #define dout_prefix *_dout << "mds." << mds->get_nodeid() << " RecoveryQueue::" << __func__ << " "
  27
  28 class C_MDC_Recover : public MDSIOContextBase {
  29 protected:
  30   RecoveryQueue *rq;
  31   CInode *in;
  32   void finish(int r) override {
  33     rq->_recovered(in, r, size, mtime);
  34   }
  35
  36   MDSRank *get_mds() override {
  37     return rq->mds;
  38   }
  39
  40 public:
  41   uint64_t size;
  42   utime_t mtime;
  43
  44   C_MDC_Recover(RecoveryQueue *rq_, CInode *i) : rq(rq_), in(i), size(0) {
  45     assert(rq != NULL);
  46   }
  47 };
  48
  49
  50 RecoveryQueue::RecoveryQueue(MDSRank *mds_)
  51   : mds(mds_), logger(NULL), filer(mds_->objecter, mds_->finisher)
  52 {}
  53
  54
  55 /**
  56  * Progress the queue.  Call this after enqueuing something or on
  57  * completion of something.
  58  */
  59 void RecoveryQueue::advance()
  60 {
  61   dout(10) << file_recover_queue.size() << " queued, "
  62            << file_recover_queue_front.size() << " prioritized, "
  63            << file_recovering.size() << " recovering" << dendl;
  64
  65   while (file_recovering.size() < g_conf->mds_max_file_recover) {
  66     if (!file_recover_queue_front.empty()) {
  67       CInode *in = *file_recover_queue_front.begin();
  68       file_recover_queue_front.erase(file_recover_queue_front.begin());
  69       file_recover_queue.erase(in);
  70       _start(in);
  71     } else if (!file_recover_queue.empty()) {
  72       CInode *in = *file_recover_queue.begin();
  73       file_recover_queue.erase(file_recover_queue.begin());
  74       _start(in);
  75     } else {
  76       break;
  77     }
  78   }
  79
  80   logger->set(l_mdc_num_recovering_processing, file_recovering.size());
  81   logger->set(l_mdc_num_recovering_enqueued, file_recover_queue.size());
  82   logger->set(l_mdc_num_recovering_prioritized, file_recover_queue_front.size());
  83 }
  84
  85 void RecoveryQueue::_start(CInode *in)
  86 {
  87   inode_t *pi = in->get_projected_inode();
  88
  89   // blech
  90   if (pi->client_ranges.size() && !pi->get_max_size()) {
  91     mds->clog->warn() << "bad client_range " << pi->client_ranges
  92                       << " on ino " << pi->ino;
  93   }
  94
  95   if (pi->client_ranges.size() && pi->get_max_size()) {
  96     dout(10) << "starting " << in->inode.size << " " << pi->client_ranges
  97              << " " << *in << dendl;
  98     file_recovering.insert(in);
  99
 100     C_MDC_Recover *fin = new C_MDC_Recover(this, in);
 101     filer.probe(in->inode.ino, &in->inode.layout, in->last,
 102                 pi->get_max_size(), &fin->size, &fin->mtime, false,
 103                 0, fin);
 104   } else {
 105     dout(10) << "skipping " << in->inode.size << " " << *in << dendl;
 106     in->state_clear(CInode::STATE_RECOVERING);
 107     mds->locker->eval(in, CEPH_LOCK_IFILE);
 108     in->auth_unpin(this);
 109   }
 110 }
 111
 112 void RecoveryQueue::prioritize(CInode *in)
 113 {
 114   if (file_recovering.count(in)) {
 115     dout(10) << "already working on " << *in << dendl;
 116     return;
 117   }
 118
 119   if (file_recover_queue.count(in)) {
 120     dout(20) << *in << dendl;
 121     file_recover_queue_front.insert(in);
 122     logger->set(l_mdc_num_recovering_prioritized, file_recover_queue_front.size());
 123     return;
 124   }
 125
 126   dout(10) << "not queued " << *in << dendl;
 127 }
 128
 129
 130 /**
 131  * Given an authoritative inode which is in the cache,
 132  * enqueue it for recovery.
 133  */
 134 void RecoveryQueue::enqueue(CInode *in)
 135 {
 136   dout(15) << "RecoveryQueue::enqueue " << *in << dendl;
 137   assert(logger);  // Caller should have done set_logger before using me
 138   assert(in->is_auth());
 139
 140   in->state_clear(CInode::STATE_NEEDSRECOVER);
 141   if (!in->state_test(CInode::STATE_RECOVERING)) {
 142     in->state_set(CInode::STATE_RECOVERING);
 143     in->auth_pin(this);
 144     logger->inc(l_mdc_recovery_started);
 145   }
 146   file_recover_queue.insert(in);
 147   logger->set(l_mdc_num_recovering_enqueued, file_recover_queue.size());
 148 }
 149
 150
 151 /**
 152  * Call back on completion of Filer probe on an inode.
 153  */
 154 void RecoveryQueue::_recovered(CInode *in, int r, uint64_t size, utime_t mtime)
 155 {
 156   dout(10) << "_recovered r=" << r << " size=" << size << " mtime=" << mtime
 157            << " for " << *in << dendl;
 158
 159   if (r != 0) {
 160     dout(0) << "recovery error! " << r << dendl;
 161     if (r == -EBLACKLISTED) {
 162       mds->respawn();
 163       return;
 164     } else {
 165       // Something wrong on the OSD side trying to recover the size
 166       // of this inode.  In principle we could record this as a piece
 167       // of per-inode damage, but it's actually more likely that
 168       // this indicates something wrong with the MDS (like maybe
 169       // it has the wrong auth caps?)
 170       mds->clog->error() << " OSD read error while recovering size for inode 0x"
 171                          << std::hex << in->ino() << std::dec;
 172       mds->damaged();
 173     }
 174   }
 175
 176   file_recovering.erase(in);
 177   logger->set(l_mdc_num_recovering_processing, file_recovering.size());
 178   logger->inc(l_mdc_recovery_completed);
 179   in->state_clear(CInode::STATE_RECOVERING);
 180
 181   if (!in->get_parent_dn() && !in->get_projected_parent_dn()) {
 182     dout(10) << " inode has no parents, killing it off" << dendl;
 183     in->auth_unpin(this);
 184     mds->mdcache->remove_inode(in);
 185   } else {
 186     // journal
 187     mds->locker->check_inode_max_size(in, true, 0,  size, mtime);
 188     mds->locker->eval(in, CEPH_LOCK_IFILE);
 189     in->auth_unpin(this);
 190   }
 191
 192   advance();
 193 }
 194