import 14.2.4 nautilus point release

[ceph.git] / ceph / src / osd / PG.cc
diff --git a/ceph/src/osd/PG.cc b/ceph/src/osd/PG.cc

index e54827611cadeff277d69de62feaf85e8dbf77c8..8161087bfb4700e1877f018bea978e06b9c6e7ae 100644 (file)
--- a/ceph/src/osd/PG.cc
+++ b/ceph/src/osd/PG.cc
@@ -1123,6 +1123,7 @@ PG::Scrubber::Scrubber()
     active(false),
     shallow_errors(0), deep_errors(0), fixed(0),
     must_scrub(false), must_deep_scrub(false), must_repair(false),
+   need_auto(false), time_for_deep(false),
     auto_repair(false),
     check_repair(false),
     deep_scrub_on_error(false),
@@ -2359,6 +2360,12 @@ bool PG::queue_scrub()
    }
    // An interrupted recovery repair could leave this set.
    state_clear(PG_STATE_REPAIR);
+  if (scrubber.need_auto) {
+    scrubber.must_scrub = true;
+    scrubber.must_deep_scrub = true;
+    scrubber.auto_repair = true;
+    scrubber.need_auto = false;
+  }
    scrubber.priority = scrubber.must_scrub ?
           cct->_conf->osd_requested_scrub_priority : get_scrub_priority();
    scrubber.must_scrub = false;
@@ -4351,73 +4358,99 @@ void PG::requeue_map_waiters()
  // returns true if a scrub has been newly kicked off
  bool PG::sched_scrub()
  {
-  bool nodeep_scrub = false;
    ceph_assert(is_locked());
-  if (!(is_primary() && is_active() && is_clean() && !is_scrubbing())) {
+  ceph_assert(!is_scrubbing());
+  if (!(is_primary() && is_active() && is_clean())) {
      return false;
    }
  
-  double deep_scrub_interval = 0;
-  pool.info.opts.get(pool_opts_t::DEEP_SCRUB_INTERVAL, &deep_scrub_interval);
-  if (deep_scrub_interval <= 0) {
-    deep_scrub_interval = cct->_conf->osd_deep_scrub_interval;
-  }
-  bool time_for_deep = ceph_clock_now() >=
-    info.history.last_deep_scrub_stamp + deep_scrub_interval;
-
-  bool deep_coin_flip = false;
-  // Only add random deep scrubs when NOT user initiated scrub
-  if (!scrubber.must_scrub)
-      deep_coin_flip = (rand() % 100) < cct->_conf->osd_deep_scrub_randomize_ratio * 100;
-  dout(20) << __func__ << ": time_for_deep=" << time_for_deep << " deep_coin_flip=" << deep_coin_flip << dendl;
+  // All processing the first time through commits us to whatever
+  // choices are made.
+  if (!scrubber.reserved) {
+    dout(20) << __func__ << ": Start processing pg " << info.pgid << dendl;
+
+    bool allow_deep_scrub = !(get_osdmap()->test_flag(CEPH_OSDMAP_NODEEP_SCRUB) ||
+                      pool.info.has_flag(pg_pool_t::FLAG_NODEEP_SCRUB));
+    bool allow_scrub = !(get_osdmap()->test_flag(CEPH_OSDMAP_NOSCRUB) ||
+                 pool.info.has_flag(pg_pool_t::FLAG_NOSCRUB));
+    bool has_deep_errors = (info.stats.stats.sum.num_deep_scrub_errors > 0);
+    bool try_to_auto_repair = (cct->_conf->osd_scrub_auto_repair
+                               && get_pgbackend()->auto_repair_supported());
+
+    scrubber.time_for_deep = false;
+    // Clear these in case user issues the scrub/repair command during
+    // the scheduling of the scrub/repair (e.g. request reservation)
+    scrubber.deep_scrub_on_error = false;
+    scrubber.auto_repair = false;
+
+    // All periodic scrub handling goes here because must_scrub is
+    // always set for must_deep_scrub and must_repair.
+    if (!scrubber.must_scrub) {
+      ceph_assert(!scrubber.must_deep_scrub && !scrubber.must_repair);
+      // Handle deep scrub determination only if allowed
+      if (allow_deep_scrub) {
+        // Initial entry and scheduled scrubs without nodeep_scrub set get here
+        if (scrubber.need_auto) {
+         dout(20) << __func__ << ": need repair after scrub errors" << dendl;
+          scrubber.time_for_deep = true;
+        } else {
+          double deep_scrub_interval = 0;
+          pool.info.opts.get(pool_opts_t::DEEP_SCRUB_INTERVAL, &deep_scrub_interval);
+          if (deep_scrub_interval <= 0) {
+           deep_scrub_interval = cct->_conf->osd_deep_scrub_interval;
+          }
+          scrubber.time_for_deep = ceph_clock_now() >=
+                 info.history.last_deep_scrub_stamp + deep_scrub_interval;
  
-  time_for_deep = (time_for_deep || deep_coin_flip);
+          bool deep_coin_flip = false;
+         // If we randomize when !allow_scrub && allow_deep_scrub, then it guarantees
+         // we will deep scrub because this function is called often.
+         if (!scrubber.time_for_deep && allow_scrub)
+           deep_coin_flip = (rand() % 100) < cct->_conf->osd_deep_scrub_randomize_ratio * 100;
+          dout(20) << __func__ << ": time_for_deep=" << scrubber.time_for_deep << " deep_coin_flip=" << deep_coin_flip << dendl;
  
-  //NODEEP_SCRUB so ignore time initiated deep-scrub
-  if (get_osdmap()->test_flag(CEPH_OSDMAP_NODEEP_SCRUB) ||
-      pool.info.has_flag(pg_pool_t::FLAG_NODEEP_SCRUB)) {
-    time_for_deep = false;
-    nodeep_scrub = true;
-  }
+          scrubber.time_for_deep = (scrubber.time_for_deep || deep_coin_flip);
+        }
  
-  if (!scrubber.must_scrub) {
-    ceph_assert(!scrubber.must_deep_scrub);
+        if (!scrubber.time_for_deep && has_deep_errors) {
+         osd->clog->info() << "osd." << osd->whoami
+                           << " pg " << info.pgid
+                           << " Deep scrub errors, upgrading scrub to deep-scrub";
+         scrubber.time_for_deep = true;
+        }
  
-    //NOSCRUB so skip regular scrubs
-    if ((get_osdmap()->test_flag(CEPH_OSDMAP_NOSCRUB) ||
-        pool.info.has_flag(pg_pool_t::FLAG_NOSCRUB)) && !time_for_deep) {
-      if (scrubber.reserved) {
-        // cancel scrub if it is still in scheduling,
-        // so pgs from other pools where scrub are still legal
-        // have a chance to go ahead with scrubbing.
-        clear_scrub_reserved();
-        scrub_unreserve_replicas();
+        if (try_to_auto_repair) {
+          if (scrubber.time_for_deep) {
+            dout(20) << __func__ << ": auto repair with deep scrubbing" << dendl;
+            scrubber.auto_repair = true;
+          } else if (allow_scrub) {
+            dout(20) << __func__ << ": auto repair with scrubbing, rescrub if errors found" << dendl;
+            scrubber.deep_scrub_on_error = true;
+          }
+        }
+      } else { // !allow_deep_scrub
+        dout(20) << __func__ << ": nodeep_scrub set" << dendl;
+        if (has_deep_errors) {
+          osd->clog->error() << "osd." << osd->whoami
+                            << " pg " << info.pgid
+                            << " Regular scrub skipped due to deep-scrub errors and nodeep-scrub set";
+          return false;
+        }
        }
-      return false;
-    }
-  }
  
-  // Clear these in case user issues the scrub/repair command during
-  // the scheduling of the scrub/repair (e.g. request reservation)
-  scrubber.deep_scrub_on_error = false;
-  scrubber.auto_repair = false;
-  if (cct->_conf->osd_scrub_auto_repair
-      && get_pgbackend()->auto_repair_supported()
-      // respect the command from user, and not do auto-repair
-      && !scrubber.must_repair
-      && !scrubber.must_scrub
-      && !scrubber.must_deep_scrub) {
-    if (time_for_deep) {
-      dout(20) << __func__ << ": auto repair with deep scrubbing" << dendl;
-      scrubber.auto_repair = true;
-    } else {
-      dout(20) << __func__ << ": auto repair with scrubbing, rescrub if errors found" << dendl;
-      scrubber.deep_scrub_on_error = true;
+      //NOSCRUB so skip regular scrubs
+      if (!allow_scrub && !scrubber.time_for_deep) {
+        return false;
+      }
+    // scrubber.must_scrub
+    } else if (!scrubber.must_deep_scrub && has_deep_errors) {
+       osd->clog->error() << "osd." << osd->whoami
+                          << " pg " << info.pgid
+                          << " Regular scrub request, deep-scrub details will be lost";
      }
-  }
+    // Unless precluded this was handle above
+    scrubber.need_auto = false;
  
-  bool ret = true;
-  if (!scrubber.reserved) {
      ceph_assert(scrubber.reserved_peers.empty());
      if ((cct->_conf->osd_scrub_during_recovery || !osd->is_recovery_active()) &&
           osd->inc_scrubs_pending()) {
@@ -4427,47 +4460,36 @@ bool PG::sched_scrub()
        scrub_reserve_replicas();
      } else {
        dout(20) << __func__ << ": failed to reserve locally" << dendl;
-      ret = false;
+      return false;
      }
    }
+
    if (scrubber.reserved) {
      if (scrubber.reserve_failed) {
-      dout(20) << "sched_scrub: failed, a peer declined" << dendl;
+      dout(20) << __func__ << ": failed, a peer declined" << dendl;
        clear_scrub_reserved();
        scrub_unreserve_replicas();
-      ret = false;
-    } else if (scrubber.reserved_peers.size() == acting.size()) {
-      dout(20) << "sched_scrub: success, reserved self and replicas" << dendl;
-      if (time_for_deep) {
-       dout(10) << "sched_scrub: scrub will be deep" << dendl;
+      return false;
+    } else if (scrubber.reserved_peers.size() == actingset.size()) {
+      dout(20) << __func__ << ": success, reserved self and replicas" << dendl;
+      if (scrubber.time_for_deep) {
+       dout(10) << __func__ << ": scrub will be deep" << dendl;
         state_set(PG_STATE_DEEP_SCRUB);
-      } else if (!scrubber.must_deep_scrub && info.stats.stats.sum.num_deep_scrub_errors) {
-       if (!nodeep_scrub) {
-         osd->clog->info() << "osd." << osd->whoami
-                           << " pg " << info.pgid
-                           << " Deep scrub errors, upgrading scrub to deep-scrub";
-         state_set(PG_STATE_DEEP_SCRUB);
-       } else if (!scrubber.must_scrub) {
-         osd->clog->error() << "osd." << osd->whoami
-                            << " pg " << info.pgid
-                            << " Regular scrub skipped due to deep-scrub errors and nodeep-scrub set";
-         clear_scrub_reserved();
-         scrub_unreserve_replicas();
-         return false;
-       } else {
-         osd->clog->error() << "osd." << osd->whoami
-                            << " pg " << info.pgid
-                            << " Regular scrub request, deep-scrub details will be lost";
-       }
+       scrubber.time_for_deep = false;
        }
        queue_scrub();
      } else {
        // none declined, since scrubber.reserved is set
-      dout(20) << "sched_scrub: reserved " << scrubber.reserved_peers << ", waiting for replicas" << dendl;
+      dout(20) << __func__ << ": reserved " << scrubber.reserved_peers
+              << ", waiting for replicas" << dendl;
      }
    }
+  return true;
+}
  
-  return ret;
+bool PG::is_scrub_registered()
+{
+  return !scrubber.scrub_reg_stamp.is_zero();
  }
  
  void PG::reg_next_scrub()
@@ -4477,9 +4499,9 @@ void PG::reg_next_scrub()
  
    utime_t reg_stamp;
    bool must = false;
-  if (scrubber.must_scrub) {
+  if (scrubber.must_scrub || scrubber.need_auto) {
      // Set the smallest time that isn't utime_t()
-    reg_stamp = utime_t(0,1);
+    reg_stamp = Scrubber::scrub_must_stamp();
      must = true;
    } else if (info.stats.stats_invalid && cct->_conf->osd_scrub_invalid_stats) {
      reg_stamp = ceph_clock_now();
@@ -4492,7 +4514,7 @@ void PG::reg_next_scrub()
    double scrub_min_interval = 0, scrub_max_interval = 0;
    pool.info.opts.get(pool_opts_t::SCRUB_MIN_INTERVAL, &scrub_min_interval);
    pool.info.opts.get(pool_opts_t::SCRUB_MAX_INTERVAL, &scrub_max_interval);
-  ceph_assert(scrubber.scrub_reg_stamp == utime_t());
+  ceph_assert(!is_scrub_registered());
    scrubber.scrub_reg_stamp = osd->reg_pg_scrub(info.pgid,
                                                reg_stamp,
                                                scrub_min_interval,
@@ -4504,12 +4526,33 @@ void PG::reg_next_scrub()
  
  void PG::unreg_next_scrub()
  {
-  if (is_primary()) {
+  if (is_scrub_registered()) {
      osd->unreg_pg_scrub(info.pgid, scrubber.scrub_reg_stamp);
      scrubber.scrub_reg_stamp = utime_t();
    }
  }
  
+void PG::on_info_history_change()
+{
+  unreg_next_scrub();
+  reg_next_scrub();
+}
+
+void PG::scrub_requested(bool deep, bool repair, bool need_auto)
+{
+  unreg_next_scrub();
+  if (need_auto) {
+    scrubber.need_auto = true;
+  } else {
+    scrubber.must_scrub = true;
+    scrubber.must_deep_scrub = deep || repair;
+    scrubber.must_repair = repair;
+    // User might intervene, so clear this
+    scrubber.need_auto = false;
+  }
+  reg_next_scrub();
+}
+
  void PG::do_replica_scrub_map(OpRequestRef op)
  {
    const MOSDRepScrubMap *m = static_cast<const MOSDRepScrubMap*>(op->get_req());
@@ -4700,8 +4743,8 @@ void PG::clear_scrub_reserved()
  void PG::scrub_reserve_replicas()
  {
    ceph_assert(backfill_targets.empty());
-  for (set<pg_shard_t>::iterator i = acting_recovery_backfill.begin();
-       i != acting_recovery_backfill.end();
+  for (set<pg_shard_t>::iterator i = actingset.begin();
+       i != actingset.end();
         ++i) {
      if (*i == pg_whoami) continue;
      dout(10) << "scrub requesting reserve from osd." << *i << dendl;
@@ -4717,8 +4760,8 @@ void PG::scrub_reserve_replicas()
  void PG::scrub_unreserve_replicas()
  {
    ceph_assert(backfill_targets.empty());
-  for (set<pg_shard_t>::iterator i = acting_recovery_backfill.begin();
-       i != acting_recovery_backfill.end();
+  for (set<pg_shard_t>::iterator i = actingset.begin();
+       i != actingset.end();
         ++i) {
      if (*i == pg_whoami) continue;
      dout(10) << "scrub requesting unreserve from osd." << *i << dendl;
@@ -5818,7 +5861,7 @@ void PG::scrub_finish()
  {
    dout(20) << __func__ << dendl;
    bool repair = state_test(PG_STATE_REPAIR);
-  bool do_deep_scrub = false;
+  bool do_auto_scrub = false;
    // if the repair request comes from auto-repair and large number of errors,
    // we would like to cancel auto-repair
    if (repair && scrubber.auto_repair
@@ -5831,12 +5874,13 @@ void PG::scrub_finish()
  
    // if a regular scrub had errors within the limit, do a deep scrub to auto repair.
    if (scrubber.deep_scrub_on_error
+      && scrubber.authoritative.size()
        && scrubber.authoritative.size() <= cct->_conf->osd_scrub_auto_repair_num_errors) {
      ceph_assert(!deep_scrub);
-    scrubber.deep_scrub_on_error = false;
-    do_deep_scrub = true;
+    do_auto_scrub = true;
      dout(20) << __func__ << " Try to auto repair after scrub errors" << dendl;
    }
+  scrubber.deep_scrub_on_error = false;
  
    // type-specific finish (can tally more errors)
    _scrub_finish();
@@ -5920,14 +5964,6 @@ void PG::scrub_finish()
      }
    }
    publish_stats_to_osd();
-  if (do_deep_scrub) {
-    // XXX: Auto scrub won't activate if must_scrub is set, but
-    // setting the scrub stamps affects what users see.
-    utime_t stamp = utime_t(0,1);
-    set_last_scrub_stamp(stamp);
-    set_last_deep_scrub_stamp(stamp);
-  }
-  reg_next_scrub();
  
    {
      ObjectStore::Transaction t;
@@ -5950,6 +5986,12 @@ void PG::scrub_finish()
    scrub_clear_state(has_error);
    scrub_unreserve_replicas();
  
+  if (do_auto_scrub) {
+    scrub_requested(false, false, true);
+  } else {
+    reg_next_scrub();
+  }
+
    if (is_active() && is_primary()) {
      share_pg_info();
    }
@@ -6077,7 +6119,6 @@ void PG::merge_new_log_entries(
  
  void PG::update_history(const pg_history_t& new_history)
  {
-  unreg_next_scrub();
    if (info.history.merge(new_history)) {
      dout(20) << __func__ << " advanced history from " << new_history << dendl;
      dirty_info = true;
@@ -6087,7 +6128,7 @@ void PG::update_history(const pg_history_t& new_history)
        dirty_big_info = true;
      }
    }
-  reg_next_scrub();
+  on_info_history_change();
  }
  
  void PG::fulfill_info(
@@ -6291,8 +6332,6 @@ void PG::start_peering_interval(
    vector<int> oldacting, oldup;
    int oldrole = get_role();
  
-  unreg_next_scrub();
-
    if (is_primary()) {
      osd->clear_ready_to_merge(this);
    }
@@ -6484,7 +6523,7 @@ void PG::on_new_interval()
  {
    const OSDMapRef osdmap = get_osdmap();
  
-  reg_next_scrub();
+  on_info_history_change();
  
    // initialize features
    acting_features = CEPH_FEATURES_SUPPORTED_DEFAULT;
@@ -6597,6 +6636,10 @@ ostream& operator<<(ostream& out, const PG& pg)
      out << " MUST_DEEP_SCRUB";
    if (pg.scrubber.must_scrub)
      out << " MUST_SCRUB";
+  if (pg.scrubber.time_for_deep)
+    out << " TIME_FOR_DEEP";
+  if (pg.scrubber.need_auto)
+    out << " NEED_AUTO";
  
    //out << " (" << pg.pg_log.get_tail() << "," << pg.pg_log.get_head() << "]";
    if (pg.pg_log.get_missing().num_missing()) {
@@ -7014,9 +7057,13 @@ void PG::_delete_some(ObjectStore::Transaction *t)
    OSDriver::OSTransaction _t(osdriver.get_transaction(t));
    int64_t num = 0;
    for (auto& oid : olist) {
-    if (oid.is_pgmeta()) {
+    if (oid == pgmeta_oid) {
        continue;
      }
+    if (oid.is_pgmeta()) {
+      osd->clog->warn() << info.pgid << " found stray pgmeta-like " << oid
+                       << " during PG removal";
+    }
      int r = snap_mapper.remove_oid(oid.hobj, &_t);
      if (r != 0 && r != -ENOENT) {
        ceph_abort();
@@ -7394,11 +7441,7 @@ boost::statechart::result PG::RecoveryState::Primary::react(
  {
    PG *pg = context< RecoveryMachine >().pg;
    if (pg->is_primary()) {
-    pg->unreg_next_scrub();
-    pg->scrubber.must_scrub = true;
-    pg->scrubber.must_deep_scrub = evt.deep || evt.repair;
-    pg->scrubber.must_repair = evt.repair;
-    pg->reg_next_scrub();
+    pg->scrub_requested(evt.deep, evt.repair);
      ldout(pg->cct,10) << "marking for scrub" << dendl;
    }
    return discard_event();
@@ -8969,9 +9012,8 @@ boost::statechart::result PG::RecoveryState::Stray::react(const MLogRec& logevt)
    ObjectStore::Transaction* t = context<RecoveryMachine>().get_cur_transaction();
    if (msg->info.last_backfill == hobject_t()) {
      // restart backfill
-    pg->unreg_next_scrub();
      pg->info = msg->info;
-    pg->reg_next_scrub();
+    pg->on_info_history_change();
      pg->dirty_info = true;
      pg->dirty_big_info = true;  // maybe.