update sources to v12.2.3

[ceph.git] / ceph / src / osd / PG.cc
diff --git a/ceph/src/osd/PG.cc b/ceph/src/osd/PG.cc

index 139a9ac6d17941d835bfecbea14b2e2577d3f072..85f402df7473e6205388d2c36d8daa4f62b036b8 100644 (file)
--- a/ceph/src/osd/PG.cc
+++ b/ceph/src/osd/PG.cc
@@ -1833,14 +1833,13 @@ void PG::activate(ObjectStore::Transaction& t,
  
        build_might_have_unfound();
  
-      state_set(PG_STATE_DEGRADED);
        if (have_unfound())
         discover_all_missing(query_map);
      }
  
-    // degraded?
+    // num_objects_degraded if calculated should reflect this too, unless no
+    // missing and we are about to go clean.
      if (get_osdmap()->get_pg_size(info.pgid.pgid) > actingset.size()) {
-      state_set(PG_STATE_DEGRADED);
        state_set(PG_STATE_UNDERSIZED);
      }
  
@@ -1932,6 +1931,12 @@ void PG::_activate_committed(epoch_t epoch, epoch_t activation_epoch)
      // waiters
      if (flushes_in_progress == 0) {
        requeue_ops(waiting_for_peered);
+    } else if (!waiting_for_peered.empty()) {
+      dout(10) << __func__ << " flushes in progress, moving "
+              << waiting_for_peered.size() << " items to waiting_for_flush"
+              << dendl;
+      assert(waiting_for_flush.empty());
+      waiting_for_flush.swap(waiting_for_peered);
      }
    }
  
@@ -1953,6 +1958,14 @@ void PG::all_activated_and_committed()
    assert(!actingbackfill.empty());
    assert(blocked_by.empty());
  
+  // Degraded?
+  _update_calc_stats();
+  if (info.stats.stats.sum.num_objects_degraded) {
+    state_set(PG_STATE_DEGRADED);
+  } else {
+    state_clear(PG_STATE_DEGRADED);
+  }
+
    queue_peering_event(
      CephPeeringEvtRef(
        std::make_shared<CephPeeringEvt>(
@@ -2566,99 +2579,150 @@ void PG::_update_calc_stats()
    info.stats.ondisk_log_size = info.stats.log_size;
    info.stats.log_start = pg_log.get_tail();
    info.stats.ondisk_log_start = pg_log.get_tail();
+  info.stats.snaptrimq_len = snap_trimq.size();
  
-  // If actingset is larger then upset we will have misplaced,
-  // so we will report based on actingset size.
-
-  // If upset is larger then we will have degraded,
-  // so we will report based on upset size.
+  unsigned num_shards = get_osdmap()->get_pg_size(info.pgid.pgid);
  
-  // If target is the largest of them all, it will contribute to
-  // the degraded count because num_object_copies is
-  // computed using target and eventual used to get degraded total.
-
-  unsigned target = get_osdmap()->get_pg_size(info.pgid.pgid);
-  unsigned nrep = MAX(actingset.size(), upset.size());
+  // In rare case that upset is too large (usually transient), use as target
+  // for calculations below.
+  unsigned target = std::max(num_shards, (unsigned)upset.size());
+  // Not sure this could ever happen, that actingset > upset
+  // which only matters if actingset > num_shards.
+  unsigned nrep = std::max(actingset.size(), upset.size());
    // calc num_object_copies
    info.stats.stats.calc_copies(MAX(target, nrep));
    info.stats.stats.sum.num_objects_degraded = 0;
    info.stats.stats.sum.num_objects_unfound = 0;
    info.stats.stats.sum.num_objects_misplaced = 0;
-  if ((is_degraded() || is_undersized() || !is_clean()) && is_peered()) {
-    // NOTE: we only generate copies, degraded, misplaced and unfound
+  if ((is_remapped() || is_undersized() || !is_clean()) && (is_peered() || is_activating())) {
+    dout(20) << __func__ << " actingset " << actingset << " upset "
+             << upset << " actingbackfill " << actingbackfill << dendl;
+    dout(20) << __func__ << " acting " << acting << " up " << up << dendl;
+
+    assert(!actingbackfill.empty());
+
+    // NOTE: we only generate degraded, misplaced and unfound
      // values for the summation, not individual stat categories.
      int64_t num_objects = info.stats.stats.sum.num_objects;
  
-    // Total sum of all missing
-    int64_t missing = 0;
-    // Objects that have arrived backfilled to up OSDs (not in acting)
-    int64_t backfilled = 0;
-    // A misplaced object is not stored on the correct OSD
-    int64_t misplaced = 0;
-    // Total of object copies/shards found
-    int64_t object_copies = 0;
-
-    // num_objects_missing on each peer
-    for (map<pg_shard_t, pg_info_t>::iterator pi =
-        peer_info.begin();
-        pi != peer_info.end();
-        ++pi) {
-      map<pg_shard_t, pg_missing_t>::const_iterator pm =
-        peer_missing.find(pi->first);
-      if (pm != peer_missing.end()) {
-        pi->second.stats.stats.sum.num_objects_missing =
-          pm->second.num_missing();
-      }
+    // Objects missing from up nodes, sorted by # objects.
+    boost::container::flat_set<pair<int64_t,pg_shard_t>> missing_target_objects;
+    // Objects missing from nodes not in up, sort by # objects
+    boost::container::flat_set<pair<int64_t,pg_shard_t>> acting_source_objects;
+
+    int64_t missing;
+
+    // Primary first
+    missing = pg_log.get_missing().num_missing();
+    assert(actingbackfill.count(pg_whoami));
+    if (upset.count(pg_whoami)) {
+      missing_target_objects.insert(make_pair(missing, pg_whoami));
+    } else {
+      acting_source_objects.insert(make_pair(missing, pg_whoami));
      }
+    info.stats.stats.sum.num_objects_missing_on_primary = missing;
  
-    assert(!actingbackfill.empty());
-    for (set<pg_shard_t>::iterator i = actingbackfill.begin();
-        i != actingbackfill.end();
-        ++i) {
-      const pg_shard_t &p = *i;
-
-      bool in_up = (upset.find(p) != upset.end());
-      bool in_acting = (actingset.find(p) != actingset.end());
-      assert(in_up || in_acting);
-
-      // in acting                  Compute total objects excluding num_missing
-      // in acting and not in up    Compute misplaced objects excluding num_missing
-      // in up and not in acting    Compute total objects already backfilled
-      if (in_acting) {
-        unsigned osd_missing;
-        // primary handling
-        if (p == pg_whoami) {
-          osd_missing = pg_log.get_missing().num_missing();
-          info.stats.stats.sum.num_objects_missing_on_primary =
-              osd_missing;
-          object_copies += num_objects; // My local (primary) count
-        } else {
-          assert(peer_missing.count(p));
-          osd_missing = peer_missing[p].num_missing();
-          object_copies += peer_info[p].stats.stats.sum.num_objects;
+    // All other peers
+    for (auto& peer : peer_info) {
+      // Ignore other peers until we add code to look at detailed missing
+      // information. (recovery)
+      if (!actingbackfill.count(peer.first)) {
+       continue;
+      }
+      missing = 0;
+      // Backfill targets always track num_objects accurately
+      // all other peers track missing accurately.
+      if (is_backfill_targets(peer.first)) {
+       missing = std::max((int64_t)0, num_objects - peer.second.stats.stats.sum.num_objects);
+      } else {
+       if (peer_missing.count(peer.first)) {
+         missing = peer_missing[peer.first].num_missing();
+       } else {
+         dout(20) << __func__ << " no peer_missing found for " << peer.first << dendl;
          }
-        missing += osd_missing;
-        // Count non-missing objects not in up as misplaced
-        if (!in_up && num_objects > osd_missing)
-         misplaced += num_objects - osd_missing;
+      }
+      if (upset.count(peer.first)) {
+       missing_target_objects.insert(make_pair(missing, peer.first));
        } else {
-        assert(in_up && !in_acting);
+       acting_source_objects.insert(make_pair(missing, peer.first));
+      }
+      peer.second.stats.stats.sum.num_objects_missing = missing;
+    }
  
-        // If this peer has more objects then it should, ignore them
-        backfilled += MIN(num_objects, peer_info[p].stats.stats.sum.num_objects);
+    if (pool.info.is_replicated()) {
+      // Add to missing_target_objects up to target elements (num_objects missing)
+      assert(target >= missing_target_objects.size());
+      unsigned needed = target - missing_target_objects.size();
+      for (; needed; --needed)
+       missing_target_objects.insert(make_pair(num_objects, pg_shard_t(pg_shard_t::NO_OSD)));
+    } else {
+      for (unsigned i = 0 ; i < num_shards; ++i) {
+        shard_id_t shard(i);
+       bool found = false;
+       for (const auto& t : missing_target_objects) {
+         if (std::get<1>(t).shard == shard) {
+           found = true;
+           break;
+         }
+       }
+       if (!found)
+         missing_target_objects.insert(make_pair(num_objects, pg_shard_t(pg_shard_t::NO_OSD,shard)));
        }
      }
  
-    // Any objects that have been backfilled to up OSDs can deducted from misplaced
-    misplaced = MAX(0, misplaced - backfilled);
+    for (const auto& item : missing_target_objects)
+      dout(20) << __func__ << " missing shard " << std::get<1>(item) << " missing= " << std::get<0>(item) << dendl;
+    for (const auto& item : acting_source_objects)
+      dout(20) << __func__ << " acting shard " << std::get<1>(item) << " missing= " << std::get<0>(item) << dendl;
+
+    // A misplaced object is not stored on the correct OSD
+    int64_t misplaced = 0;
+    // a degraded objects has fewer replicas or EC shards than the pool specifies.
+    int64_t degraded = 0;
+
+    for (auto m = missing_target_objects.rbegin();
+        m != missing_target_objects.rend(); ++m) {
  
-    // Deduct computed total missing on acting nodes
-    object_copies -= missing;
-    // Include computed backfilled objects on up nodes
-    object_copies += backfilled;
-    // a degraded objects has fewer replicas or EC shards than the
-    // pool specifies.  num_object_copies will never be smaller than target * num_copies.
-    int64_t degraded = MAX(0, info.stats.stats.sum.num_object_copies - object_copies);
+      int64_t extra_missing = -1;
+
+      if (pool.info.is_replicated()) {
+       if (!acting_source_objects.empty()) {
+         auto extra_copy = acting_source_objects.begin();
+         extra_missing = std::get<0>(*extra_copy);
+          acting_source_objects.erase(extra_copy);
+       }
+      } else { // Erasure coded
+       // Use corresponding shard
+       for (const auto& a : acting_source_objects) {
+         if (std::get<1>(a).shard == std::get<1>(*m).shard) {
+           extra_missing = std::get<0>(a);
+           acting_source_objects.erase(a);
+           break;
+         }
+       }
+      }
+
+      if (extra_missing >= 0 && std::get<0>(*m) >= extra_missing) {
+       // We don't know which of the objects on the target
+       // are part of extra_missing so assume are all degraded.
+       misplaced += std::get<0>(*m) - extra_missing;
+       degraded += extra_missing;
+      } else {
+       // 1. extra_missing == -1, more targets than sources so degraded
+       // 2. extra_missing > std::get<0>(m), so that we know that some extra_missing
+       //    previously degraded are now present on the target.
+       degraded += std::get<0>(*m);
+      }
+    }
+    // If there are still acting that haven't been accounted for
+    // then they are misplaced
+    for (const auto& a : acting_source_objects) {
+      int64_t extra_misplaced = std::max((int64_t)0, num_objects - std::get<0>(a));
+      dout(20) << __func__ << " extra acting misplaced " << extra_misplaced << dendl;
+      misplaced += extra_misplaced;
+    }
+    dout(20) << __func__ << " degraded " << degraded << dendl;
+    dout(20) << __func__ << " misplaced " << misplaced << dendl;
  
      info.stats.stats.sum.num_objects_degraded = degraded;
      info.stats.stats.sum.num_objects_unfound = get_num_unfound();
@@ -2722,6 +2786,11 @@ void PG::publish_stats_to_osd()
    }
  
    _update_calc_stats();
+  if (info.stats.stats.sum.num_objects_degraded) {
+    state_set(PG_STATE_DEGRADED);
+  } else {
+    state_clear(PG_STATE_DEGRADED);
+  }
    _update_blocked_by();
  
    bool publish = false;
@@ -3338,7 +3407,7 @@ void PG::read_state(ObjectStore *store, bufferlist &bl)
      cct->_conf->osd_ignore_stale_divergent_priors,
      cct->_conf->osd_debug_verify_missing_on_start);
    if (oss.tellp())
-    osd->clog->error() << oss.rdbuf();
+    osd->clog->error() << oss.str();
  
    if (force_rebuild_missing) {
      dout(10) << __func__ << " forced rebuild of missing got "
@@ -5582,6 +5651,8 @@ ostream& operator<<(ostream& out, const PG& pg)
        << " " << pg.up;
    if (pg.acting != pg.up)
      out << "/" << pg.acting;
+  if (pg.is_ec_pg())
+    out << "p" << pg.get_primary();
    out << " r=" << pg.get_role();
    out << " lpr=" << pg.get_last_peering_reset();
  
@@ -5943,6 +6014,10 @@ void PG::handle_create(RecoveryCtx *rctx)
    recovery_state.handle_event(evt, rctx);
    ActMap evt2;
    recovery_state.handle_event(evt2, rctx);
+
+  rctx->on_applied->add(make_lambda_context([this]() {
+    update_store_with_options();
+  }));
  }
  
  void PG::handle_query_state(Formatter *f)
@@ -5956,7 +6031,7 @@ void PG::update_store_with_options()
  {
    auto r = osd->store->set_collection_opts(coll, pool.info.opts);
    if(r < 0 && r != -EOPNOTSUPP) {
-    derr << __func__ << "set_collection_opts returns error:" << r << dendl;
+    derr << __func__ << " set_collection_opts returns error:" << r << dendl;
    }
  }
  
@@ -6433,6 +6508,37 @@ PG::RecoveryState::Backfilling::react(const DeferBackfill &c)
    return transit<NotBackfilling>();
  }
  
+boost::statechart::result
+PG::RecoveryState::Backfilling::react(const UnfoundBackfill &c)
+{
+  PG *pg = context< RecoveryMachine >().pg;
+  ldout(pg->cct, 10) << "backfill has unfound, can't continue" << dendl;
+  pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
+
+  pg->state_set(PG_STATE_BACKFILL_UNFOUND);
+  pg->state_clear(PG_STATE_BACKFILLING);
+
+  for (set<pg_shard_t>::iterator it = pg->backfill_targets.begin();
+       it != pg->backfill_targets.end();
+       ++it) {
+    assert(*it != pg->pg_whoami);
+    ConnectionRef con = pg->osd->get_con_osd_cluster(
+      it->osd, pg->get_osdmap()->get_epoch());
+    if (con) {
+      pg->osd->send_message_osd_cluster(
+        new MBackfillReserve(
+         MBackfillReserve::REJECT,
+         spg_t(pg->info.pgid.pgid, it->shard),
+         pg->get_osdmap()->get_epoch()),
+       con.get());
+    }
+  }
+
+  pg->waiting_on_backfill.clear();
+
+  return transit<NotBackfilling>();
+}
+
  boost::statechart::result
  PG::RecoveryState::Backfilling::react(const RemoteReservationRejected &)
  {
@@ -6614,6 +6720,7 @@ void PG::RecoveryState::NotBackfilling::exit()
  {
    context< RecoveryMachine >().log_exit(state_name, enter_time);
    PG *pg = context< RecoveryMachine >().pg;
+  pg->state_clear(PG_STATE_BACKFILL_UNFOUND);
    utime_t dur = ceph_clock_now() - enter_time;
    pg->osd->recoverystate_perf->tinc(rs_notbackfilling_latency, dur);
  }
@@ -6632,6 +6739,7 @@ void PG::RecoveryState::NotRecovering::exit()
  {
    context< RecoveryMachine >().log_exit(state_name, enter_time);
    PG *pg = context< RecoveryMachine >().pg;
+  pg->state_clear(PG_STATE_RECOVERY_UNFOUND);
    utime_t dur = ceph_clock_now() - enter_time;
    pg->osd->recoverystate_perf->tinc(rs_notrecovering_latency, dur);
  }
@@ -6944,6 +7052,7 @@ PG::RecoveryState::Recovering::Recovering(my_context ctx)
    pg->state_clear(PG_STATE_RECOVERY_WAIT);
    pg->state_clear(PG_STATE_RECOVERY_TOOFULL);
    pg->state_set(PG_STATE_RECOVERING);
+  assert(!pg->state_test(PG_STATE_ACTIVATING));
    pg->publish_stats_to_osd();
    pg->queue_recovery();
  }
@@ -6992,6 +7101,8 @@ PG::RecoveryState::Recovering::react(const RequestBackfill &evt)
    pg->state_clear(PG_STATE_FORCED_RECOVERY);
    release_reservations();
    pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
+  // XXX: Is this needed?
+  pg->publish_stats_to_osd();
    return transit<WaitLocalBackfillReserved>();
  }
  
@@ -7008,6 +7119,18 @@ PG::RecoveryState::Recovering::react(const DeferRecovery &evt)
    return transit<NotRecovering>();
  }
  
+boost::statechart::result
+PG::RecoveryState::Recovering::react(const UnfoundRecovery &evt)
+{
+  PG *pg = context< RecoveryMachine >().pg;
+  ldout(pg->cct, 10) << "recovery has unfound, can't continue" << dendl;
+  pg->state_set(PG_STATE_RECOVERY_UNFOUND);
+  pg->state_clear(PG_STATE_RECOVERING);
+  pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
+  release_reservations(true);
+  return transit<NotRecovering>();
+}
+
  void PG::RecoveryState::Recovering::exit()
  {
    context< RecoveryMachine >().log_exit(state_name, enter_time);
@@ -7033,7 +7156,6 @@ PG::RecoveryState::Recovered::Recovered(my_context ctx)
    assert(!pg->actingbackfill.empty());
    if (pg->get_osdmap()->get_pg_size(pg->info.pgid.pgid) <=
        pg->actingbackfill.size()) {
-    pg->state_clear(PG_STATE_DEGRADED);
      pg->state_clear(PG_STATE_FORCED_BACKFILL | PG_STATE_FORCED_RECOVERY);
      pg->publish_stats_to_osd();
    }
@@ -7178,16 +7300,11 @@ boost::statechart::result PG::RecoveryState::Active::react(const AdvMap& advmap)
        pg->get_osdmap()->get_pg_size(pg->info.pgid.pgid)) {
      if (pg->get_osdmap()->get_pg_size(pg->info.pgid.pgid) <= pg->actingset.size()) {
        pg->state_clear(PG_STATE_UNDERSIZED);
-      if (pg->needs_recovery()) {
-       pg->state_set(PG_STATE_DEGRADED);
-      } else {
-       pg->state_clear(PG_STATE_DEGRADED);
-      }
      } else {
        pg->state_set(PG_STATE_UNDERSIZED);
-      pg->state_set(PG_STATE_DEGRADED);
      }
-    need_publish = true; // degraded may have changed
+    // degraded changes will be detected by call from publish_stats_to_osd()
+    need_publish = true;
    }
  
    // if we haven't reported our PG stats in a long time, do so now.
@@ -7303,9 +7420,10 @@ boost::statechart::result PG::RecoveryState::Active::react(const MLogRec& logevt
      pg->peer_missing[logevt.from],
      logevt.from,
      context< RecoveryMachine >().get_recovery_ctx());
-  if (pg->is_peered() &&
-      got_missing)
-    pg->queue_recovery();
+  // If there are missing AND we are "fully" active then start recovery now
+  if (got_missing && pg->state_test(PG_STATE_ACTIVE)) {
+    post_event(DoRecovery());
+  }
    return discard_event();
  }
  
@@ -7396,6 +7514,13 @@ boost::statechart::result PG::RecoveryState::Active::react(const AllReplicasActi
    // waiters
    if (pg->flushes_in_progress == 0) {
      pg->requeue_ops(pg->waiting_for_peered);
+  } else if (!pg->waiting_for_peered.empty()) {
+    ldout(pg->cct, 10) << __func__ << " flushes in progress, moving "
+                      << pg->waiting_for_peered.size()
+                      << " items to waiting_for_flush"
+                      << dendl;
+    assert(pg->waiting_for_flush.empty());
+    pg->waiting_for_flush.swap(pg->waiting_for_peered);
    }
  
    pg->on_activate();