ceph/src/osd/pg_scrubber.h

   1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
   2 // vim: ts=8 sw=2 smarttab
   3
   4 #pragma once
   5
   6 #include <cassert>
   7 #include <chrono>
   8 #include <memory>
   9 #include <mutex>
  10 #include <optional>
  11 #include <string>
  12 #include <string_view>
  13 #include <vector>
  14
  15 #include "PG.h"
  16 #include "ScrubStore.h"
  17 #include "scrub_machine_lstnr.h"
  18 #include "scrubber_common.h"
  19
  20 class Callback;
  21
  22 namespace Scrub {
  23 class ScrubMachine;
  24 struct BuildMap;
  25
  26 /**
  27  * Reserving/freeing scrub resources at the replicas.
  28  *
  29  *  When constructed - sends reservation requests to the acting_set.
  30  *  A rejection triggers a "couldn't acquire the replicas' scrub resources" event.
  31  *  All previous requests, whether already granted or not, are explicitly released.
  32  *
  33  *  A note re performance: I've measured a few container alternatives for
  34  *  m_reserved_peers, with its specific usage pattern. Std::set is extremely slow, as
  35  *  expected. flat_set is only slightly better. Surprisingly - std::vector (with no
  36  *  sorting) is better than boost::small_vec. And for std::vector: no need to pre-reserve.
  37  */
  38 class ReplicaReservations {
  39   using OrigSet = decltype(std::declval<PG>().get_actingset());
  40
  41   PG* m_pg;
  42   OrigSet m_acting_set;
  43   OSDService* m_osds;
  44   std::vector<pg_shard_t> m_waited_for_peers;
  45   std::vector<pg_shard_t> m_reserved_peers;
  46   bool m_had_rejections{false};
  47   int m_pending{-1};
  48
  49   void release_replica(pg_shard_t peer, epoch_t epoch);
  50
  51   void send_all_done();  ///< all reservations are granted
  52
  53   /// notify the scrubber that we have failed to reserve replicas' resources
  54   void send_reject();
  55
  56  public:
  57   /**
  58    *  quietly discard all knowledge about existing reservations. No messages
  59    *  are sent to peers.
  60    *  To be used upon interval change, as we know the the running scrub is no longer
  61    *  relevant, and that the replicas had reset the reservations on their side.
  62    */
  63   void discard_all();
  64
  65   ReplicaReservations(PG* pg, pg_shard_t whoami);
  66
  67   ~ReplicaReservations();
  68
  69   void handle_reserve_grant(OpRequestRef op, pg_shard_t from);
  70
  71   void handle_reserve_reject(OpRequestRef op, pg_shard_t from);
  72 };
  73
  74 /**
  75  *  wraps the local OSD scrub resource reservation in an RAII wrapper
  76  */
  77 class LocalReservation {
  78   PG* m_pg;
  79   OSDService* m_osds;
  80   bool m_holding_local_reservation{false};
  81
  82  public:
  83   LocalReservation(PG* pg, OSDService* osds);
  84   ~LocalReservation();
  85   bool is_reserved() const { return m_holding_local_reservation; }
  86 };
  87
  88 /**
  89  *  wraps the OSD resource we are using when reserved as a replica by a scrubbing master.
  90  */
  91 class ReservedByRemotePrimary {
  92   PG* m_pg;
  93   OSDService* m_osds;
  94   bool m_reserved_by_remote_primary{false};
  95   const epoch_t m_reserved_at;
  96
  97  public:
  98   ReservedByRemotePrimary(PG* pg, OSDService* osds, epoch_t epoch);
  99   ~ReservedByRemotePrimary();
 100   [[nodiscard]] bool is_reserved() const { return m_reserved_by_remote_primary; }
 101
 102   /// compare the remembered reserved-at epoch to the current interval
 103   [[nodiscard]] bool is_stale() const;
 104 };
 105
 106 /**
 107  * Once all replicas' scrub maps are received, we go on to compare the maps. That is -
 108  * unless we we have not yet completed building our own scrub map. MapsCollectionStatus
 109  * combines the status of waiting for both the local map and the replicas, without
 110  * resorting to adding dummy entries into a list.
 111  */
 112 class MapsCollectionStatus {
 113
 114   bool m_local_map_ready{false};
 115   std::vector<pg_shard_t> m_maps_awaited_for;
 116
 117  public:
 118   [[nodiscard]] bool are_all_maps_available() const
 119   {
 120     return m_local_map_ready && m_maps_awaited_for.empty();
 121   }
 122
 123   void mark_local_map_ready() { m_local_map_ready = true; }
 124
 125   void mark_replica_map_request(pg_shard_t from_whom)
 126   {
 127     m_maps_awaited_for.push_back(from_whom);
 128   }
 129
 130   /// @returns true if indeed waiting for this one. Otherwise: an error string
 131   auto mark_arriving_map(pg_shard_t from) -> std::tuple<bool, std::string_view>;
 132
 133   std::vector<pg_shard_t> get_awaited() const { return m_maps_awaited_for; }
 134
 135   void reset();
 136
 137   std::string dump() const;
 138
 139   friend ostream& operator<<(ostream& out, const MapsCollectionStatus& sf);
 140 };
 141
 142 }  // namespace Scrub
 143
 144 /**
 145  * the scrub operation flags. Primary only.
 146  * Set at scrub start. Checked in multiple locations - mostly
 147  * at finish.
 148  */
 149 struct scrub_flags_t {
 150
 151   unsigned int priority{0};
 152
 153   /**
 154    * set by queue_scrub() if either planned_scrub.auto_repair or
 155    * need_auto were set.
 156    * Tested at scrub end.
 157    */
 158   bool auto_repair{false};
 159
 160   /// this flag indicates that we are scrubbing post repair to verify everything is fixed
 161   bool check_repair{false};
 162
 163   /// checked at the end of the scrub, to possibly initiate a deep-scrub
 164   bool deep_scrub_on_error{false};
 165
 166   /**
 167    * scrub must not be aborted.
 168    * Set for explicitly requested scrubs, and for scrubs originated by the pairing
 169    * process with the 'repair' flag set (in the RequestScrub event).
 170    */
 171   bool required{false};
 172 };
 173
 174 ostream& operator<<(ostream& out, const scrub_flags_t& sf);
 175
 176
 177 /**
 178  * The part of PG-scrubbing code that isn't state-machine wiring.
 179  *
 180  * Why the separation? I wish to move to a different FSM implementation. Thus I
 181  * am forced to strongly decouple the state-machine implementation details from
 182  * the actual scrubbing code.
 183  */
 184 class PgScrubber : public ScrubPgIF, public ScrubMachineListener {
 185
 186  public:
 187   explicit PgScrubber(PG* pg);
 188
 189   //  ------------------  the I/F exposed to the PG (ScrubPgIF) -------------
 190
 191   /// are we waiting for resource reservation grants form our replicas?
 192   [[nodiscard]] bool is_reserving() const final;
 193
 194   void initiate_regular_scrub(epoch_t epoch_queued) final;
 195
 196   void initiate_scrub_after_repair(epoch_t epoch_queued) final;
 197
 198   void send_scrub_resched(epoch_t epoch_queued) final;
 199
 200   void active_pushes_notification(epoch_t epoch_queued) final;
 201
 202   void update_applied_notification(epoch_t epoch_queued) final;
 203
 204   void send_scrub_unblock(epoch_t epoch_queued) final;
 205
 206   void digest_update_notification(epoch_t epoch_queued) final;
 207
 208   void send_replica_maps_ready(epoch_t epoch_queued) final;
 209
 210   void send_start_replica(epoch_t epoch_queued) final;
 211
 212   void send_sched_replica(epoch_t epoch_queued) final;
 213
 214   void send_replica_pushes_upd(epoch_t epoch_queued) final;
 215
 216   /**
 217    *  we allow some number of preemptions of the scrub, which mean we do
 218    *  not block.  Then we start to block.  Once we start blocking, we do
 219    *  not stop until the scrub range is completed.
 220    */
 221   bool write_blocked_by_scrub(const hobject_t& soid) final;
 222
 223   /// true if the given range intersects the scrub interval in any way
 224   bool range_intersects_scrub(const hobject_t& start, const hobject_t& end) final;
 225
 226   /**
 227    *  we are a replica being asked by the Primary to reserve OSD resources for
 228    *  scrubbing
 229    */
 230   void handle_scrub_reserve_request(OpRequestRef op) final;
 231
 232   void handle_scrub_reserve_grant(OpRequestRef op, pg_shard_t from) final;
 233   void handle_scrub_reserve_reject(OpRequestRef op, pg_shard_t from) final;
 234   void handle_scrub_reserve_release(OpRequestRef op) final;
 235   void discard_replica_reservations() final;
 236   void clear_scrub_reservations() final;  // PG::clear... fwds to here
 237   void unreserve_replicas() final;
 238
 239   // managing scrub op registration
 240
 241   void reg_next_scrub(const requested_scrub_t& request_flags) final;
 242
 243   void unreg_next_scrub() final;
 244
 245   void scrub_requested(scrub_level_t scrub_level,
 246                        scrub_type_t scrub_type,
 247                        requested_scrub_t& req_flags) final;
 248
 249   /**
 250    * Reserve local scrub resources (managed by the OSD)
 251    *
 252    * Fails if OSD's local-scrubs budget was exhausted
 253    * \returns were local resources reserved?
 254    */
 255   bool reserve_local() final;
 256
 257   void handle_query_state(ceph::Formatter* f) final;
 258
 259   void dump(ceph::Formatter* f) const override;
 260
 261   // used if we are a replica
 262
 263   void replica_scrub_op(OpRequestRef op) final;
 264
 265   /// the op priority, taken from the primary's request message
 266   Scrub::scrub_prio_t replica_op_priority() const final
 267   {
 268     return m_replica_request_priority;
 269   };
 270
 271   unsigned int scrub_requeue_priority(Scrub::scrub_prio_t with_priority,
 272                                       unsigned int suggested_priority) const final;
 273   /// the version that refers to m_flags.priority
 274   unsigned int scrub_requeue_priority(Scrub::scrub_prio_t with_priority) const final;
 275
 276   void add_callback(Context* context) final { m_callbacks.push_back(context); }
 277
 278   [[nodiscard]] bool are_callbacks_pending() const final  // used for an assert in PG.cc
 279   {
 280     return !m_callbacks.empty();
 281   }
 282
 283   /// handle a message carrying a replica map
 284   void map_from_replica(OpRequestRef op) final;
 285
 286   /**
 287    *  should we requeue blocked ops?
 288    *  Applicable to the PrimaryLogScrub derived class.
 289    */
 290   [[nodiscard]] virtual bool should_requeue_blocked_ops(
 291     eversion_t last_recovery_applied) const override
 292   {
 293     return false;
 294   }
 295
 296   void scrub_clear_state() final;
 297
 298   /**
 299    *  add to scrub statistics, but only if the soid is below the scrub start
 300    */
 301   virtual void stats_of_handled_objects(const object_stat_sum_t& delta_stats,
 302                                         const hobject_t& soid) override
 303   {
 304     ceph_assert(false);
 305   }
 306
 307   /**
 308    * finalize the parameters of the initiated scrubbing session:
 309    *
 310    * The "current scrub" flags (m_flags) are set from the 'planned_scrub' flag-set;
 311    * PG_STATE_SCRUBBING, and possibly PG_STATE_DEEP_SCRUB & PG_STATE_REPAIR are set.
 312    */
 313   void set_op_parameters(requested_scrub_t& request) final;
 314
 315   void cleanup_store(ObjectStore::Transaction* t) final;
 316
 317   bool get_store_errors(const scrub_ls_arg_t& arg,
 318                         scrub_ls_result_t& res_inout) const override
 319   {
 320     return false;
 321   };
 322
 323   // -------------------------------------------------------------------------------------------
 324   // the I/F used by the state-machine (i.e. the implementation of ScrubMachineListener)
 325
 326   bool select_range() final;
 327
 328   /// walk the log to find the latest update that affects our chunk
 329   eversion_t search_log_for_updates() const final;
 330
 331   eversion_t get_last_update_applied() const final
 332   {
 333     return m_pg->recovery_state.get_last_update_applied();
 334   }
 335
 336   int pending_active_pushes() const final { return m_pg->active_pushes; }
 337
 338   void scrub_compare_maps() final;
 339
 340   void on_init() final;
 341   void on_replica_init() final;
 342   void replica_handling_done() final;
 343
 344   /// the version of 'scrub_clear_state()' that does not try to invoke FSM services
 345   /// (thus can be called from FSM reactions)
 346   void clear_pgscrub_state() final;
 347
 348   void add_delayed_scheduling() final;
 349
 350   /**
 351    * @returns have we asked at least one replica?
 352    * 'false' means we are configured with no replicas, and
 353    * should expect no maps to arrive.
 354    */
 355   bool get_replicas_maps(bool replica_can_preempt) final;
 356
 357   Scrub::FsmNext on_digest_updates() final;
 358
 359   void send_replica_map(Scrub::PreemptionNoted was_preempted) final;
 360
 361   void send_remotes_reserved(epoch_t epoch_queued) final;
 362   void send_reservation_failure(epoch_t epoch_queued) final;
 363
 364   /**
 365    *  does the PG have newer updates than what we (the scrubber) know?
 366    */
 367   [[nodiscard]] bool has_pg_marked_new_updates() const final;
 368
 369   void set_subset_last_update(eversion_t e) final;
 370
 371   void maps_compare_n_cleanup() final;
 372
 373   Scrub::preemption_t& get_preemptor() final;
 374
 375   int build_primary_map_chunk() final;
 376
 377   int build_replica_map_chunk() final;
 378
 379   void reserve_replicas() final;
 380
 381   [[nodiscard]] bool was_epoch_changed() const final;
 382
 383   void mark_local_map_ready() final;
 384
 385   [[nodiscard]] bool are_all_maps_available() const final;
 386
 387   std::string dump_awaited_maps() const final;
 388
 389  protected:
 390   bool state_test(uint64_t m) const { return m_pg->state_test(m); }
 391   void state_set(uint64_t m) { m_pg->state_set(m); }
 392   void state_clear(uint64_t m) { m_pg->state_clear(m); }
 393
 394   [[nodiscard]] bool is_primary() const { return m_pg->recovery_state.is_primary(); }
 395
 396   [[nodiscard]] bool is_scrub_registered() const;
 397
 398   virtual void _scrub_clear_state() {}
 399
 400   utime_t m_scrub_reg_stamp;  ///< stamp we registered for
 401
 402   ostream& show(ostream& out) const override;
 403
 404  public:
 405   // -------------------------------------------------------------------------------------------
 406
 407   friend ostream& operator<<(ostream& out, const PgScrubber& scrubber);
 408
 409   static utime_t scrub_must_stamp() { return utime_t(1, 1); }
 410
 411   virtual ~PgScrubber();  // must be defined separately, in the .cc file
 412
 413   [[nodiscard]] bool is_scrub_active() const final { return m_active; }
 414
 415  private:
 416   void reset_internal_state();
 417
 418   void requeue_waiting() const { m_pg->requeue_ops(m_pg->waiting_for_scrub); }
 419
 420   void _scan_snaps(ScrubMap& smap);
 421
 422   ScrubMap clean_meta_map();
 423
 424   /**
 425    *  mark down some parameters of the initiated scrub:
 426    *  - the epoch when started;
 427    *  - the depth of the scrub requested (from the PG_STATE variable)
 428    */
 429   void reset_epoch(epoch_t epoch_queued);
 430
 431   void run_callbacks();
 432
 433   // -----     methods used to verify the relevance of incoming events:
 434
 435   /**
 436    *  is the incoming event still relevant, and should be processed?
 437    *
 438    *  It isn't if:
 439    *  - (1) we are no longer 'actively scrubbing'; or
 440    *  - (2) the message is from an epoch prior to when we started the current scrub
 441    * session; or
 442    *  - (3) the message epoch is from a previous interval; or
 443    *  - (4) the 'abort' configuration flags were set.
 444    *
 445    *  For (1) & (2) - teh incoming message is discarded, w/o further action.
 446    *
 447    *  For (3): (see check_interval() for a full description) if we have not reacted yet
 448    *  to this specific new interval, we do now:
 449    *  - replica reservations are silently discarded (we count on the replicas to notice
 450    *        the interval change and un-reserve themselves);
 451    *  - the scrubbing is halted.
 452    *
 453    *  For (4): the message will be discarded, but also:
 454    *    if this is the first time we've noticed the 'abort' request, we perform the abort.
 455    *
 456    *  \returns should the incoming event be processed?
 457    */
 458   bool is_message_relevant(epoch_t epoch_to_verify);
 459
 460   /**
 461    * check the 'no scrub' configuration options.
 462    */
 463   [[nodiscard]] bool should_abort() const;
 464
 465   /**
 466    * Check the 'no scrub' configuration flags.
 467    *
 468    * Reset everything if the abort was not handled before.
 469    * @returns false if the message was discarded due to abort flag.
 470    */
 471   [[nodiscard]] bool verify_against_abort(epoch_t epoch_to_verify);
 472
 473   [[nodiscard]] bool check_interval(epoch_t epoch_to_verify);
 474
 475   epoch_t m_last_aborted{};  // last time we've noticed a request to abort
 476
 477   /**
 478    * return true if any inconsistency/missing is repaired, false otherwise
 479    */
 480   [[nodiscard]] bool scrub_process_inconsistent();
 481
 482   bool m_needs_sleep{true};  ///< should we sleep before being rescheduled? always
 483                              ///< 'true', unless we just got out of a sleep period
 484
 485   utime_t m_sleep_started_at;
 486
 487
 488   // 'optional', as 'ReplicaReservations' & 'LocalReservation' are 'RAII-designed'
 489   // to guarantee un-reserving when deleted.
 490   std::optional<Scrub::ReplicaReservations> m_reservations;
 491   std::optional<Scrub::LocalReservation> m_local_osd_resource;
 492
 493   /// the 'remote' resource we, as a replica, grant our Primary when it is scrubbing
 494   std::optional<Scrub::ReservedByRemotePrimary> m_remote_osd_resource;
 495
 496   void cleanup_on_finish();  // scrub_clear_state() as called for a Primary when
 497                              // Active->NotActive
 498
 499   /// the part that actually finalizes a scrub
 500   void scrub_finish();
 501
 502  protected:
 503   PG* const m_pg;
 504
 505   /**
 506    * the derivative-specific scrub-finishing touches:
 507    */
 508   virtual void _scrub_finish() {}
 509
 510   /**
 511    * Validate consistency of the object info and snap sets.
 512    */
 513   virtual void scrub_snapshot_metadata(ScrubMap& map, const missing_map_t& missing_digest)
 514   {}
 515
 516   // common code used by build_primary_map_chunk() and build_replica_map_chunk():
 517   int build_scrub_map_chunk(ScrubMap& map,  // primary or replica?
 518                             ScrubMapBuilder& pos,
 519                             hobject_t start,
 520                             hobject_t end,
 521                             bool deep);
 522
 523   std::unique_ptr<Scrub::ScrubMachine> m_fsm;
 524   const spg_t m_pg_id;  ///< a local copy of m_pg->pg_id
 525   OSDService* const m_osds;
 526   const pg_shard_t m_pg_whoami;  ///< a local copy of m_pg->pg_whoami;
 527
 528   epoch_t m_interval_start{0};  ///< interval's 'from' of when scrubbing was first scheduled
 529   /*
 530    * the exact epoch when the scrubbing actually started (started here - cleared checks
 531    *  for no-scrub conf). Incoming events are verified against this, with stale events
 532    *  discarded.
 533    */
 534   epoch_t m_epoch_start{0};  ///< the actual epoch when scrubbing started
 535   scrub_flags_t m_flags;
 536
 537   bool m_active{false};
 538
 539   eversion_t m_subset_last_update{};
 540
 541   std::unique_ptr<Scrub::Store> m_store;
 542
 543   int num_digest_updates_pending{0};
 544   hobject_t m_start, m_end;  ///< note: half-closed: [start,end)
 545
 546   /// Returns reference to current osdmap
 547   const OSDMapRef& get_osdmap() const;
 548
 549   /// Returns epoch of current osdmap
 550   epoch_t get_osdmap_epoch() const { return get_osdmap()->get_epoch(); }
 551
 552   CephContext* get_pg_cct() const { return m_pg->cct; }
 553
 554   // collected statistics
 555   int m_shallow_errors{0};
 556   int m_deep_errors{0};
 557   int m_fixed_count{0};
 558
 559   /// Maps from objects with errors to missing peers
 560   HobjToShardSetMapping m_missing;
 561
 562  protected:
 563   /**
 564    * 'm_is_deep' - is the running scrub a deep one?
 565    *
 566    * Note that most of the code directly checks PG_STATE_DEEP_SCRUB, which is
 567    * primary-only (and is set earlier - when scheduling the scrub). 'm_is_deep' is
 568    * meaningful both for the primary and the replicas, and is used as a parameter when
 569    * building the scrub maps.
 570    */
 571   bool m_is_deep{false};
 572
 573   /**
 574    * If set: affects the backend & scrubber-backend functions called after all
 575    * scrub maps are available.
 576    *
 577    * Replaces code that directly checks PG_STATE_REPAIR (which was meant to be
 578    * a "user facing" status display only).
 579    */
 580   bool m_is_repair{false};
 581
 582   /**
 583    * User-readable summary of the scrubber's current mode of operation. Used for
 584    * both osd.*.log and the cluster log.
 585    * One of:
 586    *    "repair"
 587    *    "deep-scrub",
 588    *    "scrub
 589    *
 590    * Note: based on PG_STATE_REPAIR, and not on m_is_repair. I.e. for
 591    * auto_repair will show as "deep-scrub" and not as "repair" (until the first error
 592    * is detected).
 593    */
 594   std::string_view m_mode_desc;
 595
 596   void update_op_mode_text();
 597
 598 private:
 599
 600   /**
 601    * initiate a deep-scrub after the current scrub ended with errors.
 602    */
 603   void request_rescrubbing(requested_scrub_t& req_flags);
 604
 605   std::list<Context*> m_callbacks;
 606
 607   /**
 608    * send a replica (un)reservation request to the acting set
 609    *
 610    * @param opcode - one of MOSDScrubReserve::REQUEST
 611    *                  or MOSDScrubReserve::RELEASE
 612    */
 613   void message_all_replicas(int32_t opcode, std::string_view op_text);
 614
 615   hobject_t m_max_end;  ///< Largest end that may have been sent to replicas
 616   ScrubMap m_primary_scrubmap;
 617   ScrubMapBuilder m_primary_scrubmap_pos;
 618
 619   std::map<pg_shard_t, ScrubMap> m_received_maps;
 620
 621   /// Cleaned std::map pending snap metadata scrub
 622   ScrubMap m_cleaned_meta_map;
 623
 624   void _request_scrub_map(pg_shard_t replica,
 625                           eversion_t version,
 626                           hobject_t start,
 627                           hobject_t end,
 628                           bool deep,
 629                           bool allow_preemption);
 630
 631
 632   Scrub::MapsCollectionStatus m_maps_status;
 633
 634   omap_stat_t m_omap_stats = (const struct omap_stat_t){0};
 635
 636   /// Maps from objects with errors to inconsistent peers
 637   HobjToShardSetMapping m_inconsistent;
 638
 639   /// Maps from object with errors to good peers
 640   std::map<hobject_t, std::list<std::pair<ScrubMap::object, pg_shard_t>>> m_authoritative;
 641
 642   // ------------ members used if we are a replica
 643
 644   epoch_t m_replica_min_epoch;  ///< the min epoch needed to handle this message
 645
 646   ScrubMapBuilder replica_scrubmap_pos;
 647   ScrubMap replica_scrubmap;
 648   /**
 649    * we mark the request priority as it arrived. It influences the queuing priority
 650    * when we wait for local updates
 651    */
 652   Scrub::scrub_prio_t m_replica_request_priority;
 653
 654   /**
 655    *  Queue a XX event to be sent to the replica, to trigger a re-check of the
 656    * availability of the scrub map prepared by the backend.
 657    */
 658   void requeue_replica(Scrub::scrub_prio_t is_high_priority);
 659
 660   /**
 661    * the 'preemption' "state-machine".
 662    * Note: I was considering an orthogonal sub-machine implementation, but as
 663    * the state diagram is extremely simple, the added complexity wasn't justified.
 664    */
 665   class preemption_data_t : public Scrub::preemption_t {
 666    public:
 667     preemption_data_t(PG* pg);  // the PG access is used for conf access (and logs)
 668
 669     [[nodiscard]] bool is_preemptable() const final { return m_preemptable; }
 670
 671     bool do_preempt() final
 672     {
 673       if (m_preempted || !m_preemptable)
 674         return false;
 675
 676       std::lock_guard<std::mutex> lk{m_preemption_lock};
 677       if (!m_preemptable)
 678         return false;
 679
 680       m_preempted = true;
 681       return true;
 682     }
 683
 684     /// same as 'do_preempt()' but w/o checks (as once a replica
 685     /// was preempted, we cannot continue)
 686     void replica_preempted() { m_preempted = true; }
 687
 688     void enable_preemption()
 689     {
 690       std::lock_guard<std::mutex> lk{m_preemption_lock};
 691       if (are_preemptions_left() && !m_preempted) {
 692         m_preemptable = true;
 693       }
 694     }
 695
 696     /// used by a replica to set preemptability state according to the Primary's request
 697     void force_preemptability(bool is_allowed)
 698     {
 699       // note: no need to lock for a replica
 700       m_preempted = false;
 701       m_preemptable = is_allowed;
 702     }
 703
 704     bool disable_and_test() final
 705     {
 706       std::lock_guard<std::mutex> lk{m_preemption_lock};
 707       m_preemptable = false;
 708       return m_preempted;
 709     }
 710
 711     [[nodiscard]] bool was_preempted() const { return m_preempted; }
 712
 713     [[nodiscard]] size_t chunk_divisor() const { return m_size_divisor; }
 714
 715     void reset();
 716
 717     void adjust_parameters() final
 718     {
 719       std::lock_guard<std::mutex> lk{m_preemption_lock};
 720
 721       if (m_preempted) {
 722         m_preempted = false;
 723         m_preemptable = adjust_left();
 724       } else {
 725         m_preemptable = are_preemptions_left();
 726       }
 727     }
 728
 729    private:
 730     PG* m_pg;
 731     mutable std::mutex m_preemption_lock;
 732     bool m_preemptable{false};
 733     bool m_preempted{false};
 734     int m_left;
 735     size_t m_size_divisor{1};
 736     bool are_preemptions_left() const { return m_left > 0; }
 737
 738     bool adjust_left()
 739     {
 740       if (m_left > 0) {
 741         --m_left;
 742         m_size_divisor *= 2;
 743       }
 744       return m_left > 0;
 745     }
 746   };
 747
 748   preemption_data_t preemption_data;
 749 };