ceph/src/rocksdb/utilities/transactions/lock/range/range_tree/lib/locktree/locktree.h

   1 /* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
   2 // vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
   3 #ident "$Id$"
   4 /*======
   5 This file is part of PerconaFT.
   6
   7
   8 Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
   9
  10     PerconaFT is free software: you can redistribute it and/or modify
  11     it under the terms of the GNU General Public License, version 2,
  12     as published by the Free Software Foundation.
  13
  14     PerconaFT is distributed in the hope that it will be useful,
  15     but WITHOUT ANY WARRANTY; without even the implied warranty of
  16     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  17     GNU General Public License for more details.
  18
  19     You should have received a copy of the GNU General Public License
  20     along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
  21
  22 ----------------------------------------
  23
  24     PerconaFT is free software: you can redistribute it and/or modify
  25     it under the terms of the GNU Affero General Public License, version 3,
  26     as published by the Free Software Foundation.
  27
  28     PerconaFT is distributed in the hope that it will be useful,
  29     but WITHOUT ANY WARRANTY; without even the implied warranty of
  30     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  31     GNU Affero General Public License for more details.
  32
  33     You should have received a copy of the GNU Affero General Public License
  34     along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
  35
  36 ----------------------------------------
  37
  38    Licensed under the Apache License, Version 2.0 (the "License");
  39    you may not use this file except in compliance with the License.
  40    You may obtain a copy of the License at
  41
  42        http://www.apache.org/licenses/LICENSE-2.0
  43
  44    Unless required by applicable law or agreed to in writing, software
  45    distributed under the License is distributed on an "AS IS" BASIS,
  46    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  47    See the License for the specific language governing permissions and
  48    limitations under the License.
  49 ======= */
  50
  51 #ident \
  52     "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
  53
  54 #pragma once
  55
  56 #include <atomic>
  57
  58 #include "../db.h"
  59 #include "../ft/comparator.h"
  60 #include "../portability/toku_external_pthread.h"
  61 #include "../portability/toku_pthread.h"
  62 #include "../portability/toku_time.h"
  63 // PORT #include <ft/ft-ops.h>  // just for DICTIONARY_ID..
  64 // PORT: ft-status for LTM_STATUS:
  65 #include "../ft/ft-status.h"
  66
  67 struct DICTIONARY_ID {
  68   uint64_t dictid;
  69 };
  70
  71 #include "../util/omt.h"
  72 #include "range_buffer.h"
  73 #include "txnid_set.h"
  74 #include "wfg.h"
  75
  76 namespace toku {
  77
  78 class locktree;
  79 class locktree_manager;
  80 class lock_request;
  81 class concurrent_tree;
  82
  83 typedef int (*lt_create_cb)(locktree *lt, void *extra);
  84 typedef void (*lt_destroy_cb)(locktree *lt);
  85 typedef void (*lt_escalate_cb)(TXNID txnid, const locktree *lt,
  86                                const range_buffer &buffer, void *extra);
  87
  88 typedef bool (*lt_escalation_barrier_check_func)(const DBT *a, const DBT *b,
  89                                                  void *extra);
  90
  91 struct lt_counters {
  92   uint64_t wait_count, wait_time;
  93   uint64_t long_wait_count, long_wait_time;
  94   uint64_t timeout_count;
  95
  96   void add(const lt_counters &rhs) {
  97     wait_count += rhs.wait_count;
  98     wait_time += rhs.wait_time;
  99     long_wait_count += rhs.long_wait_count;
 100     long_wait_time += rhs.long_wait_time;
 101     timeout_count += rhs.timeout_count;
 102   }
 103 };
 104
 105 // Lock request state for some locktree
 106 struct lt_lock_request_info {
 107   omt<lock_request *> pending_lock_requests;
 108   std::atomic_bool pending_is_empty;
 109   toku_external_mutex_t mutex;
 110   bool should_retry_lock_requests;
 111   lt_counters counters;
 112   std::atomic_ullong retry_want;
 113   unsigned long long retry_done;
 114   toku_mutex_t retry_mutex;
 115   toku_cond_t retry_cv;
 116   bool running_retry;
 117
 118   void init(toku_external_mutex_factory_t mutex_factory);
 119   void destroy(void);
 120 };
 121
 122 // The locktree manager manages a set of locktrees, one for each open
 123 // dictionary. Locktrees are retrieved from the manager. When they are no
 124 // longer needed, they are be released by the user.
 125 class locktree_manager {
 126  public:
 127   // param: create_cb, called just after a locktree is first created.
 128   //        destroy_cb, called just before a locktree is destroyed.
 129   //        escalate_cb, called after a locktree is escalated (with extra
 130   //        param)
 131   void create(lt_create_cb create_cb, lt_destroy_cb destroy_cb,
 132               lt_escalate_cb escalate_cb, void *extra,
 133               toku_external_mutex_factory_t mutex_factory_arg);
 134
 135   void destroy(void);
 136
 137   size_t get_max_lock_memory(void);
 138
 139   int set_max_lock_memory(size_t max_lock_memory);
 140
 141   // effect: Get a locktree from the manager. If a locktree exists with the
 142   // given
 143   //         dict_id, it is referenced and then returned. If one did not exist,
 144   //         it is created. It will use the comparator for comparing keys. The
 145   //         on_create callback (passed to locktree_manager::create()) will be
 146   //         called with the given extra parameter.
 147   locktree *get_lt(DICTIONARY_ID dict_id, const comparator &cmp,
 148                    void *on_create_extra);
 149
 150   void reference_lt(locktree *lt);
 151
 152   // effect: Releases one reference on a locktree. If the reference count
 153   // transitions
 154   //         to zero, the on_destroy callback is called before it gets
 155   //         destroyed.
 156   void release_lt(locktree *lt);
 157
 158   void get_status(LTM_STATUS status);
 159
 160   // effect: calls the iterate function on each pending lock request
 161   // note: holds the manager's mutex
 162   typedef int (*lock_request_iterate_callback)(DICTIONARY_ID dict_id,
 163                                                TXNID txnid, const DBT *left_key,
 164                                                const DBT *right_key,
 165                                                TXNID blocking_txnid,
 166                                                uint64_t start_time,
 167                                                void *extra);
 168   int iterate_pending_lock_requests(lock_request_iterate_callback cb,
 169                                     void *extra);
 170
 171   // effect: Determines if too many locks or too much memory is being used,
 172   //         Runs escalation on the manager if so.
 173   // param: big_txn, if the current transaction is 'big' (has spilled rollback
 174   // logs) returns: 0 if there enough resources to create a new lock, or
 175   // TOKUDB_OUT_OF_LOCKS
 176   //          if there are not enough resources and lock escalation failed to
 177   //          free up enough resources for a new lock.
 178   int check_current_lock_constraints(bool big_txn);
 179
 180   bool over_big_threshold(void);
 181
 182   void note_mem_used(uint64_t mem_used);
 183
 184   void note_mem_released(uint64_t mem_freed);
 185
 186   bool out_of_locks(void) const;
 187
 188   // Escalate all locktrees
 189   void escalate_all_locktrees(void);
 190
 191   // Escalate a set of locktrees
 192   void escalate_locktrees(locktree **locktrees, int num_locktrees);
 193
 194   // effect: calls the private function run_escalation(), only ok to
 195   //         do for tests.
 196   // rationale: to get better stress test coverage, we want a way to
 197   //            deterministicly trigger lock escalation.
 198   void run_escalation_for_test(void);
 199   void run_escalation(void);
 200
 201   // Add time t to the escalator's wait time statistics
 202   void add_escalator_wait_time(uint64_t t);
 203
 204   void kill_waiter(void *extra);
 205
 206  private:
 207   static const uint64_t DEFAULT_MAX_LOCK_MEMORY = 64L * 1024 * 1024;
 208
 209   // tracks the current number of locks and lock memory
 210   uint64_t m_max_lock_memory;
 211   uint64_t m_current_lock_memory;
 212
 213   struct lt_counters m_lt_counters;
 214
 215   // the create and destroy callbacks for the locktrees
 216   lt_create_cb m_lt_create_callback;
 217   lt_destroy_cb m_lt_destroy_callback;
 218   lt_escalate_cb m_lt_escalate_callback;
 219   void *m_lt_escalate_callback_extra;
 220
 221   omt<locktree *> m_locktree_map;
 222
 223   toku_external_mutex_factory_t mutex_factory;
 224
 225   // the manager's mutex protects the locktree map
 226   toku_mutex_t m_mutex;
 227
 228   void mutex_lock(void);
 229
 230   void mutex_unlock(void);
 231
 232   // Manage the set of open locktrees
 233   locktree *locktree_map_find(const DICTIONARY_ID &dict_id);
 234   void locktree_map_put(locktree *lt);
 235   void locktree_map_remove(locktree *lt);
 236
 237   static int find_by_dict_id(locktree *const &lt, const DICTIONARY_ID &dict_id);
 238
 239   void escalator_init(void);
 240   void escalator_destroy(void);
 241
 242   // statistics about lock escalation.
 243   toku_mutex_t m_escalation_mutex;
 244   uint64_t m_escalation_count;
 245   tokutime_t m_escalation_time;
 246   uint64_t m_escalation_latest_result;
 247   uint64_t m_wait_escalation_count;
 248   uint64_t m_wait_escalation_time;
 249   uint64_t m_long_wait_escalation_count;
 250   uint64_t m_long_wait_escalation_time;
 251
 252   // the escalator coordinates escalation on a set of locktrees for a bunch of
 253   // threads
 254   class locktree_escalator {
 255    public:
 256     void create(void);
 257     void destroy(void);
 258     void run(locktree_manager *mgr, void (*escalate_locktrees_fun)(void *extra),
 259              void *extra);
 260
 261    private:
 262     toku_mutex_t m_escalator_mutex;
 263     toku_cond_t m_escalator_done;
 264     bool m_escalator_running;
 265   };
 266
 267   locktree_escalator m_escalator;
 268
 269   friend class manager_unit_test;
 270 };
 271
 272 // A locktree represents the set of row locks owned by all transactions
 273 // over an open dictionary. Read and write ranges are represented as
 274 // a left and right key which are compared with the given comparator
 275 //
 276 // Locktrees are not created and destroyed by the user. Instead, they are
 277 // referenced and released using the locktree manager.
 278 //
 279 // A sample workflow looks like this:
 280 // - Create a manager.
 281 // - Get a locktree by dictionaroy id from the manager.
 282 // - Perform read/write lock acquision on the locktree, add references to
 283 //   the locktree using the manager, release locks, release references, etc.
 284 // - ...
 285 // - Release the final reference to the locktree. It will be destroyed.
 286 // - Destroy the manager.
 287 class locktree {
 288  public:
 289   // effect: Creates a locktree
 290   void create(locktree_manager *mgr, DICTIONARY_ID dict_id,
 291               const comparator &cmp,
 292               toku_external_mutex_factory_t mutex_factory);
 293
 294   void destroy(void);
 295
 296   // For thread-safe, external reference counting
 297   void add_reference(void);
 298
 299   // requires: the reference count is > 0
 300   // returns: the reference count, after decrementing it by one
 301   uint32_t release_reference(void);
 302
 303   // returns: the current reference count
 304   uint32_t get_reference_count(void);
 305
 306   // effect: Attempts to grant a read lock for the range of keys between
 307   // [left_key, right_key]. returns: If the lock cannot be granted, return
 308   // DB_LOCK_NOTGRANTED, and populate the
 309   //          given conflicts set with the txnids that hold conflicting locks in
 310   //          the range. If the locktree cannot create more locks, return
 311   //          TOKUDB_OUT_OF_LOCKS.
 312   // note: Read locks cannot be shared between txnids, as one would expect.
 313   //       This is for simplicity since read locks are rare in MySQL.
 314   int acquire_read_lock(TXNID txnid, const DBT *left_key, const DBT *right_key,
 315                         txnid_set *conflicts, bool big_txn);
 316
 317   // effect: Attempts to grant a write lock for the range of keys between
 318   // [left_key, right_key]. returns: If the lock cannot be granted, return
 319   // DB_LOCK_NOTGRANTED, and populate the
 320   //          given conflicts set with the txnids that hold conflicting locks in
 321   //          the range. If the locktree cannot create more locks, return
 322   //          TOKUDB_OUT_OF_LOCKS.
 323   int acquire_write_lock(TXNID txnid, const DBT *left_key, const DBT *right_key,
 324                          txnid_set *conflicts, bool big_txn);
 325
 326   // effect: populate the conflicts set with the txnids that would preventing
 327   //         the given txnid from getting a lock on [left_key, right_key]
 328   void get_conflicts(bool is_write_request, TXNID txnid, const DBT *left_key,
 329                      const DBT *right_key, txnid_set *conflicts);
 330
 331   // effect: Release all of the lock ranges represented by the range buffer for
 332   // a txnid.
 333   void release_locks(TXNID txnid, const range_buffer *ranges,
 334                      bool all_trx_locks_hint = false);
 335
 336   // effect: Runs escalation on this locktree
 337   void escalate(lt_escalate_cb after_escalate_callback, void *extra);
 338
 339   // returns: The userdata associated with this locktree, or null if it has not
 340   // been set.
 341   void *get_userdata(void) const;
 342
 343   void set_userdata(void *userdata);
 344
 345   locktree_manager *get_manager(void) const;
 346
 347   void set_comparator(const comparator &cmp);
 348
 349   // Set the user-provided Lock Escalation Barrier check function and its
 350   // argument
 351   //
 352   // Lock Escalation Barrier limits the scope of Lock Escalation.
 353   // For two keys A and B (such that A < B),
 354   // escalation_barrier_check_func(A, B)==true means that there's a lock
 355   // escalation barrier between A and B, and lock escalation is not allowed to
 356   // bridge the gap between A and B.
 357   //
 358   // This method sets the user-provided barrier check function and its
 359   // parameter.
 360   void set_escalation_barrier_func(lt_escalation_barrier_check_func func,
 361                                    void *extra);
 362
 363   int compare(const locktree *lt) const;
 364
 365   DICTIONARY_ID get_dict_id() const;
 366
 367   // Private info struct for storing pending lock request state.
 368   // Only to be used by lock requests. We store it here as
 369   // something less opaque than usual to strike a tradeoff between
 370   // abstraction and code complexity. It is still fairly abstract
 371   // since the lock_request object is opaque
 372   struct lt_lock_request_info *get_lock_request_info(void);
 373
 374   typedef void (*dump_callback)(void *cdata, const DBT *left, const DBT *right,
 375                                 TXNID txnid, bool is_shared,
 376                                 TxnidVector *owners);
 377   void dump_locks(void *cdata, dump_callback cb);
 378
 379  private:
 380   locktree_manager *m_mgr;
 381   DICTIONARY_ID m_dict_id;
 382   uint32_t m_reference_count;
 383
 384   // Since the memory referenced by this comparator is not owned by the
 385   // locktree, the user must guarantee it will outlive the locktree.
 386   //
 387   // The ydb API accomplishes this by opening an ft_handle in the on_create
 388   // callback, which will keep the underlying FT (and its descriptor) in memory
 389   // for as long as the handle is open. The ft_handle is stored opaquely in the
 390   // userdata pointer below. see locktree_manager::get_lt w/ on_create_extra
 391   comparator m_cmp;
 392
 393   lt_escalation_barrier_check_func m_escalation_barrier;
 394   void *m_escalation_barrier_arg;
 395
 396   concurrent_tree *m_rangetree;
 397
 398   void *m_userdata;
 399   struct lt_lock_request_info m_lock_request_info;
 400
 401   // psergey-todo:
 402   //  Each transaction also keeps a list of ranges it has locked.
 403   //  So, when a transaction is running in STO mode, two identical
 404   //  lists are kept: the STO lock list and transaction's owned locks
 405   //  list. Why can't we do with just one list?
 406
 407   // The following fields and members prefixed with "sto_" are for
 408   // the single txnid optimization, intended to speed up the case
 409   // when only one transaction is using the locktree. If we know
 410   // the locktree has only one transaction, then acquiring locks
 411   // takes O(1) work and releasing all locks takes O(1) work.
 412   //
 413   // How do we know that the locktree only has a single txnid?
 414   // What do we do if it does?
 415   //
 416   // When a txn with txnid T requests a lock:
 417   // - If the tree is empty, the optimization is possible. Set the single
 418   // txnid to T, and insert the lock range into the buffer.
 419   // - If the tree is not empty, check if the single txnid is T. If so,
 420   // append the lock range to the buffer. Otherwise, migrate all of
 421   // the locks in the buffer into the rangetree on behalf of txnid T,
 422   // and invalid the single txnid.
 423   //
 424   // When a txn with txnid T releases its locks:
 425   // - If the single txnid is valid, it must be for T. Destroy the buffer.
 426   // - If it's not valid, release locks the normal way in the rangetree.
 427   //
 428   // To carry out the optimization we need to record a single txnid
 429   // and a range buffer for each locktree, each protected by the root
 430   // lock of the locktree's rangetree. The root lock for a rangetree
 431   // is grabbed by preparing a locked keyrange on the rangetree.
 432   TXNID m_sto_txnid;
 433   range_buffer m_sto_buffer;
 434
 435   // The single txnid optimization speeds up the case when only one
 436   // transaction is using the locktree. But it has the potential to
 437   // hurt the case when more than one txnid exists.
 438   //
 439   // There are two things we need to do to make the optimization only
 440   // optimize the case we care about, and not hurt the general case.
 441   //
 442   // Bound the worst-case latency for lock migration when the
 443   // optimization stops working:
 444   // - Idea: Stop the optimization and migrate immediate if we notice
 445   // the single txnid has takes many locks in the range buffer.
 446   // - Implementation: Enforce a max size on the single txnid range buffer.
 447   // - Analysis: Choosing the perfect max value, M, is difficult to do
 448   // without some feedback from the field. Intuition tells us that M should
 449   // not be so small that the optimization is worthless, and it should not
 450   // be so big that it's unreasonable to have to wait behind a thread doing
 451   // the work of converting M buffer locks into rangetree locks.
 452   //
 453   // Prevent concurrent-transaction workloads from trying the optimization
 454   // in vain:
 455   // - Idea: Don't even bother trying the optimization if we think the
 456   // system is in a concurrent-transaction state.
 457   // - Implementation: Do something even simpler than detecting whether the
 458   // system is in a concurent-transaction state. Just keep a "score" value
 459   // and some threshold. If at any time the locktree is eligible for the
 460   // optimization, only do it if the score is at this threshold. When you
 461   // actually do the optimization but someone has to migrate locks in the buffer
 462   // (expensive), then reset the score back to zero. Each time a txn
 463   // releases locks, the score is incremented by 1.
 464   // - Analysis: If you let the threshold be "C", then at most 1 / C txns will
 465   // do the optimization in a concurrent-transaction system. Similarly, it
 466   // takes at most C txns to start using the single txnid optimzation, which
 467   // is good when the system transitions from multithreaded to single threaded.
 468   //
 469   // STO_BUFFER_MAX_SIZE:
 470   //
 471   // We choose the max value to be 1 million since most transactions are smaller
 472   // than 1 million and we can create a rangetree of 1 million elements in
 473   // less than a second. So we can be pretty confident that this threshold
 474   // enables the optimization almost always, and prevents super pathological
 475   // latency issues for the first lock taken by a second thread.
 476   //
 477   // STO_SCORE_THRESHOLD:
 478   //
 479   // A simple first guess at a good value for the score threshold is 100.
 480   // By our analysis, we'd end up doing the optimization in vain for
 481   // around 1% of all transactions, which seems reasonable. Further,
 482   // if the system goes single threaded, it ought to be pretty quick
 483   // for 100 transactions to go by, so we won't have to wait long before
 484   // we start doing the single txind optimzation again.
 485   static const int STO_BUFFER_MAX_SIZE = 50 * 1024;
 486   static const int STO_SCORE_THRESHOLD = 100;
 487   int m_sto_score;
 488
 489   // statistics about time spent ending the STO early
 490   uint64_t m_sto_end_early_count;
 491   tokutime_t m_sto_end_early_time;
 492
 493   // effect: begins the single txnid optimizaiton, setting m_sto_txnid
 494   //         to the given txnid.
 495   // requires: m_sto_txnid is invalid
 496   void sto_begin(TXNID txnid);
 497
 498   // effect: append a range to the sto buffer
 499   // requires: m_sto_txnid is valid
 500   void sto_append(const DBT *left_key, const DBT *right_key,
 501                   bool is_write_request);
 502
 503   // effect: ends the single txnid optimization, releaseing any memory
 504   //         stored in the sto buffer, notifying the tracker, and
 505   //         invalidating m_sto_txnid.
 506   // requires: m_sto_txnid is valid
 507   void sto_end(void);
 508
 509   // params: prepared_lkr is a void * to a prepared locked keyrange. see below.
 510   // effect: ends the single txnid optimization early, migrating buffer locks
 511   //         into the rangetree, calling sto_end(), and then setting the
 512   //         sto_score back to zero.
 513   // requires: m_sto_txnid is valid
 514   void sto_end_early(void *prepared_lkr);
 515   void sto_end_early_no_accounting(void *prepared_lkr);
 516
 517   // params: prepared_lkr is a void * to a prepared locked keyrange. we can't
 518   // use
 519   //         the real type because the compiler won't allow us to forward
 520   //         declare concurrent_tree::locked_keyrange without including
 521   //         concurrent_tree.h, which we cannot do here because it is a template
 522   //         implementation.
 523   // requires: the prepared locked keyrange is for the locktree's rangetree
 524   // requires: m_sto_txnid is valid
 525   // effect: migrates each lock in the single txnid buffer into the locktree's
 526   //         rangetree, notifying the memory tracker as necessary.
 527   void sto_migrate_buffer_ranges_to_tree(void *prepared_lkr);
 528
 529   // effect: If m_sto_txnid is valid, then release the txnid's locks
 530   //         by ending the optimization.
 531   // requires: If m_sto_txnid is valid, it is equal to the given txnid
 532   // returns: True if locks were released for this txnid
 533   bool sto_try_release(TXNID txnid);
 534
 535   // params: prepared_lkr is a void * to a prepared locked keyrange. see above.
 536   // requires: the prepared locked keyrange is for the locktree's rangetree
 537   // effect: If m_sto_txnid is valid and equal to the given txnid, then
 538   // append a range onto the buffer. Otherwise, if m_sto_txnid is valid
 539   //        but not equal to this txnid, then migrate the buffer's locks
 540   //        into the rangetree and end the optimization, setting the score
 541   //        back to zero.
 542   // returns: true if the lock was acquired for this txnid
 543   bool sto_try_acquire(void *prepared_lkr, TXNID txnid, const DBT *left_key,
 544                        const DBT *right_key, bool is_write_request);
 545
 546   // Effect:
 547   //  Provides a hook for a helgrind suppression.
 548   // Returns:
 549   //  true if m_sto_txnid is not TXNID_NONE
 550   bool sto_txnid_is_valid_unsafe(void) const;
 551
 552   // Effect:
 553   //  Provides a hook for a helgrind suppression.
 554   // Returns:
 555   //  m_sto_score
 556   int sto_get_score_unsafe(void) const;
 557
 558   void remove_overlapping_locks_for_txnid(TXNID txnid, const DBT *left_key,
 559                                           const DBT *right_key);
 560
 561   int acquire_lock_consolidated(void *prepared_lkr, TXNID txnid,
 562                                 const DBT *left_key, const DBT *right_key,
 563                                 bool is_write_request, txnid_set *conflicts);
 564
 565   int acquire_lock(bool is_write_request, TXNID txnid, const DBT *left_key,
 566                    const DBT *right_key, txnid_set *conflicts);
 567
 568   int try_acquire_lock(bool is_write_request, TXNID txnid, const DBT *left_key,
 569                        const DBT *right_key, txnid_set *conflicts,
 570                        bool big_txn);
 571
 572   friend class locktree_unit_test;
 573   friend class manager_unit_test;
 574   friend class lock_request_unit_test;
 575
 576   // engine status reaches into the locktree to read some stats
 577   friend void locktree_manager::get_status(LTM_STATUS status);
 578 };
 579
 580 } /* namespace toku */