drivers/md/bcache/extents.c

   1 /*
   2  * Copyright (C) 2010 Kent Overstreet <kent.overstreet@gmail.com>
   3  *
   4  * Uses a block device as cache for other block devices; optimized for SSDs.
   5  * All allocation is done in buckets, which should match the erase block size
   6  * of the device.
   7  *
   8  * Buckets containing cached data are kept on a heap sorted by priority;
   9  * bucket priority is increased on cache hit, and periodically all the buckets
  10  * on the heap have their priority scaled down. This currently is just used as
  11  * an LRU but in the future should allow for more intelligent heuristics.
  12  *
  13  * Buckets have an 8 bit counter; freeing is accomplished by incrementing the
  14  * counter. Garbage collection is used to remove stale pointers.
  15  *
  16  * Indexing is done via a btree; nodes are not necessarily fully sorted, rather
  17  * as keys are inserted we only sort the pages that have not yet been written.
  18  * When garbage collection is run, we resort the entire node.
  19  *
  20  * All configuration is done via sysfs; see Documentation/bcache.txt.
  21  */
  22
  23 #include "bcache.h"
  24 #include "btree.h"
  25 #include "debug.h"
  26 #include "extents.h"
  27 #include "writeback.h"
  28
  29 static void sort_key_next(struct btree_iter *iter,
  30                           struct btree_iter_set *i)
  31 {
  32         i->k = bkey_next(i->k);
  33
  34         if (i->k == i->end)
  35                 *i = iter->data[--iter->used];
  36 }
  37
  38 static bool bch_key_sort_cmp(struct btree_iter_set l,
  39                              struct btree_iter_set r)
  40 {
  41         int64_t c = bkey_cmp(l.k, r.k);
  42
  43         return c ? c > 0 : l.k < r.k;
  44 }
  45
  46 static bool __ptr_invalid(struct cache_set *c, const struct bkey *k)
  47 {
  48         unsigned i;
  49
  50         for (i = 0; i < KEY_PTRS(k); i++)
  51                 if (ptr_available(c, k, i)) {
  52                         struct cache *ca = PTR_CACHE(c, k, i);
  53                         size_t bucket = PTR_BUCKET_NR(c, k, i);
  54                         size_t r = bucket_remainder(c, PTR_OFFSET(k, i));
  55
  56                         if (KEY_SIZE(k) + r > c->sb.bucket_size ||
  57                             bucket <  ca->sb.first_bucket ||
  58                             bucket >= ca->sb.nbuckets)
  59                                 return true;
  60                 }
  61
  62         return false;
  63 }
  64
  65 /* Common among btree and extent ptrs */
  66
  67 static const char *bch_ptr_status(struct cache_set *c, const struct bkey *k)
  68 {
  69         unsigned i;
  70
  71         for (i = 0; i < KEY_PTRS(k); i++)
  72                 if (ptr_available(c, k, i)) {
  73                         struct cache *ca = PTR_CACHE(c, k, i);
  74                         size_t bucket = PTR_BUCKET_NR(c, k, i);
  75                         size_t r = bucket_remainder(c, PTR_OFFSET(k, i));
  76
  77                         if (KEY_SIZE(k) + r > c->sb.bucket_size)
  78                                 return "bad, length too big";
  79                         if (bucket <  ca->sb.first_bucket)
  80                                 return "bad, short offset";
  81                         if (bucket >= ca->sb.nbuckets)
  82                                 return "bad, offset past end of device";
  83                         if (ptr_stale(c, k, i))
  84                                 return "stale";
  85                 }
  86
  87         if (!bkey_cmp(k, &ZERO_KEY))
  88                 return "bad, null key";
  89         if (!KEY_PTRS(k))
  90                 return "bad, no pointers";
  91         if (!KEY_SIZE(k))
  92                 return "zeroed key";
  93         return "";
  94 }
  95
  96 void bch_extent_to_text(char *buf, size_t size, const struct bkey *k)
  97 {
  98         unsigned i = 0;
  99         char *out = buf, *end = buf + size;
 100
 101 #define p(...)  (out += scnprintf(out, end - out, __VA_ARGS__))
 102
 103         p("%llu:%llu len %llu -> [", KEY_INODE(k), KEY_START(k), KEY_SIZE(k));
 104
 105         for (i = 0; i < KEY_PTRS(k); i++) {
 106                 if (i)
 107                         p(", ");
 108
 109                 if (PTR_DEV(k, i) == PTR_CHECK_DEV)
 110                         p("check dev");
 111                 else
 112                         p("%llu:%llu gen %llu", PTR_DEV(k, i),
 113                           PTR_OFFSET(k, i), PTR_GEN(k, i));
 114         }
 115
 116         p("]");
 117
 118         if (KEY_DIRTY(k))
 119                 p(" dirty");
 120         if (KEY_CSUM(k))
 121                 p(" cs%llu %llx", KEY_CSUM(k), k->ptr[1]);
 122 #undef p
 123 }
 124
 125 static void bch_bkey_dump(struct btree_keys *keys, const struct bkey *k)
 126 {
 127         struct btree *b = container_of(keys, struct btree, keys);
 128         unsigned j;
 129         char buf[80];
 130
 131         bch_extent_to_text(buf, sizeof(buf), k);
 132         printk(" %s", buf);
 133
 134         for (j = 0; j < KEY_PTRS(k); j++) {
 135                 size_t n = PTR_BUCKET_NR(b->c, k, j);
 136                 printk(" bucket %zu", n);
 137
 138                 if (n >= b->c->sb.first_bucket && n < b->c->sb.nbuckets)
 139                         printk(" prio %i",
 140                                PTR_BUCKET(b->c, k, j)->prio);
 141         }
 142
 143         printk(" %s\n", bch_ptr_status(b->c, k));
 144 }
 145
 146 /* Btree ptrs */
 147
 148 bool __bch_btree_ptr_invalid(struct cache_set *c, const struct bkey *k)
 149 {
 150         char buf[80];
 151
 152         if (!KEY_PTRS(k) || !KEY_SIZE(k) || KEY_DIRTY(k))
 153                 goto bad;
 154
 155         if (__ptr_invalid(c, k))
 156                 goto bad;
 157
 158         return false;
 159 bad:
 160         bch_extent_to_text(buf, sizeof(buf), k);
 161         cache_bug(c, "spotted btree ptr %s: %s", buf, bch_ptr_status(c, k));
 162         return true;
 163 }
 164
 165 static bool bch_btree_ptr_invalid(struct btree_keys *bk, const struct bkey *k)
 166 {
 167         struct btree *b = container_of(bk, struct btree, keys);
 168         return __bch_btree_ptr_invalid(b->c, k);
 169 }
 170
 171 static bool btree_ptr_bad_expensive(struct btree *b, const struct bkey *k)
 172 {
 173         unsigned i;
 174         char buf[80];
 175         struct bucket *g;
 176
 177         if (mutex_trylock(&b->c->bucket_lock)) {
 178                 for (i = 0; i < KEY_PTRS(k); i++)
 179                         if (ptr_available(b->c, k, i)) {
 180                                 g = PTR_BUCKET(b->c, k, i);
 181
 182                                 if (KEY_DIRTY(k) ||
 183                                     g->prio != BTREE_PRIO ||
 184                                     (b->c->gc_mark_valid &&
 185                                      GC_MARK(g) != GC_MARK_METADATA))
 186                                         goto err;
 187                         }
 188
 189                 mutex_unlock(&b->c->bucket_lock);
 190         }
 191
 192         return false;
 193 err:
 194         mutex_unlock(&b->c->bucket_lock);
 195         bch_extent_to_text(buf, sizeof(buf), k);
 196         btree_bug(b,
 197 "inconsistent btree pointer %s: bucket %li pin %i prio %i gen %i last_gc %i mark %llu gc_gen %i",
 198                   buf, PTR_BUCKET_NR(b->c, k, i), atomic_read(&g->pin),
 199                   g->prio, g->gen, g->last_gc, GC_MARK(g), g->gc_gen);
 200         return true;
 201 }
 202
 203 static bool bch_btree_ptr_bad(struct btree_keys *bk, const struct bkey *k)
 204 {
 205         struct btree *b = container_of(bk, struct btree, keys);
 206         unsigned i;
 207
 208         if (!bkey_cmp(k, &ZERO_KEY) ||
 209             !KEY_PTRS(k) ||
 210             bch_ptr_invalid(bk, k))
 211                 return true;
 212
 213         for (i = 0; i < KEY_PTRS(k); i++)
 214                 if (!ptr_available(b->c, k, i) ||
 215                     ptr_stale(b->c, k, i))
 216                         return true;
 217
 218         if (expensive_debug_checks(b->c) &&
 219             btree_ptr_bad_expensive(b, k))
 220                 return true;
 221
 222         return false;
 223 }
 224
 225 static bool bch_btree_ptr_insert_fixup(struct btree_keys *bk,
 226                                        struct bkey *insert,
 227                                        struct btree_iter *iter,
 228                                        struct bkey *replace_key)
 229 {
 230         struct btree *b = container_of(bk, struct btree, keys);
 231
 232         if (!KEY_OFFSET(insert))
 233                 btree_current_write(b)->prio_blocked++;
 234
 235         return false;
 236 }
 237
 238 const struct btree_keys_ops bch_btree_keys_ops = {
 239         .sort_cmp       = bch_key_sort_cmp,
 240         .insert_fixup   = bch_btree_ptr_insert_fixup,
 241         .key_invalid    = bch_btree_ptr_invalid,
 242         .key_bad        = bch_btree_ptr_bad,
 243         .key_to_text    = bch_extent_to_text,
 244         .key_dump       = bch_bkey_dump,
 245 };
 246
 247 /* Extents */
 248
 249 /*
 250  * Returns true if l > r - unless l == r, in which case returns true if l is
 251  * older than r.
 252  *
 253  * Necessary for btree_sort_fixup() - if there are multiple keys that compare
 254  * equal in different sets, we have to process them newest to oldest.
 255  */
 256 static bool bch_extent_sort_cmp(struct btree_iter_set l,
 257                                 struct btree_iter_set r)
 258 {
 259         int64_t c = bkey_cmp(&START_KEY(l.k), &START_KEY(r.k));
 260
 261         return c ? c > 0 : l.k < r.k;
 262 }
 263
 264 static struct bkey *bch_extent_sort_fixup(struct btree_iter *iter,
 265                                           struct bkey *tmp)
 266 {
 267         while (iter->used > 1) {
 268                 struct btree_iter_set *top = iter->data, *i = top + 1;
 269
 270                 if (iter->used > 2 &&
 271                     bch_extent_sort_cmp(i[0], i[1]))
 272                         i++;
 273
 274                 if (bkey_cmp(top->k, &START_KEY(i->k)) <= 0)
 275                         break;
 276
 277                 if (!KEY_SIZE(i->k)) {
 278                         sort_key_next(iter, i);
 279                         heap_sift(iter, i - top, bch_extent_sort_cmp);
 280                         continue;
 281                 }
 282
 283                 if (top->k > i->k) {
 284                         if (bkey_cmp(top->k, i->k) >= 0)
 285                                 sort_key_next(iter, i);
 286                         else
 287                                 bch_cut_front(top->k, i->k);
 288
 289                         heap_sift(iter, i - top, bch_extent_sort_cmp);
 290                 } else {
 291                         /* can't happen because of comparison func */
 292                         BUG_ON(!bkey_cmp(&START_KEY(top->k), &START_KEY(i->k)));
 293
 294                         if (bkey_cmp(i->k, top->k) < 0) {
 295                                 bkey_copy(tmp, top->k);
 296
 297                                 bch_cut_back(&START_KEY(i->k), tmp);
 298                                 bch_cut_front(i->k, top->k);
 299                                 heap_sift(iter, 0, bch_extent_sort_cmp);
 300
 301                                 return tmp;
 302                         } else {
 303                                 bch_cut_back(&START_KEY(i->k), top->k);
 304                         }
 305                 }
 306         }
 307
 308         return NULL;
 309 }
 310
 311 static bool bch_extent_insert_fixup(struct btree_keys *b,
 312                                     struct bkey *insert,
 313                                     struct btree_iter *iter,
 314                                     struct bkey *replace_key)
 315 {
 316         struct cache_set *c = container_of(b, struct btree, keys)->c;
 317
 318         void subtract_dirty(struct bkey *k, uint64_t offset, int sectors)
 319         {
 320                 if (KEY_DIRTY(k))
 321                         bcache_dev_sectors_dirty_add(c, KEY_INODE(k),
 322                                                      offset, -sectors);
 323         }
 324
 325         uint64_t old_offset;
 326         unsigned old_size, sectors_found = 0;
 327
 328         BUG_ON(!KEY_OFFSET(insert));
 329         BUG_ON(!KEY_SIZE(insert));
 330
 331         while (1) {
 332                 struct bkey *k = bch_btree_iter_next(iter);
 333                 if (!k)
 334                         break;
 335
 336                 if (bkey_cmp(&START_KEY(k), insert) >= 0) {
 337                         if (KEY_SIZE(k))
 338                                 break;
 339                         else
 340                                 continue;
 341                 }
 342
 343                 if (bkey_cmp(k, &START_KEY(insert)) <= 0)
 344                         continue;
 345
 346                 old_offset = KEY_START(k);
 347                 old_size = KEY_SIZE(k);
 348
 349                 /*
 350                  * We might overlap with 0 size extents; we can't skip these
 351                  * because if they're in the set we're inserting to we have to
 352                  * adjust them so they don't overlap with the key we're
 353                  * inserting. But we don't want to check them for replace
 354                  * operations.
 355                  */
 356
 357                 if (replace_key && KEY_SIZE(k)) {
 358                         /*
 359                          * k might have been split since we inserted/found the
 360                          * key we're replacing
 361                          */
 362                         unsigned i;
 363                         uint64_t offset = KEY_START(k) -
 364                                 KEY_START(replace_key);
 365
 366                         /* But it must be a subset of the replace key */
 367                         if (KEY_START(k) < KEY_START(replace_key) ||
 368                             KEY_OFFSET(k) > KEY_OFFSET(replace_key))
 369                                 goto check_failed;
 370
 371                         /* We didn't find a key that we were supposed to */
 372                         if (KEY_START(k) > KEY_START(insert) + sectors_found)
 373                                 goto check_failed;
 374
 375                         if (!bch_bkey_equal_header(k, replace_key))
 376                                 goto check_failed;
 377
 378                         /* skip past gen */
 379                         offset <<= 8;
 380
 381                         BUG_ON(!KEY_PTRS(replace_key));
 382
 383                         for (i = 0; i < KEY_PTRS(replace_key); i++)
 384                                 if (k->ptr[i] != replace_key->ptr[i] + offset)
 385                                         goto check_failed;
 386
 387                         sectors_found = KEY_OFFSET(k) - KEY_START(insert);
 388                 }
 389
 390                 if (bkey_cmp(insert, k) < 0 &&
 391                     bkey_cmp(&START_KEY(insert), &START_KEY(k)) > 0) {
 392                         /*
 393                          * We overlapped in the middle of an existing key: that
 394                          * means we have to split the old key. But we have to do
 395                          * slightly different things depending on whether the
 396                          * old key has been written out yet.
 397                          */
 398
 399                         struct bkey *top;
 400
 401                         subtract_dirty(k, KEY_START(insert), KEY_SIZE(insert));
 402
 403                         if (bkey_written(b, k)) {
 404                                 /*
 405                                  * We insert a new key to cover the top of the
 406                                  * old key, and the old key is modified in place
 407                                  * to represent the bottom split.
 408                                  *
 409                                  * It's completely arbitrary whether the new key
 410                                  * is the top or the bottom, but it has to match
 411                                  * up with what btree_sort_fixup() does - it
 412                                  * doesn't check for this kind of overlap, it
 413                                  * depends on us inserting a new key for the top
 414                                  * here.
 415                                  */
 416                                 top = bch_bset_search(b, bset_tree_last(b),
 417                                                       insert);
 418                                 bch_bset_insert(b, top, k);
 419                         } else {
 420                                 BKEY_PADDED(key) temp;
 421                                 bkey_copy(&temp.key, k);
 422                                 bch_bset_insert(b, k, &temp.key);
 423                                 top = bkey_next(k);
 424                         }
 425
 426                         bch_cut_front(insert, top);
 427                         bch_cut_back(&START_KEY(insert), k);
 428                         bch_bset_fix_invalidated_key(b, k);
 429                         goto out;
 430                 }
 431
 432                 if (bkey_cmp(insert, k) < 0) {
 433                         bch_cut_front(insert, k);
 434                 } else {
 435                         if (bkey_cmp(&START_KEY(insert), &START_KEY(k)) > 0)
 436                                 old_offset = KEY_START(insert);
 437
 438                         if (bkey_written(b, k) &&
 439                             bkey_cmp(&START_KEY(insert), &START_KEY(k)) <= 0) {
 440                                 /*
 441                                  * Completely overwrote, so we don't have to
 442                                  * invalidate the binary search tree
 443                                  */
 444                                 bch_cut_front(k, k);
 445                         } else {
 446                                 __bch_cut_back(&START_KEY(insert), k);
 447                                 bch_bset_fix_invalidated_key(b, k);
 448                         }
 449                 }
 450
 451                 subtract_dirty(k, old_offset, old_size - KEY_SIZE(k));
 452         }
 453
 454 check_failed:
 455         if (replace_key) {
 456                 if (!sectors_found) {
 457                         return true;
 458                 } else if (sectors_found < KEY_SIZE(insert)) {
 459                         SET_KEY_OFFSET(insert, KEY_OFFSET(insert) -
 460                                        (KEY_SIZE(insert) - sectors_found));
 461                         SET_KEY_SIZE(insert, sectors_found);
 462                 }
 463         }
 464 out:
 465         if (KEY_DIRTY(insert))
 466                 bcache_dev_sectors_dirty_add(c, KEY_INODE(insert),
 467                                              KEY_START(insert),
 468                                              KEY_SIZE(insert));
 469
 470         return false;
 471 }
 472
 473 static bool bch_extent_invalid(struct btree_keys *bk, const struct bkey *k)
 474 {
 475         struct btree *b = container_of(bk, struct btree, keys);
 476         char buf[80];
 477
 478         if (!KEY_SIZE(k))
 479                 return true;
 480
 481         if (KEY_SIZE(k) > KEY_OFFSET(k))
 482                 goto bad;
 483
 484         if (__ptr_invalid(b->c, k))
 485                 goto bad;
 486
 487         return false;
 488 bad:
 489         bch_extent_to_text(buf, sizeof(buf), k);
 490         cache_bug(b->c, "spotted extent %s: %s", buf, bch_ptr_status(b->c, k));
 491         return true;
 492 }
 493
 494 static bool bch_extent_bad_expensive(struct btree *b, const struct bkey *k,
 495                                      unsigned ptr)
 496 {
 497         struct bucket *g = PTR_BUCKET(b->c, k, ptr);
 498         char buf[80];
 499
 500         if (mutex_trylock(&b->c->bucket_lock)) {
 501                 if (b->c->gc_mark_valid &&
 502                     ((GC_MARK(g) != GC_MARK_DIRTY &&
 503                       KEY_DIRTY(k)) ||
 504                      GC_MARK(g) == GC_MARK_METADATA))
 505                         goto err;
 506
 507                 if (g->prio == BTREE_PRIO)
 508                         goto err;
 509
 510                 mutex_unlock(&b->c->bucket_lock);
 511         }
 512
 513         return false;
 514 err:
 515         mutex_unlock(&b->c->bucket_lock);
 516         bch_extent_to_text(buf, sizeof(buf), k);
 517         btree_bug(b,
 518 "inconsistent extent pointer %s:\nbucket %zu pin %i prio %i gen %i last_gc %i mark %llu gc_gen %i",
 519                   buf, PTR_BUCKET_NR(b->c, k, ptr), atomic_read(&g->pin),
 520                   g->prio, g->gen, g->last_gc, GC_MARK(g), g->gc_gen);
 521         return true;
 522 }
 523
 524 static bool bch_extent_bad(struct btree_keys *bk, const struct bkey *k)
 525 {
 526         struct btree *b = container_of(bk, struct btree, keys);
 527         struct bucket *g;
 528         unsigned i, stale;
 529
 530         if (!KEY_PTRS(k) ||
 531             bch_extent_invalid(bk, k))
 532                 return true;
 533
 534         for (i = 0; i < KEY_PTRS(k); i++)
 535                 if (!ptr_available(b->c, k, i))
 536                         return true;
 537
 538         if (!expensive_debug_checks(b->c) && KEY_DIRTY(k))
 539                 return false;
 540
 541         for (i = 0; i < KEY_PTRS(k); i++) {
 542                 g = PTR_BUCKET(b->c, k, i);
 543                 stale = ptr_stale(b->c, k, i);
 544
 545                 btree_bug_on(stale > 96, b,
 546                              "key too stale: %i, need_gc %u",
 547                              stale, b->c->need_gc);
 548
 549                 btree_bug_on(stale && KEY_DIRTY(k) && KEY_SIZE(k),
 550                              b, "stale dirty pointer");
 551
 552                 if (stale)
 553                         return true;
 554
 555                 if (expensive_debug_checks(b->c) &&
 556                     bch_extent_bad_expensive(b, k, i))
 557                         return true;
 558         }
 559
 560         return false;
 561 }
 562
 563 static uint64_t merge_chksums(struct bkey *l, struct bkey *r)
 564 {
 565         return (l->ptr[KEY_PTRS(l)] + r->ptr[KEY_PTRS(r)]) &
 566                 ~((uint64_t)1 << 63);
 567 }
 568
 569 static bool bch_extent_merge(struct btree_keys *bk, struct bkey *l, struct bkey *r)
 570 {
 571         struct btree *b = container_of(bk, struct btree, keys);
 572         unsigned i;
 573
 574         if (key_merging_disabled(b->c))
 575                 return false;
 576
 577         for (i = 0; i < KEY_PTRS(l); i++)
 578                 if (l->ptr[i] + PTR(0, KEY_SIZE(l), 0) != r->ptr[i] ||
 579                     PTR_BUCKET_NR(b->c, l, i) != PTR_BUCKET_NR(b->c, r, i))
 580                         return false;
 581
 582         /* Keys with no pointers aren't restricted to one bucket and could
 583          * overflow KEY_SIZE
 584          */
 585         if (KEY_SIZE(l) + KEY_SIZE(r) > USHRT_MAX) {
 586                 SET_KEY_OFFSET(l, KEY_OFFSET(l) + USHRT_MAX - KEY_SIZE(l));
 587                 SET_KEY_SIZE(l, USHRT_MAX);
 588
 589                 bch_cut_front(l, r);
 590                 return false;
 591         }
 592
 593         if (KEY_CSUM(l)) {
 594                 if (KEY_CSUM(r))
 595                         l->ptr[KEY_PTRS(l)] = merge_chksums(l, r);
 596                 else
 597                         SET_KEY_CSUM(l, 0);
 598         }
 599
 600         SET_KEY_OFFSET(l, KEY_OFFSET(l) + KEY_SIZE(r));
 601         SET_KEY_SIZE(l, KEY_SIZE(l) + KEY_SIZE(r));
 602
 603         return true;
 604 }
 605
 606 const struct btree_keys_ops bch_extent_keys_ops = {
 607         .sort_cmp       = bch_extent_sort_cmp,
 608         .sort_fixup     = bch_extent_sort_fixup,
 609         .insert_fixup   = bch_extent_insert_fixup,
 610         .key_invalid    = bch_extent_invalid,
 611         .key_bad        = bch_extent_bad,
 612         .key_merge      = bch_extent_merge,
 613         .key_to_text    = bch_extent_to_text,
 614         .key_dump       = bch_bkey_dump,
 615         .is_extents     = true,
 616 };