fs/mbcache.c

   1 /*
   2  * linux/fs/mbcache.c
   3  * (C) 2001-2002 Andreas Gruenbacher, <a.gruenbacher@computer.org>
   4  */
   5
   6 /*
   7  * Filesystem Meta Information Block Cache (mbcache)
   8  *
   9  * The mbcache caches blocks of block devices that need to be located
  10  * by their device/block number, as well as by other criteria (such
  11  * as the block's contents).
  12  *
  13  * There can only be one cache entry in a cache per device and block number.
  14  * Additional indexes need not be unique in this sense. The number of
  15  * additional indexes (=other criteria) can be hardwired at compile time
  16  * or specified at cache create time.
  17  *
  18  * Each cache entry is of fixed size. An entry may be `valid' or `invalid'
  19  * in the cache. A valid entry is in the main hash tables of the cache,
  20  * and may also be in the lru list. An invalid entry is not in any hashes
  21  * or lists.
  22  *
  23  * A valid cache entry is only in the lru list if no handles refer to it.
  24  * Invalid cache entries will be freed when the last handle to the cache
  25  * entry is released. Entries that cannot be freed immediately are put
  26  * back on the lru list.
  27  */
  28
  29 /*
  30  * Lock descriptions and usage:
  31  *
  32  * Each hash chain of both the block and index hash tables now contains
  33  * a built-in lock used to serialize accesses to the hash chain.
  34  *
  35  * Accesses to global data structures mb_cache_list and mb_cache_lru_list
  36  * are serialized via the global spinlock mb_cache_spinlock.
  37  *
  38  * Each mb_cache_entry contains a spinlock, e_entry_lock, to serialize
  39  * accesses to its local data, such as e_used and e_queued.
  40  *
  41  * Lock ordering:
  42  *
  43  * Each block hash chain's lock has the highest lock order, followed by an
  44  * index hash chain's lock, mb_cache_bg_lock (used to implement mb_cache_entry's
  45  * lock), and mb_cach_spinlock, with the lowest order.  While holding
  46  * either a block or index hash chain lock, a thread can acquire an
  47  * mc_cache_bg_lock, which in turn can also acquire mb_cache_spinlock.
  48  *
  49  * Synchronization:
  50  *
  51  * Since both mb_cache_entry_get and mb_cache_entry_find scan the block and
  52  * index hash chian, it needs to lock the corresponding hash chain.  For each
  53  * mb_cache_entry within the chain, it needs to lock the mb_cache_entry to
  54  * prevent either any simultaneous release or free on the entry and also
  55  * to serialize accesses to either the e_used or e_queued member of the entry.
  56  *
  57  * To avoid having a dangling reference to an already freed
  58  * mb_cache_entry, an mb_cache_entry is only freed when it is not on a
  59  * block hash chain and also no longer being referenced, both e_used,
  60  * and e_queued are 0's.  When an mb_cache_entry is explicitly freed it is
  61  * first removed from a block hash chain.
  62  */
  63
  64 #include <linux/kernel.h>
  65 #include <linux/module.h>
  66
  67 #include <linux/hash.h>
  68 #include <linux/fs.h>
  69 #include <linux/mm.h>
  70 #include <linux/slab.h>
  71 #include <linux/sched.h>
  72 #include <linux/list_bl.h>
  73 #include <linux/mbcache.h>
  74 #include <linux/init.h>
  75 #include <linux/blockgroup_lock.h>
  76
  77 #ifdef MB_CACHE_DEBUG
  78 # define mb_debug(f...) do { \
  79                 printk(KERN_DEBUG f); \
  80                 printk("\n"); \
  81         } while (0)
  82 #define mb_assert(c) do { if (!(c)) \
  83                 printk(KERN_ERR "assertion " #c " failed\n"); \
  84         } while(0)
  85 #else
  86 # define mb_debug(f...) do { } while(0)
  87 # define mb_assert(c) do { } while(0)
  88 #endif
  89 #define mb_error(f...) do { \
  90                 printk(KERN_ERR f); \
  91                 printk("\n"); \
  92         } while(0)
  93
  94 #define MB_CACHE_WRITER ((unsigned short)~0U >> 1)
  95
  96 #define MB_CACHE_ENTRY_LOCK_BITS        __builtin_log2(NR_BG_LOCKS)
  97 #define MB_CACHE_ENTRY_LOCK_INDEX(ce)                   \
  98         (hash_long((unsigned long)ce, MB_CACHE_ENTRY_LOCK_BITS))
  99
 100 static DECLARE_WAIT_QUEUE_HEAD(mb_cache_queue);
 101 static struct blockgroup_lock *mb_cache_bg_lock;
 102 static struct kmem_cache *mb_cache_kmem_cache;
 103
 104 MODULE_AUTHOR("Andreas Gruenbacher <a.gruenbacher@computer.org>");
 105 MODULE_DESCRIPTION("Meta block cache (for extended attributes)");
 106 MODULE_LICENSE("GPL");
 107
 108 EXPORT_SYMBOL(mb_cache_create);
 109 EXPORT_SYMBOL(mb_cache_shrink);
 110 EXPORT_SYMBOL(mb_cache_destroy);
 111 EXPORT_SYMBOL(mb_cache_entry_alloc);
 112 EXPORT_SYMBOL(mb_cache_entry_insert);
 113 EXPORT_SYMBOL(mb_cache_entry_release);
 114 EXPORT_SYMBOL(mb_cache_entry_free);
 115 EXPORT_SYMBOL(mb_cache_entry_get);
 116 #if !defined(MB_CACHE_INDEXES_COUNT) || (MB_CACHE_INDEXES_COUNT > 0)
 117 EXPORT_SYMBOL(mb_cache_entry_find_first);
 118 EXPORT_SYMBOL(mb_cache_entry_find_next);
 119 #endif
 120
 121 /*
 122  * Global data: list of all mbcache's, lru list, and a spinlock for
 123  * accessing cache data structures on SMP machines. The lru list is
 124  * global across all mbcaches.
 125  */
 126
 127 static LIST_HEAD(mb_cache_list);
 128 static LIST_HEAD(mb_cache_lru_list);
 129 static DEFINE_SPINLOCK(mb_cache_spinlock);
 130
 131 static inline void
 132 __spin_lock_mb_cache_entry(struct mb_cache_entry *ce)
 133 {
 134         spin_lock(bgl_lock_ptr(mb_cache_bg_lock,
 135                 MB_CACHE_ENTRY_LOCK_INDEX(ce)));
 136 }
 137
 138 static inline void
 139 __spin_unlock_mb_cache_entry(struct mb_cache_entry *ce)
 140 {
 141         spin_unlock(bgl_lock_ptr(mb_cache_bg_lock,
 142                 MB_CACHE_ENTRY_LOCK_INDEX(ce)));
 143 }
 144
 145 static inline int
 146 __mb_cache_entry_is_block_hashed(struct mb_cache_entry *ce)
 147 {
 148         return !hlist_bl_unhashed(&ce->e_block_list);
 149 }
 150
 151
 152 static inline void
 153 __mb_cache_entry_unhash_block(struct mb_cache_entry *ce)
 154 {
 155         if (__mb_cache_entry_is_block_hashed(ce))
 156                 hlist_bl_del_init(&ce->e_block_list);
 157 }
 158
 159 static inline int
 160 __mb_cache_entry_is_index_hashed(struct mb_cache_entry *ce)
 161 {
 162         return !hlist_bl_unhashed(&ce->e_index.o_list);
 163 }
 164
 165 static inline void
 166 __mb_cache_entry_unhash_index(struct mb_cache_entry *ce)
 167 {
 168         if (__mb_cache_entry_is_index_hashed(ce))
 169                 hlist_bl_del_init(&ce->e_index.o_list);
 170 }
 171
 172 /*
 173  * __mb_cache_entry_unhash_unlock()
 174  *
 175  * This function is called to unhash both the block and index hash
 176  * chain.
 177  * It assumes both the block and index hash chain is locked upon entry.
 178  * It also unlock both hash chains both exit
 179  */
 180 static inline void
 181 __mb_cache_entry_unhash_unlock(struct mb_cache_entry *ce)
 182 {
 183         __mb_cache_entry_unhash_index(ce);
 184         hlist_bl_unlock(ce->e_index_hash_p);
 185         __mb_cache_entry_unhash_block(ce);
 186         hlist_bl_unlock(ce->e_block_hash_p);
 187 }
 188
 189 static void
 190 __mb_cache_entry_forget(struct mb_cache_entry *ce, gfp_t gfp_mask)
 191 {
 192         struct mb_cache *cache = ce->e_cache;
 193
 194         mb_assert(!(ce->e_used || ce->e_queued || atomic_read(&ce->e_refcnt)));
 195         kmem_cache_free(cache->c_entry_cache, ce);
 196         atomic_dec(&cache->c_entry_count);
 197 }
 198
 199 static void
 200 __mb_cache_entry_release(struct mb_cache_entry *ce)
 201 {
 202         /* First lock the entry to serialize access to its local data. */
 203         __spin_lock_mb_cache_entry(ce);
 204         /* Wake up all processes queuing for this cache entry. */
 205         if (ce->e_queued)
 206                 wake_up_all(&mb_cache_queue);
 207         if (ce->e_used >= MB_CACHE_WRITER)
 208                 ce->e_used -= MB_CACHE_WRITER;
 209         /*
 210          * Make sure that all cache entries on lru_list have
 211          * both e_used and e_qued of 0s.
 212          */
 213         ce->e_used--;
 214         if (!(ce->e_used || ce->e_queued || atomic_read(&ce->e_refcnt))) {
 215                 if (!__mb_cache_entry_is_block_hashed(ce)) {
 216                         __spin_unlock_mb_cache_entry(ce);
 217                         goto forget;
 218                 }
 219                 /*
 220                  * Need access to lru list, first drop entry lock,
 221                  * then reacquire the lock in the proper order.
 222                  */
 223                 spin_lock(&mb_cache_spinlock);
 224                 if (list_empty(&ce->e_lru_list))
 225                         list_add_tail(&ce->e_lru_list, &mb_cache_lru_list);
 226                 spin_unlock(&mb_cache_spinlock);
 227         }
 228         __spin_unlock_mb_cache_entry(ce);
 229         return;
 230 forget:
 231         mb_assert(list_empty(&ce->e_lru_list));
 232         __mb_cache_entry_forget(ce, GFP_KERNEL);
 233 }
 234
 235 /*
 236  * mb_cache_shrink_scan()  memory pressure callback
 237  *
 238  * This function is called by the kernel memory management when memory
 239  * gets low.
 240  *
 241  * @shrink: (ignored)
 242  * @sc: shrink_control passed from reclaim
 243  *
 244  * Returns the number of objects freed.
 245  */
 246 static unsigned long
 247 mb_cache_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
 248 {
 249         LIST_HEAD(free_list);
 250         struct mb_cache_entry *entry, *tmp;
 251         int nr_to_scan = sc->nr_to_scan;
 252         gfp_t gfp_mask = sc->gfp_mask;
 253         unsigned long freed = 0;
 254
 255         mb_debug("trying to free %d entries", nr_to_scan);
 256         spin_lock(&mb_cache_spinlock);
 257         while ((nr_to_scan-- > 0) && !list_empty(&mb_cache_lru_list)) {
 258                 struct mb_cache_entry *ce =
 259                         list_entry(mb_cache_lru_list.next,
 260                                 struct mb_cache_entry, e_lru_list);
 261                 list_del_init(&ce->e_lru_list);
 262                 if (ce->e_used || ce->e_queued || atomic_read(&ce->e_refcnt))
 263                         continue;
 264                 spin_unlock(&mb_cache_spinlock);
 265                 /* Prevent any find or get operation on the entry */
 266                 hlist_bl_lock(ce->e_block_hash_p);
 267                 hlist_bl_lock(ce->e_index_hash_p);
 268                 /* Ignore if it is touched by a find/get */
 269                 if (ce->e_used || ce->e_queued || atomic_read(&ce->e_refcnt) ||
 270                         !list_empty(&ce->e_lru_list)) {
 271                         hlist_bl_unlock(ce->e_index_hash_p);
 272                         hlist_bl_unlock(ce->e_block_hash_p);
 273                         spin_lock(&mb_cache_spinlock);
 274                         continue;
 275                 }
 276                 __mb_cache_entry_unhash_unlock(ce);
 277                 list_add_tail(&ce->e_lru_list, &free_list);
 278                 spin_lock(&mb_cache_spinlock);
 279         }
 280         spin_unlock(&mb_cache_spinlock);
 281
 282         list_for_each_entry_safe(entry, tmp, &free_list, e_lru_list) {
 283                 __mb_cache_entry_forget(entry, gfp_mask);
 284                 freed++;
 285         }
 286         return freed;
 287 }
 288
 289 static unsigned long
 290 mb_cache_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
 291 {
 292         struct mb_cache *cache;
 293         unsigned long count = 0;
 294
 295         spin_lock(&mb_cache_spinlock);
 296         list_for_each_entry(cache, &mb_cache_list, c_cache_list) {
 297                 mb_debug("cache %s (%d)", cache->c_name,
 298                           atomic_read(&cache->c_entry_count));
 299                 count += atomic_read(&cache->c_entry_count);
 300         }
 301         spin_unlock(&mb_cache_spinlock);
 302
 303         return vfs_pressure_ratio(count);
 304 }
 305
 306 static struct shrinker mb_cache_shrinker = {
 307         .count_objects = mb_cache_shrink_count,
 308         .scan_objects = mb_cache_shrink_scan,
 309         .seeks = DEFAULT_SEEKS,
 310 };
 311
 312 /*
 313  * mb_cache_create()  create a new cache
 314  *
 315  * All entries in one cache are equal size. Cache entries may be from
 316  * multiple devices. If this is the first mbcache created, registers
 317  * the cache with kernel memory management. Returns NULL if no more
 318  * memory was available.
 319  *
 320  * @name: name of the cache (informal)
 321  * @bucket_bits: log2(number of hash buckets)
 322  */
 323 struct mb_cache *
 324 mb_cache_create(const char *name, int bucket_bits)
 325 {
 326         int n, bucket_count = 1 << bucket_bits;
 327         struct mb_cache *cache = NULL;
 328
 329         if (!mb_cache_bg_lock) {
 330                 mb_cache_bg_lock = kmalloc(sizeof(struct blockgroup_lock),
 331                         GFP_KERNEL);
 332                 if (!mb_cache_bg_lock)
 333                         return NULL;
 334                 bgl_lock_init(mb_cache_bg_lock);
 335         }
 336
 337         cache = kmalloc(sizeof(struct mb_cache), GFP_KERNEL);
 338         if (!cache)
 339                 return NULL;
 340         cache->c_name = name;
 341         atomic_set(&cache->c_entry_count, 0);
 342         cache->c_bucket_bits = bucket_bits;
 343         cache->c_block_hash = kmalloc(bucket_count *
 344                 sizeof(struct hlist_bl_head), GFP_KERNEL);
 345         if (!cache->c_block_hash)
 346                 goto fail;
 347         for (n=0; n<bucket_count; n++)
 348                 INIT_HLIST_BL_HEAD(&cache->c_block_hash[n]);
 349         cache->c_index_hash = kmalloc(bucket_count *
 350                 sizeof(struct hlist_bl_head), GFP_KERNEL);
 351         if (!cache->c_index_hash)
 352                 goto fail;
 353         for (n=0; n<bucket_count; n++)
 354                 INIT_HLIST_BL_HEAD(&cache->c_index_hash[n]);
 355         if (!mb_cache_kmem_cache) {
 356                 mb_cache_kmem_cache = kmem_cache_create(name,
 357                         sizeof(struct mb_cache_entry), 0,
 358                         SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, NULL);
 359                 if (!mb_cache_kmem_cache)
 360                         goto fail2;
 361         }
 362         cache->c_entry_cache = mb_cache_kmem_cache;
 363
 364         /*
 365          * Set an upper limit on the number of cache entries so that the hash
 366          * chains won't grow too long.
 367          */
 368         cache->c_max_entries = bucket_count << 4;
 369
 370         spin_lock(&mb_cache_spinlock);
 371         list_add(&cache->c_cache_list, &mb_cache_list);
 372         spin_unlock(&mb_cache_spinlock);
 373         return cache;
 374
 375 fail2:
 376         kfree(cache->c_index_hash);
 377
 378 fail:
 379         kfree(cache->c_block_hash);
 380         kfree(cache);
 381         return NULL;
 382 }
 383
 384
 385 /*
 386  * mb_cache_shrink()
 387  *
 388  * Removes all cache entries of a device from the cache. All cache entries
 389  * currently in use cannot be freed, and thus remain in the cache. All others
 390  * are freed.
 391  *
 392  * @bdev: which device's cache entries to shrink
 393  */
 394 void
 395 mb_cache_shrink(struct block_device *bdev)
 396 {
 397         LIST_HEAD(free_list);
 398         struct list_head *l;
 399         struct mb_cache_entry *ce, *tmp;
 400
 401         l = &mb_cache_lru_list;
 402         spin_lock(&mb_cache_spinlock);
 403         while (!list_is_last(l, &mb_cache_lru_list)) {
 404                 l = l->next;
 405                 ce = list_entry(l, struct mb_cache_entry, e_lru_list);
 406                 if (ce->e_bdev == bdev) {
 407                         list_del_init(&ce->e_lru_list);
 408                         if (ce->e_used || ce->e_queued ||
 409                                 atomic_read(&ce->e_refcnt))
 410                                 continue;
 411                         spin_unlock(&mb_cache_spinlock);
 412                         /*
 413                          * Prevent any find or get operation on the entry.
 414                          */
 415                         hlist_bl_lock(ce->e_block_hash_p);
 416                         hlist_bl_lock(ce->e_index_hash_p);
 417                         /* Ignore if it is touched by a find/get */
 418                         if (ce->e_used || ce->e_queued ||
 419                                 atomic_read(&ce->e_refcnt) ||
 420                                 !list_empty(&ce->e_lru_list)) {
 421                                 hlist_bl_unlock(ce->e_index_hash_p);
 422                                 hlist_bl_unlock(ce->e_block_hash_p);
 423                                 l = &mb_cache_lru_list;
 424                                 spin_lock(&mb_cache_spinlock);
 425                                 continue;
 426                         }
 427                         __mb_cache_entry_unhash_unlock(ce);
 428                         mb_assert(!(ce->e_used || ce->e_queued ||
 429                                 atomic_read(&ce->e_refcnt)));
 430                         list_add_tail(&ce->e_lru_list, &free_list);
 431                         l = &mb_cache_lru_list;
 432                         spin_lock(&mb_cache_spinlock);
 433                 }
 434         }
 435         spin_unlock(&mb_cache_spinlock);
 436
 437         list_for_each_entry_safe(ce, tmp, &free_list, e_lru_list) {
 438                 __mb_cache_entry_forget(ce, GFP_KERNEL);
 439         }
 440 }
 441
 442
 443 /*
 444  * mb_cache_destroy()
 445  *
 446  * Shrinks the cache to its minimum possible size (hopefully 0 entries),
 447  * and then destroys it. If this was the last mbcache, un-registers the
 448  * mbcache from kernel memory management.
 449  */
 450 void
 451 mb_cache_destroy(struct mb_cache *cache)
 452 {
 453         LIST_HEAD(free_list);
 454         struct mb_cache_entry *ce, *tmp;
 455
 456         spin_lock(&mb_cache_spinlock);
 457         list_for_each_entry_safe(ce, tmp, &mb_cache_lru_list, e_lru_list) {
 458                 if (ce->e_cache == cache)
 459                         list_move_tail(&ce->e_lru_list, &free_list);
 460         }
 461         list_del(&cache->c_cache_list);
 462         spin_unlock(&mb_cache_spinlock);
 463
 464         list_for_each_entry_safe(ce, tmp, &free_list, e_lru_list) {
 465                 list_del_init(&ce->e_lru_list);
 466                 /*
 467                  * Prevent any find or get operation on the entry.
 468                  */
 469                 hlist_bl_lock(ce->e_block_hash_p);
 470                 hlist_bl_lock(ce->e_index_hash_p);
 471                 mb_assert(!(ce->e_used || ce->e_queued ||
 472                         atomic_read(&ce->e_refcnt)));
 473                 __mb_cache_entry_unhash_unlock(ce);
 474                 __mb_cache_entry_forget(ce, GFP_KERNEL);
 475         }
 476
 477         if (atomic_read(&cache->c_entry_count) > 0) {
 478                 mb_error("cache %s: %d orphaned entries",
 479                           cache->c_name,
 480                           atomic_read(&cache->c_entry_count));
 481         }
 482
 483         if (list_empty(&mb_cache_list)) {
 484                 kmem_cache_destroy(mb_cache_kmem_cache);
 485                 mb_cache_kmem_cache = NULL;
 486         }
 487         kfree(cache->c_index_hash);
 488         kfree(cache->c_block_hash);
 489         kfree(cache);
 490 }
 491
 492 /*
 493  * mb_cache_entry_alloc()
 494  *
 495  * Allocates a new cache entry. The new entry will not be valid initially,
 496  * and thus cannot be looked up yet. It should be filled with data, and
 497  * then inserted into the cache using mb_cache_entry_insert(). Returns NULL
 498  * if no more memory was available.
 499  */
 500 struct mb_cache_entry *
 501 mb_cache_entry_alloc(struct mb_cache *cache, gfp_t gfp_flags)
 502 {
 503         struct mb_cache_entry *ce;
 504
 505         if (atomic_read(&cache->c_entry_count) >= cache->c_max_entries) {
 506                 struct list_head *l;
 507
 508                 l = &mb_cache_lru_list;
 509                 spin_lock(&mb_cache_spinlock);
 510                 while (!list_is_last(l, &mb_cache_lru_list)) {
 511                         l = l->next;
 512                         ce = list_entry(l, struct mb_cache_entry, e_lru_list);
 513                         if (ce->e_cache == cache) {
 514                                 list_del_init(&ce->e_lru_list);
 515                                 if (ce->e_used || ce->e_queued ||
 516                                         atomic_read(&ce->e_refcnt))
 517                                         continue;
 518                                 spin_unlock(&mb_cache_spinlock);
 519                                 /*
 520                                  * Prevent any find or get operation on the
 521                                  * entry.
 522                                  */
 523                                 hlist_bl_lock(ce->e_block_hash_p);
 524                                 hlist_bl_lock(ce->e_index_hash_p);
 525                                 /* Ignore if it is touched by a find/get */
 526                                 if (ce->e_used || ce->e_queued ||
 527                                         atomic_read(&ce->e_refcnt) ||
 528                                         !list_empty(&ce->e_lru_list)) {
 529                                         hlist_bl_unlock(ce->e_index_hash_p);
 530                                         hlist_bl_unlock(ce->e_block_hash_p);
 531                                         l = &mb_cache_lru_list;
 532                                         spin_lock(&mb_cache_spinlock);
 533                                         continue;
 534                                 }
 535                                 mb_assert(list_empty(&ce->e_lru_list));
 536                                 mb_assert(!(ce->e_used || ce->e_queued ||
 537                                         atomic_read(&ce->e_refcnt)));
 538                                 __mb_cache_entry_unhash_unlock(ce);
 539                                 goto found;
 540                         }
 541                 }
 542                 spin_unlock(&mb_cache_spinlock);
 543         }
 544
 545         ce = kmem_cache_alloc(cache->c_entry_cache, gfp_flags);
 546         if (!ce)
 547                 return NULL;
 548         atomic_inc(&cache->c_entry_count);
 549         INIT_LIST_HEAD(&ce->e_lru_list);
 550         INIT_HLIST_BL_NODE(&ce->e_block_list);
 551         INIT_HLIST_BL_NODE(&ce->e_index.o_list);
 552         ce->e_cache = cache;
 553         ce->e_queued = 0;
 554         atomic_set(&ce->e_refcnt, 0);
 555 found:
 556         ce->e_block_hash_p = &cache->c_block_hash[0];
 557         ce->e_index_hash_p = &cache->c_index_hash[0];
 558         ce->e_used = 1 + MB_CACHE_WRITER;
 559         return ce;
 560 }
 561
 562
 563 /*
 564  * mb_cache_entry_insert()
 565  *
 566  * Inserts an entry that was allocated using mb_cache_entry_alloc() into
 567  * the cache. After this, the cache entry can be looked up, but is not yet
 568  * in the lru list as the caller still holds a handle to it. Returns 0 on
 569  * success, or -EBUSY if a cache entry for that device + inode exists
 570  * already (this may happen after a failed lookup, but when another process
 571  * has inserted the same cache entry in the meantime).
 572  *
 573  * @bdev: device the cache entry belongs to
 574  * @block: block number
 575  * @key: lookup key
 576  */
 577 int
 578 mb_cache_entry_insert(struct mb_cache_entry *ce, struct block_device *bdev,
 579                       sector_t block, unsigned int key)
 580 {
 581         struct mb_cache *cache = ce->e_cache;
 582         unsigned int bucket;
 583         struct hlist_bl_node *l;
 584         struct hlist_bl_head *block_hash_p;
 585         struct hlist_bl_head *index_hash_p;
 586         struct mb_cache_entry *lce;
 587
 588         mb_assert(ce);
 589         bucket = hash_long((unsigned long)bdev + (block & 0xffffffff),
 590                            cache->c_bucket_bits);
 591         block_hash_p = &cache->c_block_hash[bucket];
 592         hlist_bl_lock(block_hash_p);
 593         hlist_bl_for_each_entry(lce, l, block_hash_p, e_block_list) {
 594                 if (lce->e_bdev == bdev && lce->e_block == block) {
 595                         hlist_bl_unlock(block_hash_p);
 596                         return -EBUSY;
 597                 }
 598         }
 599         mb_assert(!__mb_cache_entry_is_block_hashed(ce));
 600         __mb_cache_entry_unhash_block(ce);
 601         __mb_cache_entry_unhash_index(ce);
 602         ce->e_bdev = bdev;
 603         ce->e_block = block;
 604         ce->e_block_hash_p = block_hash_p;
 605         ce->e_index.o_key = key;
 606         hlist_bl_add_head(&ce->e_block_list, block_hash_p);
 607         hlist_bl_unlock(block_hash_p);
 608         bucket = hash_long(key, cache->c_bucket_bits);
 609         index_hash_p = &cache->c_index_hash[bucket];
 610         hlist_bl_lock(index_hash_p);
 611         ce->e_index_hash_p = index_hash_p;
 612         hlist_bl_add_head(&ce->e_index.o_list, index_hash_p);
 613         hlist_bl_unlock(index_hash_p);
 614         return 0;
 615 }
 616
 617
 618 /*
 619  * mb_cache_entry_release()
 620  *
 621  * Release a handle to a cache entry. When the last handle to a cache entry
 622  * is released it is either freed (if it is invalid) or otherwise inserted
 623  * in to the lru list.
 624  */
 625 void
 626 mb_cache_entry_release(struct mb_cache_entry *ce)
 627 {
 628         __mb_cache_entry_release(ce);
 629 }
 630
 631
 632 /*
 633  * mb_cache_entry_free()
 634  *
 635  */
 636 void
 637 mb_cache_entry_free(struct mb_cache_entry *ce)
 638 {
 639         mb_assert(ce);
 640         mb_assert(list_empty(&ce->e_lru_list));
 641         hlist_bl_lock(ce->e_index_hash_p);
 642         __mb_cache_entry_unhash_index(ce);
 643         hlist_bl_unlock(ce->e_index_hash_p);
 644         hlist_bl_lock(ce->e_block_hash_p);
 645         __mb_cache_entry_unhash_block(ce);
 646         hlist_bl_unlock(ce->e_block_hash_p);
 647         __mb_cache_entry_release(ce);
 648 }
 649
 650
 651 /*
 652  * mb_cache_entry_get()
 653  *
 654  * Get a cache entry  by device / block number. (There can only be one entry
 655  * in the cache per device and block.) Returns NULL if no such cache entry
 656  * exists. The returned cache entry is locked for exclusive access ("single
 657  * writer").
 658  */
 659 struct mb_cache_entry *
 660 mb_cache_entry_get(struct mb_cache *cache, struct block_device *bdev,
 661                    sector_t block)
 662 {
 663         unsigned int bucket;
 664         struct hlist_bl_node *l;
 665         struct mb_cache_entry *ce;
 666         struct hlist_bl_head *block_hash_p;
 667
 668         bucket = hash_long((unsigned long)bdev + (block & 0xffffffff),
 669                            cache->c_bucket_bits);
 670         block_hash_p = &cache->c_block_hash[bucket];
 671         /* First serialize access to the block corresponding hash chain. */
 672         hlist_bl_lock(block_hash_p);
 673         hlist_bl_for_each_entry(ce, l, block_hash_p, e_block_list) {
 674                 mb_assert(ce->e_block_hash_p == block_hash_p);
 675                 if (ce->e_bdev == bdev && ce->e_block == block) {
 676                         /*
 677                          * Prevent a free from removing the entry.
 678                          */
 679                         atomic_inc(&ce->e_refcnt);
 680                         hlist_bl_unlock(block_hash_p);
 681                         __spin_lock_mb_cache_entry(ce);
 682                         atomic_dec(&ce->e_refcnt);
 683                         if (ce->e_used > 0) {
 684                                 DEFINE_WAIT(wait);
 685                                 while (ce->e_used > 0) {
 686                                         ce->e_queued++;
 687                                         prepare_to_wait(&mb_cache_queue, &wait,
 688                                                         TASK_UNINTERRUPTIBLE);
 689                                         __spin_unlock_mb_cache_entry(ce);
 690                                         schedule();
 691                                         __spin_lock_mb_cache_entry(ce);
 692                                         ce->e_queued--;
 693                                 }
 694                                 finish_wait(&mb_cache_queue, &wait);
 695                         }
 696                         ce->e_used += 1 + MB_CACHE_WRITER;
 697                         __spin_unlock_mb_cache_entry(ce);
 698
 699                         if (!list_empty(&ce->e_lru_list)) {
 700                                 spin_lock(&mb_cache_spinlock);
 701                                 list_del_init(&ce->e_lru_list);
 702                                 spin_unlock(&mb_cache_spinlock);
 703                         }
 704                         if (!__mb_cache_entry_is_block_hashed(ce)) {
 705                                 __mb_cache_entry_release(ce);
 706                                 return NULL;
 707                         }
 708                         return ce;
 709                 }
 710         }
 711         hlist_bl_unlock(block_hash_p);
 712         return NULL;
 713 }
 714
 715 #if !defined(MB_CACHE_INDEXES_COUNT) || (MB_CACHE_INDEXES_COUNT > 0)
 716
 717 static struct mb_cache_entry *
 718 __mb_cache_entry_find(struct hlist_bl_node *l, struct hlist_bl_head *head,
 719                       struct block_device *bdev, unsigned int key)
 720 {
 721
 722         /* The index hash chain is alredy acquire by caller. */
 723         while (l != NULL) {
 724                 struct mb_cache_entry *ce =
 725                         hlist_bl_entry(l, struct mb_cache_entry,
 726                                 e_index.o_list);
 727                 mb_assert(ce->e_index_hash_p == head);
 728                 if (ce->e_bdev == bdev && ce->e_index.o_key == key) {
 729                         /*
 730                          * Prevent a free from removing the entry.
 731                          */
 732                         atomic_inc(&ce->e_refcnt);
 733                         hlist_bl_unlock(head);
 734                         __spin_lock_mb_cache_entry(ce);
 735                         atomic_dec(&ce->e_refcnt);
 736                         ce->e_used++;
 737                         /* Incrementing before holding the lock gives readers
 738                            priority over writers. */
 739                         if (ce->e_used >= MB_CACHE_WRITER) {
 740                                 DEFINE_WAIT(wait);
 741
 742                                 while (ce->e_used >= MB_CACHE_WRITER) {
 743                                         ce->e_queued++;
 744                                         prepare_to_wait(&mb_cache_queue, &wait,
 745                                                         TASK_UNINTERRUPTIBLE);
 746                                         __spin_unlock_mb_cache_entry(ce);
 747                                         schedule();
 748                                         __spin_lock_mb_cache_entry(ce);
 749                                         ce->e_queued--;
 750                                 }
 751                                 finish_wait(&mb_cache_queue, &wait);
 752                         }
 753                         __spin_unlock_mb_cache_entry(ce);
 754                         if (!list_empty(&ce->e_lru_list)) {
 755                                 spin_lock(&mb_cache_spinlock);
 756                                 list_del_init(&ce->e_lru_list);
 757                                 spin_unlock(&mb_cache_spinlock);
 758                         }
 759                         if (!__mb_cache_entry_is_block_hashed(ce)) {
 760                                 __mb_cache_entry_release(ce);
 761                                 return ERR_PTR(-EAGAIN);
 762                         }
 763                         return ce;
 764                 }
 765                 l = l->next;
 766         }
 767         hlist_bl_unlock(head);
 768         return NULL;
 769 }
 770
 771
 772 /*
 773  * mb_cache_entry_find_first()
 774  *
 775  * Find the first cache entry on a given device with a certain key in
 776  * an additional index. Additional matches can be found with
 777  * mb_cache_entry_find_next(). Returns NULL if no match was found. The
 778  * returned cache entry is locked for shared access ("multiple readers").
 779  *
 780  * @cache: the cache to search
 781  * @bdev: the device the cache entry should belong to
 782  * @key: the key in the index
 783  */
 784 struct mb_cache_entry *
 785 mb_cache_entry_find_first(struct mb_cache *cache, struct block_device *bdev,
 786                           unsigned int key)
 787 {
 788         unsigned int bucket = hash_long(key, cache->c_bucket_bits);
 789         struct hlist_bl_node *l;
 790         struct mb_cache_entry *ce = NULL;
 791         struct hlist_bl_head *index_hash_p;
 792
 793         index_hash_p = &cache->c_index_hash[bucket];
 794         hlist_bl_lock(index_hash_p);
 795         if (!hlist_bl_empty(index_hash_p)) {
 796                 l = hlist_bl_first(index_hash_p);
 797                 ce = __mb_cache_entry_find(l, index_hash_p, bdev, key);
 798         } else
 799                 hlist_bl_unlock(index_hash_p);
 800         return ce;
 801 }
 802
 803
 804 /*
 805  * mb_cache_entry_find_next()
 806  *
 807  * Find the next cache entry on a given device with a certain key in an
 808  * additional index. Returns NULL if no match could be found. The previous
 809  * entry is atomatically released, so that mb_cache_entry_find_next() can
 810  * be called like this:
 811  *
 812  * entry = mb_cache_entry_find_first();
 813  * while (entry) {
 814  *      ...
 815  *      entry = mb_cache_entry_find_next(entry, ...);
 816  * }
 817  *
 818  * @prev: The previous match
 819  * @bdev: the device the cache entry should belong to
 820  * @key: the key in the index
 821  */
 822 struct mb_cache_entry *
 823 mb_cache_entry_find_next(struct mb_cache_entry *prev,
 824                          struct block_device *bdev, unsigned int key)
 825 {
 826         struct mb_cache *cache = prev->e_cache;
 827         unsigned int bucket = hash_long(key, cache->c_bucket_bits);
 828         struct hlist_bl_node *l;
 829         struct mb_cache_entry *ce;
 830         struct hlist_bl_head *index_hash_p;
 831
 832         index_hash_p = &cache->c_index_hash[bucket];
 833         mb_assert(prev->e_index_hash_p == index_hash_p);
 834         hlist_bl_lock(index_hash_p);
 835         mb_assert(!hlist_bl_empty(index_hash_p));
 836         l = prev->e_index.o_list.next;
 837         ce = __mb_cache_entry_find(l, index_hash_p, bdev, key);
 838         __mb_cache_entry_release(prev);
 839         return ce;
 840 }
 841
 842 #endif  /* !defined(MB_CACHE_INDEXES_COUNT) || (MB_CACHE_INDEXES_COUNT > 0) */
 843
 844 static int __init init_mbcache(void)
 845 {
 846         register_shrinker(&mb_cache_shrinker);
 847         return 0;
 848 }
 849
 850 static void __exit exit_mbcache(void)
 851 {
 852         unregister_shrinker(&mb_cache_shrinker);
 853 }
 854
 855 module_init(init_mbcache)
 856 module_exit(exit_mbcache)
 857