]> git.proxmox.com Git - mirror_qemu.git/blobdiff - block/qcow2-cache.c
spapr: fix memory leak in spapr_core_pre_plug()
[mirror_qemu.git] / block / qcow2-cache.c
index 7bcae09a69f45f044784fb2a411f401a2bbae1e2..1d25147392a5a0818a8a773c8f5d995e68e5bb93 100644 (file)
  * THE SOFTWARE.
  */
 
+#include "qemu/osdep.h"
 #include "block/block_int.h"
 #include "qemu-common.h"
 #include "qcow2.h"
 #include "trace.h"
 
 typedef struct Qcow2CachedTable {
-    void*   table;
-    int64_t offset;
-    bool    dirty;
-    int     cache_hits;
-    int     ref;
+    int64_t  offset;
+    uint64_t lru_counter;
+    int      ref;
+    bool     dirty;
 } Qcow2CachedTable;
 
 struct Qcow2Cache {
-    Qcow2CachedTable*       entries;
-    struct Qcow2Cache*      depends;
+    Qcow2CachedTable       *entries;
+    struct Qcow2Cache      *depends;
     int                     size;
     bool                    depends_on_flush;
+    void                   *table_array;
+    uint64_t                lru_counter;
+    uint64_t                cache_clean_lru_counter;
 };
 
+static inline void *qcow2_cache_get_table_addr(BlockDriverState *bs,
+                    Qcow2Cache *c, int table)
+{
+    BDRVQcow2State *s = bs->opaque;
+    return (uint8_t *) c->table_array + (size_t) table * s->cluster_size;
+}
+
+static inline int qcow2_cache_get_table_idx(BlockDriverState *bs,
+                  Qcow2Cache *c, void *table)
+{
+    BDRVQcow2State *s = bs->opaque;
+    ptrdiff_t table_offset = (uint8_t *) table - (uint8_t *) c->table_array;
+    int idx = table_offset / s->cluster_size;
+    assert(idx >= 0 && idx < c->size && table_offset % s->cluster_size == 0);
+    return idx;
+}
+
+static void qcow2_cache_table_release(BlockDriverState *bs, Qcow2Cache *c,
+                                      int i, int num_tables)
+{
+/* Using MADV_DONTNEED to discard memory is a Linux-specific feature */
+#ifdef CONFIG_LINUX
+    BDRVQcow2State *s = bs->opaque;
+    void *t = qcow2_cache_get_table_addr(bs, c, i);
+    int align = getpagesize();
+    size_t mem_size = (size_t) s->cluster_size * num_tables;
+    size_t offset = QEMU_ALIGN_UP((uintptr_t) t, align) - (uintptr_t) t;
+    size_t length = QEMU_ALIGN_DOWN(mem_size - offset, align);
+    if (length > 0) {
+        madvise((uint8_t *) t + offset, length, MADV_DONTNEED);
+    }
+#endif
+}
+
+static inline bool can_clean_entry(Qcow2Cache *c, int i)
+{
+    Qcow2CachedTable *t = &c->entries[i];
+    return t->ref == 0 && !t->dirty && t->offset != 0 &&
+        t->lru_counter <= c->cache_clean_lru_counter;
+}
+
+void qcow2_cache_clean_unused(BlockDriverState *bs, Qcow2Cache *c)
+{
+    int i = 0;
+    while (i < c->size) {
+        int to_clean = 0;
+
+        /* Skip the entries that we don't need to clean */
+        while (i < c->size && !can_clean_entry(c, i)) {
+            i++;
+        }
+
+        /* And count how many we can clean in a row */
+        while (i < c->size && can_clean_entry(c, i)) {
+            c->entries[i].offset = 0;
+            c->entries[i].lru_counter = 0;
+            i++;
+            to_clean++;
+        }
+
+        if (to_clean > 0) {
+            qcow2_cache_table_release(bs, c, i - to_clean, to_clean);
+        }
+    }
+
+    c->cache_clean_lru_counter = c->lru_counter;
+}
+
 Qcow2Cache *qcow2_cache_create(BlockDriverState *bs, int num_tables)
 {
-    BDRVQcowState *s = bs->opaque;
+    BDRVQcow2State *s = bs->opaque;
     Qcow2Cache *c;
-    int i;
 
-    c = g_malloc0(sizeof(*c));
+    c = g_new0(Qcow2Cache, 1);
     c->size = num_tables;
-    c->entries = g_malloc0(sizeof(*c->entries) * num_tables);
-
-    for (i = 0; i < c->size; i++) {
-        c->entries[i].table = qemu_blockalign(bs, s->cluster_size);
+    c->entries = g_try_new0(Qcow2CachedTable, num_tables);
+    c->table_array = qemu_try_blockalign(bs->file->bs,
+                                         (size_t) num_tables * s->cluster_size);
+
+    if (!c->entries || !c->table_array) {
+        qemu_vfree(c->table_array);
+        g_free(c->entries);
+        g_free(c);
+        c = NULL;
     }
 
     return c;
 }
 
-int qcow2_cache_destroy(BlockDriverStatebs, Qcow2Cache *c)
+int qcow2_cache_destroy(BlockDriverState *bs, Qcow2Cache *c)
 {
     int i;
 
     for (i = 0; i < c->size; i++) {
         assert(c->entries[i].ref == 0);
-        qemu_vfree(c->entries[i].table);
     }
 
+    qemu_vfree(c->table_array);
     g_free(c->entries);
     g_free(c);
 
@@ -91,7 +166,7 @@ static int qcow2_cache_flush_dependency(BlockDriverState *bs, Qcow2Cache *c)
 
 static int qcow2_cache_entry_flush(BlockDriverState *bs, Qcow2Cache *c, int i)
 {
-    BDRVQcowState *s = bs->opaque;
+    BDRVQcow2State *s = bs->opaque;
     int ret = 0;
 
     if (!c->entries[i].dirty || !c->entries[i].offset) {
@@ -104,7 +179,7 @@ static int qcow2_cache_entry_flush(BlockDriverState *bs, Qcow2Cache *c, int i)
     if (c->depends) {
         ret = qcow2_cache_flush_dependency(bs, c);
     } else if (c->depends_on_flush) {
-        ret = bdrv_flush(bs->file);
+        ret = bdrv_flush(bs->file->bs);
         if (ret >= 0) {
             c->depends_on_flush = false;
         }
@@ -115,15 +190,13 @@ static int qcow2_cache_entry_flush(BlockDriverState *bs, Qcow2Cache *c, int i)
     }
 
     if (c == s->refcount_block_cache) {
-        ret = qcow2_pre_write_overlap_check(bs,
-                QCOW2_OL_DEFAULT & ~QCOW2_OL_REFCOUNT_BLOCK,
+        ret = qcow2_pre_write_overlap_check(bs, QCOW2_OL_REFCOUNT_BLOCK,
                 c->entries[i].offset, s->cluster_size);
     } else if (c == s->l2_table_cache) {
-        ret = qcow2_pre_write_overlap_check(bs,
-                QCOW2_OL_DEFAULT & ~QCOW2_OL_ACTIVE_L2,
+        ret = qcow2_pre_write_overlap_check(bs, QCOW2_OL_ACTIVE_L2,
                 c->entries[i].offset, s->cluster_size);
     } else {
-        ret = qcow2_pre_write_overlap_check(bs, QCOW2_OL_DEFAULT,
+        ret = qcow2_pre_write_overlap_check(bs, 0,
                 c->entries[i].offset, s->cluster_size);
     }
 
@@ -137,8 +210,8 @@ static int qcow2_cache_entry_flush(BlockDriverState *bs, Qcow2Cache *c, int i)
         BLKDBG_EVENT(bs->file, BLKDBG_L2_UPDATE);
     }
 
-    ret = bdrv_pwrite(bs->file, c->entries[i].offset, c->entries[i].table,
-        s->cluster_size);
+    ret = bdrv_pwrite(bs->file, c->entries[i].offset,
+                      qcow2_cache_get_table_addr(bs, c, i), s->cluster_size);
     if (ret < 0) {
         return ret;
     }
@@ -148,9 +221,9 @@ static int qcow2_cache_entry_flush(BlockDriverState *bs, Qcow2Cache *c, int i)
     return 0;
 }
 
-int qcow2_cache_flush(BlockDriverState *bs, Qcow2Cache *c)
+int qcow2_cache_write(BlockDriverState *bs, Qcow2Cache *c)
 {
-    BDRVQcowState *s = bs->opaque;
+    BDRVQcow2State *s = bs->opaque;
     int result = 0;
     int ret;
     int i;
@@ -164,8 +237,15 @@ int qcow2_cache_flush(BlockDriverState *bs, Qcow2Cache *c)
         }
     }
 
+    return result;
+}
+
+int qcow2_cache_flush(BlockDriverState *bs, Qcow2Cache *c)
+{
+    int result = qcow2_cache_write(bs, c);
+
     if (result == 0) {
-        ret = bdrv_flush(bs->file);
+        int ret = bdrv_flush(bs->file->bs);
         if (ret < 0) {
             result = ret;
         }
@@ -202,60 +282,67 @@ void qcow2_cache_depends_on_flush(Qcow2Cache *c)
     c->depends_on_flush = true;
 }
 
-static int qcow2_cache_find_entry_to_replace(Qcow2Cache *c)
+int qcow2_cache_empty(BlockDriverState *bs, Qcow2Cache *c)
 {
-    int i;
-    int min_count = INT_MAX;
-    int min_index = -1;
+    int ret, i;
 
+    ret = qcow2_cache_flush(bs, c);
+    if (ret < 0) {
+        return ret;
+    }
 
     for (i = 0; i < c->size; i++) {
-        if (c->entries[i].ref) {
-            continue;
-        }
+        assert(c->entries[i].ref == 0);
+        c->entries[i].offset = 0;
+        c->entries[i].lru_counter = 0;
+    }
 
-        if (c->entries[i].cache_hits < min_count) {
-            min_index = i;
-            min_count = c->entries[i].cache_hits;
-        }
+    qcow2_cache_table_release(bs, c, 0, c->size);
 
-        /* Give newer hits priority */
-        /* TODO Check how to optimize the replacement strategy */
-        c->entries[i].cache_hits /= 2;
-    }
+    c->lru_counter = 0;
 
-    if (min_index == -1) {
-        /* This can't happen in current synchronous code, but leave the check
-         * here as a reminder for whoever starts using AIO with the cache */
-        abort();
-    }
-    return min_index;
+    return 0;
 }
 
 static int qcow2_cache_do_get(BlockDriverState *bs, Qcow2Cache *c,
     uint64_t offset, void **table, bool read_from_disk)
 {
-    BDRVQcowState *s = bs->opaque;
+    BDRVQcow2State *s = bs->opaque;
     int i;
     int ret;
+    int lookup_index;
+    uint64_t min_lru_counter = UINT64_MAX;
+    int min_lru_index = -1;
 
     trace_qcow2_cache_get(qemu_coroutine_self(), c == s->l2_table_cache,
                           offset, read_from_disk);
 
     /* Check if the table is already cached */
-    for (i = 0; i < c->size; i++) {
-        if (c->entries[i].offset == offset) {
+    i = lookup_index = (offset / s->cluster_size * 4) % c->size;
+    do {
+        const Qcow2CachedTable *t = &c->entries[i];
+        if (t->offset == offset) {
             goto found;
         }
+        if (t->ref == 0 && t->lru_counter < min_lru_counter) {
+            min_lru_counter = t->lru_counter;
+            min_lru_index = i;
+        }
+        if (++i == c->size) {
+            i = 0;
+        }
+    } while (i != lookup_index);
+
+    if (min_lru_index == -1) {
+        /* This can't happen in current synchronous code, but leave the check
+         * here as a reminder for whoever starts using AIO with the cache */
+        abort();
     }
 
-    /* If not, write a table back and replace it */
-    i = qcow2_cache_find_entry_to_replace(c);
+    /* Cache miss: write a table back and replace it */
+    i = min_lru_index;
     trace_qcow2_cache_get_replace_entry(qemu_coroutine_self(),
                                         c == s->l2_table_cache, i);
-    if (i < 0) {
-        return i;
-    }
 
     ret = qcow2_cache_entry_flush(bs, c, i);
     if (ret < 0) {
@@ -270,22 +357,20 @@ static int qcow2_cache_do_get(BlockDriverState *bs, Qcow2Cache *c,
             BLKDBG_EVENT(bs->file, BLKDBG_L2_LOAD);
         }
 
-        ret = bdrv_pread(bs->file, offset, c->entries[i].table, s->cluster_size);
+        ret = bdrv_pread(bs->file, offset,
+                         qcow2_cache_get_table_addr(bs, c, i),
+                         s->cluster_size);
         if (ret < 0) {
             return ret;
         }
     }
 
-    /* Give the table some hits for the start so that it won't be replaced
-     * immediately. The number 32 is completely arbitrary. */
-    c->entries[i].cache_hits = 32;
     c->entries[i].offset = offset;
 
     /* And return the right table */
 found:
-    c->entries[i].cache_hits++;
     c->entries[i].ref++;
-    *table = c->entries[i].table;
+    *table = qcow2_cache_get_table_addr(bs, c, i);
 
     trace_qcow2_cache_get_done(qemu_coroutine_self(),
                                c == s->l2_table_cache, i);
@@ -305,36 +390,24 @@ int qcow2_cache_get_empty(BlockDriverState *bs, Qcow2Cache *c, uint64_t offset,
     return qcow2_cache_do_get(bs, c, offset, table, false);
 }
 
-int qcow2_cache_put(BlockDriverState *bs, Qcow2Cache *c, void **table)
+void qcow2_cache_put(BlockDriverState *bs, Qcow2Cache *c, void **table)
 {
-    int i;
+    int i = qcow2_cache_get_table_idx(bs, c, *table);
 
-    for (i = 0; i < c->size; i++) {
-        if (c->entries[i].table == *table) {
-            goto found;
-        }
-    }
-    return -ENOENT;
-
-found:
     c->entries[i].ref--;
     *table = NULL;
 
+    if (c->entries[i].ref == 0) {
+        c->entries[i].lru_counter = ++c->lru_counter;
+    }
+
     assert(c->entries[i].ref >= 0);
-    return 0;
 }
 
-void qcow2_cache_entry_mark_dirty(Qcow2Cache *c, void *table)
+void qcow2_cache_entry_mark_dirty(BlockDriverState *bs, Qcow2Cache *c,
+     void *table)
 {
-    int i;
-
-    for (i = 0; i < c->size; i++) {
-        if (c->entries[i].table == table) {
-            goto found;
-        }
-    }
-    abort();
-
-found:
+    int i = qcow2_cache_get_table_idx(bs, c, table);
+    assert(c->entries[i].offset != 0);
     c->entries[i].dirty = true;
 }