include/sys/arc_impl.h

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright (c) 2013 by Delphix. All rights reserved.
  24  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
  25  * Copyright 2013 Nexenta Systems, Inc.  All rights reserved.
  26  */
  27
  28 #ifndef _SYS_ARC_IMPL_H
  29 #define _SYS_ARC_IMPL_H
  30
  31 #include <sys/arc.h>
  32 #include <sys/zio_crypt.h>
  33
  34 #ifdef __cplusplus
  35 extern "C" {
  36 #endif
  37
  38 /*
  39  * Note that buffers can be in one of 6 states:
  40  *      ARC_anon        - anonymous (discussed below)
  41  *      ARC_mru         - recently used, currently cached
  42  *      ARC_mru_ghost   - recentely used, no longer in cache
  43  *      ARC_mfu         - frequently used, currently cached
  44  *      ARC_mfu_ghost   - frequently used, no longer in cache
  45  *      ARC_l2c_only    - exists in L2ARC but not other states
  46  * When there are no active references to the buffer, they are
  47  * are linked onto a list in one of these arc states.  These are
  48  * the only buffers that can be evicted or deleted.  Within each
  49  * state there are multiple lists, one for meta-data and one for
  50  * non-meta-data.  Meta-data (indirect blocks, blocks of dnodes,
  51  * etc.) is tracked separately so that it can be managed more
  52  * explicitly: favored over data, limited explicitly.
  53  *
  54  * Anonymous buffers are buffers that are not associated with
  55  * a DVA.  These are buffers that hold dirty block copies
  56  * before they are written to stable storage.  By definition,
  57  * they are "ref'd" and are considered part of arc_mru
  58  * that cannot be freed.  Generally, they will acquire a DVA
  59  * as they are written and migrate onto the arc_mru list.
  60  *
  61  * The ARC_l2c_only state is for buffers that are in the second
  62  * level ARC but no longer in any of the ARC_m* lists.  The second
  63  * level ARC itself may also contain buffers that are in any of
  64  * the ARC_m* states - meaning that a buffer can exist in two
  65  * places.  The reason for the ARC_l2c_only state is to keep the
  66  * buffer header in the hash table, so that reads that hit the
  67  * second level ARC benefit from these fast lookups.
  68  */
  69
  70 typedef struct arc_state {
  71         /*
  72          * list of evictable buffers
  73          */
  74         multilist_t *arcs_list[ARC_BUFC_NUMTYPES];
  75         /*
  76          * total amount of evictable data in this state
  77          */
  78         refcount_t arcs_esize[ARC_BUFC_NUMTYPES];
  79         /*
  80          * total amount of data in this state; this includes: evictable,
  81          * non-evictable, ARC_BUFC_DATA, and ARC_BUFC_METADATA.
  82          */
  83         refcount_t arcs_size;
  84         /*
  85          * supports the "dbufs" kstat
  86          */
  87         arc_state_type_t arcs_state;
  88 } arc_state_t;
  89
  90 typedef struct arc_callback arc_callback_t;
  91
  92 struct arc_callback {
  93         void                    *acb_private;
  94         arc_read_done_func_t    *acb_done;
  95         arc_buf_t               *acb_buf;
  96         boolean_t               acb_encrypted;
  97         boolean_t               acb_compressed;
  98         boolean_t               acb_noauth;
  99         zio_t                   *acb_zio_dummy;
 100         arc_callback_t          *acb_next;
 101 };
 102
 103 typedef struct arc_write_callback arc_write_callback_t;
 104
 105 struct arc_write_callback {
 106         void                    *awcb_private;
 107         arc_write_done_func_t   *awcb_ready;
 108         arc_write_done_func_t   *awcb_children_ready;
 109         arc_write_done_func_t   *awcb_physdone;
 110         arc_write_done_func_t   *awcb_done;
 111         arc_buf_t               *awcb_buf;
 112 };
 113
 114 /*
 115  * ARC buffers are separated into multiple structs as a memory saving measure:
 116  *   - Common fields struct, always defined, and embedded within it:
 117  *       - L2-only fields, always allocated but undefined when not in L2ARC
 118  *       - L1-only fields, only allocated when in L1ARC
 119  *
 120  *           Buffer in L1                     Buffer only in L2
 121  *    +------------------------+          +------------------------+
 122  *    | arc_buf_hdr_t          |          | arc_buf_hdr_t          |
 123  *    |                        |          |                        |
 124  *    |                        |          |                        |
 125  *    |                        |          |                        |
 126  *    +------------------------+          +------------------------+
 127  *    | l2arc_buf_hdr_t        |          | l2arc_buf_hdr_t        |
 128  *    | (undefined if L1-only) |          |                        |
 129  *    +------------------------+          +------------------------+
 130  *    | l1arc_buf_hdr_t        |
 131  *    |                        |
 132  *    |                        |
 133  *    |                        |
 134  *    |                        |
 135  *    +------------------------+
 136  *
 137  * Because it's possible for the L2ARC to become extremely large, we can wind
 138  * up eating a lot of memory in L2ARC buffer headers, so the size of a header
 139  * is minimized by only allocating the fields necessary for an L1-cached buffer
 140  * when a header is actually in the L1 cache. The sub-headers (l1arc_buf_hdr and
 141  * l2arc_buf_hdr) are embedded rather than allocated separately to save a couple
 142  * words in pointers. arc_hdr_realloc() is used to switch a header between
 143  * these two allocation states.
 144  */
 145 typedef struct l1arc_buf_hdr {
 146         kmutex_t                b_freeze_lock;
 147         zio_cksum_t             *b_freeze_cksum;
 148
 149         arc_buf_t               *b_buf;
 150         uint32_t                b_bufcnt;
 151         /* for waiting on writes to complete */
 152         kcondvar_t              b_cv;
 153         uint8_t                 b_byteswap;
 154
 155
 156         /* protected by arc state mutex */
 157         arc_state_t             *b_state;
 158         multilist_node_t        b_arc_node;
 159
 160         /* updated atomically */
 161         clock_t                 b_arc_access;
 162         uint32_t                b_mru_hits;
 163         uint32_t                b_mru_ghost_hits;
 164         uint32_t                b_mfu_hits;
 165         uint32_t                b_mfu_ghost_hits;
 166         uint32_t                b_l2_hits;
 167
 168         /* self protecting */
 169         refcount_t              b_refcnt;
 170
 171         arc_callback_t          *b_acb;
 172         abd_t                   *b_pabd;
 173 } l1arc_buf_hdr_t;
 174
 175 /*
 176  * Encrypted blocks will need to be stored encrypted on the L2ARC
 177  * disk as they appear in the main pool. In order for this to work we
 178  * need to pass around the encryption parameters so they can be used
 179  * to write data to the L2ARC. This struct is only defined in the
 180  * arc_buf_hdr_t if the L1 header is defined and has the ARC_FLAG_ENCRYPTED
 181  * flag set.
 182  */
 183 typedef struct arc_buf_hdr_crypt {
 184         abd_t                   *b_rabd;        /* raw encrypted data */
 185         dmu_object_type_t       b_ot;           /* object type */
 186         uint32_t                b_ebufcnt;      /* count of encrypted buffers */
 187
 188         /* dsobj for looking up encryption key for l2arc encryption */
 189         uint64_t                b_dsobj;
 190
 191         /* encryption parameters */
 192         uint8_t                 b_salt[ZIO_DATA_SALT_LEN];
 193         uint8_t                 b_iv[ZIO_DATA_IV_LEN];
 194
 195         /*
 196          * Technically this could be removed since we will always be able to
 197          * get the mac from the bp when we need it. However, it is inconvenient
 198          * for callers of arc code to have to pass a bp in all the time. This
 199          * also allows us to assert that L2ARC data is properly encrypted to
 200          * match the data in the main storage pool.
 201          */
 202         uint8_t                 b_mac[ZIO_DATA_MAC_LEN];
 203 } arc_buf_hdr_crypt_t;
 204
 205 typedef struct l2arc_dev {
 206         vdev_t                  *l2ad_vdev;     /* vdev */
 207         spa_t                   *l2ad_spa;      /* spa */
 208         uint64_t                l2ad_hand;      /* next write location */
 209         uint64_t                l2ad_start;     /* first addr on device */
 210         uint64_t                l2ad_end;       /* last addr on device */
 211         boolean_t               l2ad_first;     /* first sweep through */
 212         boolean_t               l2ad_writing;   /* currently writing */
 213         kmutex_t                l2ad_mtx;       /* lock for buffer list */
 214         list_t                  l2ad_buflist;   /* buffer list */
 215         list_node_t             l2ad_node;      /* device list node */
 216         refcount_t              l2ad_alloc;     /* allocated bytes */
 217 } l2arc_dev_t;
 218
 219 typedef struct l2arc_buf_hdr {
 220         /* protected by arc_buf_hdr mutex */
 221         l2arc_dev_t             *b_dev;         /* L2ARC device */
 222         uint64_t                b_daddr;        /* disk address, offset byte */
 223         uint32_t                b_hits;
 224
 225         list_node_t             b_l2node;
 226 } l2arc_buf_hdr_t;
 227
 228 typedef struct l2arc_write_callback {
 229         l2arc_dev_t     *l2wcb_dev;             /* device info */
 230         arc_buf_hdr_t   *l2wcb_head;            /* head of write buflist */
 231 } l2arc_write_callback_t;
 232
 233 struct arc_buf_hdr {
 234         /* protected by hash lock */
 235         dva_t                   b_dva;
 236         uint64_t                b_birth;
 237
 238         arc_buf_contents_t      b_type;
 239         arc_buf_hdr_t           *b_hash_next;
 240         arc_flags_t             b_flags;
 241
 242         /*
 243          * This field stores the size of the data buffer after
 244          * compression, and is set in the arc's zio completion handlers.
 245          * It is in units of SPA_MINBLOCKSIZE (e.g. 1 == 512 bytes).
 246          *
 247          * While the block pointers can store up to 32MB in their psize
 248          * field, we can only store up to 32MB minus 512B. This is due
 249          * to the bp using a bias of 1, whereas we use a bias of 0 (i.e.
 250          * a field of zeros represents 512B in the bp). We can't use a
 251          * bias of 1 since we need to reserve a psize of zero, here, to
 252          * represent holes and embedded blocks.
 253          *
 254          * This isn't a problem in practice, since the maximum size of a
 255          * buffer is limited to 16MB, so we never need to store 32MB in
 256          * this field. Even in the upstream illumos code base, the
 257          * maximum size of a buffer is limited to 16MB.
 258          */
 259         uint16_t                b_psize;
 260
 261         /*
 262          * This field stores the size of the data buffer before
 263          * compression, and cannot change once set. It is in units
 264          * of SPA_MINBLOCKSIZE (e.g. 2 == 1024 bytes)
 265          */
 266         uint16_t                b_lsize;        /* immutable */
 267         uint64_t                b_spa;          /* immutable */
 268
 269         /* L2ARC fields. Undefined when not in L2ARC. */
 270         l2arc_buf_hdr_t         b_l2hdr;
 271         /* L1ARC fields. Undefined when in l2arc_only state */
 272         l1arc_buf_hdr_t         b_l1hdr;
 273         /*
 274          * Encryption parameters. Defined only when ARC_FLAG_ENCRYPTED
 275          * is set and the L1 header exists.
 276          */
 277         arc_buf_hdr_crypt_t b_crypt_hdr;
 278 };
 279 #ifdef __cplusplus
 280 }
 281 #endif
 282
 283 #endif /* _SYS_ARC_IMPL_H */