include/sys/arc_impl.h

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright (c) 2013 by Delphix. All rights reserved.
  24  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
  25  * Copyright 2013 Nexenta Systems, Inc.  All rights reserved.
  26  */
  27
  28 #ifndef _SYS_ARC_IMPL_H
  29 #define _SYS_ARC_IMPL_H
  30
  31 #include <sys/arc.h>
  32 #include <sys/zio_crypt.h>
  33
  34 #ifdef __cplusplus
  35 extern "C" {
  36 #endif
  37
  38 /*
  39  * Note that buffers can be in one of 6 states:
  40  *      ARC_anon        - anonymous (discussed below)
  41  *      ARC_mru         - recently used, currently cached
  42  *      ARC_mru_ghost   - recentely used, no longer in cache
  43  *      ARC_mfu         - frequently used, currently cached
  44  *      ARC_mfu_ghost   - frequently used, no longer in cache
  45  *      ARC_l2c_only    - exists in L2ARC but not other states
  46  * When there are no active references to the buffer, they are
  47  * are linked onto a list in one of these arc states.  These are
  48  * the only buffers that can be evicted or deleted.  Within each
  49  * state there are multiple lists, one for meta-data and one for
  50  * non-meta-data.  Meta-data (indirect blocks, blocks of dnodes,
  51  * etc.) is tracked separately so that it can be managed more
  52  * explicitly: favored over data, limited explicitly.
  53  *
  54  * Anonymous buffers are buffers that are not associated with
  55  * a DVA.  These are buffers that hold dirty block copies
  56  * before they are written to stable storage.  By definition,
  57  * they are "ref'd" and are considered part of arc_mru
  58  * that cannot be freed.  Generally, they will acquire a DVA
  59  * as they are written and migrate onto the arc_mru list.
  60  *
  61  * The ARC_l2c_only state is for buffers that are in the second
  62  * level ARC but no longer in any of the ARC_m* lists.  The second
  63  * level ARC itself may also contain buffers that are in any of
  64  * the ARC_m* states - meaning that a buffer can exist in two
  65  * places.  The reason for the ARC_l2c_only state is to keep the
  66  * buffer header in the hash table, so that reads that hit the
  67  * second level ARC benefit from these fast lookups.
  68  */
  69
  70 typedef struct arc_state {
  71         /*
  72          * list of evictable buffers
  73          */
  74         multilist_t *arcs_list[ARC_BUFC_NUMTYPES];
  75         /*
  76          * total amount of evictable data in this state
  77          */
  78         refcount_t arcs_esize[ARC_BUFC_NUMTYPES];
  79         /*
  80          * total amount of data in this state; this includes: evictable,
  81          * non-evictable, ARC_BUFC_DATA, and ARC_BUFC_METADATA.
  82          */
  83         refcount_t arcs_size;
  84         /*
  85          * supports the "dbufs" kstat
  86          */
  87         arc_state_type_t arcs_state;
  88 } arc_state_t;
  89
  90 typedef struct arc_callback arc_callback_t;
  91
  92 struct arc_callback {
  93         void                    *acb_private;
  94         arc_read_done_func_t    *acb_done;
  95         arc_buf_t               *acb_buf;
  96         boolean_t               acb_encrypted;
  97         boolean_t               acb_compressed;
  98         boolean_t               acb_noauth;
  99         uint64_t                acb_dsobj;
 100         zio_t                   *acb_zio_dummy;
 101         arc_callback_t          *acb_next;
 102 };
 103
 104 typedef struct arc_write_callback arc_write_callback_t;
 105
 106 struct arc_write_callback {
 107         void                    *awcb_private;
 108         arc_write_done_func_t   *awcb_ready;
 109         arc_write_done_func_t   *awcb_children_ready;
 110         arc_write_done_func_t   *awcb_physdone;
 111         arc_write_done_func_t   *awcb_done;
 112         arc_buf_t               *awcb_buf;
 113 };
 114
 115 /*
 116  * ARC buffers are separated into multiple structs as a memory saving measure:
 117  *   - Common fields struct, always defined, and embedded within it:
 118  *       - L2-only fields, always allocated but undefined when not in L2ARC
 119  *       - L1-only fields, only allocated when in L1ARC
 120  *
 121  *           Buffer in L1                     Buffer only in L2
 122  *    +------------------------+          +------------------------+
 123  *    | arc_buf_hdr_t          |          | arc_buf_hdr_t          |
 124  *    |                        |          |                        |
 125  *    |                        |          |                        |
 126  *    |                        |          |                        |
 127  *    +------------------------+          +------------------------+
 128  *    | l2arc_buf_hdr_t        |          | l2arc_buf_hdr_t        |
 129  *    | (undefined if L1-only) |          |                        |
 130  *    +------------------------+          +------------------------+
 131  *    | l1arc_buf_hdr_t        |
 132  *    |                        |
 133  *    |                        |
 134  *    |                        |
 135  *    |                        |
 136  *    +------------------------+
 137  *
 138  * Because it's possible for the L2ARC to become extremely large, we can wind
 139  * up eating a lot of memory in L2ARC buffer headers, so the size of a header
 140  * is minimized by only allocating the fields necessary for an L1-cached buffer
 141  * when a header is actually in the L1 cache. The sub-headers (l1arc_buf_hdr and
 142  * l2arc_buf_hdr) are embedded rather than allocated separately to save a couple
 143  * words in pointers. arc_hdr_realloc() is used to switch a header between
 144  * these two allocation states.
 145  */
 146 typedef struct l1arc_buf_hdr {
 147         kmutex_t                b_freeze_lock;
 148         zio_cksum_t             *b_freeze_cksum;
 149
 150         arc_buf_t               *b_buf;
 151         uint32_t                b_bufcnt;
 152         /* for waiting on writes to complete */
 153         kcondvar_t              b_cv;
 154         uint8_t                 b_byteswap;
 155
 156
 157         /* protected by arc state mutex */
 158         arc_state_t             *b_state;
 159         multilist_node_t        b_arc_node;
 160
 161         /* updated atomically */
 162         clock_t                 b_arc_access;
 163         uint32_t                b_mru_hits;
 164         uint32_t                b_mru_ghost_hits;
 165         uint32_t                b_mfu_hits;
 166         uint32_t                b_mfu_ghost_hits;
 167         uint32_t                b_l2_hits;
 168
 169         /* self protecting */
 170         refcount_t              b_refcnt;
 171
 172         arc_callback_t          *b_acb;
 173         abd_t                   *b_pabd;
 174 } l1arc_buf_hdr_t;
 175
 176 /*
 177  * Encrypted blocks will need to be stored encrypted on the L2ARC
 178  * disk as they appear in the main pool. In order for this to work we
 179  * need to pass around the encryption parameters so they can be used
 180  * to write data to the L2ARC. This struct is only defined in the
 181  * arc_buf_hdr_t if the L1 header is defined and has the ARC_FLAG_ENCRYPTED
 182  * flag set.
 183  */
 184 typedef struct arc_buf_hdr_crypt {
 185         abd_t                   *b_rabd;        /* raw encrypted data */
 186         dmu_object_type_t       b_ot;           /* object type */
 187         uint32_t                b_ebufcnt;      /* count of encrypted buffers */
 188
 189         /* dsobj for looking up encryption key for l2arc encryption */
 190         uint64_t                b_dsobj;
 191
 192         /* encryption parameters */
 193         uint8_t                 b_salt[ZIO_DATA_SALT_LEN];
 194         uint8_t                 b_iv[ZIO_DATA_IV_LEN];
 195
 196         /*
 197          * Technically this could be removed since we will always be able to
 198          * get the mac from the bp when we need it. However, it is inconvenient
 199          * for callers of arc code to have to pass a bp in all the time. This
 200          * also allows us to assert that L2ARC data is properly encrypted to
 201          * match the data in the main storage pool.
 202          */
 203         uint8_t                 b_mac[ZIO_DATA_MAC_LEN];
 204 } arc_buf_hdr_crypt_t;
 205
 206 typedef struct l2arc_dev {
 207         vdev_t                  *l2ad_vdev;     /* vdev */
 208         spa_t                   *l2ad_spa;      /* spa */
 209         uint64_t                l2ad_hand;      /* next write location */
 210         uint64_t                l2ad_start;     /* first addr on device */
 211         uint64_t                l2ad_end;       /* last addr on device */
 212         boolean_t               l2ad_first;     /* first sweep through */
 213         boolean_t               l2ad_writing;   /* currently writing */
 214         kmutex_t                l2ad_mtx;       /* lock for buffer list */
 215         list_t                  l2ad_buflist;   /* buffer list */
 216         list_node_t             l2ad_node;      /* device list node */
 217         refcount_t              l2ad_alloc;     /* allocated bytes */
 218 } l2arc_dev_t;
 219
 220 typedef struct l2arc_buf_hdr {
 221         /* protected by arc_buf_hdr mutex */
 222         l2arc_dev_t             *b_dev;         /* L2ARC device */
 223         uint64_t                b_daddr;        /* disk address, offset byte */
 224         uint32_t                b_hits;
 225
 226         list_node_t             b_l2node;
 227 } l2arc_buf_hdr_t;
 228
 229 typedef struct l2arc_write_callback {
 230         l2arc_dev_t     *l2wcb_dev;             /* device info */
 231         arc_buf_hdr_t   *l2wcb_head;            /* head of write buflist */
 232 } l2arc_write_callback_t;
 233
 234 struct arc_buf_hdr {
 235         /* protected by hash lock */
 236         dva_t                   b_dva;
 237         uint64_t                b_birth;
 238
 239         arc_buf_contents_t      b_type;
 240         arc_buf_hdr_t           *b_hash_next;
 241         arc_flags_t             b_flags;
 242
 243         /*
 244          * This field stores the size of the data buffer after
 245          * compression, and is set in the arc's zio completion handlers.
 246          * It is in units of SPA_MINBLOCKSIZE (e.g. 1 == 512 bytes).
 247          *
 248          * While the block pointers can store up to 32MB in their psize
 249          * field, we can only store up to 32MB minus 512B. This is due
 250          * to the bp using a bias of 1, whereas we use a bias of 0 (i.e.
 251          * a field of zeros represents 512B in the bp). We can't use a
 252          * bias of 1 since we need to reserve a psize of zero, here, to
 253          * represent holes and embedded blocks.
 254          *
 255          * This isn't a problem in practice, since the maximum size of a
 256          * buffer is limited to 16MB, so we never need to store 32MB in
 257          * this field. Even in the upstream illumos code base, the
 258          * maximum size of a buffer is limited to 16MB.
 259          */
 260         uint16_t                b_psize;
 261
 262         /*
 263          * This field stores the size of the data buffer before
 264          * compression, and cannot change once set. It is in units
 265          * of SPA_MINBLOCKSIZE (e.g. 2 == 1024 bytes)
 266          */
 267         uint16_t                b_lsize;        /* immutable */
 268         uint64_t                b_spa;          /* immutable */
 269
 270         /* L2ARC fields. Undefined when not in L2ARC. */
 271         l2arc_buf_hdr_t         b_l2hdr;
 272         /* L1ARC fields. Undefined when in l2arc_only state */
 273         l1arc_buf_hdr_t         b_l1hdr;
 274         /*
 275          * Encryption parameters. Defined only when ARC_FLAG_ENCRYPTED
 276          * is set and the L1 header exists.
 277          */
 278         arc_buf_hdr_crypt_t b_crypt_hdr;
 279 };
 280 #ifdef __cplusplus
 281 }
 282 #endif
 283
 284 #endif /* _SYS_ARC_IMPL_H */