]>
Commit | Line | Data |
---|---|---|
34dc7c2f BB |
1 | /* |
2 | * CDDL HEADER START | |
3 | * | |
4 | * The contents of this file are subject to the terms of the | |
5 | * Common Development and Distribution License (the "License"). | |
6 | * You may not use this file except in compliance with the License. | |
7 | * | |
8 | * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE | |
1d3ba0bf | 9 | * or https://opensource.org/licenses/CDDL-1.0. |
34dc7c2f BB |
10 | * See the License for the specific language governing permissions |
11 | * and limitations under the License. | |
12 | * | |
13 | * When distributing Covered Code, include this CDDL HEADER in each | |
14 | * file and include the License file at usr/src/OPENSOLARIS.LICENSE. | |
15 | * If applicable, add the following below this CDDL HEADER, with the | |
16 | * fields enclosed by brackets "[]" replaced with your own identifying | |
17 | * information: Portions Copyright [yyyy] [name of copyright owner] | |
18 | * | |
19 | * CDDL HEADER END | |
20 | */ | |
21 | /* | |
428870ff | 22 | * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. |
b8738257 | 23 | * Copyright (c) 2012, 2018 by Delphix. All rights reserved. |
34dc7c2f BB |
24 | */ |
25 | ||
428870ff BB |
26 | /* Portions Copyright 2010 Robert Milkowski */ |
27 | ||
34dc7c2f BB |
28 | #ifndef _SYS_ZIL_IMPL_H |
29 | #define _SYS_ZIL_IMPL_H | |
30 | ||
34dc7c2f BB |
31 | #include <sys/zil.h> |
32 | #include <sys/dmu_objset.h> | |
33 | ||
34 | #ifdef __cplusplus | |
35 | extern "C" { | |
36 | #endif | |
37 | ||
38 | /* | |
2fe61a7e PS |
39 | * Possible states for a given lwb structure. |
40 | * | |
eda3fcd5 AM |
41 | * An lwb will start out in the "new" state, and transition to the "opened" |
42 | * state via a call to zil_lwb_write_open() on first itx assignment. When | |
43 | * transitioning from "new" to "opened" the zilog's "zl_issuer_lock" must be | |
44 | * held. | |
2fe61a7e | 45 | * |
eda3fcd5 AM |
46 | * After the lwb is "opened", it can be assigned number of itxs and transition |
47 | * into the "closed" state via zil_lwb_write_close() when full or on timeout. | |
48 | * When transitioning from "opened" to "closed" the zilog's "zl_issuer_lock" | |
49 | * must be held. New lwb allocation also takes "zl_lock" to protect the list. | |
50 | * | |
51 | * After the lwb is "closed", it can transition into the "ready" state via | |
52 | * zil_lwb_write_issue(). "zl_lock" must be held when making this transition. | |
53 | * Since it is done by the same thread, "zl_issuer_lock" is not needed. | |
54 | * | |
55 | * When lwb in "ready" state receives its block pointer, it can transition to | |
56 | * "issued". "zl_lock" must be held when making this transition. | |
2fe61a7e | 57 | * |
900d09b2 PS |
58 | * After the lwb's write zio completes, it transitions into the "write |
59 | * done" state via zil_lwb_write_done(); and then into the "flush done" | |
60 | * state via zil_lwb_flush_vdevs_done(). When transitioning from | |
61 | * "issued" to "write done", and then from "write done" to "flush done", | |
62 | * the zilog's "zl_lock" must be held, *not* the "zl_issuer_lock". | |
2fe61a7e PS |
63 | * |
64 | * The zilog's "zl_issuer_lock" can become heavily contended in certain | |
65 | * workloads, so we specifically avoid acquiring that lock when | |
66 | * transitioning an lwb from "issued" to "done". This allows us to avoid | |
67 | * having to acquire the "zl_issuer_lock" for each lwb ZIO completion, | |
68 | * which would have added more lock contention on an already heavily | |
69 | * contended lock. | |
70 | * | |
71 | * Additionally, correctness when reading an lwb's state is often | |
72 | * achieved by exploiting the fact that these state transitions occur in | |
eda3fcd5 AM |
73 | * this specific order; i.e. "new" to "opened" to "closed" to "ready" to |
74 | * "issued" to "write_done" and finally "flush_done". | |
2fe61a7e | 75 | * |
eda3fcd5 | 76 | * Thus, if an lwb is in the "new" or "opened" state, holding the |
2fe61a7e | 77 | * "zl_issuer_lock" will prevent a concurrent thread from transitioning |
eda3fcd5 AM |
78 | * that lwb to the "closed" state. Likewise, if an lwb is already in the |
79 | * "ready" state, holding the "zl_lock" will prevent a concurrent thread | |
80 | * from transitioning that lwb to the "issued" state. | |
1ce23dca PS |
81 | */ |
82 | typedef enum { | |
eda3fcd5 | 83 | LWB_STATE_NEW, |
1ce23dca | 84 | LWB_STATE_OPENED, |
eda3fcd5 AM |
85 | LWB_STATE_CLOSED, |
86 | LWB_STATE_READY, | |
1ce23dca | 87 | LWB_STATE_ISSUED, |
900d09b2 PS |
88 | LWB_STATE_WRITE_DONE, |
89 | LWB_STATE_FLUSH_DONE, | |
1ce23dca PS |
90 | LWB_NUM_STATES |
91 | } lwb_state_t; | |
92 | ||
93 | /* | |
94 | * Log write block (lwb) | |
95 | * | |
96 | * Prior to an lwb being issued to disk via zil_lwb_write_issue(), it | |
1b2b0aca | 97 | * will be protected by the zilog's "zl_issuer_lock". Basically, prior |
1ce23dca | 98 | * to it being issued, it will only be accessed by the thread that's |
1b2b0aca | 99 | * holding the "zl_issuer_lock". After the lwb is issued, the zilog's |
1ce23dca | 100 | * "zl_lock" is used to protect the lwb against concurrent access. |
34dc7c2f BB |
101 | */ |
102 | typedef struct lwb { | |
103 | zilog_t *lwb_zilog; /* back pointer to log struct */ | |
104 | blkptr_t lwb_blk; /* on disk address of this log blk */ | |
eda3fcd5 | 105 | boolean_t lwb_slim; /* log block has slim format */ |
1b7c1e5c | 106 | boolean_t lwb_slog; /* lwb_blk is on SLOG device */ |
eda3fcd5 AM |
107 | int lwb_error; /* log block allocation error */ |
108 | int lwb_nmax; /* max bytes in the buffer */ | |
34dc7c2f | 109 | int lwb_nused; /* # used bytes in buffer */ |
f63811f0 | 110 | int lwb_nfilled; /* # filled bytes in buffer */ |
34dc7c2f | 111 | int lwb_sz; /* size of block and buffer */ |
1ce23dca | 112 | lwb_state_t lwb_state; /* the state of this lwb */ |
34dc7c2f | 113 | char *lwb_buf; /* log write buffer */ |
eda3fcd5 | 114 | zio_t *lwb_child_zio; /* parent zio for children */ |
1ce23dca PS |
115 | zio_t *lwb_write_zio; /* zio for the lwb buffer */ |
116 | zio_t *lwb_root_zio; /* root zio for lwb write and flushes */ | |
f63811f0 | 117 | hrtime_t lwb_issued_timestamp; /* when was the lwb issued? */ |
152d6fda | 118 | uint64_t lwb_issued_txg; /* the txg when the write is issued */ |
eda3fcd5 | 119 | uint64_t lwb_alloc_txg; /* the txg when lwb_blk is allocated */ |
34dc7c2f | 120 | uint64_t lwb_max_txg; /* highest txg in this lwb */ |
34dc7c2f | 121 | list_node_t lwb_node; /* zilog->zl_lwb_list linkage */ |
f63811f0 | 122 | list_node_t lwb_issue_node; /* linkage of lwbs ready for issue */ |
1ce23dca PS |
123 | list_t lwb_itxs; /* list of itx's */ |
124 | list_t lwb_waiters; /* list of zil_commit_waiter's */ | |
125 | avl_tree_t lwb_vdev_tree; /* vdevs to flush after lwb write */ | |
126 | kmutex_t lwb_vdev_lock; /* protects lwb_vdev_tree */ | |
34dc7c2f BB |
127 | } lwb_t; |
128 | ||
1ce23dca PS |
129 | /* |
130 | * ZIL commit waiter. | |
131 | * | |
132 | * This structure is allocated each time zil_commit() is called, and is | |
133 | * used by zil_commit() to communicate with other parts of the ZIL, such | |
134 | * that zil_commit() can know when it safe for it return. For more | |
135 | * details, see the comment above zil_commit(). | |
136 | * | |
137 | * The "zcw_lock" field is used to protect the commit waiter against | |
138 | * concurrent access. This lock is often acquired while already holding | |
1b2b0aca | 139 | * the zilog's "zl_issuer_lock" or "zl_lock"; see the functions |
1ce23dca PS |
140 | * zil_process_commit_list() and zil_lwb_flush_vdevs_done() as examples |
141 | * of this. Thus, one must be careful not to acquire the | |
1b2b0aca | 142 | * "zl_issuer_lock" or "zl_lock" when already holding the "zcw_lock"; |
1ce23dca PS |
143 | * e.g. see the zil_commit_waiter_timeout() function. |
144 | */ | |
145 | typedef struct zil_commit_waiter { | |
146 | kcondvar_t zcw_cv; /* signalled when "done" */ | |
147 | kmutex_t zcw_lock; /* protects fields of this struct */ | |
148 | list_node_t zcw_node; /* linkage in lwb_t:lwb_waiter list */ | |
149 | lwb_t *zcw_lwb; /* back pointer to lwb when linked */ | |
150 | boolean_t zcw_done; /* B_TRUE when "done", else B_FALSE */ | |
151 | int zcw_zio_error; /* contains the zio io_error value */ | |
152 | } zil_commit_waiter_t; | |
153 | ||
572e2857 BB |
154 | /* |
155 | * Intent log transaction lists | |
156 | */ | |
157 | typedef struct itxs { | |
158 | list_t i_sync_list; /* list of synchronous itxs */ | |
159 | avl_tree_t i_async_tree; /* tree of foids for async itxs */ | |
160 | } itxs_t; | |
161 | ||
162 | typedef struct itxg { | |
163 | kmutex_t itxg_lock; /* lock for this structure */ | |
164 | uint64_t itxg_txg; /* txg for this chain */ | |
572e2857 BB |
165 | itxs_t *itxg_itxs; /* sync and async itxs */ |
166 | } itxg_t; | |
167 | ||
168 | /* for async nodes we build up an AVL tree of lists of async itxs per file */ | |
169 | typedef struct itx_async_node { | |
170 | uint64_t ia_foid; /* file object id */ | |
171 | list_t ia_list; /* list of async itxs for this foid */ | |
172 | avl_node_t ia_node; /* AVL tree linkage */ | |
173 | } itx_async_node_t; | |
174 | ||
34dc7c2f BB |
175 | /* |
176 | * Vdev flushing: during a zil_commit(), we build up an AVL tree of the vdevs | |
177 | * we've touched so we know which ones need a write cache flush at the end. | |
178 | */ | |
179 | typedef struct zil_vdev_node { | |
180 | uint64_t zv_vdev; /* vdev to be flushed */ | |
181 | avl_node_t zv_node; /* AVL tree linkage */ | |
182 | } zil_vdev_node_t; | |
183 | ||
252f46be | 184 | #define ZIL_BURSTS 8 |
428870ff | 185 | |
34dc7c2f BB |
186 | /* |
187 | * Stable storage intent log management structure. One per dataset. | |
188 | */ | |
189 | struct zilog { | |
190 | kmutex_t zl_lock; /* protects most zilog_t fields */ | |
191 | struct dsl_pool *zl_dmu_pool; /* DSL pool */ | |
192 | spa_t *zl_spa; /* handle for read/write log */ | |
193 | const zil_header_t *zl_header; /* log header buffer */ | |
194 | objset_t *zl_os; /* object set we're logging */ | |
195 | zil_get_data_t *zl_get_data; /* callback to get object content */ | |
1ce23dca PS |
196 | lwb_t *zl_last_lwb_opened; /* most recent lwb opened */ |
197 | hrtime_t zl_last_lwb_latency; /* zio latency of last lwb done */ | |
428870ff | 198 | uint64_t zl_lr_seq; /* on-disk log record sequence number */ |
428870ff | 199 | uint64_t zl_commit_lr_seq; /* last committed on-disk lr seq */ |
34dc7c2f | 200 | uint64_t zl_destroy_txg; /* txg of last zil_destroy() */ |
fb5f0bc8 BB |
201 | uint64_t zl_replayed_seq[TXG_SIZE]; /* last replayed rec seq */ |
202 | uint64_t zl_replaying_seq; /* current replay seq number */ | |
34dc7c2f | 203 | uint32_t zl_suspend; /* log suspend count */ |
34dc7c2f BB |
204 | kcondvar_t zl_cv_suspend; /* log suspend completion */ |
205 | uint8_t zl_suspending; /* log is currently suspending */ | |
206 | uint8_t zl_keep_first; /* keep first log block in destroy */ | |
fb5f0bc8 | 207 | uint8_t zl_replay; /* replaying records while set */ |
34dc7c2f | 208 | uint8_t zl_stop_sync; /* for debugging */ |
1b2b0aca | 209 | kmutex_t zl_issuer_lock; /* single writer, per ZIL, at a time */ |
428870ff BB |
210 | uint8_t zl_logbias; /* latency or throughput */ |
211 | uint8_t zl_sync; /* synchronous or asynchronous */ | |
212 | int zl_parse_error; /* last zil_parse() error */ | |
213 | uint64_t zl_parse_blk_seq; /* highest blk seq on last parse */ | |
214 | uint64_t zl_parse_lr_seq; /* highest lr seq on last parse */ | |
215 | uint64_t zl_parse_blk_count; /* number of blocks parsed */ | |
216 | uint64_t zl_parse_lr_count; /* number of log records parsed */ | |
572e2857 BB |
217 | itxg_t zl_itxg[TXG_SIZE]; /* intent log txg chains */ |
218 | list_t zl_itx_commit_list; /* itx list to be committed */ | |
eff77a80 AM |
219 | uint64_t zl_cur_size; /* current burst full size */ |
220 | uint64_t zl_cur_left; /* current burst remaining size */ | |
221 | uint64_t zl_cur_max; /* biggest record in current burst */ | |
34dc7c2f | 222 | list_t zl_lwb_list; /* in-flight log write list */ |
428870ff | 223 | avl_tree_t zl_bp_tree; /* track bps during log parse */ |
34dc7c2f BB |
224 | clock_t zl_replay_time; /* lbolt of when replay started */ |
225 | uint64_t zl_replay_blks; /* number of log blocks replayed */ | |
428870ff | 226 | zil_header_t zl_old_header; /* debugging aid */ |
252f46be | 227 | uint_t zl_parallel; /* workload is multi-threaded */ |
428870ff | 228 | uint_t zl_prev_rotor; /* rotor for zl_prev[] */ |
eff77a80 AM |
229 | uint_t zl_prev_opt[ZIL_BURSTS]; /* optimal block size */ |
230 | uint_t zl_prev_min[ZIL_BURSTS]; /* minimal first block size */ | |
29809a6c | 231 | txg_node_t zl_dirty_link; /* protected by dp_dirty_zilogs list */ |
1ce23dca | 232 | uint64_t zl_dirty_max_txg; /* highest txg used to dirty zilog */ |
152d6fda KJ |
233 | |
234 | kmutex_t zl_lwb_io_lock; /* protect following members */ | |
235 | uint64_t zl_lwb_inflight[TXG_SIZE]; /* io issued, but not done */ | |
236 | kcondvar_t zl_lwb_io_cv; /* signal when the flush is done */ | |
237 | uint64_t zl_lwb_max_issued_txg; /* max txg when lwb io issued */ | |
238 | ||
b8738257 MA |
239 | /* |
240 | * Max block size for this ZIL. Note that this can not be changed | |
241 | * while the ZIL is in use because consumers (ZPL/zvol) need to take | |
242 | * this into account when deciding between WR_COPIED and WR_NEED_COPY | |
243 | * (see zil_max_copied_data()). | |
244 | */ | |
245 | uint64_t zl_max_block_size; | |
fb087146 AH |
246 | |
247 | /* Pointer for per dataset zil sums */ | |
248 | zil_sums_t *zl_sums; | |
34dc7c2f BB |
249 | }; |
250 | ||
428870ff | 251 | typedef struct zil_bp_node { |
34dc7c2f BB |
252 | dva_t zn_dva; |
253 | avl_node_t zn_node; | |
428870ff | 254 | } zil_bp_node_t; |
34dc7c2f BB |
255 | |
256 | #ifdef __cplusplus | |
257 | } | |
258 | #endif | |
259 | ||
260 | #endif /* _SYS_ZIL_IMPL_H */ |