]>
Commit | Line | Data |
---|---|---|
ff03aa2d SI |
1 | From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 |
2 | From: Tom Caputi <tcaputi@datto.com> | |
3 | Date: Tue, 10 Apr 2018 14:15:05 -0400 | |
4 | Subject: [PATCH] Fix race in dnode_check_slots_free() | |
5 | ||
6 | Currently, dnode_check_slots_free() works by checking dn->dn_type | |
7 | in the dnode to determine if the dnode is reclaimable. However, | |
8 | there is a small window of time between dnode_free_sync() in the | |
9 | first call to dsl_dataset_sync() and when the useraccounting code | |
10 | is run when the type is set DMU_OT_NONE, but the dnode is not yet | |
11 | evictable, leading to crashes. This patch adds the ability for | |
12 | dnodes to track which txg they were last dirtied in and adds a | |
13 | check for this before performing the reclaim. | |
14 | ||
15 | This patch also corrects several instances when dn_dirty_link was | |
16 | treated as a list_node_t when it is technically a multilist_node_t. | |
17 | ||
18 | Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> | |
19 | Signed-off-by: Tom Caputi <tcaputi@datto.com> | |
20 | Requires-spl: spl-0.7-release | |
21 | Issue #7147 | |
22 | Issue #7388 | |
23 | Issue #7997 | |
24 | ||
25 | (cherry-picked from behlendorf/issue-7997 4764f6f3be90be073d2700653dff286371e52583) | |
26 | Signed-off-by: Stoiko Ivanov <s.ivanov@proxmox.com> | |
27 | --- | |
28 | include/sys/dmu_impl.h | 1 + | |
29 | include/sys/dnode.h | 4 ++++ | |
30 | module/zfs/dbuf.c | 3 +++ | |
31 | module/zfs/dmu.c | 2 +- | |
32 | module/zfs/dmu_objset.c | 15 +++++++++++++++ | |
33 | module/zfs/dnode.c | 29 +++++++++++++++++++---------- | |
34 | 6 files changed, 43 insertions(+), 11 deletions(-) | |
35 | ||
36 | diff --git a/include/sys/dmu_impl.h b/include/sys/dmu_impl.h | |
37 | index 65e417e3..03a63077 100644 | |
38 | --- a/include/sys/dmu_impl.h | |
39 | +++ b/include/sys/dmu_impl.h | |
40 | @@ -161,6 +161,7 @@ extern "C" { | |
41 | * dn_allocated_txg | |
42 | * dn_free_txg | |
43 | * dn_assigned_txg | |
44 | + * dn_dirty_txg | |
45 | * dd_assigned_tx | |
46 | * dn_notxholds | |
47 | * dn_dirtyctx | |
48 | diff --git a/include/sys/dnode.h b/include/sys/dnode.h | |
49 | index ea7defe1..2dd087b3 100644 | |
50 | --- a/include/sys/dnode.h | |
51 | +++ b/include/sys/dnode.h | |
52 | @@ -260,6 +260,7 @@ struct dnode { | |
53 | uint64_t dn_allocated_txg; | |
54 | uint64_t dn_free_txg; | |
55 | uint64_t dn_assigned_txg; | |
56 | + uint64_t dn_dirty_txg; /* txg dnode was last dirtied */ | |
57 | kcondvar_t dn_notxholds; | |
58 | enum dnode_dirtycontext dn_dirtyctx; | |
59 | uint8_t *dn_dirtyctx_firstset; /* dbg: contents meaningless */ | |
60 | @@ -362,6 +363,9 @@ void dnode_evict_dbufs(dnode_t *dn); | |
61 | void dnode_evict_bonus(dnode_t *dn); | |
62 | void dnode_free_interior_slots(dnode_t *dn); | |
63 | ||
64 | +#define DNODE_IS_DIRTY(_dn) \ | |
65 | + ((_dn)->dn_dirty_txg >= spa_syncing_txg((_dn)->dn_objset->os_spa)) | |
66 | + | |
67 | #define DNODE_IS_CACHEABLE(_dn) \ | |
68 | ((_dn)->dn_objset->os_primary_cache == ZFS_CACHE_ALL || \ | |
69 | (DMU_OT_IS_METADATA((_dn)->dn_type) && \ | |
70 | diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c | |
71 | index 4ee121f5..6edb39d6 100644 | |
72 | --- a/module/zfs/dbuf.c | |
73 | +++ b/module/zfs/dbuf.c | |
74 | @@ -1606,6 +1606,9 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) | |
75 | FTAG); | |
76 | } | |
77 | } | |
78 | + | |
79 | + if (tx->tx_txg > dn->dn_dirty_txg) | |
80 | + dn->dn_dirty_txg = tx->tx_txg; | |
81 | mutex_exit(&dn->dn_mtx); | |
82 | ||
83 | if (db->db_blkid == DMU_SPILL_BLKID) | |
84 | diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c | |
85 | index 6f09aa2f..a09ac4f9 100644 | |
86 | --- a/module/zfs/dmu.c | |
87 | +++ b/module/zfs/dmu.c | |
88 | @@ -2044,7 +2044,7 @@ dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off) | |
89 | * Check if dnode is dirty | |
90 | */ | |
91 | for (i = 0; i < TXG_SIZE; i++) { | |
92 | - if (list_link_active(&dn->dn_dirty_link[i])) { | |
93 | + if (multilist_link_active(&dn->dn_dirty_link[i])) { | |
94 | clean = B_FALSE; | |
95 | break; | |
96 | } | |
97 | diff --git a/module/zfs/dmu_objset.c b/module/zfs/dmu_objset.c | |
98 | index 449ebedf..0bed2d3e 100644 | |
99 | --- a/module/zfs/dmu_objset.c | |
100 | +++ b/module/zfs/dmu_objset.c | |
101 | @@ -1213,10 +1213,23 @@ dmu_objset_sync_dnodes(multilist_sublist_t *list, dmu_tx_t *tx) | |
102 | ASSERT3U(dn->dn_nlevels, <=, DN_MAX_LEVELS); | |
103 | multilist_sublist_remove(list, dn); | |
104 | ||
105 | + /* | |
106 | + * If we are not doing useraccounting (os_synced_dnodes == NULL) | |
107 | + * we are done with this dnode for this txg. Unset dn_dirty_txg | |
108 | + * if later txgs aren't dirtying it so that future holders do | |
109 | + * not get a stale value. Otherwise, we will do this in | |
110 | + * userquota_updates_task() when processing has completely | |
111 | + * finished for this txg. | |
112 | + */ | |
113 | multilist_t *newlist = dn->dn_objset->os_synced_dnodes; | |
114 | if (newlist != NULL) { | |
115 | (void) dnode_add_ref(dn, newlist); | |
116 | multilist_insert(newlist, dn); | |
117 | + } else { | |
118 | + mutex_enter(&dn->dn_mtx); | |
119 | + if (dn->dn_dirty_txg == tx->tx_txg) | |
120 | + dn->dn_dirty_txg = 0; | |
121 | + mutex_exit(&dn->dn_mtx); | |
122 | } | |
123 | ||
124 | dnode_sync(dn, tx); | |
125 | @@ -1621,6 +1634,8 @@ userquota_updates_task(void *arg) | |
126 | dn->dn_id_flags |= DN_ID_CHKED_BONUS; | |
127 | } | |
128 | dn->dn_id_flags &= ~(DN_ID_NEW_EXIST); | |
129 | + if (dn->dn_dirty_txg == spa_syncing_txg(os->os_spa)) | |
130 | + dn->dn_dirty_txg = 0; | |
131 | mutex_exit(&dn->dn_mtx); | |
132 | ||
133 | multilist_sublist_remove(list, dn); | |
134 | diff --git a/module/zfs/dnode.c b/module/zfs/dnode.c | |
135 | index d465b545..4a169c49 100644 | |
136 | --- a/module/zfs/dnode.c | |
137 | +++ b/module/zfs/dnode.c | |
138 | @@ -137,7 +137,7 @@ dnode_cons(void *arg, void *unused, int kmflag) | |
139 | bzero(&dn->dn_next_blksz[0], sizeof (dn->dn_next_blksz)); | |
140 | ||
141 | for (i = 0; i < TXG_SIZE; i++) { | |
142 | - list_link_init(&dn->dn_dirty_link[i]); | |
143 | + multilist_link_init(&dn->dn_dirty_link[i]); | |
144 | dn->dn_free_ranges[i] = NULL; | |
145 | list_create(&dn->dn_dirty_records[i], | |
146 | sizeof (dbuf_dirty_record_t), | |
147 | @@ -147,6 +147,7 @@ dnode_cons(void *arg, void *unused, int kmflag) | |
148 | dn->dn_allocated_txg = 0; | |
149 | dn->dn_free_txg = 0; | |
150 | dn->dn_assigned_txg = 0; | |
151 | + dn->dn_dirty_txg = 0; | |
152 | dn->dn_dirtyctx = 0; | |
153 | dn->dn_dirtyctx_firstset = NULL; | |
154 | dn->dn_bonus = NULL; | |
155 | @@ -184,7 +185,7 @@ dnode_dest(void *arg, void *unused) | |
156 | ASSERT(!list_link_active(&dn->dn_link)); | |
157 | ||
158 | for (i = 0; i < TXG_SIZE; i++) { | |
159 | - ASSERT(!list_link_active(&dn->dn_dirty_link[i])); | |
160 | + ASSERT(!multilist_link_active(&dn->dn_dirty_link[i])); | |
161 | ASSERT3P(dn->dn_free_ranges[i], ==, NULL); | |
162 | list_destroy(&dn->dn_dirty_records[i]); | |
163 | ASSERT0(dn->dn_next_nblkptr[i]); | |
164 | @@ -199,6 +200,7 @@ dnode_dest(void *arg, void *unused) | |
165 | ASSERT0(dn->dn_allocated_txg); | |
166 | ASSERT0(dn->dn_free_txg); | |
167 | ASSERT0(dn->dn_assigned_txg); | |
168 | + ASSERT0(dn->dn_dirty_txg); | |
169 | ASSERT0(dn->dn_dirtyctx); | |
170 | ASSERT3P(dn->dn_dirtyctx_firstset, ==, NULL); | |
171 | ASSERT3P(dn->dn_bonus, ==, NULL); | |
172 | @@ -523,6 +525,7 @@ dnode_destroy(dnode_t *dn) | |
173 | dn->dn_allocated_txg = 0; | |
174 | dn->dn_free_txg = 0; | |
175 | dn->dn_assigned_txg = 0; | |
176 | + dn->dn_dirty_txg = 0; | |
177 | ||
178 | dn->dn_dirtyctx = 0; | |
179 | if (dn->dn_dirtyctx_firstset != NULL) { | |
180 | @@ -592,6 +595,7 @@ dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs, | |
181 | ASSERT0(dn->dn_maxblkid); | |
182 | ASSERT0(dn->dn_allocated_txg); | |
183 | ASSERT0(dn->dn_assigned_txg); | |
184 | + ASSERT0(dn->dn_dirty_txg); | |
185 | ASSERT(refcount_is_zero(&dn->dn_tx_holds)); | |
186 | ASSERT3U(refcount_count(&dn->dn_holds), <=, 1); | |
187 | ASSERT(avl_is_empty(&dn->dn_dbufs)); | |
188 | @@ -604,7 +608,7 @@ dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs, | |
189 | ASSERT0(dn->dn_next_bonustype[i]); | |
190 | ASSERT0(dn->dn_rm_spillblk[i]); | |
191 | ASSERT0(dn->dn_next_blksz[i]); | |
192 | - ASSERT(!list_link_active(&dn->dn_dirty_link[i])); | |
193 | + ASSERT(!multilist_link_active(&dn->dn_dirty_link[i])); | |
194 | ASSERT3P(list_head(&dn->dn_dirty_records[i]), ==, NULL); | |
195 | ASSERT3P(dn->dn_free_ranges[i], ==, NULL); | |
196 | } | |
197 | @@ -779,6 +783,7 @@ dnode_move_impl(dnode_t *odn, dnode_t *ndn) | |
198 | ndn->dn_allocated_txg = odn->dn_allocated_txg; | |
199 | ndn->dn_free_txg = odn->dn_free_txg; | |
200 | ndn->dn_assigned_txg = odn->dn_assigned_txg; | |
201 | + ndn->dn_dirty_txg = odn->dn_dirty_txg; | |
202 | ndn->dn_dirtyctx = odn->dn_dirtyctx; | |
203 | ndn->dn_dirtyctx_firstset = odn->dn_dirtyctx_firstset; | |
204 | ASSERT(refcount_count(&odn->dn_tx_holds) == 0); | |
205 | @@ -845,6 +850,7 @@ dnode_move_impl(dnode_t *odn, dnode_t *ndn) | |
206 | odn->dn_allocated_txg = 0; | |
207 | odn->dn_free_txg = 0; | |
208 | odn->dn_assigned_txg = 0; | |
209 | + odn->dn_dirty_txg = 0; | |
210 | odn->dn_dirtyctx = 0; | |
211 | odn->dn_dirtyctx_firstset = NULL; | |
212 | odn->dn_have_spill = B_FALSE; | |
213 | @@ -1069,6 +1075,10 @@ dnode_check_slots_free(dnode_children_t *children, int idx, int slots) | |
214 | { | |
215 | ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK); | |
216 | ||
217 | + /* | |
218 | + * If all dnode slots are either already free or | |
219 | + * evictable return B_TRUE. | |
220 | + */ | |
221 | for (int i = idx; i < idx + slots; i++) { | |
222 | dnode_handle_t *dnh = &children->dnc_children[i]; | |
223 | dnode_t *dn = dnh->dnh_dnode; | |
224 | @@ -1077,18 +1087,17 @@ dnode_check_slots_free(dnode_children_t *children, int idx, int slots) | |
225 | continue; | |
226 | } else if (DN_SLOT_IS_PTR(dn)) { | |
227 | mutex_enter(&dn->dn_mtx); | |
228 | - dmu_object_type_t type = dn->dn_type; | |
229 | + boolean_t can_free = (dn->dn_type == DMU_OT_NONE && | |
230 | + !DNODE_IS_DIRTY(dn)); | |
231 | mutex_exit(&dn->dn_mtx); | |
232 | ||
233 | - if (type != DMU_OT_NONE) | |
234 | + if (!can_free) | |
235 | return (B_FALSE); | |
236 | - | |
237 | - continue; | |
238 | + else | |
239 | + continue; | |
240 | } else { | |
241 | return (B_FALSE); | |
242 | } | |
243 | - | |
244 | - return (B_FALSE); | |
245 | } | |
246 | ||
247 | return (B_TRUE); | |
248 | @@ -1594,7 +1603,7 @@ dnode_setdirty(dnode_t *dn, dmu_tx_t *tx) | |
249 | /* | |
250 | * If we are already marked dirty, we're done. | |
251 | */ | |
252 | - if (list_link_active(&dn->dn_dirty_link[txg & TXG_MASK])) { | |
253 | + if (multilist_link_active(&dn->dn_dirty_link[txg & TXG_MASK])) { | |
254 | multilist_sublist_unlock(mls); | |
255 | return; | |
256 | } |