]> git.proxmox.com Git - zfsonlinux.git/blob - zfs-patches/0005-Fix-race-in-dnode_check_slots_free.patch
update/rebase to spl-0.7.12 with patches from ZOL
[zfsonlinux.git] / zfs-patches / 0005-Fix-race-in-dnode_check_slots_free.patch
1 From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
2 From: Tom Caputi <tcaputi@datto.com>
3 Date: Tue, 10 Apr 2018 14:15:05 -0400
4 Subject: [PATCH] Fix race in dnode_check_slots_free()
5
6 Currently, dnode_check_slots_free() works by checking dn->dn_type
7 in the dnode to determine if the dnode is reclaimable. However,
8 there is a small window of time between dnode_free_sync() in the
9 first call to dsl_dataset_sync() and when the useraccounting code
10 is run when the type is set DMU_OT_NONE, but the dnode is not yet
11 evictable, leading to crashes. This patch adds the ability for
12 dnodes to track which txg they were last dirtied in and adds a
13 check for this before performing the reclaim.
14
15 This patch also corrects several instances when dn_dirty_link was
16 treated as a list_node_t when it is technically a multilist_node_t.
17
18 Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
19 Signed-off-by: Tom Caputi <tcaputi@datto.com>
20 Requires-spl: spl-0.7-release
21 Issue #7147
22 Issue #7388
23 Issue #7997
24
25 (cherry-picked from behlendorf/issue-7997 4764f6f3be90be073d2700653dff286371e52583)
26 Signed-off-by: Stoiko Ivanov <s.ivanov@proxmox.com>
27 ---
28 include/sys/dmu_impl.h | 1 +
29 include/sys/dnode.h | 4 ++++
30 module/zfs/dbuf.c | 3 +++
31 module/zfs/dmu.c | 2 +-
32 module/zfs/dmu_objset.c | 15 +++++++++++++++
33 module/zfs/dnode.c | 29 +++++++++++++++++++----------
34 6 files changed, 43 insertions(+), 11 deletions(-)
35
36 diff --git a/include/sys/dmu_impl.h b/include/sys/dmu_impl.h
37 index 65e417e3..03a63077 100644
38 --- a/include/sys/dmu_impl.h
39 +++ b/include/sys/dmu_impl.h
40 @@ -161,6 +161,7 @@ extern "C" {
41 * dn_allocated_txg
42 * dn_free_txg
43 * dn_assigned_txg
44 + * dn_dirty_txg
45 * dd_assigned_tx
46 * dn_notxholds
47 * dn_dirtyctx
48 diff --git a/include/sys/dnode.h b/include/sys/dnode.h
49 index ea7defe1..2dd087b3 100644
50 --- a/include/sys/dnode.h
51 +++ b/include/sys/dnode.h
52 @@ -260,6 +260,7 @@ struct dnode {
53 uint64_t dn_allocated_txg;
54 uint64_t dn_free_txg;
55 uint64_t dn_assigned_txg;
56 + uint64_t dn_dirty_txg; /* txg dnode was last dirtied */
57 kcondvar_t dn_notxholds;
58 enum dnode_dirtycontext dn_dirtyctx;
59 uint8_t *dn_dirtyctx_firstset; /* dbg: contents meaningless */
60 @@ -362,6 +363,9 @@ void dnode_evict_dbufs(dnode_t *dn);
61 void dnode_evict_bonus(dnode_t *dn);
62 void dnode_free_interior_slots(dnode_t *dn);
63
64 +#define DNODE_IS_DIRTY(_dn) \
65 + ((_dn)->dn_dirty_txg >= spa_syncing_txg((_dn)->dn_objset->os_spa))
66 +
67 #define DNODE_IS_CACHEABLE(_dn) \
68 ((_dn)->dn_objset->os_primary_cache == ZFS_CACHE_ALL || \
69 (DMU_OT_IS_METADATA((_dn)->dn_type) && \
70 diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c
71 index 4ee121f5..6edb39d6 100644
72 --- a/module/zfs/dbuf.c
73 +++ b/module/zfs/dbuf.c
74 @@ -1606,6 +1606,9 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
75 FTAG);
76 }
77 }
78 +
79 + if (tx->tx_txg > dn->dn_dirty_txg)
80 + dn->dn_dirty_txg = tx->tx_txg;
81 mutex_exit(&dn->dn_mtx);
82
83 if (db->db_blkid == DMU_SPILL_BLKID)
84 diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c
85 index 6f09aa2f..a09ac4f9 100644
86 --- a/module/zfs/dmu.c
87 +++ b/module/zfs/dmu.c
88 @@ -2044,7 +2044,7 @@ dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off)
89 * Check if dnode is dirty
90 */
91 for (i = 0; i < TXG_SIZE; i++) {
92 - if (list_link_active(&dn->dn_dirty_link[i])) {
93 + if (multilist_link_active(&dn->dn_dirty_link[i])) {
94 clean = B_FALSE;
95 break;
96 }
97 diff --git a/module/zfs/dmu_objset.c b/module/zfs/dmu_objset.c
98 index 449ebedf..0bed2d3e 100644
99 --- a/module/zfs/dmu_objset.c
100 +++ b/module/zfs/dmu_objset.c
101 @@ -1213,10 +1213,23 @@ dmu_objset_sync_dnodes(multilist_sublist_t *list, dmu_tx_t *tx)
102 ASSERT3U(dn->dn_nlevels, <=, DN_MAX_LEVELS);
103 multilist_sublist_remove(list, dn);
104
105 + /*
106 + * If we are not doing useraccounting (os_synced_dnodes == NULL)
107 + * we are done with this dnode for this txg. Unset dn_dirty_txg
108 + * if later txgs aren't dirtying it so that future holders do
109 + * not get a stale value. Otherwise, we will do this in
110 + * userquota_updates_task() when processing has completely
111 + * finished for this txg.
112 + */
113 multilist_t *newlist = dn->dn_objset->os_synced_dnodes;
114 if (newlist != NULL) {
115 (void) dnode_add_ref(dn, newlist);
116 multilist_insert(newlist, dn);
117 + } else {
118 + mutex_enter(&dn->dn_mtx);
119 + if (dn->dn_dirty_txg == tx->tx_txg)
120 + dn->dn_dirty_txg = 0;
121 + mutex_exit(&dn->dn_mtx);
122 }
123
124 dnode_sync(dn, tx);
125 @@ -1621,6 +1634,8 @@ userquota_updates_task(void *arg)
126 dn->dn_id_flags |= DN_ID_CHKED_BONUS;
127 }
128 dn->dn_id_flags &= ~(DN_ID_NEW_EXIST);
129 + if (dn->dn_dirty_txg == spa_syncing_txg(os->os_spa))
130 + dn->dn_dirty_txg = 0;
131 mutex_exit(&dn->dn_mtx);
132
133 multilist_sublist_remove(list, dn);
134 diff --git a/module/zfs/dnode.c b/module/zfs/dnode.c
135 index d465b545..4a169c49 100644
136 --- a/module/zfs/dnode.c
137 +++ b/module/zfs/dnode.c
138 @@ -137,7 +137,7 @@ dnode_cons(void *arg, void *unused, int kmflag)
139 bzero(&dn->dn_next_blksz[0], sizeof (dn->dn_next_blksz));
140
141 for (i = 0; i < TXG_SIZE; i++) {
142 - list_link_init(&dn->dn_dirty_link[i]);
143 + multilist_link_init(&dn->dn_dirty_link[i]);
144 dn->dn_free_ranges[i] = NULL;
145 list_create(&dn->dn_dirty_records[i],
146 sizeof (dbuf_dirty_record_t),
147 @@ -147,6 +147,7 @@ dnode_cons(void *arg, void *unused, int kmflag)
148 dn->dn_allocated_txg = 0;
149 dn->dn_free_txg = 0;
150 dn->dn_assigned_txg = 0;
151 + dn->dn_dirty_txg = 0;
152 dn->dn_dirtyctx = 0;
153 dn->dn_dirtyctx_firstset = NULL;
154 dn->dn_bonus = NULL;
155 @@ -184,7 +185,7 @@ dnode_dest(void *arg, void *unused)
156 ASSERT(!list_link_active(&dn->dn_link));
157
158 for (i = 0; i < TXG_SIZE; i++) {
159 - ASSERT(!list_link_active(&dn->dn_dirty_link[i]));
160 + ASSERT(!multilist_link_active(&dn->dn_dirty_link[i]));
161 ASSERT3P(dn->dn_free_ranges[i], ==, NULL);
162 list_destroy(&dn->dn_dirty_records[i]);
163 ASSERT0(dn->dn_next_nblkptr[i]);
164 @@ -199,6 +200,7 @@ dnode_dest(void *arg, void *unused)
165 ASSERT0(dn->dn_allocated_txg);
166 ASSERT0(dn->dn_free_txg);
167 ASSERT0(dn->dn_assigned_txg);
168 + ASSERT0(dn->dn_dirty_txg);
169 ASSERT0(dn->dn_dirtyctx);
170 ASSERT3P(dn->dn_dirtyctx_firstset, ==, NULL);
171 ASSERT3P(dn->dn_bonus, ==, NULL);
172 @@ -523,6 +525,7 @@ dnode_destroy(dnode_t *dn)
173 dn->dn_allocated_txg = 0;
174 dn->dn_free_txg = 0;
175 dn->dn_assigned_txg = 0;
176 + dn->dn_dirty_txg = 0;
177
178 dn->dn_dirtyctx = 0;
179 if (dn->dn_dirtyctx_firstset != NULL) {
180 @@ -592,6 +595,7 @@ dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
181 ASSERT0(dn->dn_maxblkid);
182 ASSERT0(dn->dn_allocated_txg);
183 ASSERT0(dn->dn_assigned_txg);
184 + ASSERT0(dn->dn_dirty_txg);
185 ASSERT(refcount_is_zero(&dn->dn_tx_holds));
186 ASSERT3U(refcount_count(&dn->dn_holds), <=, 1);
187 ASSERT(avl_is_empty(&dn->dn_dbufs));
188 @@ -604,7 +608,7 @@ dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
189 ASSERT0(dn->dn_next_bonustype[i]);
190 ASSERT0(dn->dn_rm_spillblk[i]);
191 ASSERT0(dn->dn_next_blksz[i]);
192 - ASSERT(!list_link_active(&dn->dn_dirty_link[i]));
193 + ASSERT(!multilist_link_active(&dn->dn_dirty_link[i]));
194 ASSERT3P(list_head(&dn->dn_dirty_records[i]), ==, NULL);
195 ASSERT3P(dn->dn_free_ranges[i], ==, NULL);
196 }
197 @@ -779,6 +783,7 @@ dnode_move_impl(dnode_t *odn, dnode_t *ndn)
198 ndn->dn_allocated_txg = odn->dn_allocated_txg;
199 ndn->dn_free_txg = odn->dn_free_txg;
200 ndn->dn_assigned_txg = odn->dn_assigned_txg;
201 + ndn->dn_dirty_txg = odn->dn_dirty_txg;
202 ndn->dn_dirtyctx = odn->dn_dirtyctx;
203 ndn->dn_dirtyctx_firstset = odn->dn_dirtyctx_firstset;
204 ASSERT(refcount_count(&odn->dn_tx_holds) == 0);
205 @@ -845,6 +850,7 @@ dnode_move_impl(dnode_t *odn, dnode_t *ndn)
206 odn->dn_allocated_txg = 0;
207 odn->dn_free_txg = 0;
208 odn->dn_assigned_txg = 0;
209 + odn->dn_dirty_txg = 0;
210 odn->dn_dirtyctx = 0;
211 odn->dn_dirtyctx_firstset = NULL;
212 odn->dn_have_spill = B_FALSE;
213 @@ -1069,6 +1075,10 @@ dnode_check_slots_free(dnode_children_t *children, int idx, int slots)
214 {
215 ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
216
217 + /*
218 + * If all dnode slots are either already free or
219 + * evictable return B_TRUE.
220 + */
221 for (int i = idx; i < idx + slots; i++) {
222 dnode_handle_t *dnh = &children->dnc_children[i];
223 dnode_t *dn = dnh->dnh_dnode;
224 @@ -1077,18 +1087,17 @@ dnode_check_slots_free(dnode_children_t *children, int idx, int slots)
225 continue;
226 } else if (DN_SLOT_IS_PTR(dn)) {
227 mutex_enter(&dn->dn_mtx);
228 - dmu_object_type_t type = dn->dn_type;
229 + boolean_t can_free = (dn->dn_type == DMU_OT_NONE &&
230 + !DNODE_IS_DIRTY(dn));
231 mutex_exit(&dn->dn_mtx);
232
233 - if (type != DMU_OT_NONE)
234 + if (!can_free)
235 return (B_FALSE);
236 -
237 - continue;
238 + else
239 + continue;
240 } else {
241 return (B_FALSE);
242 }
243 -
244 - return (B_FALSE);
245 }
246
247 return (B_TRUE);
248 @@ -1594,7 +1603,7 @@ dnode_setdirty(dnode_t *dn, dmu_tx_t *tx)
249 /*
250 * If we are already marked dirty, we're done.
251 */
252 - if (list_link_active(&dn->dn_dirty_link[txg & TXG_MASK])) {
253 + if (multilist_link_active(&dn->dn_dirty_link[txg & TXG_MASK])) {
254 multilist_sublist_unlock(mls);
255 return;
256 }