]> git.proxmox.com Git - zfsonlinux.git/blob - zfs-patches/0008-Fix-race-in-dnode_check_slots_free.patch
update/rebase to zfs-0.7.12 with patches from ZOL
[zfsonlinux.git] / zfs-patches / 0008-Fix-race-in-dnode_check_slots_free.patch
1 From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
2 From: Tom Caputi <tcaputi@datto.com>
3 Date: Tue, 10 Apr 2018 14:15:05 -0400
4 Subject: [PATCH] Fix race in dnode_check_slots_free()
5
6 Currently, dnode_check_slots_free() works by checking dn->dn_type
7 in the dnode to determine if the dnode is reclaimable. However,
8 there is a small window of time between dnode_free_sync() in the
9 first call to dsl_dataset_sync() and when the useraccounting code
10 is run when the type is set DMU_OT_NONE, but the dnode is not yet
11 evictable, leading to crashes. This patch adds the ability for
12 dnodes to track which txg they were last dirtied in and adds a
13 check for this before performing the reclaim.
14
15 This patch also corrects several instances when dn_dirty_link was
16 treated as a list_node_t when it is technically a multilist_node_t.
17
18 Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
19 Signed-off-by: Tom Caputi <tcaputi@datto.com>
20 Closes #7147
21 Closes #7388
22 ---
23 include/sys/dmu_impl.h | 1 +
24 include/sys/dnode.h | 4 ++++
25 module/zfs/dbuf.c | 3 +++
26 module/zfs/dmu.c | 2 +-
27 module/zfs/dmu_objset.c | 15 +++++++++++++++
28 module/zfs/dnode.c | 29 +++++++++++++++++++----------
29 6 files changed, 43 insertions(+), 11 deletions(-)
30
31 diff --git a/include/sys/dmu_impl.h b/include/sys/dmu_impl.h
32 index 65e417e3..03a63077 100644
33 --- a/include/sys/dmu_impl.h
34 +++ b/include/sys/dmu_impl.h
35 @@ -161,6 +161,7 @@ extern "C" {
36 * dn_allocated_txg
37 * dn_free_txg
38 * dn_assigned_txg
39 + * dn_dirty_txg
40 * dd_assigned_tx
41 * dn_notxholds
42 * dn_dirtyctx
43 diff --git a/include/sys/dnode.h b/include/sys/dnode.h
44 index ea7defe1..2dd087b3 100644
45 --- a/include/sys/dnode.h
46 +++ b/include/sys/dnode.h
47 @@ -260,6 +260,7 @@ struct dnode {
48 uint64_t dn_allocated_txg;
49 uint64_t dn_free_txg;
50 uint64_t dn_assigned_txg;
51 + uint64_t dn_dirty_txg; /* txg dnode was last dirtied */
52 kcondvar_t dn_notxholds;
53 enum dnode_dirtycontext dn_dirtyctx;
54 uint8_t *dn_dirtyctx_firstset; /* dbg: contents meaningless */
55 @@ -362,6 +363,9 @@ void dnode_evict_dbufs(dnode_t *dn);
56 void dnode_evict_bonus(dnode_t *dn);
57 void dnode_free_interior_slots(dnode_t *dn);
58
59 +#define DNODE_IS_DIRTY(_dn) \
60 + ((_dn)->dn_dirty_txg >= spa_syncing_txg((_dn)->dn_objset->os_spa))
61 +
62 #define DNODE_IS_CACHEABLE(_dn) \
63 ((_dn)->dn_objset->os_primary_cache == ZFS_CACHE_ALL || \
64 (DMU_OT_IS_METADATA((_dn)->dn_type) && \
65 diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c
66 index 4ee121f5..6edb39d6 100644
67 --- a/module/zfs/dbuf.c
68 +++ b/module/zfs/dbuf.c
69 @@ -1606,6 +1606,9 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
70 FTAG);
71 }
72 }
73 +
74 + if (tx->tx_txg > dn->dn_dirty_txg)
75 + dn->dn_dirty_txg = tx->tx_txg;
76 mutex_exit(&dn->dn_mtx);
77
78 if (db->db_blkid == DMU_SPILL_BLKID)
79 diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c
80 index 6f09aa2f..a09ac4f9 100644
81 --- a/module/zfs/dmu.c
82 +++ b/module/zfs/dmu.c
83 @@ -2044,7 +2044,7 @@ dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off)
84 * Check if dnode is dirty
85 */
86 for (i = 0; i < TXG_SIZE; i++) {
87 - if (list_link_active(&dn->dn_dirty_link[i])) {
88 + if (multilist_link_active(&dn->dn_dirty_link[i])) {
89 clean = B_FALSE;
90 break;
91 }
92 diff --git a/module/zfs/dmu_objset.c b/module/zfs/dmu_objset.c
93 index 449ebedf..0bed2d3e 100644
94 --- a/module/zfs/dmu_objset.c
95 +++ b/module/zfs/dmu_objset.c
96 @@ -1213,10 +1213,23 @@ dmu_objset_sync_dnodes(multilist_sublist_t *list, dmu_tx_t *tx)
97 ASSERT3U(dn->dn_nlevels, <=, DN_MAX_LEVELS);
98 multilist_sublist_remove(list, dn);
99
100 + /*
101 + * If we are not doing useraccounting (os_synced_dnodes == NULL)
102 + * we are done with this dnode for this txg. Unset dn_dirty_txg
103 + * if later txgs aren't dirtying it so that future holders do
104 + * not get a stale value. Otherwise, we will do this in
105 + * userquota_updates_task() when processing has completely
106 + * finished for this txg.
107 + */
108 multilist_t *newlist = dn->dn_objset->os_synced_dnodes;
109 if (newlist != NULL) {
110 (void) dnode_add_ref(dn, newlist);
111 multilist_insert(newlist, dn);
112 + } else {
113 + mutex_enter(&dn->dn_mtx);
114 + if (dn->dn_dirty_txg == tx->tx_txg)
115 + dn->dn_dirty_txg = 0;
116 + mutex_exit(&dn->dn_mtx);
117 }
118
119 dnode_sync(dn, tx);
120 @@ -1621,6 +1634,8 @@ userquota_updates_task(void *arg)
121 dn->dn_id_flags |= DN_ID_CHKED_BONUS;
122 }
123 dn->dn_id_flags &= ~(DN_ID_NEW_EXIST);
124 + if (dn->dn_dirty_txg == spa_syncing_txg(os->os_spa))
125 + dn->dn_dirty_txg = 0;
126 mutex_exit(&dn->dn_mtx);
127
128 multilist_sublist_remove(list, dn);
129 diff --git a/module/zfs/dnode.c b/module/zfs/dnode.c
130 index d465b545..4a169c49 100644
131 --- a/module/zfs/dnode.c
132 +++ b/module/zfs/dnode.c
133 @@ -137,7 +137,7 @@ dnode_cons(void *arg, void *unused, int kmflag)
134 bzero(&dn->dn_next_blksz[0], sizeof (dn->dn_next_blksz));
135
136 for (i = 0; i < TXG_SIZE; i++) {
137 - list_link_init(&dn->dn_dirty_link[i]);
138 + multilist_link_init(&dn->dn_dirty_link[i]);
139 dn->dn_free_ranges[i] = NULL;
140 list_create(&dn->dn_dirty_records[i],
141 sizeof (dbuf_dirty_record_t),
142 @@ -147,6 +147,7 @@ dnode_cons(void *arg, void *unused, int kmflag)
143 dn->dn_allocated_txg = 0;
144 dn->dn_free_txg = 0;
145 dn->dn_assigned_txg = 0;
146 + dn->dn_dirty_txg = 0;
147 dn->dn_dirtyctx = 0;
148 dn->dn_dirtyctx_firstset = NULL;
149 dn->dn_bonus = NULL;
150 @@ -184,7 +185,7 @@ dnode_dest(void *arg, void *unused)
151 ASSERT(!list_link_active(&dn->dn_link));
152
153 for (i = 0; i < TXG_SIZE; i++) {
154 - ASSERT(!list_link_active(&dn->dn_dirty_link[i]));
155 + ASSERT(!multilist_link_active(&dn->dn_dirty_link[i]));
156 ASSERT3P(dn->dn_free_ranges[i], ==, NULL);
157 list_destroy(&dn->dn_dirty_records[i]);
158 ASSERT0(dn->dn_next_nblkptr[i]);
159 @@ -199,6 +200,7 @@ dnode_dest(void *arg, void *unused)
160 ASSERT0(dn->dn_allocated_txg);
161 ASSERT0(dn->dn_free_txg);
162 ASSERT0(dn->dn_assigned_txg);
163 + ASSERT0(dn->dn_dirty_txg);
164 ASSERT0(dn->dn_dirtyctx);
165 ASSERT3P(dn->dn_dirtyctx_firstset, ==, NULL);
166 ASSERT3P(dn->dn_bonus, ==, NULL);
167 @@ -523,6 +525,7 @@ dnode_destroy(dnode_t *dn)
168 dn->dn_allocated_txg = 0;
169 dn->dn_free_txg = 0;
170 dn->dn_assigned_txg = 0;
171 + dn->dn_dirty_txg = 0;
172
173 dn->dn_dirtyctx = 0;
174 if (dn->dn_dirtyctx_firstset != NULL) {
175 @@ -592,6 +595,7 @@ dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
176 ASSERT0(dn->dn_maxblkid);
177 ASSERT0(dn->dn_allocated_txg);
178 ASSERT0(dn->dn_assigned_txg);
179 + ASSERT0(dn->dn_dirty_txg);
180 ASSERT(refcount_is_zero(&dn->dn_tx_holds));
181 ASSERT3U(refcount_count(&dn->dn_holds), <=, 1);
182 ASSERT(avl_is_empty(&dn->dn_dbufs));
183 @@ -604,7 +608,7 @@ dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
184 ASSERT0(dn->dn_next_bonustype[i]);
185 ASSERT0(dn->dn_rm_spillblk[i]);
186 ASSERT0(dn->dn_next_blksz[i]);
187 - ASSERT(!list_link_active(&dn->dn_dirty_link[i]));
188 + ASSERT(!multilist_link_active(&dn->dn_dirty_link[i]));
189 ASSERT3P(list_head(&dn->dn_dirty_records[i]), ==, NULL);
190 ASSERT3P(dn->dn_free_ranges[i], ==, NULL);
191 }
192 @@ -779,6 +783,7 @@ dnode_move_impl(dnode_t *odn, dnode_t *ndn)
193 ndn->dn_allocated_txg = odn->dn_allocated_txg;
194 ndn->dn_free_txg = odn->dn_free_txg;
195 ndn->dn_assigned_txg = odn->dn_assigned_txg;
196 + ndn->dn_dirty_txg = odn->dn_dirty_txg;
197 ndn->dn_dirtyctx = odn->dn_dirtyctx;
198 ndn->dn_dirtyctx_firstset = odn->dn_dirtyctx_firstset;
199 ASSERT(refcount_count(&odn->dn_tx_holds) == 0);
200 @@ -845,6 +850,7 @@ dnode_move_impl(dnode_t *odn, dnode_t *ndn)
201 odn->dn_allocated_txg = 0;
202 odn->dn_free_txg = 0;
203 odn->dn_assigned_txg = 0;
204 + odn->dn_dirty_txg = 0;
205 odn->dn_dirtyctx = 0;
206 odn->dn_dirtyctx_firstset = NULL;
207 odn->dn_have_spill = B_FALSE;
208 @@ -1069,6 +1075,10 @@ dnode_check_slots_free(dnode_children_t *children, int idx, int slots)
209 {
210 ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
211
212 + /*
213 + * If all dnode slots are either already free or
214 + * evictable return B_TRUE.
215 + */
216 for (int i = idx; i < idx + slots; i++) {
217 dnode_handle_t *dnh = &children->dnc_children[i];
218 dnode_t *dn = dnh->dnh_dnode;
219 @@ -1077,18 +1087,17 @@ dnode_check_slots_free(dnode_children_t *children, int idx, int slots)
220 continue;
221 } else if (DN_SLOT_IS_PTR(dn)) {
222 mutex_enter(&dn->dn_mtx);
223 - dmu_object_type_t type = dn->dn_type;
224 + boolean_t can_free = (dn->dn_type == DMU_OT_NONE &&
225 + !DNODE_IS_DIRTY(dn));
226 mutex_exit(&dn->dn_mtx);
227
228 - if (type != DMU_OT_NONE)
229 + if (!can_free)
230 return (B_FALSE);
231 -
232 - continue;
233 + else
234 + continue;
235 } else {
236 return (B_FALSE);
237 }
238 -
239 - return (B_FALSE);
240 }
241
242 return (B_TRUE);
243 @@ -1594,7 +1603,7 @@ dnode_setdirty(dnode_t *dn, dmu_tx_t *tx)
244 /*
245 * If we are already marked dirty, we're done.
246 */
247 - if (list_link_active(&dn->dn_dirty_link[txg & TXG_MASK])) {
248 + if (multilist_link_active(&dn->dn_dirty_link[txg & TXG_MASK])) {
249 multilist_sublist_unlock(mls);
250 return;
251 }