]> git.proxmox.com Git - zfsonlinux.git/blame - zfs-patches/0028-Handle-zap_add-failures-in-mixed-case-mode.patch
bump version to 0.7.7-pve1~bpo9
[zfsonlinux.git] / zfs-patches / 0028-Handle-zap_add-failures-in-mixed-case-mode.patch
CommitLineData
75b07eca
FG
1From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
2From: sanjeevbagewadi <sanjeev.bagewadi@gmail.com>
3Date: Fri, 9 Feb 2018 23:45:53 +0530
4Subject: [PATCH] Handle zap_add() failures in mixed case mode
5MIME-Version: 1.0
6Content-Type: text/plain; charset=UTF-8
7Content-Transfer-Encoding: 8bit
8
9With "casesensitivity=mixed", zap_add() could fail when the number of
10files/directories with the same name (varying in case) exceed the
11capacity of the leaf node of a Fatzap. This results in a ASSERT()
12failure as zfs_link_create() does not expect zap_add() to fail. The fix
13is to handle these failures and rollback the transactions.
14
15Reviewed by: Matt Ahrens <mahrens@delphix.com>
16Reviewed-by: Chunwei Chen <david.chen@nutanix.com>
17Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
18Signed-off-by: Sanjeev Bagewadi <sanjeev.bagewadi@gmail.com>
19Closes #7011
20Closes #7054
21(cherry picked from commit b3da003ebfad673bb4ada35f87a18a1ef175e95d)
22Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
23---
24 .../tests/functional/casenorm/Makefile.am | 1 +
25 include/sys/zap_leaf.h | 15 ++-
26 module/zfs/zap.c | 25 +++-
27 module/zfs/zap_leaf.c | 2 +-
28 module/zfs/zap_micro.c | 38 +++++-
29 module/zfs/zfs_dir.c | 29 ++++-
30 module/zfs/zfs_vnops.c | 73 ++++++++---
31 tests/runfiles/linux.run | 2 +-
32 .../functional/casenorm/mixed_create_failure.ksh | 136 +++++++++++++++++++++
33 9 files changed, 289 insertions(+), 32 deletions(-)
34 create mode 100755 tests/zfs-tests/tests/functional/casenorm/mixed_create_failure.ksh
35
36diff --git a/tests/zfs-tests/tests/functional/casenorm/Makefile.am b/tests/zfs-tests/tests/functional/casenorm/Makefile.am
37index 00a19c7ff..00cb59074 100644
38--- a/tests/zfs-tests/tests/functional/casenorm/Makefile.am
39+++ b/tests/zfs-tests/tests/functional/casenorm/Makefile.am
40@@ -9,6 +9,7 @@ dist_pkgdata_SCRIPTS = \
41 insensitive_formd_lookup.ksh \
42 insensitive_none_delete.ksh \
43 insensitive_none_lookup.ksh \
44+ mixed_create_failure.ksh \
45 mixed_formd_delete.ksh \
46 mixed_formd_lookup_ci.ksh \
47 mixed_formd_lookup.ksh \
48diff --git a/include/sys/zap_leaf.h b/include/sys/zap_leaf.h
49index e784c5963..a3da1036a 100644
50--- a/include/sys/zap_leaf.h
51+++ b/include/sys/zap_leaf.h
52@@ -46,10 +46,15 @@ struct zap_stats;
53 * block size (1<<l->l_bs) - hash entry size (2) * number of hash
54 * entries - header space (2*chunksize)
55 */
56-#define ZAP_LEAF_NUMCHUNKS(l) \
57- (((1<<(l)->l_bs) - 2*ZAP_LEAF_HASH_NUMENTRIES(l)) / \
58+#define ZAP_LEAF_NUMCHUNKS_BS(bs) \
59+ (((1<<(bs)) - 2*ZAP_LEAF_HASH_NUMENTRIES_BS(bs)) / \
60 ZAP_LEAF_CHUNKSIZE - 2)
61
62+#define ZAP_LEAF_NUMCHUNKS(l) (ZAP_LEAF_NUMCHUNKS_BS(((l)->l_bs)))
63+
64+#define ZAP_LEAF_NUMCHUNKS_DEF \
65+ (ZAP_LEAF_NUMCHUNKS_BS(fzap_default_block_shift))
66+
67 /*
68 * The amount of space within the chunk available for the array is:
69 * chunk size - space for type (1) - space for next pointer (2)
70@@ -74,8 +79,10 @@ struct zap_stats;
71 * which is less than block size / CHUNKSIZE (24) / minimum number of
72 * chunks per entry (3).
73 */
74-#define ZAP_LEAF_HASH_SHIFT(l) ((l)->l_bs - 5)
75-#define ZAP_LEAF_HASH_NUMENTRIES(l) (1 << ZAP_LEAF_HASH_SHIFT(l))
76+#define ZAP_LEAF_HASH_SHIFT_BS(bs) ((bs) - 5)
77+#define ZAP_LEAF_HASH_NUMENTRIES_BS(bs) (1 << ZAP_LEAF_HASH_SHIFT_BS(bs))
78+#define ZAP_LEAF_HASH_SHIFT(l) (ZAP_LEAF_HASH_SHIFT_BS(((l)->l_bs)))
79+#define ZAP_LEAF_HASH_NUMENTRIES(l) (ZAP_LEAF_HASH_NUMENTRIES_BS(((l)->l_bs)))
80
81 /*
82 * The chunks start immediately after the hash table. The end of the
83diff --git a/module/zfs/zap.c b/module/zfs/zap.c
84index ee9962bff..9843d8c50 100644
85--- a/module/zfs/zap.c
86+++ b/module/zfs/zap.c
87@@ -819,15 +819,19 @@ fzap_lookup(zap_name_t *zn,
88 return (err);
89 }
90
91+#define MAX_EXPAND_RETRIES 2
92+
93 int
94 fzap_add_cd(zap_name_t *zn,
95 uint64_t integer_size, uint64_t num_integers,
96 const void *val, uint32_t cd, void *tag, dmu_tx_t *tx)
97 {
98 zap_leaf_t *l;
99+ zap_leaf_t *prev_l = NULL;
100 int err;
101 zap_entry_handle_t zeh;
102 zap_t *zap = zn->zn_zap;
103+ int expand_retries = 0;
104
105 ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
106 ASSERT(!zap->zap_ismicro);
107@@ -851,10 +855,29 @@ retry:
108 if (err == 0) {
109 zap_increment_num_entries(zap, 1, tx);
110 } else if (err == EAGAIN) {
111+ /*
112+ * If the last two expansions did not help, there is no point
113+ * trying to expand again
114+ */
115+ if (expand_retries > MAX_EXPAND_RETRIES && prev_l == l) {
116+ err = SET_ERROR(ENOSPC);
117+ goto out;
118+ }
119+
120 err = zap_expand_leaf(zn, l, tag, tx, &l);
121 zap = zn->zn_zap; /* zap_expand_leaf() may change zap */
122- if (err == 0)
123+ if (err == 0) {
124+ prev_l = l;
125+ expand_retries++;
126 goto retry;
127+ } else if (err == ENOSPC) {
128+ /*
129+ * If we failed to expand the leaf, then bailout
130+ * as there is no point trying
131+ * zap_put_leaf_maybe_grow_ptrtbl().
132+ */
133+ return (err);
134+ }
135 }
136
137 out:
138diff --git a/module/zfs/zap_leaf.c b/module/zfs/zap_leaf.c
139index c342695c7..526e46606 100644
140--- a/module/zfs/zap_leaf.c
141+++ b/module/zfs/zap_leaf.c
142@@ -53,7 +53,7 @@ static uint16_t *zap_leaf_rehash_entry(zap_leaf_t *l, uint16_t entry);
143 ((h) >> \
144 (64 - ZAP_LEAF_HASH_SHIFT(l) - zap_leaf_phys(l)->l_hdr.lh_prefix_len)))
145
146-#define LEAF_HASH_ENTPTR(l, h) (&zap_leaf_phys(l)->l_hash[LEAF_HASH(l, h)])
147+#define LEAF_HASH_ENTPTR(l, h) (&zap_leaf_phys(l)->l_hash[LEAF_HASH(l, h)])
148
149 extern inline zap_leaf_phys_t *zap_leaf_phys(zap_leaf_t *l);
150
151diff --git a/module/zfs/zap_micro.c b/module/zfs/zap_micro.c
152index 3ebf995c6..34bef3e63 100644
153--- a/module/zfs/zap_micro.c
154+++ b/module/zfs/zap_micro.c
155@@ -363,6 +363,41 @@ mze_find_unused_cd(zap_t *zap, uint64_t hash)
156 return (cd);
157 }
158
159+/*
160+ * Each mzap entry requires at max : 4 chunks
161+ * 3 chunks for names + 1 chunk for value.
162+ */
163+#define MZAP_ENT_CHUNKS (1 + ZAP_LEAF_ARRAY_NCHUNKS(MZAP_NAME_LEN) + \
164+ ZAP_LEAF_ARRAY_NCHUNKS(sizeof (uint64_t)))
165+
166+/*
167+ * Check if the current entry keeps the colliding entries under the fatzap leaf
168+ * size.
169+ */
170+static boolean_t
171+mze_canfit_fzap_leaf(zap_name_t *zn, uint64_t hash)
172+{
173+ zap_t *zap = zn->zn_zap;
174+ mzap_ent_t mze_tofind;
175+ mzap_ent_t *mze;
176+ avl_index_t idx;
177+ avl_tree_t *avl = &zap->zap_m.zap_avl;
178+ uint32_t mzap_ents = 0;
179+
180+ mze_tofind.mze_hash = hash;
181+ mze_tofind.mze_cd = 0;
182+
183+ for (mze = avl_find(avl, &mze_tofind, &idx);
184+ mze && mze->mze_hash == hash; mze = AVL_NEXT(avl, mze)) {
185+ mzap_ents++;
186+ }
187+
188+ /* Include the new entry being added */
189+ mzap_ents++;
190+
191+ return (ZAP_LEAF_NUMCHUNKS_DEF > (mzap_ents * MZAP_ENT_CHUNKS));
192+}
193+
194 static void
195 mze_remove(zap_t *zap, mzap_ent_t *mze)
196 {
197@@ -1191,7 +1226,8 @@ zap_add_impl(zap_t *zap, const char *key,
198 err = fzap_add(zn, integer_size, num_integers, val, tag, tx);
199 zap = zn->zn_zap; /* fzap_add() may change zap */
200 } else if (integer_size != 8 || num_integers != 1 ||
201- strlen(key) >= MZAP_NAME_LEN) {
202+ strlen(key) >= MZAP_NAME_LEN ||
203+ !mze_canfit_fzap_leaf(zn, zn->zn_hash)) {
204 err = mzap_upgrade(&zn->zn_zap, tag, tx, 0);
205 if (err == 0) {
206 err = fzap_add(zn, integer_size, num_integers, val,
207diff --git a/module/zfs/zfs_dir.c b/module/zfs/zfs_dir.c
208index 9a8bbccd9..6398a1d15 100644
209--- a/module/zfs/zfs_dir.c
210+++ b/module/zfs/zfs_dir.c
211@@ -742,7 +742,11 @@ zfs_dirent(znode_t *zp, uint64_t mode)
212 }
213
214 /*
215- * Link zp into dl. Can only fail if zp has been unlinked.
216+ * Link zp into dl. Can fail in the following cases :
217+ * - if zp has been unlinked.
218+ * - if the number of entries with the same hash (aka. colliding entries)
219+ * exceed the capacity of a leaf-block of fatzap and splitting of the
220+ * leaf-block does not help.
221 */
222 int
223 zfs_link_create(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag)
224@@ -776,6 +780,24 @@ zfs_link_create(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag)
225 NULL, &links, sizeof (links));
226 }
227 }
228+
229+ value = zfs_dirent(zp, zp->z_mode);
230+ error = zap_add(ZTOZSB(zp)->z_os, dzp->z_id, dl->dl_name, 8, 1,
231+ &value, tx);
232+
233+ /*
234+ * zap_add could fail to add the entry if it exceeds the capacity of the
235+ * leaf-block and zap_leaf_split() failed to help.
236+ * The caller of this routine is responsible for failing the transaction
237+ * which will rollback the SA updates done above.
238+ */
239+ if (error != 0) {
240+ if (!(flag & ZRENAMING) && !(flag & ZNEW))
241+ drop_nlink(ZTOI(zp));
242+ mutex_exit(&zp->z_lock);
243+ return (error);
244+ }
245+
246 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL,
247 &dzp->z_id, sizeof (dzp->z_id));
248 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
249@@ -813,11 +835,6 @@ zfs_link_create(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag)
250 ASSERT(error == 0);
251 mutex_exit(&dzp->z_lock);
252
253- value = zfs_dirent(zp, zp->z_mode);
254- error = zap_add(ZTOZSB(zp)->z_os, dzp->z_id, dl->dl_name,
255- 8, 1, &value, tx);
256- ASSERT(error == 0);
257-
258 return (0);
259 }
260
261diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c
262index 6f6ce79db..8a7ad702c 100644
263--- a/module/zfs/zfs_vnops.c
264+++ b/module/zfs/zfs_vnops.c
265@@ -1443,10 +1443,22 @@ top:
266 }
267 zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
268
269+ error = zfs_link_create(dl, zp, tx, ZNEW);
270+ if (error != 0) {
271+ /*
272+ * Since, we failed to add the directory entry for it,
273+ * delete the newly created dnode.
274+ */
275+ zfs_znode_delete(zp, tx);
276+ remove_inode_hash(ZTOI(zp));
277+ zfs_acl_ids_free(&acl_ids);
278+ dmu_tx_commit(tx);
279+ goto out;
280+ }
281+
282 if (fuid_dirtied)
283 zfs_fuid_sync(zfsvfs, tx);
284
285- (void) zfs_link_create(dl, zp, tx, ZNEW);
286 txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap);
287 if (flag & FIGNORECASE)
288 txtype |= TX_CI;
289@@ -2037,13 +2049,18 @@ top:
290 */
291 zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
292
293- if (fuid_dirtied)
294- zfs_fuid_sync(zfsvfs, tx);
295-
296 /*
297 * Now put new name in parent dir.
298 */
299- (void) zfs_link_create(dl, zp, tx, ZNEW);
300+ error = zfs_link_create(dl, zp, tx, ZNEW);
301+ if (error != 0) {
302+ zfs_znode_delete(zp, tx);
303+ remove_inode_hash(ZTOI(zp));
304+ goto out;
305+ }
306+
307+ if (fuid_dirtied)
308+ zfs_fuid_sync(zfsvfs, tx);
309
310 *ipp = ZTOI(zp);
311
312@@ -2053,6 +2070,7 @@ top:
313 zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, vsecp,
314 acl_ids.z_fuidp, vap);
315
316+out:
317 zfs_acl_ids_free(&acl_ids);
318
319 dmu_tx_commit(tx);
320@@ -2062,10 +2080,14 @@ top:
321 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
322 zil_commit(zilog, 0);
323
324- zfs_inode_update(dzp);
325- zfs_inode_update(zp);
326+ if (error != 0) {
327+ iput(ZTOI(zp));
328+ } else {
329+ zfs_inode_update(dzp);
330+ zfs_inode_update(zp);
331+ }
332 ZFS_EXIT(zfsvfs);
333- return (0);
334+ return (error);
335 }
336
337 /*
338@@ -3683,6 +3705,13 @@ top:
339 VERIFY3U(zfs_link_destroy(tdl, szp, tx,
340 ZRENAMING, NULL), ==, 0);
341 }
342+ } else {
343+ /*
344+ * If we had removed the existing target, subsequent
345+ * call to zfs_link_create() to add back the same entry
346+ * but, the new dnode (szp) should not fail.
347+ */
348+ ASSERT(tzp == NULL);
349 }
350 }
351
352@@ -3853,14 +3882,18 @@ top:
353 /*
354 * Insert the new object into the directory.
355 */
356- (void) zfs_link_create(dl, zp, tx, ZNEW);
357-
358- if (flags & FIGNORECASE)
359- txtype |= TX_CI;
360- zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link);
361+ error = zfs_link_create(dl, zp, tx, ZNEW);
362+ if (error != 0) {
363+ zfs_znode_delete(zp, tx);
364+ remove_inode_hash(ZTOI(zp));
365+ } else {
366+ if (flags & FIGNORECASE)
367+ txtype |= TX_CI;
368+ zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link);
369
370- zfs_inode_update(dzp);
371- zfs_inode_update(zp);
372+ zfs_inode_update(dzp);
373+ zfs_inode_update(zp);
374+ }
375
376 zfs_acl_ids_free(&acl_ids);
377
378@@ -3868,10 +3901,14 @@ top:
379
380 zfs_dirent_unlock(dl);
381
382- *ipp = ZTOI(zp);
383+ if (error == 0) {
384+ *ipp = ZTOI(zp);
385
386- if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
387- zil_commit(zilog, 0);
388+ if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
389+ zil_commit(zilog, 0);
390+ } else {
391+ iput(ZTOI(zp));
392+ }
393
394 ZFS_EXIT(zfsvfs);
395 return (error);
396diff --git a/tests/runfiles/linux.run b/tests/runfiles/linux.run
397index ea2dbb282..8be3e1c62 100644
398--- a/tests/runfiles/linux.run
399+++ b/tests/runfiles/linux.run
400@@ -55,7 +55,7 @@ tags = ['functional', 'cachefile']
401 # 'mixed_none_lookup', 'mixed_none_lookup_ci', 'mixed_none_delete',
402 # 'mixed_formd_lookup', 'mixed_formd_lookup_ci', 'mixed_formd_delete']
403 [tests/functional/casenorm]
404-tests = ['case_all_values', 'norm_all_values']
405+tests = ['case_all_values', 'norm_all_values', 'mixed_create_failure']
406 tags = ['functional', 'casenorm']
407
408 [tests/functional/chattr]
409diff --git a/tests/zfs-tests/tests/functional/casenorm/mixed_create_failure.ksh b/tests/zfs-tests/tests/functional/casenorm/mixed_create_failure.ksh
410new file mode 100755
411index 000000000..51b5bb3f6
412--- /dev/null
413+++ b/tests/zfs-tests/tests/functional/casenorm/mixed_create_failure.ksh
414@@ -0,0 +1,136 @@
415+#!/bin/ksh -p
416+#
417+#
418+# This file and its contents are supplied under the terms of the
419+# Common Development and Distribution License ("CDDL"), version 1.0.
420+# You may only use this file in accordance with the terms of version
421+# 1.0 of the CDDL.
422+#
423+# A full copy of the text of the CDDL should have accompanied this
424+# source. A copy of the CDDL is also available via the Internet at
425+# http://www.illumos.org/license/CDDL.
426+#
427+#
428+# Copyright 2018 Nutanix Inc. All rights reserved.
429+#
430+
431+. $STF_SUITE/tests/functional/casenorm/casenorm.kshlib
432+
433+# DESCRIPTION:
434+# For the filesystem with casesensitivity=mixed, normalization=none,
435+# when multiple files with the same name (differing only in case) are created,
436+# the number of files is limited to what can fit in a fatzap leaf-block.
437+# And beyond that, it fails with ENOSPC.
438+#
439+# Ensure that the create/rename operations fail gracefully and not trigger an
440+# ASSERT.
441+#
442+# STRATEGY:
443+# Repeat the below steps for objects: files, directories, symlinks and hardlinks
444+# 1. Create objects with same name but varying in case.
445+# E.g. 'abcdefghijklmnop', 'Abcdefghijklmnop', 'ABcdefghijklmnop' etc.
446+# The create should fail with ENOSPC.
447+# 2. Create an object with name 'tmp_obj' and try to rename it to name that we
448+# failed to add in step 1 above.
449+# This should fail as well.
450+
451+verify_runnable "global"
452+
453+function cleanup
454+{
455+ destroy_testfs
456+}
457+
458+log_onexit cleanup
459+log_assert "With mixed mode: ensure create fails with ENOSPC beyond a certain limit"
460+
461+create_testfs "-o casesensitivity=mixed -o normalization=none"
462+
463+# Different object types
464+obj_type=('file' 'dir' 'symlink' 'hardlink')
465+
466+# Commands to create different object types
467+typeset -A ops
468+ops['file']='touch'
469+ops['dir']='mkdir'
470+ops['symlink']='ln -s'
471+ops['hardlink']='ln'
472+
473+# This function tests the following for a give object type :
474+# - Create multiple objects with the same name (varying only in case).
475+# Ensure that it eventually fails once the leaf-block limit is exceeded.
476+# - Create another object with a different name. And attempt rename it to the
477+# name (for which the create had failed in the previous step).
478+# This should fail as well.
479+# Args :
480+# $1 - object type (file/dir/symlink/hardlink)
481+# $2 - test directory
482+#
483+function test_ops
484+{
485+ typeset obj_type=$1
486+ typeset testdir=$2
487+
488+ target_obj='target-file'
489+
490+ op="${ops[$obj_type]}"
491+
492+ log_note "The op : $op"
493+ log_note "testdir=$testdir obj_type=$obj_type"
494+
495+ test_path="$testdir/$obj_type"
496+ mkdir $test_path
497+ log_note "Created test dir $test_path"
498+
499+ if [[ $obj_type = "symlink" || $obj_type = "hardlink" ]]; then
500+ touch $test_path/$target_obj
501+ log_note "Created target: $test_path/$target_obj"
502+ op="$op $test_path/$target_obj"
503+ fi
504+
505+ log_note "op : $op"
506+ names='{a,A}{b,B}{c,C}{d,D}{e,E}{f,F}{g,G}{h,H}{i,I}{j,J}{k,K}{l,L}'
507+ for name in $names; do
508+ cmd="$op $test_path/$name"
509+ out=$($cmd 2>&1)
510+ ret=$?
511+ log_note "cmd: $cmd ret: $ret out=$out"
512+ if (($ret != 0)); then
513+ if [[ $out = *@(No space left on device)* ]]; then
514+ save_name="$test_path/$name"
515+ break;
516+ else
517+ log_err "$cmd failed with unexpected error : $out"
518+ fi
519+ fi
520+ done
521+
522+ log_note 'Test rename \"sample_name\" rename'
523+ TMP_OBJ="$test_path/tmp_obj"
524+ cmd="$op $TMP_OBJ"
525+ out=$($cmd 2>&1)
526+ ret=$?
527+ if (($ret != 0)); then
528+ log_err "cmd:$cmd failed out:$out"
529+ fi
530+
531+ # Now, try to rename the tmp_obj to the name which we failed to add earlier.
532+ # This should fail as well.
533+ out=$(mv $TMP_OBJ $save_name 2>&1)
534+ ret=$?
535+ if (($ret != 0)); then
536+ if [[ $out = *@(No space left on device)* ]]; then
537+ log_note "$cmd failed as expected : $out"
538+ else
539+ log_err "$cmd failed with : $out"
540+ fi
541+ fi
542+}
543+
544+for obj_type in ${obj_type[*]};
545+do
546+ log_note "Testing create of $obj_type"
547+ test_ops $obj_type $TESTDIR
548+done
549+
550+log_pass "Mixed mode FS: Ops on large number of colliding names fail gracefully"
551--
5522.14.2
553