]>
Commit | Line | Data |
---|---|---|
a010b409 SI |
1 | From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 |
2 | From: Sara Hartse <sara.hartse@gmail.com> | |
3 | Date: Thu, 31 May 2018 10:36:37 -0700 | |
4 | Subject: [PATCH] zpool reopen should detect expanded devices | |
5 | ||
6 | Update bdev_capacity to have wholedisk vdevs query the | |
7 | size of the underlying block device (correcting for the size | |
8 | of the efi parition and partition alignment) and therefore detect | |
9 | expanded space. | |
10 | ||
11 | Correct vdev_get_stats_ex so that the expandsize is aligned | |
12 | to metaslab size and new space is only reported if it is large | |
13 | enough for a new metaslab. | |
14 | ||
15 | Reviewed by: Don Brady <don.brady@delphix.com> | |
16 | Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> | |
17 | Reviewed by: George Wilson <george.wilson@delphix.com> | |
18 | Reviewed-by: Matthew Ahrens <mahrens@delphix.com> | |
19 | Reviewed by: John Wren Kennedy <jwk404@gmail.com> | |
20 | Signed-off-by: sara hartse <sara.hartse@delphix.com> | |
21 | External-issue: LX-165 | |
22 | Closes #7546 | |
23 | Issue #7582 | |
24 | ||
25 | Signed-off-by: Stoiko Ivanov <s.ivanov@proxmox.com> | |
26 | --- | |
27 | include/sys/vdev_disk.h | 12 +++++ | |
28 | lib/libefi/rdwr_efi.c | 20 +++++++- | |
29 | lib/libzfs/libzfs_pool.c | 14 +----- | |
30 | module/zfs/vdev.c | 3 +- | |
31 | module/zfs/vdev_disk.c | 46 +++++++++++++----- | |
32 | .../cli_root/zpool_expand/zpool_expand_002_pos.ksh | 54 +++++++++++++++------- | |
33 | 6 files changed, 107 insertions(+), 42 deletions(-) | |
34 | ||
35 | diff --git a/include/sys/vdev_disk.h b/include/sys/vdev_disk.h | |
36 | index 15570b10..b8a32b31 100644 | |
37 | --- a/include/sys/vdev_disk.h | |
38 | +++ b/include/sys/vdev_disk.h | |
39 | @@ -23,11 +23,23 @@ | |
40 | * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). | |
41 | * Written by Brian Behlendorf <behlendorf1@llnl.gov>. | |
42 | * LLNL-CODE-403049. | |
43 | + * Copyright (c) 2018 by Delphix. All rights reserved. | |
44 | */ | |
45 | ||
46 | #ifndef _SYS_VDEV_DISK_H | |
47 | #define _SYS_VDEV_DISK_H | |
48 | ||
49 | +/* | |
50 | + * Don't start the slice at the default block of 34; many storage | |
51 | + * devices will use a stripe width of 128k, other vendors prefer a 1m | |
52 | + * alignment. It is best to play it safe and ensure a 1m alignment | |
53 | + * given 512B blocks. When the block size is larger by a power of 2 | |
54 | + * we will still be 1m aligned. Some devices are sensitive to the | |
55 | + * partition ending alignment as well. | |
56 | + */ | |
57 | +#define NEW_START_BLOCK 2048 | |
58 | +#define PARTITION_END_ALIGNMENT 2048 | |
59 | + | |
60 | #ifdef _KERNEL | |
61 | #include <sys/vdev.h> | |
62 | ||
63 | diff --git a/lib/libefi/rdwr_efi.c b/lib/libefi/rdwr_efi.c | |
64 | index 7935047e..19cb17e5 100644 | |
65 | --- a/lib/libefi/rdwr_efi.c | |
66 | +++ b/lib/libefi/rdwr_efi.c | |
67 | @@ -22,6 +22,7 @@ | |
68 | /* | |
69 | * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. | |
70 | * Copyright 2012 Nexenta Systems, Inc. All rights reserved. | |
71 | + * Copyright (c) 2018 by Delphix. All rights reserved. | |
72 | */ | |
73 | ||
74 | #include <stdio.h> | |
75 | @@ -1153,7 +1154,7 @@ efi_use_whole_disk(int fd) | |
76 | ||
77 | /* | |
78 | * Find the last physically non-zero partition. | |
79 | - * This is the reserved partition. | |
80 | + * This should be the reserved partition. | |
81 | */ | |
82 | for (i = 0; i < efi_label->efi_nparts; i ++) { | |
83 | if (resv_start < efi_label->efi_parts[i].p_start) { | |
84 | @@ -1163,6 +1164,23 @@ efi_use_whole_disk(int fd) | |
85 | } | |
86 | ||
87 | /* | |
88 | + * Verify that we've found the reserved partition by checking | |
89 | + * that it looks the way it did when we created it in zpool_label_disk. | |
90 | + * If we've found the incorrect partition, then we know that this | |
91 | + * device was reformatted and no longer is soley used by ZFS. | |
92 | + */ | |
93 | + if ((efi_label->efi_parts[resv_index].p_size != EFI_MIN_RESV_SIZE) || | |
94 | + (efi_label->efi_parts[resv_index].p_tag != V_RESERVED) || | |
95 | + (resv_index != 8)) { | |
96 | + if (efi_debug) { | |
97 | + (void) fprintf(stderr, | |
98 | + "efi_use_whole_disk: wholedisk not available\n"); | |
99 | + } | |
100 | + efi_free(efi_label); | |
101 | + return (VT_ENOSPC); | |
102 | + } | |
103 | + | |
104 | + /* | |
105 | * Find the last physically non-zero partition before that. | |
106 | * This is the data partition. | |
107 | */ | |
108 | diff --git a/lib/libzfs/libzfs_pool.c b/lib/libzfs/libzfs_pool.c | |
109 | index e00d5f51..53bc5034 100644 | |
110 | --- a/lib/libzfs/libzfs_pool.c | |
111 | +++ b/lib/libzfs/libzfs_pool.c | |
112 | @@ -22,7 +22,7 @@ | |
113 | /* | |
114 | * Copyright 2015 Nexenta Systems, Inc. All rights reserved. | |
115 | * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. | |
116 | - * Copyright (c) 2011, 2014 by Delphix. All rights reserved. | |
117 | + * Copyright (c) 2011, 2018 by Delphix. All rights reserved. | |
118 | * Copyright 2016 Igor Kozhukhov <ikozhukhov@gmail.com> | |
119 | * Copyright (c) 2017 Datto Inc. | |
120 | */ | |
121 | @@ -42,6 +42,7 @@ | |
122 | #include <sys/efi_partition.h> | |
123 | #include <sys/vtoc.h> | |
124 | #include <sys/zfs_ioctl.h> | |
125 | +#include <sys/vdev_disk.h> | |
126 | #include <dlfcn.h> | |
127 | ||
128 | #include "zfs_namecheck.h" | |
129 | @@ -913,17 +914,6 @@ zpool_prop_get_feature(zpool_handle_t *zhp, const char *propname, char *buf, | |
130 | } | |
131 | ||
132 | /* | |
133 | - * Don't start the slice at the default block of 34; many storage | |
134 | - * devices will use a stripe width of 128k, other vendors prefer a 1m | |
135 | - * alignment. It is best to play it safe and ensure a 1m alignment | |
136 | - * given 512B blocks. When the block size is larger by a power of 2 | |
137 | - * we will still be 1m aligned. Some devices are sensitive to the | |
138 | - * partition ending alignment as well. | |
139 | - */ | |
140 | -#define NEW_START_BLOCK 2048 | |
141 | -#define PARTITION_END_ALIGNMENT 2048 | |
142 | - | |
143 | -/* | |
144 | * Validate the given pool name, optionally putting an extended error message in | |
145 | * 'buf'. | |
146 | */ | |
147 | diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c | |
148 | index acac2a97..b643bd35 100644 | |
149 | --- a/module/zfs/vdev.c | |
150 | +++ b/module/zfs/vdev.c | |
151 | @@ -21,7 +21,7 @@ | |
152 | ||
153 | /* | |
154 | * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. | |
155 | - * Copyright (c) 2011, 2015 by Delphix. All rights reserved. | |
156 | + * Copyright (c) 2011, 2018 by Delphix. All rights reserved. | |
157 | * Copyright 2017 Nexenta Systems, Inc. | |
158 | * Copyright (c) 2014 Integros [integros.com] | |
159 | * Copyright 2016 Toomas Soome <tsoome@me.com> | |
160 | @@ -3039,7 +3039,6 @@ vdev_get_stats_ex(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx) | |
161 | vd->vdev_max_asize - vd->vdev_asize, | |
162 | 1ULL << tvd->vdev_ms_shift); | |
163 | } | |
164 | - vs->vs_esize = vd->vdev_max_asize - vd->vdev_asize; | |
165 | if (vd->vdev_aux == NULL && vd == vd->vdev_top && | |
166 | !vd->vdev_ishole) { | |
167 | vs->vs_fragmentation = vd->vdev_mg->mg_fragmentation; | |
168 | diff --git a/module/zfs/vdev_disk.c b/module/zfs/vdev_disk.c | |
169 | index 6761e755..6dc0544f 100644 | |
170 | --- a/module/zfs/vdev_disk.c | |
171 | +++ b/module/zfs/vdev_disk.c | |
172 | @@ -23,7 +23,7 @@ | |
173 | * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). | |
174 | * Rewritten for Linux by Brian Behlendorf <behlendorf1@llnl.gov>. | |
175 | * LLNL-CODE-403049. | |
176 | - * Copyright (c) 2012, 2015 by Delphix. All rights reserved. | |
177 | + * Copyright (c) 2012, 2018 by Delphix. All rights reserved. | |
178 | */ | |
179 | ||
180 | #include <sys/zfs_context.h> | |
181 | @@ -35,10 +35,14 @@ | |
182 | #include <sys/zio.h> | |
183 | #include <sys/sunldi.h> | |
184 | #include <linux/mod_compat.h> | |
185 | +#include <linux/msdos_fs.h> | |
186 | ||
187 | char *zfs_vdev_scheduler = VDEV_SCHEDULER; | |
188 | static void *zfs_vdev_holder = VDEV_HOLDER; | |
189 | ||
190 | +/* size of the "reserved" partition, in blocks */ | |
191 | +#define EFI_MIN_RESV_SIZE (16 * 1024) | |
192 | + | |
193 | /* | |
194 | * Virtual device vector for disks. | |
195 | */ | |
196 | @@ -82,17 +86,39 @@ vdev_bdev_mode(int smode) | |
197 | } | |
198 | #endif /* HAVE_OPEN_BDEV_EXCLUSIVE */ | |
199 | ||
200 | +/* The capacity (in bytes) of a bdev that is available to be used by a vdev */ | |
201 | static uint64_t | |
202 | -bdev_capacity(struct block_device *bdev) | |
203 | +bdev_capacity(struct block_device *bdev, boolean_t wholedisk) | |
204 | { | |
205 | struct hd_struct *part = bdev->bd_part; | |
206 | + uint64_t sectors = get_capacity(bdev->bd_disk); | |
207 | + /* If there are no paritions, return the entire device capacity */ | |
208 | + if (part == NULL) | |
209 | + return (sectors << SECTOR_BITS); | |
210 | ||
211 | - /* The partition capacity referenced by the block device */ | |
212 | - if (part) | |
213 | - return (part->nr_sects << 9); | |
214 | - | |
215 | - /* Otherwise assume the full device capacity */ | |
216 | - return (get_capacity(bdev->bd_disk) << 9); | |
217 | + /* | |
218 | + * If there are partitions, decide if we are using a `wholedisk` | |
219 | + * layout (composed of part1 and part9) or just a single partition. | |
220 | + */ | |
221 | + if (wholedisk) { | |
222 | + /* Verify the expected device layout */ | |
223 | + ASSERT3P(bdev, !=, bdev->bd_contains); | |
224 | + /* | |
225 | + * Sectors used by the EFI partition (part9) as well as | |
226 | + * partion alignment. | |
227 | + */ | |
228 | + uint64_t used = EFI_MIN_RESV_SIZE + NEW_START_BLOCK + | |
229 | + PARTITION_END_ALIGNMENT; | |
230 | + | |
231 | + /* Space available to the vdev, i.e. the size of part1 */ | |
232 | + if (sectors <= used) | |
233 | + return (0); | |
234 | + uint64_t available = sectors - used; | |
235 | + return (available << SECTOR_BITS); | |
236 | + } else { | |
237 | + /* The partition capacity referenced by the block device */ | |
238 | + return (part->nr_sects << SECTOR_BITS); | |
239 | + } | |
240 | } | |
241 | ||
242 | static void | |
243 | @@ -328,9 +354,7 @@ skip_open: | |
244 | v->vdev_nonrot = blk_queue_nonrot(bdev_get_queue(vd->vd_bdev)); | |
245 | ||
246 | /* Physical volume size in bytes */ | |
247 | - *psize = bdev_capacity(vd->vd_bdev); | |
248 | - | |
249 | - /* TODO: report possible expansion size */ | |
250 | + *psize = bdev_capacity(vd->vd_bdev, v->vdev_wholedisk); | |
251 | *max_psize = *psize; | |
252 | ||
253 | /* Based on the minimum sector size set the block size */ | |
254 | diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_expand/zpool_expand_002_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_expand/zpool_expand_002_pos.ksh | |
255 | index d578ae60..66b6969d 100755 | |
256 | --- a/tests/zfs-tests/tests/functional/cli_root/zpool_expand/zpool_expand_002_pos.ksh | |
257 | +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_expand/zpool_expand_002_pos.ksh | |
258 | @@ -26,7 +26,7 @@ | |
259 | # | |
260 | ||
261 | # | |
262 | -# Copyright (c) 2012, 2016 by Delphix. All rights reserved. | |
263 | +# Copyright (c) 2012, 2018 by Delphix. All rights reserved. | |
264 | # Copyright (c) 2017 Lawrence Livermore National Security, LLC. | |
265 | # | |
266 | ||
267 | @@ -43,8 +43,9 @@ | |
268 | # 1) Create 3 files | |
269 | # 2) Create a pool backed by the files | |
270 | # 3) Expand the files' size with truncate | |
271 | -# 4) Use zpool online -e to online the vdevs | |
272 | -# 5) Check that the pool size was expanded | |
273 | +# 4) Use zpool reopen to check the expandsize | |
274 | +# 5) Use zpool online -e to online the vdevs | |
275 | +# 6) Check that the pool size was expanded | |
276 | # | |
277 | ||
278 | verify_runnable "global" | |
279 | @@ -64,8 +65,8 @@ log_onexit cleanup | |
280 | ||
281 | log_assert "zpool can expand after zpool online -e zvol vdevs on LUN expansion" | |
282 | ||
283 | - | |
284 | for type in " " mirror raidz raidz2; do | |
285 | + # Initialize the file devices and the pool | |
286 | for i in 1 2 3; do | |
287 | log_must truncate -s $org_size ${TEMPFILE}.$i | |
288 | done | |
289 | @@ -80,13 +81,35 @@ for type in " " mirror raidz raidz2; do | |
290 | "$autoexp" | |
291 | fi | |
292 | typeset prev_size=$(get_pool_prop size $TESTPOOL1) | |
293 | - typeset zfs_prev_size=$(zfs get -p avail $TESTPOOL1 | tail -1 | \ | |
294 | - awk '{print $3}') | |
295 | + typeset zfs_prev_size=$(get_prop avail $TESTPOOL1) | |
296 | ||
297 | + # Increase the size of the file devices | |
298 | for i in 1 2 3; do | |
299 | log_must truncate -s $exp_size ${TEMPFILE}.$i | |
300 | done | |
301 | ||
302 | + # Reopen the pool and check that the `expandsize` property is set | |
303 | + log_must zpool reopen $TESTPOOL1 | |
304 | + typeset zpool_expandsize=$(get_pool_prop expandsize $TESTPOOL1) | |
305 | + | |
306 | + if [[ $type == "mirror" ]]; then | |
307 | + typeset expected_zpool_expandsize=$(($exp_size-$org_size)) | |
308 | + else | |
309 | + typeset expected_zpool_expandsize=$((3*($exp_size-$org_size))) | |
310 | + fi | |
311 | + | |
312 | + if [[ "$zpool_expandsize" = "-" ]]; then | |
313 | + log_fail "pool $TESTPOOL1 did not detect any " \ | |
314 | + "expandsize after reopen" | |
315 | + fi | |
316 | + | |
317 | + if [[ $zpool_expandsize -ne $expected_zpool_expandsize ]]; then | |
318 | + log_fail "pool $TESTPOOL1 did not detect correct " \ | |
319 | + "expandsize after reopen: found $zpool_expandsize," \ | |
320 | + "expected $expected_zpool_expandsize" | |
321 | + fi | |
322 | + | |
323 | + # Online the devices to add the new space to the pool | |
324 | for i in 1 2 3; do | |
325 | log_must zpool online -e $TESTPOOL1 ${TEMPFILE}.$i | |
326 | done | |
327 | @@ -96,8 +119,7 @@ for type in " " mirror raidz raidz2; do | |
328 | sync | |
329 | ||
330 | typeset expand_size=$(get_pool_prop size $TESTPOOL1) | |
331 | - typeset zfs_expand_size=$(zfs get -p avail $TESTPOOL1 | tail -1 | \ | |
332 | - awk '{print $3}') | |
333 | + typeset zfs_expand_size=$(get_prop avail $TESTPOOL1) | |
334 | log_note "$TESTPOOL1 $type has previous size: $prev_size and " \ | |
335 | "expanded size: $expand_size" | |
336 | ||
337 | @@ -112,8 +134,8 @@ for type in " " mirror raidz raidz2; do | |
338 | grep "(+${expansion_size}" | wc -l) | |
339 | ||
340 | if [[ $size_addition -ne $i ]]; then | |
341 | - log_fail "pool $TESTPOOL1 is not autoexpand " \ | |
342 | - "after LUN expansion" | |
343 | + log_fail "pool $TESTPOOL1 did not expand " \ | |
344 | + "after LUN expansion and zpool online -e" | |
345 | fi | |
346 | elif [[ $type == "mirror" ]]; then | |
347 | typeset expansion_size=$(($exp_size-$org_size)) | |
348 | @@ -123,8 +145,8 @@ for type in " " mirror raidz raidz2; do | |
349 | grep "(+${expansion_size})" >/dev/null 2>&1 | |
350 | ||
351 | if [[ $? -ne 0 ]]; then | |
352 | - log_fail "pool $TESTPOOL1 is not autoexpand " \ | |
353 | - "after LUN expansion" | |
354 | + log_fail "pool $TESTPOOL1 did not expand " \ | |
355 | + "after LUN expansion and zpool online -e" | |
356 | fi | |
357 | else | |
358 | typeset expansion_size=$((3*($exp_size-$org_size))) | |
359 | @@ -134,13 +156,13 @@ for type in " " mirror raidz raidz2; do | |
360 | grep "(+${expansion_size})" >/dev/null 2>&1 | |
361 | ||
362 | if [[ $? -ne 0 ]] ; then | |
363 | - log_fail "pool $TESTPOOL1 is not autoexpand " \ | |
364 | - "after LUN expansion" | |
365 | + log_fail "pool $TESTPOOL1 did not expand " \ | |
366 | + "after LUN expansion and zpool online -e" | |
367 | fi | |
368 | fi | |
369 | else | |
370 | - log_fail "pool $TESTPOOL1 is not autoexpanded after LUN " \ | |
371 | - "expansion" | |
372 | + log_fail "pool $TESTPOOL1 did not expand after LUN expansion " \ | |
373 | + "and zpool online -e" | |
374 | fi | |
375 | log_must zpool destroy $TESTPOOL1 | |
376 | done |