]>
Commit | Line | Data |
---|---|---|
b32b8144 FG |
1 | #!/usr/bin/env bash |
2 | # | |
3 | # Copyright (C) 2017 Red Hat <contact@redhat.com> | |
4 | # | |
5 | # Author: David Zafman <dzafman@redhat.com> | |
6 | # | |
7 | # This program is free software; you can redistribute it and/or modify | |
8 | # it under the terms of the GNU Library Public License as published by | |
9 | # the Free Software Foundation; either version 2, or (at your option) | |
10 | # any later version. | |
11 | # | |
12 | # This program is distributed in the hope that it will be useful, | |
13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of | |
14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
15 | # GNU Library Public License for more details. | |
16 | # | |
17 | ||
18 | source $CEPH_ROOT/qa/standalone/ceph-helpers.sh | |
19 | ||
20 | function run() { | |
21 | local dir=$1 | |
22 | shift | |
23 | ||
24 | # Fix port???? | |
25 | export CEPH_MON="127.0.0.1:7115" # git grep '\<7115\>' : there must be only one | |
26 | export CEPH_ARGS | |
27 | CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none " | |
28 | CEPH_ARGS+="--mon-host=$CEPH_MON " | |
29 | export margin=10 | |
30 | export objects=200 | |
31 | export poolname=test | |
32 | ||
33 | local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')} | |
34 | for func in $funcs ; do | |
35 | setup $dir || return 1 | |
36 | $func $dir || return 1 | |
37 | teardown $dir || return 1 | |
38 | done | |
39 | } | |
40 | ||
41 | function below_margin() { | |
42 | local -i check=$1 | |
43 | shift | |
44 | local -i target=$1 | |
45 | ||
46 | return $(( $check <= $target && $check >= $target - $margin ? 0 : 1 )) | |
47 | } | |
48 | ||
49 | function above_margin() { | |
50 | local -i check=$1 | |
51 | shift | |
52 | local -i target=$1 | |
53 | ||
54 | return $(( $check >= $target && $check <= $target + $margin ? 0 : 1 )) | |
55 | } | |
56 | ||
57 | function check() { | |
58 | local PG=$1 | |
59 | local log=$2 | |
60 | local degraded_start=$3 | |
61 | local degraded_end=$4 | |
62 | local misplaced_start=$5 | |
63 | local misplaced_end=$6 | |
64 | local type=$7 | |
65 | ||
66 | local addp=" " | |
67 | if [ "$type" = "erasure" ]; | |
68 | then | |
69 | addp="p" | |
70 | fi | |
71 | ||
72 | UPACT=$(grep "pg[[]${PG}.*recovering.*_update_calc_stats " $log | tail -1 | sed "s/.*[)] \([[][^ p]*\).*$/\1/") | |
73 | ||
74 | # Check 3rd line at start because of false recovery starts | |
75 | FIRST=$(grep "pg[[]${PG}.*recovering.*_update_calc_stats degraded " $log | grep -F " ${UPACT}${addp}" | head -1 | sed "s/.* \([0-9]*\)$/\1/") | |
76 | below_margin $FIRST $degraded_start || return 1 | |
77 | LAST=$(grep "pg[[]${PG}.*recovering.*_update_calc_stats degraded " $log | tail -1 | sed "s/.* \([0-9]*\)$/\1/") | |
78 | above_margin $LAST $degraded_end || return 1 | |
79 | ||
80 | # Check 3rd line at start because of false recovery starts | |
81 | FIRST=$(grep "pg[[]${PG}.*recovering.*_update_calc_stats misplaced " $log | grep -F " ${UPACT}${addp}" | head -1 | sed "s/.* \([0-9]*\)$/\1/") | |
82 | below_margin $FIRST $misplaced_start || return 1 | |
83 | LAST=$(grep "pg[[]${PG}.*recovering.*_update_calc_stats misplaced " $log | tail -1 | sed "s/.* \([0-9]*\)$/\1/") | |
84 | above_margin $LAST $misplaced_end || return 1 | |
85 | } | |
86 | ||
87 | # [1,0,?] -> [1,2,4] | |
88 | # degraded 500 -> 0 | |
89 | # active+recovering+degraded | |
90 | ||
91 | # PG_STAT OBJECTS MISSING_ON_PRIMARY DEGRADED MISPLACED UNFOUND BYTES LOG DISK_LOG STATE STATE_STAMP VERSION REPORTED UP UP_PRIMARY ACTING ACTING_PRIMARY LAST_SCRUB SCRUB_STAMP LAST_DEEP_SCRUB DEEP_SCRUB_STAMP | |
92 | # 1.0 500 0 500 0 0 0 500 500 active+recovering+degraded 2017-11-17 19:27:36.493828 28'500 32:603 [1,2,4] 1 [1,2,4] 1 0'0 2017-11-17 19:27:05.915467 0'0 2017-11-17 19:27:05.915467 | |
93 | function do_recovery_out1() { | |
94 | local dir=$1 | |
95 | shift | |
96 | local type=$1 | |
97 | ||
98 | run_mon $dir a || return 1 | |
99 | run_mgr $dir x || return 1 | |
100 | run_osd $dir 0 || return 1 | |
101 | run_osd $dir 1 || return 1 | |
102 | run_osd $dir 2 || return 1 | |
103 | run_osd $dir 3 || return 1 | |
104 | run_osd $dir 4 || return 1 | |
105 | run_osd $dir 5 || return 1 | |
106 | ||
107 | if [ $type = "erasure" ]; | |
108 | then | |
109 | ceph osd erasure-code-profile set myprofile plugin=jerasure technique=reed_sol_van k=2 m=1 crush-failure-domain=osd | |
110 | create_pool $poolname 1 1 $type myprofile | |
111 | else | |
112 | create_pool $poolname 1 1 $type | |
113 | fi | |
114 | ||
115 | wait_for_clean || return 1 | |
116 | ||
117 | for i in $(seq 1 $objects) | |
118 | do | |
119 | rados -p $poolname put obj$i /dev/null | |
120 | done | |
121 | ||
122 | local primary=$(get_primary $poolname obj1) | |
123 | local PG=$(get_pg $poolname obj1) | |
124 | # Only 2 OSDs so only 1 not primary | |
125 | local otherosd=$(get_not_primary $poolname obj1) | |
126 | ||
127 | ceph osd set norecover | |
128 | kill $(cat $dir/osd.${otherosd}.pid) | |
129 | ceph osd down osd.${otherosd} | |
130 | ceph osd out osd.${otherosd} | |
131 | ceph osd unset norecover | |
132 | ceph tell osd.$(get_primary $poolname obj1) debug kick_recovery_wq 0 | |
133 | sleep 2 | |
134 | ||
135 | wait_for_clean || return 1 | |
136 | ||
137 | local log=$dir/osd.${primary}.log | |
138 | check $PG $log $objects 0 0 0 $type || return 1 | |
139 | ||
140 | delete_pool $poolname | |
141 | kill_daemons $dir || return 1 | |
142 | } | |
143 | ||
144 | function TEST_recovery_replicated_out1() { | |
145 | local dir=$1 | |
146 | ||
147 | do_recovery_out1 $dir replicated || return 1 | |
148 | } | |
149 | ||
150 | function TEST_recovery_erasure_out1() { | |
151 | local dir=$1 | |
152 | ||
153 | do_recovery_out1 $dir erasure || return 1 | |
154 | } | |
155 | ||
156 | # [0, 1] -> [2,3,4,5] | |
157 | # degraded 2000 -> 0 | |
158 | # missing on primary 500 -> 0 | |
159 | ||
160 | # PG_STAT OBJECTS MISSING_ON_PRIMARY DEGRADED MISPLACED UNFOUND BYTES LOG DISK_LOG STATE STATE_STAMP VERSION REPORTED UP UP_PRIMARY ACTING ACTING_PRIMARY LAST_SCRUB SCRUB_STAMP LAST_DEEP_SCRUB DEEP_SCRUB_STAMP | |
161 | # 1.0 500 500 2000 0 0 0 500 500 active+recovering+degraded 2017-10-27 09:38:37.453438 22'500 25:394 [2,4,3,5] 2 [2,4,3,5] 2 0'0 2017-10-27 09:37:58.046748 0'0 2017-10-27 09:37:58.046748 | |
162 | function TEST_recovery_sizeup() { | |
163 | local dir=$1 | |
164 | ||
165 | run_mon $dir a || return 1 | |
166 | run_mgr $dir x || return 1 | |
167 | run_osd $dir 0 || return 1 | |
168 | run_osd $dir 1 || return 1 | |
169 | run_osd $dir 2 || return 1 | |
170 | run_osd $dir 3 || return 1 | |
171 | run_osd $dir 4 || return 1 | |
172 | run_osd $dir 5 || return 1 | |
173 | ||
174 | create_pool $poolname 1 1 | |
175 | ceph osd pool set $poolname size 2 | |
176 | ||
177 | wait_for_clean || return 1 | |
178 | ||
179 | for i in $(seq 1 $objects) | |
180 | do | |
181 | rados -p $poolname put obj$i /dev/null | |
182 | done | |
183 | ||
184 | local primary=$(get_primary $poolname obj1) | |
185 | local PG=$(get_pg $poolname obj1) | |
186 | # Only 2 OSDs so only 1 not primary | |
187 | local otherosd=$(get_not_primary $poolname obj1) | |
188 | ||
189 | ceph osd set norecover | |
190 | ceph osd out osd.$primary osd.$otherosd | |
191 | ceph osd pool set test size 4 | |
192 | ceph osd unset norecover | |
193 | ceph tell osd.$(get_primary $poolname obj1) debug kick_recovery_wq 0 | |
194 | sleep 2 | |
195 | ||
196 | wait_for_clean || return 1 | |
197 | ||
198 | # Get new primary | |
199 | primary=$(get_primary $poolname obj1) | |
200 | ||
201 | local degraded=$(expr $objects \* 4) | |
202 | local log=$dir/osd.${primary}.log | |
203 | check $PG $log $degraded 0 0 0 || return 1 | |
204 | ||
205 | UPACT=$(grep "pg[[]${PG}.*recovering.*_update_calc_stats " $log | tail -1 | sed "s/.*[)] \([[][^ p]*\).*$/\1/") | |
206 | ||
207 | # This is the value of set into MISSING_ON_PRIMARY | |
208 | FIRST=$(grep "pg[[]${PG}.*recovering.*_update_calc_stats missing shard $primary " $log | grep -F " $UPACT " | head -1 | sed "s/.* \([0-9]*\)$/\1/") | |
209 | below_margin $FIRST $objects || return 1 | |
210 | LAST=$(grep "pg[[]${PG}.*recovering.*_update_calc_stats missing shard $primary " $log | tail -1 | sed "s/.* \([0-9]*\)$/\1/") | |
211 | above_margin $LAST 0 || return 1 | |
212 | ||
213 | delete_pool $poolname | |
214 | kill_daemons $dir || return 1 | |
215 | } | |
216 | ||
217 | # [0, 1, 2, 4] -> [3, 5] | |
218 | # degraded 1000 -> 0 | |
219 | # missing on primary 500 -> 0 | |
220 | # active+recovering+degraded | |
221 | ||
222 | # PG_STAT OBJECTS MISSING_ON_PRIMARY DEGRADED MISPLACED UNFOUND BYTES LOG DISK_LOG STATE STATE_STAMP VERSION REPORTED UP UP_PRIMARY ACTING ACTING_PRIMARY LAST_SCRUB SCRUB_STAMP LAST_DEEP_SCRUB DEEP_SCRUB_STAMP | |
223 | # 1.0 500 500 1000 0 0 0 500 500 active+recovering+degraded 2017-10-27 09:34:50.012261 22'500 27:118 [3,5] 3 [3,5] 3 0'0 2017-10-27 09:34:08.617248 0'0 2017-10-27 09:34:08.617248 | |
224 | function TEST_recovery_sizedown() { | |
225 | local dir=$1 | |
226 | ||
227 | run_mon $dir a || return 1 | |
228 | run_mgr $dir x || return 1 | |
229 | run_osd $dir 0 || return 1 | |
230 | run_osd $dir 1 || return 1 | |
231 | run_osd $dir 2 || return 1 | |
232 | run_osd $dir 3 || return 1 | |
233 | run_osd $dir 4 || return 1 | |
234 | run_osd $dir 5 || return 1 | |
235 | ||
236 | create_pool $poolname 1 1 | |
237 | ceph osd pool set $poolname size 4 | |
238 | ||
239 | wait_for_clean || return 1 | |
240 | ||
241 | for i in $(seq 1 $objects) | |
242 | do | |
243 | rados -p $poolname put obj$i /dev/null | |
244 | done | |
245 | ||
246 | local primary=$(get_primary $poolname obj1) | |
247 | local PG=$(get_pg $poolname obj1) | |
248 | # Only 2 OSDs so only 1 not primary | |
249 | local allosds=$(get_osds $poolname obj1) | |
250 | ||
251 | ceph osd set norecover | |
252 | for osd in $allosds | |
253 | do | |
254 | ceph osd out osd.$osd | |
255 | done | |
256 | ||
257 | ceph osd pool set test size 2 | |
258 | ceph osd unset norecover | |
259 | ceph tell osd.$(get_primary $poolname obj1) debug kick_recovery_wq 0 | |
260 | sleep 2 | |
261 | ||
262 | wait_for_clean || return 1 | |
263 | ||
264 | # Get new primary | |
265 | primary=$(get_primary $poolname obj1) | |
266 | ||
267 | local degraded=$(expr $objects \* 2) | |
268 | local log=$dir/osd.${primary}.log | |
269 | check $PG $log $degraded 0 0 0 || return 1 | |
270 | ||
271 | UPACT=$(grep "pg[[]${PG}.*recovering.*_update_calc_stats " $log | tail -1 | sed "s/.*[)] \([[][^ p]*\).*$/\1/") | |
272 | ||
273 | # This is the value of set into MISSING_ON_PRIMARY | |
274 | FIRST=$(grep "pg[[]${PG}.*recovering.*_update_calc_stats missing shard $primary " $log | grep -F " $UPACT " | head -1 | sed "s/.* \([0-9]*\)$/\1/") | |
275 | below_margin $FIRST $objects || return 1 | |
276 | LAST=$(grep "pg[[]${PG}.*recovering.*_update_calc_stats missing shard $primary " $log | tail -1 | sed "s/.* \([0-9]*\)$/\1/") | |
277 | above_margin $LAST 0 || return 1 | |
278 | ||
279 | delete_pool $poolname | |
280 | kill_daemons $dir || return 1 | |
281 | } | |
282 | ||
283 | # [1] -> [1,2] | |
284 | # degraded 200 -> 100 | |
285 | # active+recovering+undersized+degraded | |
286 | ||
287 | # PG_STAT OBJECTS MISSING_ON_PRIMARY DEGRADED MISPLACED UNFOUND BYTES LOG DISK_LOG STATE STATE_STAMP VERSION REPORTED UP UP_PRIMARY ACTING ACTING_PRIMARY LAST_SCRUB SCRUB_STAMP LAST_DEEP_SCRUB DEEP_SCRUB_STAMP | |
288 | # 1.0 100 0 200 0 0 0 100 100 active+recovering+undersized+degraded 2017-11-17 17:16:15.302943 13'500 16:643 [1,2] 1 [1,2] 1 0'0 2017-11-17 17:15:34.985563 0'0 2017-11-17 17:15:34.985563 | |
289 | function TEST_recovery_undersized() { | |
290 | local dir=$1 | |
291 | ||
292 | run_mon $dir a || return 1 | |
293 | run_mgr $dir x || return 1 | |
294 | run_osd $dir 0 || return 1 | |
295 | run_osd $dir 1 || return 1 | |
296 | run_osd $dir 2 || return 1 | |
297 | ||
298 | create_pool $poolname 1 1 | |
299 | ceph osd pool set $poolname size 1 | |
300 | ||
301 | wait_for_clean || return 1 | |
302 | ||
303 | for i in $(seq 1 $objects) | |
304 | do | |
305 | rados -p $poolname put obj$i /dev/null | |
306 | done | |
307 | ||
308 | local primary=$(get_primary $poolname obj1) | |
309 | local PG=$(get_pg $poolname obj1) | |
310 | ||
311 | ceph osd set norecover | |
312 | # Mark any osd not the primary (only 1 replica so also has no replica) | |
313 | for i in 0 1 2 | |
314 | do | |
315 | if [ $i = $primary ]; | |
316 | then | |
317 | continue | |
318 | fi | |
319 | ceph osd out osd.$i | |
320 | break | |
321 | done | |
322 | ceph osd pool set test size 3 | |
323 | ceph osd unset norecover | |
324 | ceph tell osd.$(get_primary $poolname obj1) debug kick_recovery_wq 0 | |
325 | # Give extra sleep time because code below doesn't have the sophistication of wait_for_clean() | |
326 | sleep 10 | |
327 | flush_pg_stats | |
328 | ||
329 | # Wait for recovery to finish | |
330 | # Can't use wait_for_clean() because state goes from active+recovering+undersized+degraded | |
331 | # to active+undersized+degraded | |
332 | for i in $(seq 1 60) | |
333 | do | |
334 | if ceph pg dump pgs | grep ^$PG | grep -qv recovering | |
335 | then | |
336 | break | |
337 | fi | |
338 | if [ $i = "60" ]; | |
339 | then | |
340 | echo "Timeout waiting for recovery to finish" | |
341 | return 1 | |
342 | fi | |
343 | sleep 1 | |
344 | done | |
345 | ||
346 | # Get new primary | |
347 | primary=$(get_primary $poolname obj1) | |
348 | local log=$dir/osd.${primary}.log | |
349 | ||
350 | UPACT=$(grep "pg[[]${PG}.*recovering.*_update_calc_stats " $log | tail -1 | sed "s/.*[)] \([[][^ p]*\).*$/\1/") | |
351 | ||
352 | local degraded=$(expr $objects \* 2) | |
353 | FIRST=$(grep "pg[[]${PG}.*recovering.*_update_calc_stats degraded " $log | grep -F " $UPACT " | head -1 | sed "s/.* \([0-9]*\)$/\1/") | |
354 | below_margin $FIRST $degraded || return 1 | |
355 | LAST=$(grep "pg[[]${PG}.*recovering.*_update_calc_stats degraded " $log | tail -1 | sed "s/.* \([0-9]*\)$/\1/") | |
356 | above_margin $LAST $objects || return 1 | |
357 | ||
358 | delete_pool $poolname | |
359 | kill_daemons $dir || return 1 | |
360 | } | |
361 | ||
362 | # [1,0,2] -> [1,3,NONE]/[1,3,2] | |
363 | # degraded 100 -> 0 | |
364 | # misplaced 100 -> 100 | |
365 | # active+recovering+degraded+remapped | |
366 | ||
367 | # PG_STAT OBJECTS MISSING_ON_PRIMARY DEGRADED MISPLACED UNFOUND BYTES LOG DISK_LOG STATE STATE_STAMP VERSION REPORTED UP UP_PRIMARY ACTING ACTING_PRIMARY LAST_SCRUB SCRUB_STAMP LAST_DEEP_SCRUB DEEP_SCRUB_STAMP | |
368 | # 1.0 100 0 100 100 0 0 100 100 active+recovering+degraded+remapped 2017-11-27 21:24:20.851243 18'500 23:618 [1,3,NONE] 1 [1,3,2] 1 0'0 2017-11-27 21:23:39.395242 0'0 2017-11-27 21:23:39.395242 | |
369 | function TEST_recovery_erasure_remapped() { | |
370 | local dir=$1 | |
371 | ||
372 | run_mon $dir a || return 1 | |
373 | run_mgr $dir x || return 1 | |
374 | run_osd $dir 0 || return 1 | |
375 | run_osd $dir 1 || return 1 | |
376 | run_osd $dir 2 || return 1 | |
377 | run_osd $dir 3 || return 1 | |
378 | ||
379 | ceph osd erasure-code-profile set myprofile plugin=jerasure technique=reed_sol_van k=2 m=1 crush-failure-domain=osd | |
380 | create_pool $poolname 1 1 erasure myprofile | |
381 | ceph osd pool set $poolname min_size 2 | |
382 | ||
383 | wait_for_clean || return 1 | |
384 | ||
385 | for i in $(seq 1 $objects) | |
386 | do | |
387 | rados -p $poolname put obj$i /dev/null | |
388 | done | |
389 | ||
390 | local primary=$(get_primary $poolname obj1) | |
391 | local PG=$(get_pg $poolname obj1) | |
392 | local otherosd=$(get_not_primary $poolname obj1) | |
393 | ||
394 | ceph osd set norecover | |
395 | kill $(cat $dir/osd.${otherosd}.pid) | |
396 | ceph osd down osd.${otherosd} | |
397 | ceph osd out osd.${otherosd} | |
398 | ||
399 | # Mark osd not the primary and not down/out osd as just out | |
400 | for i in 0 1 2 3 | |
401 | do | |
402 | if [ $i = $primary ]; | |
403 | then | |
404 | continue | |
405 | fi | |
406 | if [ $i = $otherosd ]; | |
407 | then | |
408 | continue | |
409 | fi | |
410 | ceph osd out osd.$i | |
411 | break | |
412 | done | |
413 | ceph osd unset norecover | |
414 | ceph tell osd.$(get_primary $poolname obj1) debug kick_recovery_wq 0 | |
415 | sleep 2 | |
416 | ||
417 | wait_for_clean || return 1 | |
418 | ||
419 | local log=$dir/osd.${primary}.log | |
420 | check $PG $log $objects 0 $objects $objects erasure || return 1 | |
421 | ||
422 | delete_pool $poolname | |
423 | kill_daemons $dir || return 1 | |
424 | } | |
425 | ||
426 | main recout "$@" | |
427 | ||
428 | # Local Variables: | |
429 | # compile-command: "make -j4 && ../qa/run-standalone.sh osd-recovery-stats.sh" | |
430 | # End: |