3 # Copyright (C) 2017 Red Hat <contact@redhat.com>
5 # Author: David Zafman <dzafman@redhat.com>
7 # This program is free software; you can redistribute it and/or modify
8 # it under the terms of the GNU Library Public License as published by
9 # the Free Software Foundation; either version 2, or (at your option)
12 # This program is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU Library Public License for more details.
18 source $CEPH_ROOT/qa
/standalone
/ceph-helpers.sh
25 export CEPH_MON
="127.0.0.1:7115" # git grep '\<7115\>' : there must be only one
27 CEPH_ARGS
+="--fsid=$(uuidgen) --auth-supported=none "
28 CEPH_ARGS
+="--mon-host=$CEPH_MON "
33 local funcs
=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')}
34 for func
in $funcs ; do
35 setup
$dir ||
return 1
36 $func $dir ||
return 1
37 teardown
$dir ||
return 1
41 function below_margin
() {
46 return $
(( $check <= $target && $check >= $target - $margin ?
0 : 1 ))
49 function above_margin
() {
54 return $
(( $check >= $target && $check <= $target + $margin ?
0 : 1 ))
57 FIND_UPACT
='grep "pg[[]${PG}.*recovering.*_update_calc_stats " $log | tail -1 | sed "s/.*[)] \([[][^ p]*\).*$/\1/"'
58 FIND_FIRST
='grep "pg[[]${PG}.*recovering.*_update_calc_stats $which " $log | grep -F " ${UPACT}${addp}" | grep -v est | head -1 | sed "s/.* \([0-9]*\)$/\1/"'
59 FIND_LAST
='grep "pg[[]${PG}.*recovering.*_update_calc_stats $which " $log | tail -1 | sed "s/.* \([0-9]*\)$/\1/"'
66 local degraded_start
=$5
68 local misplaced_start
=$7
69 local misplaced_end
=$8
70 local primary_start
=${9:-}
71 local primary_end
=${10:-}
73 local log
=$dir/osd.
${primary}.log
76 if [ "$type" = "erasure" ];
81 UPACT
=$
(eval $FIND_UPACT)
83 # Check 3rd line at start because of false recovery starts
84 local which="degraded"
85 FIRST
=$
(eval $FIND_FIRST)
86 below_margin
$FIRST $degraded_start ||
return 1
87 LAST
=$
(eval $FIND_LAST)
88 above_margin
$LAST $degraded_end ||
return 1
90 # Check 3rd line at start because of false recovery starts
92 FIRST
=$
(eval $FIND_FIRST)
93 below_margin
$FIRST $misplaced_start ||
return 1
94 LAST
=$
(eval $FIND_LAST)
95 above_margin
$LAST $misplaced_end ||
return 1
97 # This is the value of set into MISSING_ON_PRIMARY
98 if [ -n "$primary_start" ];
100 which="shard $primary"
101 FIRST
=$
(eval $FIND_FIRST)
102 below_margin
$FIRST $primary_start ||
return 1
103 LAST
=$
(eval $FIND_LAST)
104 above_margin
$LAST $primary_end ||
return 1
110 # active+recovering+degraded
112 # PG_STAT OBJECTS MISSING_ON_PRIMARY DEGRADED MISPLACED UNFOUND BYTES LOG DISK_LOG STATE STATE_STAMP VERSION REPORTED UP UP_PRIMARY ACTING ACTING_PRIMARY LAST_SCRUB SCRUB_STAMP LAST_DEEP_SCRUB DEEP_SCRUB_STAMP
113 # 1.0 500 0 500 0 0 0 500 500 active+recovering+degraded 2017-11-17 19:27:36.493828 28'500 32:603 [1,2,4] 1 [1,2,4] 1 0'0 2017-11-17 19:27:05.915467 0'0 2017-11-17 19:27:05.915467
114 function do_recovery_out1
() {
119 run_mon
$dir a ||
return 1
120 run_mgr
$dir x ||
return 1
121 run_osd
$dir 0 ||
return 1
122 run_osd
$dir 1 ||
return 1
123 run_osd
$dir 2 ||
return 1
124 run_osd
$dir 3 ||
return 1
125 run_osd
$dir 4 ||
return 1
126 run_osd
$dir 5 ||
return 1
128 if [ $type = "erasure" ];
130 ceph osd erasure-code-profile
set myprofile plugin
=jerasure technique
=reed_sol_van k
=2 m
=1 crush-failure-domain
=osd
131 create_pool
$poolname 1 1 $type myprofile
133 create_pool
$poolname 1 1 $type
136 wait_for_clean ||
return 1
138 for i
in $
(seq 1 $objects)
140 rados
-p $poolname put obj
$i /dev
/null
143 local primary
=$
(get_primary
$poolname obj1
)
144 local PG
=$
(get_pg
$poolname obj1
)
145 # Only 2 OSDs so only 1 not primary
146 local otherosd
=$
(get_not_primary
$poolname obj1
)
148 ceph osd
set norecover
149 kill $
(cat $dir/osd.
${otherosd}.pid
)
150 ceph osd down osd.
${otherosd}
151 ceph osd out osd.
${otherosd}
152 ceph osd
unset norecover
153 ceph tell osd.$
(get_primary
$poolname obj1
) debug kick_recovery_wq
0
156 wait_for_clean ||
return 1
158 check
$dir $PG $primary $type $objects 0 0 0 ||
return 1
160 delete_pool
$poolname
161 kill_daemons
$dir ||
return 1
164 function TEST_recovery_replicated_out1
() {
167 do_recovery_out1
$dir replicated ||
return 1
170 function TEST_recovery_erasure_out1
() {
173 do_recovery_out1
$dir erasure ||
return 1
176 # [0, 1] -> [2,3,4,5]
178 # misplaced 1000 -> 0
179 # missing on primary 500 -> 0
181 # PG_STAT OBJECTS MISSING_ON_PRIMARY DEGRADED MISPLACED UNFOUND BYTES LOG DISK_LOG STATE STATE_STAMP VERSION REPORTED UP UP_PRIMARY ACTING ACTING_PRIMARY LAST_SCRUB SCRUB_STAMP LAST_DEEP_SCRUB DEEP_SCRUB_STAMP
182 # 1.0 500 500 1000 1000 0 0 500 500 active+recovering+degraded 2017-10-27 09:38:37.453438 22'500 25:394 [2,4,3,5] 2 [2,4,3,5] 2 0'0 2017-10-27 09:37:58.046748 0'0 2017-10-27 09:37:58.046748
183 function TEST_recovery_sizeup
() {
186 run_mon
$dir a ||
return 1
187 run_mgr
$dir x ||
return 1
188 run_osd
$dir 0 ||
return 1
189 run_osd
$dir 1 ||
return 1
190 run_osd
$dir 2 ||
return 1
191 run_osd
$dir 3 ||
return 1
192 run_osd
$dir 4 ||
return 1
193 run_osd
$dir 5 ||
return 1
195 create_pool
$poolname 1 1
196 ceph osd pool
set $poolname size
2
198 wait_for_clean ||
return 1
200 for i
in $
(seq 1 $objects)
202 rados
-p $poolname put obj
$i /dev
/null
205 local primary
=$
(get_primary
$poolname obj1
)
206 local PG
=$
(get_pg
$poolname obj1
)
207 # Only 2 OSDs so only 1 not primary
208 local otherosd
=$
(get_not_primary
$poolname obj1
)
210 ceph osd
set norecover
211 ceph osd out osd.
$primary osd.
$otherosd
212 ceph osd pool
set test size
4
213 ceph osd
unset norecover
214 ceph tell osd.$
(get_primary
$poolname obj1
) debug kick_recovery_wq
0
217 wait_for_clean ||
return 1
220 primary
=$
(get_primary
$poolname obj1
)
222 local degraded
=$
(expr $objects \
* 2)
223 local misplaced
=$
(expr $objects \
* 2)
224 local log
=$dir/osd.
${primary}.log
225 check
$dir $PG $primary replicated
$degraded 0 $misplaced 0 $objects 0 ||
return 1
227 delete_pool
$poolname
228 kill_daemons
$dir ||
return 1
231 # [0, 1, 2, 4] -> [3, 5]
232 # misplaced 1000 -> 0
233 # missing on primary 500 -> 0
234 # active+recovering+degraded
236 # PG_STAT OBJECTS MISSING_ON_PRIMARY DEGRADED MISPLACED UNFOUND BYTES LOG DISK_LOG STATE STATE_STAMP VERSION REPORTED UP UP_PRIMARY ACTING ACTING_PRIMARY LAST_SCRUB SCRUB_STAMP LAST_DEEP_SCRUB DEEP_SCRUB_STAMP
237 # 1.0 500 500 0 1000 0 0 500 500 active+recovering+degraded 2017-10-27 09:34:50.012261 22'500 27:118 [3,5] 3 [3,5] 3 0'0 2017-10-27 09:34:08.617248 0'0 2017-10-27 09:34:08.617248
238 function TEST_recovery_sizedown
() {
241 run_mon
$dir a ||
return 1
242 run_mgr
$dir x ||
return 1
243 run_osd
$dir 0 ||
return 1
244 run_osd
$dir 1 ||
return 1
245 run_osd
$dir 2 ||
return 1
246 run_osd
$dir 3 ||
return 1
247 run_osd
$dir 4 ||
return 1
248 run_osd
$dir 5 ||
return 1
250 create_pool
$poolname 1 1
251 ceph osd pool
set $poolname size
4
253 wait_for_clean ||
return 1
255 for i
in $
(seq 1 $objects)
257 rados
-p $poolname put obj
$i /dev
/null
260 local primary
=$
(get_primary
$poolname obj1
)
261 local PG
=$
(get_pg
$poolname obj1
)
262 # Only 2 OSDs so only 1 not primary
263 local allosds
=$
(get_osds
$poolname obj1
)
265 ceph osd
set norecover
268 ceph osd out osd.
$osd
271 ceph osd pool
set test size
2
272 ceph osd
unset norecover
273 ceph tell osd.$
(get_primary
$poolname obj1
) debug kick_recovery_wq
0
276 wait_for_clean ||
return 1
279 primary
=$
(get_primary
$poolname obj1
)
281 local misplaced
=$
(expr $objects \
* 2)
282 local log
=$dir/osd.
${primary}.log
283 check
$dir $PG $primary replicated
0 0 $misplaced 0 ||
return 1
285 UPACT
=$
(grep "pg[[]${PG}.*recovering.*_update_calc_stats " $log |
tail -1 |
sed "s/.*[)] \([[][^ p]*\).*$/\1/")
287 # This is the value of set into MISSING_ON_PRIMARY
288 FIRST
=$
(grep "pg[[]${PG}.*recovering.*_update_calc_stats shard $primary " $log |
grep -F " $UPACT " |
head -1 |
sed "s/.* \([0-9]*\)$/\1/")
289 below_margin
$FIRST $objects ||
return 1
290 LAST
=$
(grep "pg[[]${PG}.*recovering.*_update_calc_stats shard $primary " $log |
tail -1 |
sed "s/.* \([0-9]*\)$/\1/")
291 above_margin
$LAST 0 ||
return 1
293 delete_pool
$poolname
294 kill_daemons
$dir ||
return 1
298 # degraded 300 -> 200
299 # active+recovering+undersized+degraded
301 # PG_STAT OBJECTS MISSING_ON_PRIMARY DEGRADED MISPLACED UNFOUND BYTES LOG DISK_LOG STATE STATE_STAMP VERSION REPORTED UP UP_PRIMARY ACTING ACTING_PRIMARY LAST_SCRUB SCRUB_STAMP LAST_DEEP_SCRUB DEEP_SCRUB_STAMP
302 # 1.0 100 0 300 0 0 0 100 100 active+recovering+undersized+degraded 2017-11-17 17:16:15.302943 13'500 16:643 [1,2] 1 [1,2] 1 0'0 2017-11-17 17:15:34.985563 0'0 2017-11-17 17:15:34.985563
303 function TEST_recovery_undersized
() {
307 run_mon
$dir a ||
return 1
308 run_mgr
$dir x ||
return 1
309 for i
in $
(seq 0 $
(expr $osds - 1))
311 run_osd
$dir $i ||
return 1
314 create_pool
$poolname 1 1
315 ceph osd pool
set $poolname size
1
317 wait_for_clean ||
return 1
319 for i
in $
(seq 1 $objects)
321 rados
-p $poolname put obj
$i /dev
/null
324 local primary
=$
(get_primary
$poolname obj1
)
325 local PG
=$
(get_pg
$poolname obj1
)
327 ceph osd
set norecover
328 # Mark any osd not the primary (only 1 replica so also has no replica)
329 for i
in $
(seq 0 $
(expr $osds - 1))
331 if [ $i = $primary ];
338 ceph osd pool
set test size
4
339 ceph osd
unset norecover
340 ceph tell osd.$
(get_primary
$poolname obj1
) debug kick_recovery_wq
0
341 # Give extra sleep time because code below doesn't have the sophistication of wait_for_clean()
343 flush_pg_stats ||
return 1
345 # Wait for recovery to finish
346 # Can't use wait_for_clean() because state goes from active+recovering+undersized+degraded
347 # to active+undersized+degraded
350 if ceph pg dump pgs |
grep ^
$PG |
grep -qv recovering
356 echo "Timeout waiting for recovery to finish"
363 primary
=$
(get_primary
$poolname obj1
)
364 local log
=$dir/osd.
${primary}.log
366 local first_degraded
=$
(expr $objects \
* 3)
367 local last_degraded
=$
(expr $objects \
* 2)
368 check
$dir $PG $primary replicated
$first_degraded $last_degraded 0 0 ||
return 1
370 delete_pool
$poolname
371 kill_daemons
$dir ||
return 1
374 # [1,0,2] -> [1,3,NONE]/[1,3,2]
376 # misplaced 100 -> 100
377 # active+recovering+degraded+remapped
379 # PG_STAT OBJECTS MISSING_ON_PRIMARY DEGRADED MISPLACED UNFOUND BYTES LOG DISK_LOG STATE STATE_STAMP VERSION REPORTED UP UP_PRIMARY ACTING ACTING_PRIMARY LAST_SCRUB SCRUB_STAMP LAST_DEEP_SCRUB DEEP_SCRUB_STAMP
380 # 1.0 100 0 100 100 0 0 100 100 active+recovering+degraded+remapped 2017-11-27 21:24:20.851243 18'500 23:618 [1,3,NONE] 1 [1,3,2] 1 0'0 2017-11-27 21:23:39.395242 0'0 2017-11-27 21:23:39.395242
381 function TEST_recovery_erasure_remapped
() {
384 run_mon
$dir a ||
return 1
385 run_mgr
$dir x ||
return 1
386 run_osd
$dir 0 ||
return 1
387 run_osd
$dir 1 ||
return 1
388 run_osd
$dir 2 ||
return 1
389 run_osd
$dir 3 ||
return 1
391 ceph osd erasure-code-profile
set myprofile plugin
=jerasure technique
=reed_sol_van k
=2 m
=1 crush-failure-domain
=osd
392 create_pool
$poolname 1 1 erasure myprofile
393 ceph osd pool
set $poolname min_size
2
395 wait_for_clean ||
return 1
397 for i
in $
(seq 1 $objects)
399 rados
-p $poolname put obj
$i /dev
/null
402 local primary
=$
(get_primary
$poolname obj1
)
403 local PG
=$
(get_pg
$poolname obj1
)
404 local otherosd
=$
(get_not_primary
$poolname obj1
)
406 ceph osd
set norecover
407 kill $
(cat $dir/osd.
${otherosd}.pid
)
408 ceph osd down osd.
${otherosd}
409 ceph osd out osd.
${otherosd}
411 # Mark osd not the primary and not down/out osd as just out
414 if [ $i = $primary ];
418 if [ $i = $otherosd ];
425 ceph osd
unset norecover
426 ceph tell osd.$
(get_primary
$poolname obj1
) debug kick_recovery_wq
0
429 wait_for_clean ||
return 1
431 local log
=$dir/osd.
${primary}.log
432 check
$dir $PG $primary erasure
$objects 0 $objects $objects ||
return 1
434 delete_pool
$poolname
435 kill_daemons
$dir ||
return 1
438 main osd-recovery-stats
"$@"
440 function TEST_recovery_multi
() {
444 run_mon
$dir a ||
return 1
445 run_mgr
$dir x ||
return 1
446 for i
in $
(seq 0 $
(expr $osds - 1))
448 run_osd
$dir $i ||
return 1
451 create_pool
$poolname 1 1
452 ceph osd pool
set $poolname size
3
453 ceph osd pool
set $poolname min_size
1
455 wait_for_clean ||
return 1
457 rados
-p $poolname put obj1
/dev
/null
459 local primary
=$
(get_primary
$poolname obj1
)
460 local otherosd
=$
(get_not_primary
$poolname obj1
)
463 ceph osd
set norecover
464 kill $
(cat $dir/osd.
${otherosd}.pid
)
465 ceph osd down osd.
${otherosd}
467 local half
=$
(expr $objects / 2)
468 for i
in $
(seq 2 $half)
470 rados
-p $poolname put obj
$i /dev
/null
473 kill $
(cat $dir/osd.
${primary}.pid
)
474 ceph osd down osd.
${primary}
475 run_osd
$dir ${otherosd}
478 for i
in $
(seq $
(expr $half + 1) $objects)
480 rados
-p $poolname put obj
$i /dev
/null
483 local PG
=$
(get_pg
$poolname obj1
)
484 local otherosd
=$
(get_not_primary
$poolname obj
$objects)
487 ceph osd out osd.
$primary osd.
$otherosd
488 run_osd
$dir ${primary}
491 ceph osd pool
set test size
4
492 ceph osd
unset norecover
493 ceph tell osd.$
(get_primary
$poolname obj1
) debug kick_recovery_wq
0
496 wait_for_clean ||
return 1
499 primary
=$
(get_primary
$poolname obj1
)
501 local log
=$dir/osd.
${primary}.log
502 check
$dir $PG $primary replicated
399 0 300 0 99 0 ||
return 1
504 delete_pool
$poolname
505 kill_daemons
$dir ||
return 1
510 # compile-command: "make -j4 && ../qa/run-standalone.sh osd-recovery-stats.sh"