]> git.proxmox.com Git - ceph.git/blame - ceph/qa/standalone/osd/osd-recovery-stats.sh
update sources to ceph Nautilus 14.2.1
[ceph.git] / ceph / qa / standalone / osd / osd-recovery-stats.sh
CommitLineData
b32b8144
FG
1#!/usr/bin/env bash
2#
3# Copyright (C) 2017 Red Hat <contact@redhat.com>
4#
5# Author: David Zafman <dzafman@redhat.com>
6#
7# This program is free software; you can redistribute it and/or modify
8# it under the terms of the GNU Library Public License as published by
9# the Free Software Foundation; either version 2, or (at your option)
10# any later version.
11#
12# This program is distributed in the hope that it will be useful,
13# but WITHOUT ANY WARRANTY; without even the implied warranty of
14# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15# GNU Library Public License for more details.
16#
17
18source $CEPH_ROOT/qa/standalone/ceph-helpers.sh
19
20function run() {
21 local dir=$1
22 shift
23
24 # Fix port????
25 export CEPH_MON="127.0.0.1:7115" # git grep '\<7115\>' : there must be only one
26 export CEPH_ARGS
27 CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none "
28 CEPH_ARGS+="--mon-host=$CEPH_MON "
11fdf7f2
TL
29 # so we will not force auth_log_shard to be acting_primary
30 CEPH_ARGS+="--osd_force_auth_primary_missing_objects=1000000 "
b32b8144
FG
31 export margin=10
32 export objects=200
33 export poolname=test
34
35 local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')}
36 for func in $funcs ; do
37 setup $dir || return 1
38 $func $dir || return 1
39 teardown $dir || return 1
40 done
41}
42
43function below_margin() {
44 local -i check=$1
45 shift
46 local -i target=$1
47
48 return $(( $check <= $target && $check >= $target - $margin ? 0 : 1 ))
49}
50
51function above_margin() {
52 local -i check=$1
53 shift
54 local -i target=$1
55
56 return $(( $check >= $target && $check <= $target + $margin ? 0 : 1 ))
57}
58
91327a77
AA
59FIND_UPACT='grep "pg[[]${PG}.*recovering.*_update_calc_stats " $log | tail -1 | sed "s/.*[)] \([[][^ p]*\).*$/\1/"'
60FIND_FIRST='grep "pg[[]${PG}.*recovering.*_update_calc_stats $which " $log | grep -F " ${UPACT}${addp}" | grep -v est | head -1 | sed "s/.* \([0-9]*\)$/\1/"'
61FIND_LAST='grep "pg[[]${PG}.*recovering.*_update_calc_stats $which " $log | tail -1 | sed "s/.* \([0-9]*\)$/\1/"'
62
b32b8144 63function check() {
91327a77
AA
64 local dir=$1
65 local PG=$2
66 local primary=$3
67 local type=$4
68 local degraded_start=$5
69 local degraded_end=$6
70 local misplaced_start=$7
71 local misplaced_end=$8
72 local primary_start=${9:-}
73 local primary_end=${10:-}
74
75 local log=$dir/osd.${primary}.log
b32b8144
FG
76
77 local addp=" "
78 if [ "$type" = "erasure" ];
79 then
80 addp="p"
81 fi
82
91327a77 83 UPACT=$(eval $FIND_UPACT)
b32b8144
FG
84
85 # Check 3rd line at start because of false recovery starts
91327a77
AA
86 local which="degraded"
87 FIRST=$(eval $FIND_FIRST)
b32b8144 88 below_margin $FIRST $degraded_start || return 1
91327a77 89 LAST=$(eval $FIND_LAST)
b32b8144
FG
90 above_margin $LAST $degraded_end || return 1
91
92 # Check 3rd line at start because of false recovery starts
91327a77
AA
93 which="misplaced"
94 FIRST=$(eval $FIND_FIRST)
b32b8144 95 below_margin $FIRST $misplaced_start || return 1
91327a77 96 LAST=$(eval $FIND_LAST)
b32b8144 97 above_margin $LAST $misplaced_end || return 1
91327a77
AA
98
99 # This is the value of set into MISSING_ON_PRIMARY
100 if [ -n "$primary_start" ];
101 then
102 which="shard $primary"
103 FIRST=$(eval $FIND_FIRST)
104 below_margin $FIRST $primary_start || return 1
105 LAST=$(eval $FIND_LAST)
106 above_margin $LAST $primary_end || return 1
107 fi
b32b8144
FG
108}
109
110# [1,0,?] -> [1,2,4]
111# degraded 500 -> 0
112# active+recovering+degraded
113
114# PG_STAT OBJECTS MISSING_ON_PRIMARY DEGRADED MISPLACED UNFOUND BYTES LOG DISK_LOG STATE STATE_STAMP VERSION REPORTED UP UP_PRIMARY ACTING ACTING_PRIMARY LAST_SCRUB SCRUB_STAMP LAST_DEEP_SCRUB DEEP_SCRUB_STAMP
115# 1.0 500 0 500 0 0 0 500 500 active+recovering+degraded 2017-11-17 19:27:36.493828 28'500 32:603 [1,2,4] 1 [1,2,4] 1 0'0 2017-11-17 19:27:05.915467 0'0 2017-11-17 19:27:05.915467
116function do_recovery_out1() {
117 local dir=$1
118 shift
119 local type=$1
120
121 run_mon $dir a || return 1
122 run_mgr $dir x || return 1
123 run_osd $dir 0 || return 1
124 run_osd $dir 1 || return 1
125 run_osd $dir 2 || return 1
126 run_osd $dir 3 || return 1
127 run_osd $dir 4 || return 1
128 run_osd $dir 5 || return 1
129
130 if [ $type = "erasure" ];
131 then
132 ceph osd erasure-code-profile set myprofile plugin=jerasure technique=reed_sol_van k=2 m=1 crush-failure-domain=osd
133 create_pool $poolname 1 1 $type myprofile
134 else
135 create_pool $poolname 1 1 $type
136 fi
137
138 wait_for_clean || return 1
139
140 for i in $(seq 1 $objects)
141 do
142 rados -p $poolname put obj$i /dev/null
143 done
144
145 local primary=$(get_primary $poolname obj1)
146 local PG=$(get_pg $poolname obj1)
147 # Only 2 OSDs so only 1 not primary
148 local otherosd=$(get_not_primary $poolname obj1)
149
150 ceph osd set norecover
151 kill $(cat $dir/osd.${otherosd}.pid)
152 ceph osd down osd.${otherosd}
153 ceph osd out osd.${otherosd}
154 ceph osd unset norecover
155 ceph tell osd.$(get_primary $poolname obj1) debug kick_recovery_wq 0
156 sleep 2
157
158 wait_for_clean || return 1
159
91327a77 160 check $dir $PG $primary $type $objects 0 0 0 || return 1
b32b8144
FG
161
162 delete_pool $poolname
163 kill_daemons $dir || return 1
164}
165
166function TEST_recovery_replicated_out1() {
167 local dir=$1
168
169 do_recovery_out1 $dir replicated || return 1
170}
171
172function TEST_recovery_erasure_out1() {
173 local dir=$1
174
175 do_recovery_out1 $dir erasure || return 1
176}
177
178# [0, 1] -> [2,3,4,5]
91327a77
AA
179# degraded 1000 -> 0
180# misplaced 1000 -> 0
b32b8144
FG
181# missing on primary 500 -> 0
182
183# PG_STAT OBJECTS MISSING_ON_PRIMARY DEGRADED MISPLACED UNFOUND BYTES LOG DISK_LOG STATE STATE_STAMP VERSION REPORTED UP UP_PRIMARY ACTING ACTING_PRIMARY LAST_SCRUB SCRUB_STAMP LAST_DEEP_SCRUB DEEP_SCRUB_STAMP
91327a77 184# 1.0 500 500 1000 1000 0 0 500 500 active+recovering+degraded 2017-10-27 09:38:37.453438 22'500 25:394 [2,4,3,5] 2 [2,4,3,5] 2 0'0 2017-10-27 09:37:58.046748 0'0 2017-10-27 09:37:58.046748
b32b8144
FG
185function TEST_recovery_sizeup() {
186 local dir=$1
187
188 run_mon $dir a || return 1
189 run_mgr $dir x || return 1
190 run_osd $dir 0 || return 1
191 run_osd $dir 1 || return 1
192 run_osd $dir 2 || return 1
193 run_osd $dir 3 || return 1
194 run_osd $dir 4 || return 1
195 run_osd $dir 5 || return 1
196
197 create_pool $poolname 1 1
198 ceph osd pool set $poolname size 2
199
200 wait_for_clean || return 1
201
202 for i in $(seq 1 $objects)
203 do
204 rados -p $poolname put obj$i /dev/null
205 done
206
207 local primary=$(get_primary $poolname obj1)
208 local PG=$(get_pg $poolname obj1)
209 # Only 2 OSDs so only 1 not primary
210 local otherosd=$(get_not_primary $poolname obj1)
211
212 ceph osd set norecover
213 ceph osd out osd.$primary osd.$otherosd
214 ceph osd pool set test size 4
215 ceph osd unset norecover
11fdf7f2
TL
216 # Get new primary
217 primary=$(get_primary $poolname obj1)
218
219 ceph tell osd.${primary} debug kick_recovery_wq 0
b32b8144
FG
220 sleep 2
221
222 wait_for_clean || return 1
223
91327a77
AA
224 local degraded=$(expr $objects \* 2)
225 local misplaced=$(expr $objects \* 2)
b32b8144 226 local log=$dir/osd.${primary}.log
91327a77 227 check $dir $PG $primary replicated $degraded 0 $misplaced 0 $objects 0 || return 1
b32b8144
FG
228
229 delete_pool $poolname
230 kill_daemons $dir || return 1
231}
232
233# [0, 1, 2, 4] -> [3, 5]
91327a77 234# misplaced 1000 -> 0
b32b8144
FG
235# missing on primary 500 -> 0
236# active+recovering+degraded
237
238# PG_STAT OBJECTS MISSING_ON_PRIMARY DEGRADED MISPLACED UNFOUND BYTES LOG DISK_LOG STATE STATE_STAMP VERSION REPORTED UP UP_PRIMARY ACTING ACTING_PRIMARY LAST_SCRUB SCRUB_STAMP LAST_DEEP_SCRUB DEEP_SCRUB_STAMP
91327a77 239# 1.0 500 500 0 1000 0 0 500 500 active+recovering+degraded 2017-10-27 09:34:50.012261 22'500 27:118 [3,5] 3 [3,5] 3 0'0 2017-10-27 09:34:08.617248 0'0 2017-10-27 09:34:08.617248
b32b8144
FG
240function TEST_recovery_sizedown() {
241 local dir=$1
242
243 run_mon $dir a || return 1
244 run_mgr $dir x || return 1
245 run_osd $dir 0 || return 1
246 run_osd $dir 1 || return 1
247 run_osd $dir 2 || return 1
248 run_osd $dir 3 || return 1
249 run_osd $dir 4 || return 1
250 run_osd $dir 5 || return 1
251
252 create_pool $poolname 1 1
253 ceph osd pool set $poolname size 4
254
255 wait_for_clean || return 1
256
257 for i in $(seq 1 $objects)
258 do
259 rados -p $poolname put obj$i /dev/null
260 done
261
262 local primary=$(get_primary $poolname obj1)
263 local PG=$(get_pg $poolname obj1)
264 # Only 2 OSDs so only 1 not primary
265 local allosds=$(get_osds $poolname obj1)
266
267 ceph osd set norecover
268 for osd in $allosds
269 do
270 ceph osd out osd.$osd
271 done
272
273 ceph osd pool set test size 2
274 ceph osd unset norecover
275 ceph tell osd.$(get_primary $poolname obj1) debug kick_recovery_wq 0
276 sleep 2
277
278 wait_for_clean || return 1
279
280 # Get new primary
281 primary=$(get_primary $poolname obj1)
282
91327a77 283 local misplaced=$(expr $objects \* 2)
b32b8144 284 local log=$dir/osd.${primary}.log
91327a77 285 check $dir $PG $primary replicated 0 0 $misplaced 0 || return 1
b32b8144
FG
286
287 UPACT=$(grep "pg[[]${PG}.*recovering.*_update_calc_stats " $log | tail -1 | sed "s/.*[)] \([[][^ p]*\).*$/\1/")
288
289 # This is the value of set into MISSING_ON_PRIMARY
91327a77 290 FIRST=$(grep "pg[[]${PG}.*recovering.*_update_calc_stats shard $primary " $log | grep -F " $UPACT " | head -1 | sed "s/.* \([0-9]*\)$/\1/")
b32b8144 291 below_margin $FIRST $objects || return 1
91327a77 292 LAST=$(grep "pg[[]${PG}.*recovering.*_update_calc_stats shard $primary " $log | tail -1 | sed "s/.* \([0-9]*\)$/\1/")
b32b8144
FG
293 above_margin $LAST 0 || return 1
294
295 delete_pool $poolname
296 kill_daemons $dir || return 1
297}
298
299# [1] -> [1,2]
91327a77 300# degraded 300 -> 200
b32b8144
FG
301# active+recovering+undersized+degraded
302
303# PG_STAT OBJECTS MISSING_ON_PRIMARY DEGRADED MISPLACED UNFOUND BYTES LOG DISK_LOG STATE STATE_STAMP VERSION REPORTED UP UP_PRIMARY ACTING ACTING_PRIMARY LAST_SCRUB SCRUB_STAMP LAST_DEEP_SCRUB DEEP_SCRUB_STAMP
91327a77 304# 1.0 100 0 300 0 0 0 100 100 active+recovering+undersized+degraded 2017-11-17 17:16:15.302943 13'500 16:643 [1,2] 1 [1,2] 1 0'0 2017-11-17 17:15:34.985563 0'0 2017-11-17 17:15:34.985563
b32b8144
FG
305function TEST_recovery_undersized() {
306 local dir=$1
307
91327a77 308 local osds=3
b32b8144
FG
309 run_mon $dir a || return 1
310 run_mgr $dir x || return 1
91327a77
AA
311 for i in $(seq 0 $(expr $osds - 1))
312 do
313 run_osd $dir $i || return 1
314 done
b32b8144
FG
315
316 create_pool $poolname 1 1
317 ceph osd pool set $poolname size 1
318
319 wait_for_clean || return 1
320
321 for i in $(seq 1 $objects)
322 do
323 rados -p $poolname put obj$i /dev/null
324 done
325
326 local primary=$(get_primary $poolname obj1)
327 local PG=$(get_pg $poolname obj1)
328
329 ceph osd set norecover
330 # Mark any osd not the primary (only 1 replica so also has no replica)
91327a77 331 for i in $(seq 0 $(expr $osds - 1))
b32b8144
FG
332 do
333 if [ $i = $primary ];
334 then
335 continue
336 fi
337 ceph osd out osd.$i
338 break
339 done
91327a77 340 ceph osd pool set test size 4
b32b8144
FG
341 ceph osd unset norecover
342 ceph tell osd.$(get_primary $poolname obj1) debug kick_recovery_wq 0
343 # Give extra sleep time because code below doesn't have the sophistication of wait_for_clean()
344 sleep 10
91327a77 345 flush_pg_stats || return 1
b32b8144
FG
346
347 # Wait for recovery to finish
348 # Can't use wait_for_clean() because state goes from active+recovering+undersized+degraded
349 # to active+undersized+degraded
350 for i in $(seq 1 60)
351 do
352 if ceph pg dump pgs | grep ^$PG | grep -qv recovering
353 then
354 break
355 fi
356 if [ $i = "60" ];
357 then
358 echo "Timeout waiting for recovery to finish"
359 return 1
360 fi
361 sleep 1
362 done
363
364 # Get new primary
365 primary=$(get_primary $poolname obj1)
366 local log=$dir/osd.${primary}.log
367
91327a77
AA
368 local first_degraded=$(expr $objects \* 3)
369 local last_degraded=$(expr $objects \* 2)
370 check $dir $PG $primary replicated $first_degraded $last_degraded 0 0 || return 1
b32b8144
FG
371
372 delete_pool $poolname
373 kill_daemons $dir || return 1
374}
375
376# [1,0,2] -> [1,3,NONE]/[1,3,2]
377# degraded 100 -> 0
378# misplaced 100 -> 100
379# active+recovering+degraded+remapped
380
381# PG_STAT OBJECTS MISSING_ON_PRIMARY DEGRADED MISPLACED UNFOUND BYTES LOG DISK_LOG STATE STATE_STAMP VERSION REPORTED UP UP_PRIMARY ACTING ACTING_PRIMARY LAST_SCRUB SCRUB_STAMP LAST_DEEP_SCRUB DEEP_SCRUB_STAMP
382# 1.0 100 0 100 100 0 0 100 100 active+recovering+degraded+remapped 2017-11-27 21:24:20.851243 18'500 23:618 [1,3,NONE] 1 [1,3,2] 1 0'0 2017-11-27 21:23:39.395242 0'0 2017-11-27 21:23:39.395242
383function TEST_recovery_erasure_remapped() {
384 local dir=$1
385
386 run_mon $dir a || return 1
387 run_mgr $dir x || return 1
388 run_osd $dir 0 || return 1
389 run_osd $dir 1 || return 1
390 run_osd $dir 2 || return 1
391 run_osd $dir 3 || return 1
392
393 ceph osd erasure-code-profile set myprofile plugin=jerasure technique=reed_sol_van k=2 m=1 crush-failure-domain=osd
394 create_pool $poolname 1 1 erasure myprofile
395 ceph osd pool set $poolname min_size 2
396
397 wait_for_clean || return 1
398
399 for i in $(seq 1 $objects)
400 do
401 rados -p $poolname put obj$i /dev/null
402 done
403
404 local primary=$(get_primary $poolname obj1)
405 local PG=$(get_pg $poolname obj1)
406 local otherosd=$(get_not_primary $poolname obj1)
407
408 ceph osd set norecover
409 kill $(cat $dir/osd.${otherosd}.pid)
410 ceph osd down osd.${otherosd}
411 ceph osd out osd.${otherosd}
412
413 # Mark osd not the primary and not down/out osd as just out
414 for i in 0 1 2 3
415 do
416 if [ $i = $primary ];
417 then
418 continue
419 fi
420 if [ $i = $otherosd ];
421 then
422 continue
423 fi
424 ceph osd out osd.$i
425 break
426 done
427 ceph osd unset norecover
428 ceph tell osd.$(get_primary $poolname obj1) debug kick_recovery_wq 0
429 sleep 2
430
431 wait_for_clean || return 1
432
433 local log=$dir/osd.${primary}.log
91327a77 434 check $dir $PG $primary erasure $objects 0 $objects $objects || return 1
b32b8144
FG
435
436 delete_pool $poolname
437 kill_daemons $dir || return 1
438}
439
91327a77
AA
440function TEST_recovery_multi() {
441 local dir=$1
442
443 local osds=6
444 run_mon $dir a || return 1
445 run_mgr $dir x || return 1
446 for i in $(seq 0 $(expr $osds - 1))
447 do
448 run_osd $dir $i || return 1
449 done
450
451 create_pool $poolname 1 1
452 ceph osd pool set $poolname size 3
453 ceph osd pool set $poolname min_size 1
454
455 wait_for_clean || return 1
456
457 rados -p $poolname put obj1 /dev/null
458
459 local primary=$(get_primary $poolname obj1)
460 local otherosd=$(get_not_primary $poolname obj1)
461
462 ceph osd set noout
463 ceph osd set norecover
464 kill $(cat $dir/osd.${otherosd}.pid)
465 ceph osd down osd.${otherosd}
466
467 local half=$(expr $objects / 2)
468 for i in $(seq 2 $half)
469 do
470 rados -p $poolname put obj$i /dev/null
471 done
472
473 kill $(cat $dir/osd.${primary}.pid)
474 ceph osd down osd.${primary}
475 run_osd $dir ${otherosd}
476 sleep 3
477
478 for i in $(seq $(expr $half + 1) $objects)
479 do
480 rados -p $poolname put obj$i /dev/null
481 done
482
483 local PG=$(get_pg $poolname obj1)
484 local otherosd=$(get_not_primary $poolname obj$objects)
485
486 ceph osd unset noout
487 ceph osd out osd.$primary osd.$otherosd
488 run_osd $dir ${primary}
489 sleep 3
490
491 ceph osd pool set test size 4
492 ceph osd unset norecover
493 ceph tell osd.$(get_primary $poolname obj1) debug kick_recovery_wq 0
494 sleep 2
495
496 wait_for_clean || return 1
497
498 # Get new primary
499 primary=$(get_primary $poolname obj1)
500
501 local log=$dir/osd.${primary}.log
502 check $dir $PG $primary replicated 399 0 300 0 99 0 || return 1
503
504 delete_pool $poolname
505 kill_daemons $dir || return 1
506}
507
11fdf7f2 508main osd-recovery-stats "$@"
91327a77 509
b32b8144
FG
510# Local Variables:
511# compile-command: "make -j4 && ../qa/run-standalone.sh osd-recovery-stats.sh"
512# End: