]> git.proxmox.com Git - ceph.git/blob - ceph/qa/standalone/osd/osd-recovery-stats.sh
update sources to 12.2.10
[ceph.git] / ceph / qa / standalone / osd / osd-recovery-stats.sh
1 #!/usr/bin/env bash
2 #
3 # Copyright (C) 2017 Red Hat <contact@redhat.com>
4 #
5 # Author: David Zafman <dzafman@redhat.com>
6 #
7 # This program is free software; you can redistribute it and/or modify
8 # it under the terms of the GNU Library Public License as published by
9 # the Free Software Foundation; either version 2, or (at your option)
10 # any later version.
11 #
12 # This program is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU Library Public License for more details.
16 #
17
18 source $CEPH_ROOT/qa/standalone/ceph-helpers.sh
19
20 function run() {
21 local dir=$1
22 shift
23
24 # Fix port????
25 export CEPH_MON="127.0.0.1:7115" # git grep '\<7115\>' : there must be only one
26 export CEPH_ARGS
27 CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none "
28 CEPH_ARGS+="--mon-host=$CEPH_MON "
29 export margin=10
30 export objects=200
31 export poolname=test
32
33 local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')}
34 for func in $funcs ; do
35 setup $dir || return 1
36 $func $dir || return 1
37 teardown $dir || return 1
38 done
39 }
40
41 function below_margin() {
42 local -i check=$1
43 shift
44 local -i target=$1
45
46 return $(( $check <= $target && $check >= $target - $margin ? 0 : 1 ))
47 }
48
49 function above_margin() {
50 local -i check=$1
51 shift
52 local -i target=$1
53
54 return $(( $check >= $target && $check <= $target + $margin ? 0 : 1 ))
55 }
56
57 FIND_UPACT='grep "pg[[]${PG}.*recovering.*_update_calc_stats " $log | tail -1 | sed "s/.*[)] \([[][^ p]*\).*$/\1/"'
58 FIND_FIRST='grep "pg[[]${PG}.*recovering.*_update_calc_stats $which " $log | grep -F " ${UPACT}${addp}" | grep -v est | head -1 | sed "s/.* \([0-9]*\)$/\1/"'
59 FIND_LAST='grep "pg[[]${PG}.*recovering.*_update_calc_stats $which " $log | tail -1 | sed "s/.* \([0-9]*\)$/\1/"'
60
61 function check() {
62 local dir=$1
63 local PG=$2
64 local primary=$3
65 local type=$4
66 local degraded_start=$5
67 local degraded_end=$6
68 local misplaced_start=$7
69 local misplaced_end=$8
70 local primary_start=${9:-}
71 local primary_end=${10:-}
72
73 local log=$dir/osd.${primary}.log
74
75 local addp=" "
76 if [ "$type" = "erasure" ];
77 then
78 addp="p"
79 fi
80
81 UPACT=$(eval $FIND_UPACT)
82
83 # Check 3rd line at start because of false recovery starts
84 local which="degraded"
85 FIRST=$(eval $FIND_FIRST)
86 below_margin $FIRST $degraded_start || return 1
87 LAST=$(eval $FIND_LAST)
88 above_margin $LAST $degraded_end || return 1
89
90 # Check 3rd line at start because of false recovery starts
91 which="misplaced"
92 FIRST=$(eval $FIND_FIRST)
93 below_margin $FIRST $misplaced_start || return 1
94 LAST=$(eval $FIND_LAST)
95 above_margin $LAST $misplaced_end || return 1
96
97 # This is the value of set into MISSING_ON_PRIMARY
98 if [ -n "$primary_start" ];
99 then
100 which="shard $primary"
101 FIRST=$(eval $FIND_FIRST)
102 below_margin $FIRST $primary_start || return 1
103 LAST=$(eval $FIND_LAST)
104 above_margin $LAST $primary_end || return 1
105 fi
106 }
107
108 # [1,0,?] -> [1,2,4]
109 # degraded 500 -> 0
110 # active+recovering+degraded
111
112 # PG_STAT OBJECTS MISSING_ON_PRIMARY DEGRADED MISPLACED UNFOUND BYTES LOG DISK_LOG STATE STATE_STAMP VERSION REPORTED UP UP_PRIMARY ACTING ACTING_PRIMARY LAST_SCRUB SCRUB_STAMP LAST_DEEP_SCRUB DEEP_SCRUB_STAMP
113 # 1.0 500 0 500 0 0 0 500 500 active+recovering+degraded 2017-11-17 19:27:36.493828 28'500 32:603 [1,2,4] 1 [1,2,4] 1 0'0 2017-11-17 19:27:05.915467 0'0 2017-11-17 19:27:05.915467
114 function do_recovery_out1() {
115 local dir=$1
116 shift
117 local type=$1
118
119 run_mon $dir a || return 1
120 run_mgr $dir x || return 1
121 run_osd $dir 0 || return 1
122 run_osd $dir 1 || return 1
123 run_osd $dir 2 || return 1
124 run_osd $dir 3 || return 1
125 run_osd $dir 4 || return 1
126 run_osd $dir 5 || return 1
127
128 if [ $type = "erasure" ];
129 then
130 ceph osd erasure-code-profile set myprofile plugin=jerasure technique=reed_sol_van k=2 m=1 crush-failure-domain=osd
131 create_pool $poolname 1 1 $type myprofile
132 else
133 create_pool $poolname 1 1 $type
134 fi
135
136 wait_for_clean || return 1
137
138 for i in $(seq 1 $objects)
139 do
140 rados -p $poolname put obj$i /dev/null
141 done
142
143 local primary=$(get_primary $poolname obj1)
144 local PG=$(get_pg $poolname obj1)
145 # Only 2 OSDs so only 1 not primary
146 local otherosd=$(get_not_primary $poolname obj1)
147
148 ceph osd set norecover
149 kill $(cat $dir/osd.${otherosd}.pid)
150 ceph osd down osd.${otherosd}
151 ceph osd out osd.${otherosd}
152 ceph osd unset norecover
153 ceph tell osd.$(get_primary $poolname obj1) debug kick_recovery_wq 0
154 sleep 2
155
156 wait_for_clean || return 1
157
158 check $dir $PG $primary $type $objects 0 0 0 || return 1
159
160 delete_pool $poolname
161 kill_daemons $dir || return 1
162 }
163
164 function TEST_recovery_replicated_out1() {
165 local dir=$1
166
167 do_recovery_out1 $dir replicated || return 1
168 }
169
170 function TEST_recovery_erasure_out1() {
171 local dir=$1
172
173 do_recovery_out1 $dir erasure || return 1
174 }
175
176 # [0, 1] -> [2,3,4,5]
177 # degraded 1000 -> 0
178 # misplaced 1000 -> 0
179 # missing on primary 500 -> 0
180
181 # PG_STAT OBJECTS MISSING_ON_PRIMARY DEGRADED MISPLACED UNFOUND BYTES LOG DISK_LOG STATE STATE_STAMP VERSION REPORTED UP UP_PRIMARY ACTING ACTING_PRIMARY LAST_SCRUB SCRUB_STAMP LAST_DEEP_SCRUB DEEP_SCRUB_STAMP
182 # 1.0 500 500 1000 1000 0 0 500 500 active+recovering+degraded 2017-10-27 09:38:37.453438 22'500 25:394 [2,4,3,5] 2 [2,4,3,5] 2 0'0 2017-10-27 09:37:58.046748 0'0 2017-10-27 09:37:58.046748
183 function TEST_recovery_sizeup() {
184 local dir=$1
185
186 run_mon $dir a || return 1
187 run_mgr $dir x || return 1
188 run_osd $dir 0 || return 1
189 run_osd $dir 1 || return 1
190 run_osd $dir 2 || return 1
191 run_osd $dir 3 || return 1
192 run_osd $dir 4 || return 1
193 run_osd $dir 5 || return 1
194
195 create_pool $poolname 1 1
196 ceph osd pool set $poolname size 2
197
198 wait_for_clean || return 1
199
200 for i in $(seq 1 $objects)
201 do
202 rados -p $poolname put obj$i /dev/null
203 done
204
205 local primary=$(get_primary $poolname obj1)
206 local PG=$(get_pg $poolname obj1)
207 # Only 2 OSDs so only 1 not primary
208 local otherosd=$(get_not_primary $poolname obj1)
209
210 ceph osd set norecover
211 ceph osd out osd.$primary osd.$otherosd
212 ceph osd pool set test size 4
213 ceph osd unset norecover
214 ceph tell osd.$(get_primary $poolname obj1) debug kick_recovery_wq 0
215 sleep 2
216
217 wait_for_clean || return 1
218
219 # Get new primary
220 primary=$(get_primary $poolname obj1)
221
222 local degraded=$(expr $objects \* 2)
223 local misplaced=$(expr $objects \* 2)
224 local log=$dir/osd.${primary}.log
225 check $dir $PG $primary replicated $degraded 0 $misplaced 0 $objects 0 || return 1
226
227 delete_pool $poolname
228 kill_daemons $dir || return 1
229 }
230
231 # [0, 1, 2, 4] -> [3, 5]
232 # misplaced 1000 -> 0
233 # missing on primary 500 -> 0
234 # active+recovering+degraded
235
236 # PG_STAT OBJECTS MISSING_ON_PRIMARY DEGRADED MISPLACED UNFOUND BYTES LOG DISK_LOG STATE STATE_STAMP VERSION REPORTED UP UP_PRIMARY ACTING ACTING_PRIMARY LAST_SCRUB SCRUB_STAMP LAST_DEEP_SCRUB DEEP_SCRUB_STAMP
237 # 1.0 500 500 0 1000 0 0 500 500 active+recovering+degraded 2017-10-27 09:34:50.012261 22'500 27:118 [3,5] 3 [3,5] 3 0'0 2017-10-27 09:34:08.617248 0'0 2017-10-27 09:34:08.617248
238 function TEST_recovery_sizedown() {
239 local dir=$1
240
241 run_mon $dir a || return 1
242 run_mgr $dir x || return 1
243 run_osd $dir 0 || return 1
244 run_osd $dir 1 || return 1
245 run_osd $dir 2 || return 1
246 run_osd $dir 3 || return 1
247 run_osd $dir 4 || return 1
248 run_osd $dir 5 || return 1
249
250 create_pool $poolname 1 1
251 ceph osd pool set $poolname size 4
252
253 wait_for_clean || return 1
254
255 for i in $(seq 1 $objects)
256 do
257 rados -p $poolname put obj$i /dev/null
258 done
259
260 local primary=$(get_primary $poolname obj1)
261 local PG=$(get_pg $poolname obj1)
262 # Only 2 OSDs so only 1 not primary
263 local allosds=$(get_osds $poolname obj1)
264
265 ceph osd set norecover
266 for osd in $allosds
267 do
268 ceph osd out osd.$osd
269 done
270
271 ceph osd pool set test size 2
272 ceph osd unset norecover
273 ceph tell osd.$(get_primary $poolname obj1) debug kick_recovery_wq 0
274 sleep 2
275
276 wait_for_clean || return 1
277
278 # Get new primary
279 primary=$(get_primary $poolname obj1)
280
281 local misplaced=$(expr $objects \* 2)
282 local log=$dir/osd.${primary}.log
283 check $dir $PG $primary replicated 0 0 $misplaced 0 || return 1
284
285 UPACT=$(grep "pg[[]${PG}.*recovering.*_update_calc_stats " $log | tail -1 | sed "s/.*[)] \([[][^ p]*\).*$/\1/")
286
287 # This is the value of set into MISSING_ON_PRIMARY
288 FIRST=$(grep "pg[[]${PG}.*recovering.*_update_calc_stats shard $primary " $log | grep -F " $UPACT " | head -1 | sed "s/.* \([0-9]*\)$/\1/")
289 below_margin $FIRST $objects || return 1
290 LAST=$(grep "pg[[]${PG}.*recovering.*_update_calc_stats shard $primary " $log | tail -1 | sed "s/.* \([0-9]*\)$/\1/")
291 above_margin $LAST 0 || return 1
292
293 delete_pool $poolname
294 kill_daemons $dir || return 1
295 }
296
297 # [1] -> [1,2]
298 # degraded 300 -> 200
299 # active+recovering+undersized+degraded
300
301 # PG_STAT OBJECTS MISSING_ON_PRIMARY DEGRADED MISPLACED UNFOUND BYTES LOG DISK_LOG STATE STATE_STAMP VERSION REPORTED UP UP_PRIMARY ACTING ACTING_PRIMARY LAST_SCRUB SCRUB_STAMP LAST_DEEP_SCRUB DEEP_SCRUB_STAMP
302 # 1.0 100 0 300 0 0 0 100 100 active+recovering+undersized+degraded 2017-11-17 17:16:15.302943 13'500 16:643 [1,2] 1 [1,2] 1 0'0 2017-11-17 17:15:34.985563 0'0 2017-11-17 17:15:34.985563
303 function TEST_recovery_undersized() {
304 local dir=$1
305
306 local osds=3
307 run_mon $dir a || return 1
308 run_mgr $dir x || return 1
309 for i in $(seq 0 $(expr $osds - 1))
310 do
311 run_osd $dir $i || return 1
312 done
313
314 create_pool $poolname 1 1
315 ceph osd pool set $poolname size 1
316
317 wait_for_clean || return 1
318
319 for i in $(seq 1 $objects)
320 do
321 rados -p $poolname put obj$i /dev/null
322 done
323
324 local primary=$(get_primary $poolname obj1)
325 local PG=$(get_pg $poolname obj1)
326
327 ceph osd set norecover
328 # Mark any osd not the primary (only 1 replica so also has no replica)
329 for i in $(seq 0 $(expr $osds - 1))
330 do
331 if [ $i = $primary ];
332 then
333 continue
334 fi
335 ceph osd out osd.$i
336 break
337 done
338 ceph osd pool set test size 4
339 ceph osd unset norecover
340 ceph tell osd.$(get_primary $poolname obj1) debug kick_recovery_wq 0
341 # Give extra sleep time because code below doesn't have the sophistication of wait_for_clean()
342 sleep 10
343 flush_pg_stats || return 1
344
345 # Wait for recovery to finish
346 # Can't use wait_for_clean() because state goes from active+recovering+undersized+degraded
347 # to active+undersized+degraded
348 for i in $(seq 1 60)
349 do
350 if ceph pg dump pgs | grep ^$PG | grep -qv recovering
351 then
352 break
353 fi
354 if [ $i = "60" ];
355 then
356 echo "Timeout waiting for recovery to finish"
357 return 1
358 fi
359 sleep 1
360 done
361
362 # Get new primary
363 primary=$(get_primary $poolname obj1)
364 local log=$dir/osd.${primary}.log
365
366 local first_degraded=$(expr $objects \* 3)
367 local last_degraded=$(expr $objects \* 2)
368 check $dir $PG $primary replicated $first_degraded $last_degraded 0 0 || return 1
369
370 delete_pool $poolname
371 kill_daemons $dir || return 1
372 }
373
374 # [1,0,2] -> [1,3,NONE]/[1,3,2]
375 # degraded 100 -> 0
376 # misplaced 100 -> 100
377 # active+recovering+degraded+remapped
378
379 # PG_STAT OBJECTS MISSING_ON_PRIMARY DEGRADED MISPLACED UNFOUND BYTES LOG DISK_LOG STATE STATE_STAMP VERSION REPORTED UP UP_PRIMARY ACTING ACTING_PRIMARY LAST_SCRUB SCRUB_STAMP LAST_DEEP_SCRUB DEEP_SCRUB_STAMP
380 # 1.0 100 0 100 100 0 0 100 100 active+recovering+degraded+remapped 2017-11-27 21:24:20.851243 18'500 23:618 [1,3,NONE] 1 [1,3,2] 1 0'0 2017-11-27 21:23:39.395242 0'0 2017-11-27 21:23:39.395242
381 function TEST_recovery_erasure_remapped() {
382 local dir=$1
383
384 run_mon $dir a || return 1
385 run_mgr $dir x || return 1
386 run_osd $dir 0 || return 1
387 run_osd $dir 1 || return 1
388 run_osd $dir 2 || return 1
389 run_osd $dir 3 || return 1
390
391 ceph osd erasure-code-profile set myprofile plugin=jerasure technique=reed_sol_van k=2 m=1 crush-failure-domain=osd
392 create_pool $poolname 1 1 erasure myprofile
393 ceph osd pool set $poolname min_size 2
394
395 wait_for_clean || return 1
396
397 for i in $(seq 1 $objects)
398 do
399 rados -p $poolname put obj$i /dev/null
400 done
401
402 local primary=$(get_primary $poolname obj1)
403 local PG=$(get_pg $poolname obj1)
404 local otherosd=$(get_not_primary $poolname obj1)
405
406 ceph osd set norecover
407 kill $(cat $dir/osd.${otherosd}.pid)
408 ceph osd down osd.${otherosd}
409 ceph osd out osd.${otherosd}
410
411 # Mark osd not the primary and not down/out osd as just out
412 for i in 0 1 2 3
413 do
414 if [ $i = $primary ];
415 then
416 continue
417 fi
418 if [ $i = $otherosd ];
419 then
420 continue
421 fi
422 ceph osd out osd.$i
423 break
424 done
425 ceph osd unset norecover
426 ceph tell osd.$(get_primary $poolname obj1) debug kick_recovery_wq 0
427 sleep 2
428
429 wait_for_clean || return 1
430
431 local log=$dir/osd.${primary}.log
432 check $dir $PG $primary erasure $objects 0 $objects $objects || return 1
433
434 delete_pool $poolname
435 kill_daemons $dir || return 1
436 }
437
438 main osd-recovery-stats "$@"
439
440 function TEST_recovery_multi() {
441 local dir=$1
442
443 local osds=6
444 run_mon $dir a || return 1
445 run_mgr $dir x || return 1
446 for i in $(seq 0 $(expr $osds - 1))
447 do
448 run_osd $dir $i || return 1
449 done
450
451 create_pool $poolname 1 1
452 ceph osd pool set $poolname size 3
453 ceph osd pool set $poolname min_size 1
454
455 wait_for_clean || return 1
456
457 rados -p $poolname put obj1 /dev/null
458
459 local primary=$(get_primary $poolname obj1)
460 local otherosd=$(get_not_primary $poolname obj1)
461
462 ceph osd set noout
463 ceph osd set norecover
464 kill $(cat $dir/osd.${otherosd}.pid)
465 ceph osd down osd.${otherosd}
466
467 local half=$(expr $objects / 2)
468 for i in $(seq 2 $half)
469 do
470 rados -p $poolname put obj$i /dev/null
471 done
472
473 kill $(cat $dir/osd.${primary}.pid)
474 ceph osd down osd.${primary}
475 run_osd $dir ${otherosd}
476 sleep 3
477
478 for i in $(seq $(expr $half + 1) $objects)
479 do
480 rados -p $poolname put obj$i /dev/null
481 done
482
483 local PG=$(get_pg $poolname obj1)
484 local otherosd=$(get_not_primary $poolname obj$objects)
485
486 ceph osd unset noout
487 ceph osd out osd.$primary osd.$otherosd
488 run_osd $dir ${primary}
489 sleep 3
490
491 ceph osd pool set test size 4
492 ceph osd unset norecover
493 ceph tell osd.$(get_primary $poolname obj1) debug kick_recovery_wq 0
494 sleep 2
495
496 wait_for_clean || return 1
497
498 # Get new primary
499 primary=$(get_primary $poolname obj1)
500
501 local log=$dir/osd.${primary}.log
502 check $dir $PG $primary replicated 399 0 300 0 99 0 || return 1
503
504 delete_pool $poolname
505 kill_daemons $dir || return 1
506 }
507
508
509 # Local Variables:
510 # compile-command: "make -j4 && ../qa/run-standalone.sh osd-recovery-stats.sh"
511 # End: