]> git.proxmox.com Git - ceph.git/blame - ceph/qa/standalone/osd/osd-recovery-stats.sh
import quincy beta 17.1.0
[ceph.git] / ceph / qa / standalone / osd / osd-recovery-stats.sh
CommitLineData
b32b8144
FG
1#!/usr/bin/env bash
2#
3# Copyright (C) 2017 Red Hat <contact@redhat.com>
4#
5# Author: David Zafman <dzafman@redhat.com>
6#
7# This program is free software; you can redistribute it and/or modify
8# it under the terms of the GNU Library Public License as published by
9# the Free Software Foundation; either version 2, or (at your option)
10# any later version.
11#
12# This program is distributed in the hope that it will be useful,
13# but WITHOUT ANY WARRANTY; without even the implied warranty of
14# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15# GNU Library Public License for more details.
16#
17
18source $CEPH_ROOT/qa/standalone/ceph-helpers.sh
19
20function run() {
21 local dir=$1
22 shift
23
24 # Fix port????
25 export CEPH_MON="127.0.0.1:7115" # git grep '\<7115\>' : there must be only one
26 export CEPH_ARGS
27 CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none "
28 CEPH_ARGS+="--mon-host=$CEPH_MON "
11fdf7f2
TL
29 # so we will not force auth_log_shard to be acting_primary
30 CEPH_ARGS+="--osd_force_auth_primary_missing_objects=1000000 "
20effc67
TL
31 # Use "high_recovery_ops" profile if mclock_scheduler is enabled.
32 CEPH_ARGS+="--osd-mclock-profile=high_recovery_ops "
b32b8144
FG
33 export margin=10
34 export objects=200
35 export poolname=test
36
37 local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')}
38 for func in $funcs ; do
39 setup $dir || return 1
40 $func $dir || return 1
41 teardown $dir || return 1
42 done
43}
44
45function below_margin() {
46 local -i check=$1
47 shift
48 local -i target=$1
49
50 return $(( $check <= $target && $check >= $target - $margin ? 0 : 1 ))
51}
52
53function above_margin() {
54 local -i check=$1
55 shift
56 local -i target=$1
57
58 return $(( $check >= $target && $check <= $target + $margin ? 0 : 1 ))
59}
60
9f95a23c
TL
61FIND_UPACT='grep "pg[[]${PG}.*recovering.*update_calc_stats " $log | tail -1 | sed "s/.*[)] \([[][^ p]*\).*$/\1/"'
62FIND_FIRST='grep "pg[[]${PG}.*recovering.*update_calc_stats $which " $log | grep -F " ${UPACT}${addp}" | grep -v est | head -1 | sed "s/.* \([0-9]*\)$/\1/"'
63FIND_LAST='grep "pg[[]${PG}.*recovering.*update_calc_stats $which " $log | tail -1 | sed "s/.* \([0-9]*\)$/\1/"'
91327a77 64
b32b8144 65function check() {
91327a77
AA
66 local dir=$1
67 local PG=$2
68 local primary=$3
69 local type=$4
70 local degraded_start=$5
71 local degraded_end=$6
72 local misplaced_start=$7
73 local misplaced_end=$8
74 local primary_start=${9:-}
75 local primary_end=${10:-}
76
77 local log=$dir/osd.${primary}.log
b32b8144
FG
78
79 local addp=" "
80 if [ "$type" = "erasure" ];
81 then
82 addp="p"
83 fi
84
91327a77 85 UPACT=$(eval $FIND_UPACT)
b32b8144
FG
86
87 # Check 3rd line at start because of false recovery starts
91327a77
AA
88 local which="degraded"
89 FIRST=$(eval $FIND_FIRST)
b32b8144 90 below_margin $FIRST $degraded_start || return 1
91327a77 91 LAST=$(eval $FIND_LAST)
b32b8144
FG
92 above_margin $LAST $degraded_end || return 1
93
94 # Check 3rd line at start because of false recovery starts
91327a77
AA
95 which="misplaced"
96 FIRST=$(eval $FIND_FIRST)
b32b8144 97 below_margin $FIRST $misplaced_start || return 1
91327a77 98 LAST=$(eval $FIND_LAST)
b32b8144 99 above_margin $LAST $misplaced_end || return 1
91327a77
AA
100
101 # This is the value of set into MISSING_ON_PRIMARY
102 if [ -n "$primary_start" ];
103 then
104 which="shard $primary"
105 FIRST=$(eval $FIND_FIRST)
106 below_margin $FIRST $primary_start || return 1
107 LAST=$(eval $FIND_LAST)
108 above_margin $LAST $primary_end || return 1
109 fi
b32b8144
FG
110}
111
112# [1,0,?] -> [1,2,4]
113# degraded 500 -> 0
114# active+recovering+degraded
115
116# PG_STAT OBJECTS MISSING_ON_PRIMARY DEGRADED MISPLACED UNFOUND BYTES LOG DISK_LOG STATE STATE_STAMP VERSION REPORTED UP UP_PRIMARY ACTING ACTING_PRIMARY LAST_SCRUB SCRUB_STAMP LAST_DEEP_SCRUB DEEP_SCRUB_STAMP
117# 1.0 500 0 500 0 0 0 500 500 active+recovering+degraded 2017-11-17 19:27:36.493828 28'500 32:603 [1,2,4] 1 [1,2,4] 1 0'0 2017-11-17 19:27:05.915467 0'0 2017-11-17 19:27:05.915467
118function do_recovery_out1() {
119 local dir=$1
120 shift
121 local type=$1
122
123 run_mon $dir a || return 1
124 run_mgr $dir x || return 1
125 run_osd $dir 0 || return 1
126 run_osd $dir 1 || return 1
127 run_osd $dir 2 || return 1
128 run_osd $dir 3 || return 1
129 run_osd $dir 4 || return 1
130 run_osd $dir 5 || return 1
131
132 if [ $type = "erasure" ];
133 then
134 ceph osd erasure-code-profile set myprofile plugin=jerasure technique=reed_sol_van k=2 m=1 crush-failure-domain=osd
135 create_pool $poolname 1 1 $type myprofile
136 else
137 create_pool $poolname 1 1 $type
138 fi
139
140 wait_for_clean || return 1
141
142 for i in $(seq 1 $objects)
143 do
144 rados -p $poolname put obj$i /dev/null
145 done
146
147 local primary=$(get_primary $poolname obj1)
148 local PG=$(get_pg $poolname obj1)
149 # Only 2 OSDs so only 1 not primary
150 local otherosd=$(get_not_primary $poolname obj1)
151
152 ceph osd set norecover
153 kill $(cat $dir/osd.${otherosd}.pid)
154 ceph osd down osd.${otherosd}
155 ceph osd out osd.${otherosd}
156 ceph osd unset norecover
157 ceph tell osd.$(get_primary $poolname obj1) debug kick_recovery_wq 0
158 sleep 2
159
160 wait_for_clean || return 1
161
91327a77 162 check $dir $PG $primary $type $objects 0 0 0 || return 1
b32b8144
FG
163
164 delete_pool $poolname
165 kill_daemons $dir || return 1
166}
167
168function TEST_recovery_replicated_out1() {
169 local dir=$1
170
171 do_recovery_out1 $dir replicated || return 1
172}
173
174function TEST_recovery_erasure_out1() {
175 local dir=$1
176
177 do_recovery_out1 $dir erasure || return 1
178}
179
180# [0, 1] -> [2,3,4,5]
91327a77
AA
181# degraded 1000 -> 0
182# misplaced 1000 -> 0
b32b8144
FG
183# missing on primary 500 -> 0
184
185# PG_STAT OBJECTS MISSING_ON_PRIMARY DEGRADED MISPLACED UNFOUND BYTES LOG DISK_LOG STATE STATE_STAMP VERSION REPORTED UP UP_PRIMARY ACTING ACTING_PRIMARY LAST_SCRUB SCRUB_STAMP LAST_DEEP_SCRUB DEEP_SCRUB_STAMP
91327a77 186# 1.0 500 500 1000 1000 0 0 500 500 active+recovering+degraded 2017-10-27 09:38:37.453438 22'500 25:394 [2,4,3,5] 2 [2,4,3,5] 2 0'0 2017-10-27 09:37:58.046748 0'0 2017-10-27 09:37:58.046748
b32b8144
FG
187function TEST_recovery_sizeup() {
188 local dir=$1
189
190 run_mon $dir a || return 1
191 run_mgr $dir x || return 1
192 run_osd $dir 0 || return 1
193 run_osd $dir 1 || return 1
194 run_osd $dir 2 || return 1
195 run_osd $dir 3 || return 1
196 run_osd $dir 4 || return 1
197 run_osd $dir 5 || return 1
198
199 create_pool $poolname 1 1
200 ceph osd pool set $poolname size 2
201
202 wait_for_clean || return 1
203
204 for i in $(seq 1 $objects)
205 do
206 rados -p $poolname put obj$i /dev/null
207 done
208
209 local primary=$(get_primary $poolname obj1)
210 local PG=$(get_pg $poolname obj1)
211 # Only 2 OSDs so only 1 not primary
212 local otherosd=$(get_not_primary $poolname obj1)
213
214 ceph osd set norecover
215 ceph osd out osd.$primary osd.$otherosd
216 ceph osd pool set test size 4
217 ceph osd unset norecover
11fdf7f2
TL
218 # Get new primary
219 primary=$(get_primary $poolname obj1)
220
221 ceph tell osd.${primary} debug kick_recovery_wq 0
b32b8144
FG
222 sleep 2
223
224 wait_for_clean || return 1
225
91327a77
AA
226 local degraded=$(expr $objects \* 2)
227 local misplaced=$(expr $objects \* 2)
b32b8144 228 local log=$dir/osd.${primary}.log
91327a77 229 check $dir $PG $primary replicated $degraded 0 $misplaced 0 $objects 0 || return 1
b32b8144
FG
230
231 delete_pool $poolname
232 kill_daemons $dir || return 1
233}
234
235# [0, 1, 2, 4] -> [3, 5]
91327a77 236# misplaced 1000 -> 0
b32b8144
FG
237# missing on primary 500 -> 0
238# active+recovering+degraded
239
240# PG_STAT OBJECTS MISSING_ON_PRIMARY DEGRADED MISPLACED UNFOUND BYTES LOG DISK_LOG STATE STATE_STAMP VERSION REPORTED UP UP_PRIMARY ACTING ACTING_PRIMARY LAST_SCRUB SCRUB_STAMP LAST_DEEP_SCRUB DEEP_SCRUB_STAMP
91327a77 241# 1.0 500 500 0 1000 0 0 500 500 active+recovering+degraded 2017-10-27 09:34:50.012261 22'500 27:118 [3,5] 3 [3,5] 3 0'0 2017-10-27 09:34:08.617248 0'0 2017-10-27 09:34:08.617248
b32b8144
FG
242function TEST_recovery_sizedown() {
243 local dir=$1
244
245 run_mon $dir a || return 1
246 run_mgr $dir x || return 1
247 run_osd $dir 0 || return 1
248 run_osd $dir 1 || return 1
249 run_osd $dir 2 || return 1
250 run_osd $dir 3 || return 1
251 run_osd $dir 4 || return 1
252 run_osd $dir 5 || return 1
253
254 create_pool $poolname 1 1
255 ceph osd pool set $poolname size 4
256
257 wait_for_clean || return 1
258
259 for i in $(seq 1 $objects)
260 do
261 rados -p $poolname put obj$i /dev/null
262 done
263
264 local primary=$(get_primary $poolname obj1)
265 local PG=$(get_pg $poolname obj1)
266 # Only 2 OSDs so only 1 not primary
267 local allosds=$(get_osds $poolname obj1)
268
269 ceph osd set norecover
270 for osd in $allosds
271 do
272 ceph osd out osd.$osd
273 done
274
275 ceph osd pool set test size 2
276 ceph osd unset norecover
277 ceph tell osd.$(get_primary $poolname obj1) debug kick_recovery_wq 0
278 sleep 2
279
280 wait_for_clean || return 1
281
282 # Get new primary
283 primary=$(get_primary $poolname obj1)
284
91327a77 285 local misplaced=$(expr $objects \* 2)
b32b8144 286 local log=$dir/osd.${primary}.log
91327a77 287 check $dir $PG $primary replicated 0 0 $misplaced 0 || return 1
b32b8144 288
9f95a23c 289 UPACT=$(grep "pg[[]${PG}.*recovering.*update_calc_stats " $log | tail -1 | sed "s/.*[)] \([[][^ p]*\).*$/\1/")
b32b8144
FG
290
291 # This is the value of set into MISSING_ON_PRIMARY
9f95a23c 292 FIRST=$(grep "pg[[]${PG}.*recovering.*update_calc_stats shard $primary " $log | grep -F " $UPACT " | head -1 | sed "s/.* \([0-9]*\)$/\1/")
b32b8144 293 below_margin $FIRST $objects || return 1
9f95a23c 294 LAST=$(grep "pg[[]${PG}.*recovering.*update_calc_stats shard $primary " $log | tail -1 | sed "s/.* \([0-9]*\)$/\1/")
b32b8144
FG
295 above_margin $LAST 0 || return 1
296
297 delete_pool $poolname
298 kill_daemons $dir || return 1
299}
300
301# [1] -> [1,2]
91327a77 302# degraded 300 -> 200
b32b8144
FG
303# active+recovering+undersized+degraded
304
305# PG_STAT OBJECTS MISSING_ON_PRIMARY DEGRADED MISPLACED UNFOUND BYTES LOG DISK_LOG STATE STATE_STAMP VERSION REPORTED UP UP_PRIMARY ACTING ACTING_PRIMARY LAST_SCRUB SCRUB_STAMP LAST_DEEP_SCRUB DEEP_SCRUB_STAMP
91327a77 306# 1.0 100 0 300 0 0 0 100 100 active+recovering+undersized+degraded 2017-11-17 17:16:15.302943 13'500 16:643 [1,2] 1 [1,2] 1 0'0 2017-11-17 17:15:34.985563 0'0 2017-11-17 17:15:34.985563
b32b8144
FG
307function TEST_recovery_undersized() {
308 local dir=$1
309
91327a77 310 local osds=3
b32b8144
FG
311 run_mon $dir a || return 1
312 run_mgr $dir x || return 1
91327a77
AA
313 for i in $(seq 0 $(expr $osds - 1))
314 do
315 run_osd $dir $i || return 1
316 done
b32b8144
FG
317
318 create_pool $poolname 1 1
f67539c2 319 ceph osd pool set $poolname size 1 --yes-i-really-mean-it
b32b8144
FG
320
321 wait_for_clean || return 1
322
323 for i in $(seq 1 $objects)
324 do
325 rados -p $poolname put obj$i /dev/null
326 done
327
328 local primary=$(get_primary $poolname obj1)
329 local PG=$(get_pg $poolname obj1)
330
331 ceph osd set norecover
332 # Mark any osd not the primary (only 1 replica so also has no replica)
91327a77 333 for i in $(seq 0 $(expr $osds - 1))
b32b8144
FG
334 do
335 if [ $i = $primary ];
336 then
337 continue
338 fi
339 ceph osd out osd.$i
340 break
341 done
91327a77 342 ceph osd pool set test size 4
b32b8144
FG
343 ceph osd unset norecover
344 ceph tell osd.$(get_primary $poolname obj1) debug kick_recovery_wq 0
345 # Give extra sleep time because code below doesn't have the sophistication of wait_for_clean()
346 sleep 10
91327a77 347 flush_pg_stats || return 1
b32b8144
FG
348
349 # Wait for recovery to finish
350 # Can't use wait_for_clean() because state goes from active+recovering+undersized+degraded
351 # to active+undersized+degraded
20effc67 352 for i in $(seq 1 300)
b32b8144
FG
353 do
354 if ceph pg dump pgs | grep ^$PG | grep -qv recovering
355 then
356 break
357 fi
20effc67 358 if [ $i = "300" ];
b32b8144
FG
359 then
360 echo "Timeout waiting for recovery to finish"
361 return 1
362 fi
363 sleep 1
364 done
365
366 # Get new primary
367 primary=$(get_primary $poolname obj1)
368 local log=$dir/osd.${primary}.log
369
91327a77
AA
370 local first_degraded=$(expr $objects \* 3)
371 local last_degraded=$(expr $objects \* 2)
372 check $dir $PG $primary replicated $first_degraded $last_degraded 0 0 || return 1
b32b8144
FG
373
374 delete_pool $poolname
375 kill_daemons $dir || return 1
376}
377
378# [1,0,2] -> [1,3,NONE]/[1,3,2]
379# degraded 100 -> 0
380# misplaced 100 -> 100
381# active+recovering+degraded+remapped
382
383# PG_STAT OBJECTS MISSING_ON_PRIMARY DEGRADED MISPLACED UNFOUND BYTES LOG DISK_LOG STATE STATE_STAMP VERSION REPORTED UP UP_PRIMARY ACTING ACTING_PRIMARY LAST_SCRUB SCRUB_STAMP LAST_DEEP_SCRUB DEEP_SCRUB_STAMP
384# 1.0 100 0 100 100 0 0 100 100 active+recovering+degraded+remapped 2017-11-27 21:24:20.851243 18'500 23:618 [1,3,NONE] 1 [1,3,2] 1 0'0 2017-11-27 21:23:39.395242 0'0 2017-11-27 21:23:39.395242
385function TEST_recovery_erasure_remapped() {
386 local dir=$1
387
388 run_mon $dir a || return 1
389 run_mgr $dir x || return 1
390 run_osd $dir 0 || return 1
391 run_osd $dir 1 || return 1
392 run_osd $dir 2 || return 1
393 run_osd $dir 3 || return 1
394
395 ceph osd erasure-code-profile set myprofile plugin=jerasure technique=reed_sol_van k=2 m=1 crush-failure-domain=osd
396 create_pool $poolname 1 1 erasure myprofile
397 ceph osd pool set $poolname min_size 2
398
399 wait_for_clean || return 1
400
401 for i in $(seq 1 $objects)
402 do
403 rados -p $poolname put obj$i /dev/null
404 done
405
406 local primary=$(get_primary $poolname obj1)
407 local PG=$(get_pg $poolname obj1)
408 local otherosd=$(get_not_primary $poolname obj1)
409
410 ceph osd set norecover
411 kill $(cat $dir/osd.${otherosd}.pid)
412 ceph osd down osd.${otherosd}
413 ceph osd out osd.${otherosd}
414
415 # Mark osd not the primary and not down/out osd as just out
416 for i in 0 1 2 3
417 do
418 if [ $i = $primary ];
419 then
420 continue
421 fi
422 if [ $i = $otherosd ];
423 then
424 continue
425 fi
426 ceph osd out osd.$i
427 break
428 done
429 ceph osd unset norecover
430 ceph tell osd.$(get_primary $poolname obj1) debug kick_recovery_wq 0
431 sleep 2
432
433 wait_for_clean || return 1
434
435 local log=$dir/osd.${primary}.log
91327a77 436 check $dir $PG $primary erasure $objects 0 $objects $objects || return 1
b32b8144
FG
437
438 delete_pool $poolname
439 kill_daemons $dir || return 1
440}
441
91327a77
AA
442function TEST_recovery_multi() {
443 local dir=$1
444
445 local osds=6
446 run_mon $dir a || return 1
447 run_mgr $dir x || return 1
448 for i in $(seq 0 $(expr $osds - 1))
449 do
450 run_osd $dir $i || return 1
451 done
452
453 create_pool $poolname 1 1
454 ceph osd pool set $poolname size 3
455 ceph osd pool set $poolname min_size 1
456
457 wait_for_clean || return 1
458
459 rados -p $poolname put obj1 /dev/null
460
461 local primary=$(get_primary $poolname obj1)
462 local otherosd=$(get_not_primary $poolname obj1)
463
464 ceph osd set noout
465 ceph osd set norecover
466 kill $(cat $dir/osd.${otherosd}.pid)
467 ceph osd down osd.${otherosd}
468
469 local half=$(expr $objects / 2)
470 for i in $(seq 2 $half)
471 do
472 rados -p $poolname put obj$i /dev/null
473 done
474
475 kill $(cat $dir/osd.${primary}.pid)
476 ceph osd down osd.${primary}
9f95a23c 477 activate_osd $dir ${otherosd}
91327a77
AA
478 sleep 3
479
480 for i in $(seq $(expr $half + 1) $objects)
481 do
482 rados -p $poolname put obj$i /dev/null
483 done
484
485 local PG=$(get_pg $poolname obj1)
486 local otherosd=$(get_not_primary $poolname obj$objects)
487
488 ceph osd unset noout
489 ceph osd out osd.$primary osd.$otherosd
9f95a23c 490 activate_osd $dir ${primary}
91327a77
AA
491 sleep 3
492
493 ceph osd pool set test size 4
494 ceph osd unset norecover
495 ceph tell osd.$(get_primary $poolname obj1) debug kick_recovery_wq 0
496 sleep 2
497
498 wait_for_clean || return 1
499
500 # Get new primary
501 primary=$(get_primary $poolname obj1)
502
503 local log=$dir/osd.${primary}.log
504 check $dir $PG $primary replicated 399 0 300 0 99 0 || return 1
505
506 delete_pool $poolname
507 kill_daemons $dir || return 1
508}
509
11fdf7f2 510main osd-recovery-stats "$@"
91327a77 511
b32b8144
FG
512# Local Variables:
513# compile-command: "make -j4 && ../qa/run-standalone.sh osd-recovery-stats.sh"
514# End: