]> git.proxmox.com Git - ceph.git/blob - ceph/qa/standalone/osd/osd-recovery-stats.sh
update sources to 12.2.8
[ceph.git] / ceph / qa / standalone / osd / osd-recovery-stats.sh
1 #!/usr/bin/env bash
2 #
3 # Copyright (C) 2017 Red Hat <contact@redhat.com>
4 #
5 # Author: David Zafman <dzafman@redhat.com>
6 #
7 # This program is free software; you can redistribute it and/or modify
8 # it under the terms of the GNU Library Public License as published by
9 # the Free Software Foundation; either version 2, or (at your option)
10 # any later version.
11 #
12 # This program is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU Library Public License for more details.
16 #
17
18 source $CEPH_ROOT/qa/standalone/ceph-helpers.sh
19
20 function run() {
21 local dir=$1
22 shift
23
24 # Fix port????
25 export CEPH_MON="127.0.0.1:7115" # git grep '\<7115\>' : there must be only one
26 export CEPH_ARGS
27 CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none "
28 CEPH_ARGS+="--mon-host=$CEPH_MON "
29 export margin=10
30 export objects=200
31 export poolname=test
32
33 local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')}
34 for func in $funcs ; do
35 setup $dir || return 1
36 $func $dir || return 1
37 teardown $dir || return 1
38 done
39 }
40
41 function below_margin() {
42 local -i check=$1
43 shift
44 local -i target=$1
45
46 return $(( $check <= $target && $check >= $target - $margin ? 0 : 1 ))
47 }
48
49 function above_margin() {
50 local -i check=$1
51 shift
52 local -i target=$1
53
54 return $(( $check >= $target && $check <= $target + $margin ? 0 : 1 ))
55 }
56
57 function check() {
58 local PG=$1
59 local log=$2
60 local degraded_start=$3
61 local degraded_end=$4
62 local misplaced_start=$5
63 local misplaced_end=$6
64 local type=$7
65
66 local addp=" "
67 if [ "$type" = "erasure" ];
68 then
69 addp="p"
70 fi
71
72 UPACT=$(grep "pg[[]${PG}.*recovering.*_update_calc_stats " $log | tail -1 | sed "s/.*[)] \([[][^ p]*\).*$/\1/")
73
74 # Check 3rd line at start because of false recovery starts
75 FIRST=$(grep "pg[[]${PG}.*recovering.*_update_calc_stats degraded " $log | grep -F " ${UPACT}${addp}" | head -1 | sed "s/.* \([0-9]*\)$/\1/")
76 below_margin $FIRST $degraded_start || return 1
77 LAST=$(grep "pg[[]${PG}.*recovering.*_update_calc_stats degraded " $log | tail -1 | sed "s/.* \([0-9]*\)$/\1/")
78 above_margin $LAST $degraded_end || return 1
79
80 # Check 3rd line at start because of false recovery starts
81 FIRST=$(grep "pg[[]${PG}.*recovering.*_update_calc_stats misplaced " $log | grep -F " ${UPACT}${addp}" | head -1 | sed "s/.* \([0-9]*\)$/\1/")
82 below_margin $FIRST $misplaced_start || return 1
83 LAST=$(grep "pg[[]${PG}.*recovering.*_update_calc_stats misplaced " $log | tail -1 | sed "s/.* \([0-9]*\)$/\1/")
84 above_margin $LAST $misplaced_end || return 1
85 }
86
87 # [1,0,?] -> [1,2,4]
88 # degraded 500 -> 0
89 # active+recovering+degraded
90
91 # PG_STAT OBJECTS MISSING_ON_PRIMARY DEGRADED MISPLACED UNFOUND BYTES LOG DISK_LOG STATE STATE_STAMP VERSION REPORTED UP UP_PRIMARY ACTING ACTING_PRIMARY LAST_SCRUB SCRUB_STAMP LAST_DEEP_SCRUB DEEP_SCRUB_STAMP
92 # 1.0 500 0 500 0 0 0 500 500 active+recovering+degraded 2017-11-17 19:27:36.493828 28'500 32:603 [1,2,4] 1 [1,2,4] 1 0'0 2017-11-17 19:27:05.915467 0'0 2017-11-17 19:27:05.915467
93 function do_recovery_out1() {
94 local dir=$1
95 shift
96 local type=$1
97
98 run_mon $dir a || return 1
99 run_mgr $dir x || return 1
100 run_osd $dir 0 || return 1
101 run_osd $dir 1 || return 1
102 run_osd $dir 2 || return 1
103 run_osd $dir 3 || return 1
104 run_osd $dir 4 || return 1
105 run_osd $dir 5 || return 1
106
107 if [ $type = "erasure" ];
108 then
109 ceph osd erasure-code-profile set myprofile plugin=jerasure technique=reed_sol_van k=2 m=1 crush-failure-domain=osd
110 create_pool $poolname 1 1 $type myprofile
111 else
112 create_pool $poolname 1 1 $type
113 fi
114
115 wait_for_clean || return 1
116
117 for i in $(seq 1 $objects)
118 do
119 rados -p $poolname put obj$i /dev/null
120 done
121
122 local primary=$(get_primary $poolname obj1)
123 local PG=$(get_pg $poolname obj1)
124 # Only 2 OSDs so only 1 not primary
125 local otherosd=$(get_not_primary $poolname obj1)
126
127 ceph osd set norecover
128 kill $(cat $dir/osd.${otherosd}.pid)
129 ceph osd down osd.${otherosd}
130 ceph osd out osd.${otherosd}
131 ceph osd unset norecover
132 ceph tell osd.$(get_primary $poolname obj1) debug kick_recovery_wq 0
133 sleep 2
134
135 wait_for_clean || return 1
136
137 local log=$dir/osd.${primary}.log
138 check $PG $log $objects 0 0 0 $type || return 1
139
140 delete_pool $poolname
141 kill_daemons $dir || return 1
142 }
143
144 function TEST_recovery_replicated_out1() {
145 local dir=$1
146
147 do_recovery_out1 $dir replicated || return 1
148 }
149
150 function TEST_recovery_erasure_out1() {
151 local dir=$1
152
153 do_recovery_out1 $dir erasure || return 1
154 }
155
156 # [0, 1] -> [2,3,4,5]
157 # degraded 2000 -> 0
158 # missing on primary 500 -> 0
159
160 # PG_STAT OBJECTS MISSING_ON_PRIMARY DEGRADED MISPLACED UNFOUND BYTES LOG DISK_LOG STATE STATE_STAMP VERSION REPORTED UP UP_PRIMARY ACTING ACTING_PRIMARY LAST_SCRUB SCRUB_STAMP LAST_DEEP_SCRUB DEEP_SCRUB_STAMP
161 # 1.0 500 500 2000 0 0 0 500 500 active+recovering+degraded 2017-10-27 09:38:37.453438 22'500 25:394 [2,4,3,5] 2 [2,4,3,5] 2 0'0 2017-10-27 09:37:58.046748 0'0 2017-10-27 09:37:58.046748
162 function TEST_recovery_sizeup() {
163 local dir=$1
164
165 run_mon $dir a || return 1
166 run_mgr $dir x || return 1
167 run_osd $dir 0 || return 1
168 run_osd $dir 1 || return 1
169 run_osd $dir 2 || return 1
170 run_osd $dir 3 || return 1
171 run_osd $dir 4 || return 1
172 run_osd $dir 5 || return 1
173
174 create_pool $poolname 1 1
175 ceph osd pool set $poolname size 2
176
177 wait_for_clean || return 1
178
179 for i in $(seq 1 $objects)
180 do
181 rados -p $poolname put obj$i /dev/null
182 done
183
184 local primary=$(get_primary $poolname obj1)
185 local PG=$(get_pg $poolname obj1)
186 # Only 2 OSDs so only 1 not primary
187 local otherosd=$(get_not_primary $poolname obj1)
188
189 ceph osd set norecover
190 ceph osd out osd.$primary osd.$otherosd
191 ceph osd pool set test size 4
192 ceph osd unset norecover
193 ceph tell osd.$(get_primary $poolname obj1) debug kick_recovery_wq 0
194 sleep 2
195
196 wait_for_clean || return 1
197
198 # Get new primary
199 primary=$(get_primary $poolname obj1)
200
201 local degraded=$(expr $objects \* 4)
202 local log=$dir/osd.${primary}.log
203 check $PG $log $degraded 0 0 0 || return 1
204
205 UPACT=$(grep "pg[[]${PG}.*recovering.*_update_calc_stats " $log | tail -1 | sed "s/.*[)] \([[][^ p]*\).*$/\1/")
206
207 # This is the value of set into MISSING_ON_PRIMARY
208 FIRST=$(grep "pg[[]${PG}.*recovering.*_update_calc_stats missing shard $primary " $log | grep -F " $UPACT " | head -1 | sed "s/.* \([0-9]*\)$/\1/")
209 below_margin $FIRST $objects || return 1
210 LAST=$(grep "pg[[]${PG}.*recovering.*_update_calc_stats missing shard $primary " $log | tail -1 | sed "s/.* \([0-9]*\)$/\1/")
211 above_margin $LAST 0 || return 1
212
213 delete_pool $poolname
214 kill_daemons $dir || return 1
215 }
216
217 # [0, 1, 2, 4] -> [3, 5]
218 # degraded 1000 -> 0
219 # missing on primary 500 -> 0
220 # active+recovering+degraded
221
222 # PG_STAT OBJECTS MISSING_ON_PRIMARY DEGRADED MISPLACED UNFOUND BYTES LOG DISK_LOG STATE STATE_STAMP VERSION REPORTED UP UP_PRIMARY ACTING ACTING_PRIMARY LAST_SCRUB SCRUB_STAMP LAST_DEEP_SCRUB DEEP_SCRUB_STAMP
223 # 1.0 500 500 1000 0 0 0 500 500 active+recovering+degraded 2017-10-27 09:34:50.012261 22'500 27:118 [3,5] 3 [3,5] 3 0'0 2017-10-27 09:34:08.617248 0'0 2017-10-27 09:34:08.617248
224 function TEST_recovery_sizedown() {
225 local dir=$1
226
227 run_mon $dir a || return 1
228 run_mgr $dir x || return 1
229 run_osd $dir 0 || return 1
230 run_osd $dir 1 || return 1
231 run_osd $dir 2 || return 1
232 run_osd $dir 3 || return 1
233 run_osd $dir 4 || return 1
234 run_osd $dir 5 || return 1
235
236 create_pool $poolname 1 1
237 ceph osd pool set $poolname size 4
238
239 wait_for_clean || return 1
240
241 for i in $(seq 1 $objects)
242 do
243 rados -p $poolname put obj$i /dev/null
244 done
245
246 local primary=$(get_primary $poolname obj1)
247 local PG=$(get_pg $poolname obj1)
248 # Only 2 OSDs so only 1 not primary
249 local allosds=$(get_osds $poolname obj1)
250
251 ceph osd set norecover
252 for osd in $allosds
253 do
254 ceph osd out osd.$osd
255 done
256
257 ceph osd pool set test size 2
258 ceph osd unset norecover
259 ceph tell osd.$(get_primary $poolname obj1) debug kick_recovery_wq 0
260 sleep 2
261
262 wait_for_clean || return 1
263
264 # Get new primary
265 primary=$(get_primary $poolname obj1)
266
267 local degraded=$(expr $objects \* 2)
268 local log=$dir/osd.${primary}.log
269 check $PG $log $degraded 0 0 0 || return 1
270
271 UPACT=$(grep "pg[[]${PG}.*recovering.*_update_calc_stats " $log | tail -1 | sed "s/.*[)] \([[][^ p]*\).*$/\1/")
272
273 # This is the value of set into MISSING_ON_PRIMARY
274 FIRST=$(grep "pg[[]${PG}.*recovering.*_update_calc_stats missing shard $primary " $log | grep -F " $UPACT " | head -1 | sed "s/.* \([0-9]*\)$/\1/")
275 below_margin $FIRST $objects || return 1
276 LAST=$(grep "pg[[]${PG}.*recovering.*_update_calc_stats missing shard $primary " $log | tail -1 | sed "s/.* \([0-9]*\)$/\1/")
277 above_margin $LAST 0 || return 1
278
279 delete_pool $poolname
280 kill_daemons $dir || return 1
281 }
282
283 # [1] -> [1,2]
284 # degraded 200 -> 100
285 # active+recovering+undersized+degraded
286
287 # PG_STAT OBJECTS MISSING_ON_PRIMARY DEGRADED MISPLACED UNFOUND BYTES LOG DISK_LOG STATE STATE_STAMP VERSION REPORTED UP UP_PRIMARY ACTING ACTING_PRIMARY LAST_SCRUB SCRUB_STAMP LAST_DEEP_SCRUB DEEP_SCRUB_STAMP
288 # 1.0 100 0 200 0 0 0 100 100 active+recovering+undersized+degraded 2017-11-17 17:16:15.302943 13'500 16:643 [1,2] 1 [1,2] 1 0'0 2017-11-17 17:15:34.985563 0'0 2017-11-17 17:15:34.985563
289 function TEST_recovery_undersized() {
290 local dir=$1
291
292 run_mon $dir a || return 1
293 run_mgr $dir x || return 1
294 run_osd $dir 0 || return 1
295 run_osd $dir 1 || return 1
296 run_osd $dir 2 || return 1
297
298 create_pool $poolname 1 1
299 ceph osd pool set $poolname size 1
300
301 wait_for_clean || return 1
302
303 for i in $(seq 1 $objects)
304 do
305 rados -p $poolname put obj$i /dev/null
306 done
307
308 local primary=$(get_primary $poolname obj1)
309 local PG=$(get_pg $poolname obj1)
310
311 ceph osd set norecover
312 # Mark any osd not the primary (only 1 replica so also has no replica)
313 for i in 0 1 2
314 do
315 if [ $i = $primary ];
316 then
317 continue
318 fi
319 ceph osd out osd.$i
320 break
321 done
322 ceph osd pool set test size 3
323 ceph osd unset norecover
324 ceph tell osd.$(get_primary $poolname obj1) debug kick_recovery_wq 0
325 # Give extra sleep time because code below doesn't have the sophistication of wait_for_clean()
326 sleep 10
327 flush_pg_stats
328
329 # Wait for recovery to finish
330 # Can't use wait_for_clean() because state goes from active+recovering+undersized+degraded
331 # to active+undersized+degraded
332 for i in $(seq 1 60)
333 do
334 if ceph pg dump pgs | grep ^$PG | grep -qv recovering
335 then
336 break
337 fi
338 if [ $i = "60" ];
339 then
340 echo "Timeout waiting for recovery to finish"
341 return 1
342 fi
343 sleep 1
344 done
345
346 # Get new primary
347 primary=$(get_primary $poolname obj1)
348 local log=$dir/osd.${primary}.log
349
350 UPACT=$(grep "pg[[]${PG}.*recovering.*_update_calc_stats " $log | tail -1 | sed "s/.*[)] \([[][^ p]*\).*$/\1/")
351
352 local degraded=$(expr $objects \* 2)
353 FIRST=$(grep "pg[[]${PG}.*recovering.*_update_calc_stats degraded " $log | grep -F " $UPACT " | head -1 | sed "s/.* \([0-9]*\)$/\1/")
354 below_margin $FIRST $degraded || return 1
355 LAST=$(grep "pg[[]${PG}.*recovering.*_update_calc_stats degraded " $log | tail -1 | sed "s/.* \([0-9]*\)$/\1/")
356 above_margin $LAST $objects || return 1
357
358 delete_pool $poolname
359 kill_daemons $dir || return 1
360 }
361
362 # [1,0,2] -> [1,3,NONE]/[1,3,2]
363 # degraded 100 -> 0
364 # misplaced 100 -> 100
365 # active+recovering+degraded+remapped
366
367 # PG_STAT OBJECTS MISSING_ON_PRIMARY DEGRADED MISPLACED UNFOUND BYTES LOG DISK_LOG STATE STATE_STAMP VERSION REPORTED UP UP_PRIMARY ACTING ACTING_PRIMARY LAST_SCRUB SCRUB_STAMP LAST_DEEP_SCRUB DEEP_SCRUB_STAMP
368 # 1.0 100 0 100 100 0 0 100 100 active+recovering+degraded+remapped 2017-11-27 21:24:20.851243 18'500 23:618 [1,3,NONE] 1 [1,3,2] 1 0'0 2017-11-27 21:23:39.395242 0'0 2017-11-27 21:23:39.395242
369 function TEST_recovery_erasure_remapped() {
370 local dir=$1
371
372 run_mon $dir a || return 1
373 run_mgr $dir x || return 1
374 run_osd $dir 0 || return 1
375 run_osd $dir 1 || return 1
376 run_osd $dir 2 || return 1
377 run_osd $dir 3 || return 1
378
379 ceph osd erasure-code-profile set myprofile plugin=jerasure technique=reed_sol_van k=2 m=1 crush-failure-domain=osd
380 create_pool $poolname 1 1 erasure myprofile
381 ceph osd pool set $poolname min_size 2
382
383 wait_for_clean || return 1
384
385 for i in $(seq 1 $objects)
386 do
387 rados -p $poolname put obj$i /dev/null
388 done
389
390 local primary=$(get_primary $poolname obj1)
391 local PG=$(get_pg $poolname obj1)
392 local otherosd=$(get_not_primary $poolname obj1)
393
394 ceph osd set norecover
395 kill $(cat $dir/osd.${otherosd}.pid)
396 ceph osd down osd.${otherosd}
397 ceph osd out osd.${otherosd}
398
399 # Mark osd not the primary and not down/out osd as just out
400 for i in 0 1 2 3
401 do
402 if [ $i = $primary ];
403 then
404 continue
405 fi
406 if [ $i = $otherosd ];
407 then
408 continue
409 fi
410 ceph osd out osd.$i
411 break
412 done
413 ceph osd unset norecover
414 ceph tell osd.$(get_primary $poolname obj1) debug kick_recovery_wq 0
415 sleep 2
416
417 wait_for_clean || return 1
418
419 local log=$dir/osd.${primary}.log
420 check $PG $log $objects 0 $objects $objects erasure || return 1
421
422 delete_pool $poolname
423 kill_daemons $dir || return 1
424 }
425
426 main osd-recovery-stats "$@"
427
428 # Local Variables:
429 # compile-command: "make -j4 && ../qa/run-standalone.sh osd-recovery-stats.sh"
430 # End: