]> git.proxmox.com Git - ceph.git/blob - ceph/qa/standalone/scrub/osd-scrub-snaps.sh
import ceph 14.2.5
[ceph.git] / ceph / qa / standalone / scrub / osd-scrub-snaps.sh
1 #!/usr/bin/env bash
2 #
3 # Copyright (C) 2015 Red Hat <contact@redhat.com>
4 #
5 # Author: David Zafman <dzafman@redhat.com>
6 #
7 # This program is free software; you can redistribute it and/or modify
8 # it under the terms of the GNU Library Public License as published by
9 # the Free Software Foundation; either version 2, or (at your option)
10 # any later version.
11 #
12 # This program is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU Library Public License for more details.
16 #
17 source $CEPH_ROOT/qa/standalone/ceph-helpers.sh
18
19 # Test development and debugging
20 # Set to "yes" in order to ignore diff errors and save results to update test
21 getjson="no"
22
23 jqfilter='.inconsistents'
24 sortkeys='import json; import sys ; JSON=sys.stdin.read() ; ud = json.loads(JSON) ; print json.dumps(ud, sort_keys=True, indent=2)'
25
26 function run() {
27 local dir=$1
28 shift
29
30 export CEPH_MON="127.0.0.1:7121" # git grep '\<7121\>' : there must be only one
31 export CEPH_ARGS
32 CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none "
33 CEPH_ARGS+="--mon-host=$CEPH_MON "
34
35 export -n CEPH_CLI_TEST_DUP_COMMAND
36 local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')}
37 for func in $funcs ; do
38 setup $dir || return 1
39 $func $dir || return 1
40 teardown $dir || return 1
41 done
42 }
43
44 function create_scenario() {
45 local dir=$1
46 local poolname=$2
47 local TESTDATA=$3
48 local osd=$4
49
50 SNAP=1
51 rados -p $poolname mksnap snap${SNAP}
52 dd if=/dev/urandom of=$TESTDATA bs=256 count=${SNAP}
53 rados -p $poolname put obj1 $TESTDATA
54 rados -p $poolname put obj5 $TESTDATA
55 rados -p $poolname put obj3 $TESTDATA
56 for i in `seq 6 14`
57 do rados -p $poolname put obj${i} $TESTDATA
58 done
59
60 SNAP=2
61 rados -p $poolname mksnap snap${SNAP}
62 dd if=/dev/urandom of=$TESTDATA bs=256 count=${SNAP}
63 rados -p $poolname put obj5 $TESTDATA
64
65 SNAP=3
66 rados -p $poolname mksnap snap${SNAP}
67 dd if=/dev/urandom of=$TESTDATA bs=256 count=${SNAP}
68 rados -p $poolname put obj3 $TESTDATA
69
70 SNAP=4
71 rados -p $poolname mksnap snap${SNAP}
72 dd if=/dev/urandom of=$TESTDATA bs=256 count=${SNAP}
73 rados -p $poolname put obj5 $TESTDATA
74 rados -p $poolname put obj2 $TESTDATA
75
76 SNAP=5
77 rados -p $poolname mksnap snap${SNAP}
78 SNAP=6
79 rados -p $poolname mksnap snap${SNAP}
80 dd if=/dev/urandom of=$TESTDATA bs=256 count=${SNAP}
81 rados -p $poolname put obj5 $TESTDATA
82
83 SNAP=7
84 rados -p $poolname mksnap snap${SNAP}
85
86 rados -p $poolname rm obj4
87 rados -p $poolname rm obj16
88 rados -p $poolname rm obj2
89
90 kill_daemons $dir TERM osd || return 1
91
92 # Don't need to use ceph_objectstore_tool() function because osd stopped
93
94 JSON="$(ceph-objectstore-tool --data-path $dir/${osd} --head --op list obj1)"
95 ceph-objectstore-tool --data-path $dir/${osd} "$JSON" --force remove || return 1
96
97 JSON="$(ceph-objectstore-tool --data-path $dir/${osd} --op list obj5 | grep \"snapid\":2)"
98 ceph-objectstore-tool --data-path $dir/${osd} "$JSON" remove || return 1
99
100 JSON="$(ceph-objectstore-tool --data-path $dir/${osd} --op list obj5 | grep \"snapid\":1)"
101 OBJ5SAVE="$JSON"
102 # Starts with a snapmap
103 ceph-kvstore-tool bluestore-kv $dir/${osd} list 2> /dev/null > $dir/drk.log
104 grep "^M.*MAP_.*[.]1[.]obj5[.][.]$" $dir/drk.log || return 1
105 ceph-objectstore-tool --data-path $dir/${osd} --rmtype nosnapmap "$JSON" remove || return 1
106 # Check that snapmap is stil there
107 ceph-kvstore-tool bluestore-kv $dir/${osd} list 2> /dev/null > $dir/drk.log
108 grep "^M.*MAP_.*[.]1[.]obj5[.][.]$" $dir/drk.log || return 1
109 rm -f $dir/drk.log
110
111 JSON="$(ceph-objectstore-tool --data-path $dir/${osd} --op list obj5 | grep \"snapid\":4)"
112 dd if=/dev/urandom of=$TESTDATA bs=256 count=18
113 ceph-objectstore-tool --data-path $dir/${osd} "$JSON" set-bytes $TESTDATA || return 1
114
115 JSON="$(ceph-objectstore-tool --data-path $dir/${osd} --head --op list obj3)"
116 dd if=/dev/urandom of=$TESTDATA bs=256 count=15
117 ceph-objectstore-tool --data-path $dir/${osd} "$JSON" set-bytes $TESTDATA || return 1
118
119 JSON="$(ceph-objectstore-tool --data-path $dir/${osd} --op list obj4 | grep \"snapid\":7)"
120 ceph-objectstore-tool --data-path $dir/${osd} "$JSON" remove || return 1
121
122 # Starts with a snapmap
123 ceph-kvstore-tool bluestore-kv $dir/${osd} list 2> /dev/null > $dir/drk.log
124 grep "^M.*MAP_.*[.]7[.]obj16[.][.]$" $dir/drk.log || return 1
125 JSON="$(ceph-objectstore-tool --data-path $dir/${osd} --op list obj16 | grep \"snapid\":7)"
126 ceph-objectstore-tool --data-path $dir/${osd} --rmtype snapmap "$JSON" remove || return 1
127 # Check that snapmap is now removed
128 ceph-kvstore-tool bluestore-kv $dir/${osd} list 2> /dev/null > $dir/drk.log
129 ! grep "^M.*MAP_.*[.]7[.]obj16[.][.]$" $dir/drk.log || return 1
130 rm -f $dir/drk.log
131
132 JSON="$(ceph-objectstore-tool --data-path $dir/${osd} --head --op list obj2)"
133 ceph-objectstore-tool --data-path $dir/${osd} "$JSON" rm-attr snapset || return 1
134
135 # Create a clone which isn't in snapset and doesn't have object info
136 JSON="$(echo "$OBJ5SAVE" | sed s/snapid\":1/snapid\":7/)"
137 dd if=/dev/urandom of=$TESTDATA bs=256 count=7
138 ceph-objectstore-tool --data-path $dir/${osd} "$JSON" set-bytes $TESTDATA || return 1
139
140 JSON="$(ceph-objectstore-tool --data-path $dir/${osd} --head --op list obj6)"
141 ceph-objectstore-tool --data-path $dir/${osd} "$JSON" clear-snapset || return 1
142 JSON="$(ceph-objectstore-tool --data-path $dir/${osd} --head --op list obj7)"
143 ceph-objectstore-tool --data-path $dir/${osd} "$JSON" clear-snapset corrupt || return 1
144 JSON="$(ceph-objectstore-tool --data-path $dir/${osd} --head --op list obj8)"
145 ceph-objectstore-tool --data-path $dir/${osd} "$JSON" clear-snapset seq || return 1
146 JSON="$(ceph-objectstore-tool --data-path $dir/${osd} --head --op list obj9)"
147 ceph-objectstore-tool --data-path $dir/${osd} "$JSON" clear-snapset clone_size || return 1
148 JSON="$(ceph-objectstore-tool --data-path $dir/${osd} --head --op list obj10)"
149 ceph-objectstore-tool --data-path $dir/${osd} "$JSON" clear-snapset clone_overlap || return 1
150 JSON="$(ceph-objectstore-tool --data-path $dir/${osd} --head --op list obj11)"
151 ceph-objectstore-tool --data-path $dir/${osd} "$JSON" clear-snapset clones || return 1
152 JSON="$(ceph-objectstore-tool --data-path $dir/${osd} --head --op list obj12)"
153 ceph-objectstore-tool --data-path $dir/${osd} "$JSON" clear-snapset head || return 1
154 JSON="$(ceph-objectstore-tool --data-path $dir/${osd} --head --op list obj13)"
155 ceph-objectstore-tool --data-path $dir/${osd} "$JSON" clear-snapset snaps || return 1
156 JSON="$(ceph-objectstore-tool --data-path $dir/${osd} --head --op list obj14)"
157 ceph-objectstore-tool --data-path $dir/${osd} "$JSON" clear-snapset size || return 1
158
159 echo "garbage" > $dir/bad
160 JSON="$(ceph-objectstore-tool --data-path $dir/${osd} --head --op list obj15)"
161 ceph-objectstore-tool --data-path $dir/${osd} "$JSON" set-attr snapset $dir/bad || return 1
162 rm -f $dir/bad
163 return 0
164 }
165
166 function TEST_scrub_snaps() {
167 local dir=$1
168 local poolname=test
169 local OBJS=16
170 local OSDS=1
171
172 TESTDATA="testdata.$$"
173
174 run_mon $dir a --osd_pool_default_size=$OSDS || return 1
175 run_mgr $dir x || return 1
176 for osd in $(seq 0 $(expr $OSDS - 1))
177 do
178 run_osd $dir $osd || return 1
179 done
180
181 # All scrubs done manually. Don't want any unexpected scheduled scrubs.
182 ceph osd set noscrub || return 1
183 ceph osd set nodeep-scrub || return 1
184
185 # Create a pool with a single pg
186 create_pool $poolname 1 1
187 wait_for_clean || return 1
188 poolid=$(ceph osd dump | grep "^pool.*[']test[']" | awk '{ print $2 }')
189
190 dd if=/dev/urandom of=$TESTDATA bs=1032 count=1
191 for i in `seq 1 $OBJS`
192 do
193 rados -p $poolname put obj${i} $TESTDATA
194 done
195
196 local primary=$(get_primary $poolname obj1)
197
198 create_scenario $dir $poolname $TESTDATA $primary || return 1
199
200 rm -f $TESTDATA
201
202 for osd in $(seq 0 $(expr $OSDS - 1))
203 do
204 run_osd $dir $osd || return 1
205 done
206
207 local pgid="${poolid}.0"
208 if ! pg_scrub "$pgid" ; then
209 return 1
210 fi
211
212 test "$(grep "_scan_snaps start" $dir/osd.${primary}.log | wc -l)" = "2" || return 1
213
214 rados list-inconsistent-pg $poolname > $dir/json || return 1
215 # Check pg count
216 test $(jq '. | length' $dir/json) = "1" || return 1
217 # Check pgid
218 test $(jq -r '.[0]' $dir/json) = $pgid || return 1
219
220 rados list-inconsistent-obj $pgid > $dir/json || return 1
221
222 # The injected snapshot errors with a single copy pool doesn't
223 # see object errors because all the issues are detected by
224 # comparing copies.
225 jq "$jqfilter" << EOF | python -c "$sortkeys" > $dir/checkcsjson
226 {
227 "epoch": 17,
228 "inconsistents": []
229 }
230 EOF
231
232 jq "$jqfilter" $dir/json | python -c "$sortkeys" > $dir/csjson
233 multidiff $dir/checkcsjson $dir/csjson || test $getjson = "yes" || return 1
234
235 rados list-inconsistent-snapset $pgid > $dir/json || return 1
236
237 jq "$jqfilter" << EOF | python -c "$sortkeys" > $dir/checkcsjson
238 {
239 "inconsistents": [
240 {
241 "errors": [
242 "headless"
243 ],
244 "snap": 1,
245 "locator": "",
246 "nspace": "",
247 "name": "obj1"
248 },
249 {
250 "errors": [
251 "size_mismatch"
252 ],
253 "snap": 1,
254 "locator": "",
255 "nspace": "",
256 "name": "obj10"
257 },
258 {
259 "errors": [
260 "headless"
261 ],
262 "snap": 1,
263 "locator": "",
264 "nspace": "",
265 "name": "obj11"
266 },
267 {
268 "errors": [
269 "size_mismatch"
270 ],
271 "snap": 1,
272 "locator": "",
273 "nspace": "",
274 "name": "obj14"
275 },
276 {
277 "errors": [
278 "headless"
279 ],
280 "snap": 1,
281 "locator": "",
282 "nspace": "",
283 "name": "obj6"
284 },
285 {
286 "errors": [
287 "headless"
288 ],
289 "snap": 1,
290 "locator": "",
291 "nspace": "",
292 "name": "obj7"
293 },
294 {
295 "errors": [
296 "size_mismatch"
297 ],
298 "snap": 1,
299 "locator": "",
300 "nspace": "",
301 "name": "obj9"
302 },
303 {
304 "errors": [
305 "headless"
306 ],
307 "snap": 4,
308 "locator": "",
309 "nspace": "",
310 "name": "obj2"
311 },
312 {
313 "errors": [
314 "size_mismatch"
315 ],
316 "snap": 4,
317 "locator": "",
318 "nspace": "",
319 "name": "obj5"
320 },
321 {
322 "errors": [
323 "headless"
324 ],
325 "snap": 7,
326 "locator": "",
327 "nspace": "",
328 "name": "obj2"
329 },
330 {
331 "errors": [
332 "info_missing",
333 "headless"
334 ],
335 "snap": 7,
336 "locator": "",
337 "nspace": "",
338 "name": "obj5"
339 },
340 {
341 "name": "obj10",
342 "nspace": "",
343 "locator": "",
344 "snap": "head",
345 "snapset": {
346 "snap_context": {
347 "seq": 1,
348 "snaps": [
349 1
350 ]
351 },
352 "clones": [
353 {
354 "snap": 1,
355 "size": 1032,
356 "overlap": "????",
357 "snaps": [
358 1
359 ]
360 }
361 ]
362 },
363 "errors": []
364 },
365 {
366 "extra clones": [
367 1
368 ],
369 "errors": [
370 "extra_clones"
371 ],
372 "snap": "head",
373 "locator": "",
374 "nspace": "",
375 "name": "obj11",
376 "snapset": {
377 "snap_context": {
378 "seq": 1,
379 "snaps": [
380 1
381 ]
382 },
383 "clones": []
384 }
385 },
386 {
387 "name": "obj14",
388 "nspace": "",
389 "locator": "",
390 "snap": "head",
391 "snapset": {
392 "snap_context": {
393 "seq": 1,
394 "snaps": [
395 1
396 ]
397 },
398 "clones": [
399 {
400 "snap": 1,
401 "size": 1033,
402 "overlap": "[]",
403 "snaps": [
404 1
405 ]
406 }
407 ]
408 },
409 "errors": []
410 },
411 {
412 "errors": [
413 "snapset_corrupted"
414 ],
415 "snap": "head",
416 "locator": "",
417 "nspace": "",
418 "name": "obj15"
419 },
420 {
421 "extra clones": [
422 7,
423 4
424 ],
425 "errors": [
426 "snapset_missing",
427 "extra_clones"
428 ],
429 "snap": "head",
430 "locator": "",
431 "nspace": "",
432 "name": "obj2"
433 },
434 {
435 "errors": [
436 "size_mismatch"
437 ],
438 "snap": "head",
439 "locator": "",
440 "nspace": "",
441 "name": "obj3",
442 "snapset": {
443 "snap_context": {
444 "seq": 3,
445 "snaps": [
446 3,
447 2,
448 1
449 ]
450 },
451 "clones": [
452 {
453 "snap": 1,
454 "size": 1032,
455 "overlap": "[]",
456 "snaps": [
457 1
458 ]
459 },
460 {
461 "snap": 3,
462 "size": 256,
463 "overlap": "[]",
464 "snaps": [
465 3,
466 2
467 ]
468 }
469 ]
470 }
471 },
472 {
473 "missing": [
474 7
475 ],
476 "errors": [
477 "clone_missing"
478 ],
479 "snap": "head",
480 "locator": "",
481 "nspace": "",
482 "name": "obj4",
483 "snapset": {
484 "snap_context": {
485 "seq": 7,
486 "snaps": [
487 7,
488 6,
489 5,
490 4,
491 3,
492 2,
493 1
494 ]
495 },
496 "clones": [
497 {
498 "snap": 7,
499 "size": 1032,
500 "overlap": "[]",
501 "snaps": [
502 7,
503 6,
504 5,
505 4,
506 3,
507 2,
508 1
509 ]
510 }
511 ]
512 }
513 },
514 {
515 "missing": [
516 2,
517 1
518 ],
519 "extra clones": [
520 7
521 ],
522 "errors": [
523 "extra_clones",
524 "clone_missing"
525 ],
526 "snap": "head",
527 "locator": "",
528 "nspace": "",
529 "name": "obj5",
530 "snapset": {
531 "snap_context": {
532 "seq": 6,
533 "snaps": [
534 6,
535 5,
536 4,
537 3,
538 2,
539 1
540 ]
541 },
542 "clones": [
543 {
544 "snap": 1,
545 "size": 1032,
546 "overlap": "[]",
547 "snaps": [
548 1
549 ]
550 },
551 {
552 "snap": 2,
553 "size": 256,
554 "overlap": "[]",
555 "snaps": [
556 2
557 ]
558 },
559 {
560 "snap": 4,
561 "size": 512,
562 "overlap": "[]",
563 "snaps": [
564 4,
565 3
566 ]
567 },
568 {
569 "snap": 6,
570 "size": 1024,
571 "overlap": "[]",
572 "snaps": [
573 6,
574 5
575 ]
576 }
577 ]
578 }
579 },
580 {
581 "extra clones": [
582 1
583 ],
584 "errors": [
585 "extra_clones"
586 ],
587 "snap": "head",
588 "locator": "",
589 "nspace": "",
590 "name": "obj6",
591 "snapset": {
592 "snap_context": {
593 "seq": 1,
594 "snaps": [
595 1
596 ]
597 },
598 "clones": []
599 }
600 },
601 {
602 "extra clones": [
603 1
604 ],
605 "errors": [
606 "extra_clones"
607 ],
608 "snap": "head",
609 "locator": "",
610 "nspace": "",
611 "name": "obj7",
612 "snapset": {
613 "snap_context": {
614 "seq": 0,
615 "snaps": []
616 },
617 "clones": []
618 }
619 },
620 {
621 "errors": [
622 "snapset_error"
623 ],
624 "snap": "head",
625 "locator": "",
626 "nspace": "",
627 "name": "obj8",
628 "snapset": {
629 "snap_context": {
630 "seq": 0,
631 "snaps": [
632 1
633 ]
634 },
635 "clones": [
636 {
637 "snap": 1,
638 "size": 1032,
639 "overlap": "[]",
640 "snaps": [
641 1
642 ]
643 }
644 ]
645 }
646 },
647 {
648 "name": "obj9",
649 "nspace": "",
650 "locator": "",
651 "snap": "head",
652 "snapset": {
653 "snap_context": {
654 "seq": 1,
655 "snaps": [
656 1
657 ]
658 },
659 "clones": [
660 {
661 "snap": 1,
662 "size": "????",
663 "overlap": "[]",
664 "snaps": [
665 1
666 ]
667 }
668 ]
669 },
670 "errors": []
671 }
672 ],
673 "epoch": 20
674 }
675 EOF
676
677 jq "$jqfilter" $dir/json | python -c "$sortkeys" > $dir/csjson
678 multidiff $dir/checkcsjson $dir/csjson || test $getjson = "yes" || return 1
679 if test $getjson = "yes"
680 then
681 jq '.' $dir/json > save1.json
682 fi
683
684 if test "$LOCALRUN" = "yes" && which jsonschema > /dev/null;
685 then
686 jsonschema -i $dir/json $CEPH_ROOT/doc/rados/command/list-inconsistent-snap.json || return 1
687 fi
688
689 pidfiles=$(find $dir 2>/dev/null | grep 'osd[^/]*\.pid')
690 pids=""
691 for pidfile in ${pidfiles}
692 do
693 pids+="$(cat $pidfile) "
694 done
695
696 ERRORS=0
697
698 for i in `seq 1 7`
699 do
700 rados -p $poolname rmsnap snap$i
701 done
702 sleep 5
703 local -i loop=0
704 while ceph pg dump pgs | grep -q snaptrim;
705 do
706 if ceph pg dump pgs | grep -q snaptrim_error;
707 then
708 break
709 fi
710 sleep 2
711 loop+=1
712 if (( $loop >= 10 )) ; then
713 ERRORS=$(expr $ERRORS + 1)
714 break
715 fi
716 done
717 ceph pg dump pgs
718
719 for pid in $pids
720 do
721 if ! kill -0 $pid
722 then
723 echo "OSD Crash occurred"
724 ERRORS=$(expr $ERRORS + 1)
725 fi
726 done
727
728 kill_daemons $dir || return 1
729
730 declare -a err_strings
731 err_strings[0]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 .*::obj10:.* : is missing in clone_overlap"
732 err_strings[1]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 .*::obj5:7 : no '_' attr"
733 err_strings[2]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 .*::obj5:7 : is an unexpected clone"
734 err_strings[3]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 .*::obj5:4 : on disk size [(]4608[)] does not match object info size [(]512[)] adjusted for ondisk to [(]512[)]"
735 err_strings[4]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 .*:::obj5:head : expected clone .*:::obj5:2"
736 err_strings[5]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 .*:::obj5:head : expected clone .*:::obj5:1"
737 err_strings[6]="log_channel[(]cluster[)] log [[]INF[]] : scrub [0-9]*[.]0 .*:::obj5:head : 2 missing clone[(]s[)]"
738 err_strings[7]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 .*:::obj8:head : snaps.seq not set"
739 err_strings[8]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 .*:::obj7:1 : is an unexpected clone"
740 err_strings[9]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 .*:::obj3:head : on disk size [(]3840[)] does not match object info size [(]768[)] adjusted for ondisk to [(]768[)]"
741 err_strings[10]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 .*:::obj6:1 : is an unexpected clone"
742 err_strings[11]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 .*:::obj2:head : no 'snapset' attr"
743 err_strings[12]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 .*:::obj2:7 : clone ignored due to missing snapset"
744 err_strings[13]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 .*:::obj2:4 : clone ignored due to missing snapset"
745 err_strings[14]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 .*:::obj4:head : expected clone .*:::obj4:7"
746 err_strings[15]="log_channel[(]cluster[)] log [[]INF[]] : scrub [0-9]*[.]0 .*:::obj4:head : 1 missing clone[(]s[)]"
747 err_strings[16]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 .*:::obj1:1 : is an unexpected clone"
748 err_strings[17]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 .*:::obj9:1 : is missing in clone_size"
749 err_strings[18]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 .*:::obj11:1 : is an unexpected clone"
750 err_strings[19]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 .*:::obj14:1 : size 1032 != clone_size 1033"
751 err_strings[20]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 scrub 20 errors"
752 err_strings[21]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 .*:::obj15:head : can't decode 'snapset' attr buffer"
753 err_strings[22]="log_channel[(]cluster[)] log [[]ERR[]] : osd[.][0-9]* found snap mapper error on pg 1.0 oid 1:461f8b5e:::obj16:7 snaps missing in mapper, should be: 1,2,3,4,5,6,7 was r -2...repaired"
754
755 for err_string in "${err_strings[@]}"
756 do
757 if ! grep "$err_string" $dir/osd.${primary}.log > /dev/null;
758 then
759 echo "Missing log message '$err_string'"
760 ERRORS=$(expr $ERRORS + 1)
761 fi
762 done
763
764 if [ $ERRORS != "0" ];
765 then
766 echo "TEST FAILED WITH $ERRORS ERRORS"
767 return 1
768 fi
769
770 echo "TEST PASSED"
771 return 0
772 }
773
774 function _scrub_snaps_multi() {
775 local dir=$1
776 local poolname=test
777 local OBJS=16
778 local OSDS=2
779 local which=$2
780
781 TESTDATA="testdata.$$"
782
783 run_mon $dir a --osd_pool_default_size=$OSDS || return 1
784 run_mgr $dir x || return 1
785 for osd in $(seq 0 $(expr $OSDS - 1))
786 do
787 run_osd $dir $osd || return 1
788 done
789
790 # All scrubs done manually. Don't want any unexpected scheduled scrubs.
791 ceph osd set noscrub || return 1
792 ceph osd set nodeep-scrub || return 1
793
794 # Create a pool with a single pg
795 create_pool $poolname 1 1
796 wait_for_clean || return 1
797 poolid=$(ceph osd dump | grep "^pool.*[']test[']" | awk '{ print $2 }')
798
799 dd if=/dev/urandom of=$TESTDATA bs=1032 count=1
800 for i in `seq 1 $OBJS`
801 do
802 rados -p $poolname put obj${i} $TESTDATA
803 done
804
805 local primary=$(get_primary $poolname obj1)
806 local replica=$(get_not_primary $poolname obj1)
807
808 eval create_scenario $dir $poolname $TESTDATA \$$which || return 1
809
810 rm -f $TESTDATA
811
812 for osd in $(seq 0 $(expr $OSDS - 1))
813 do
814 run_osd $dir $osd || return 1
815 done
816
817 local pgid="${poolid}.0"
818 if ! pg_scrub "$pgid" ; then
819 return 1
820 fi
821
822 test "$(grep "_scan_snaps start" $dir/osd.${primary}.log | wc -l)" -gt "3" || return 1
823 test "$(grep "_scan_snaps start" $dir/osd.${replica}.log | wc -l)" -gt "3" || return 1
824
825 rados list-inconsistent-pg $poolname > $dir/json || return 1
826 # Check pg count
827 test $(jq '. | length' $dir/json) = "1" || return 1
828 # Check pgid
829 test $(jq -r '.[0]' $dir/json) = $pgid || return 1
830
831 rados list-inconsistent-obj $pgid --format=json-pretty
832
833 rados list-inconsistent-snapset $pgid > $dir/json || return 1
834
835 # Since all of the snapshots on the primary is consistent there are no errors here
836 if [ $which = "replica" ];
837 then
838 scruberrors="20"
839 jq "$jqfilter" << EOF | python -c "$sortkeys" > $dir/checkcsjson
840 {
841 "epoch": 23,
842 "inconsistents": []
843 }
844 EOF
845
846 else
847 scruberrors="30"
848 jq "$jqfilter" << EOF | python -c "$sortkeys" > $dir/checkcsjson
849 {
850 "epoch": 23,
851 "inconsistents": [
852 {
853 "name": "obj10",
854 "nspace": "",
855 "locator": "",
856 "snap": 1,
857 "errors": [
858 "size_mismatch"
859 ]
860 },
861 {
862 "name": "obj11",
863 "nspace": "",
864 "locator": "",
865 "snap": 1,
866 "errors": [
867 "headless"
868 ]
869 },
870 {
871 "name": "obj14",
872 "nspace": "",
873 "locator": "",
874 "snap": 1,
875 "errors": [
876 "size_mismatch"
877 ]
878 },
879 {
880 "name": "obj6",
881 "nspace": "",
882 "locator": "",
883 "snap": 1,
884 "errors": [
885 "headless"
886 ]
887 },
888 {
889 "name": "obj7",
890 "nspace": "",
891 "locator": "",
892 "snap": 1,
893 "errors": [
894 "headless"
895 ]
896 },
897 {
898 "name": "obj9",
899 "nspace": "",
900 "locator": "",
901 "snap": 1,
902 "errors": [
903 "size_mismatch"
904 ]
905 },
906 {
907 "name": "obj5",
908 "nspace": "",
909 "locator": "",
910 "snap": 7,
911 "errors": [
912 "info_missing",
913 "headless"
914 ]
915 },
916 {
917 "name": "obj10",
918 "nspace": "",
919 "locator": "",
920 "snap": "head",
921 "snapset": {
922 "snap_context": {
923 "seq": 1,
924 "snaps": [
925 1
926 ]
927 },
928 "clones": [
929 {
930 "snap": 1,
931 "size": 1032,
932 "overlap": "????",
933 "snaps": [
934 1
935 ]
936 }
937 ]
938 },
939 "errors": []
940 },
941 {
942 "name": "obj11",
943 "nspace": "",
944 "locator": "",
945 "snap": "head",
946 "snapset": {
947 "snap_context": {
948 "seq": 1,
949 "snaps": [
950 1
951 ]
952 },
953 "clones": []
954 },
955 "errors": [
956 "extra_clones"
957 ],
958 "extra clones": [
959 1
960 ]
961 },
962 {
963 "name": "obj14",
964 "nspace": "",
965 "locator": "",
966 "snap": "head",
967 "snapset": {
968 "snap_context": {
969 "seq": 1,
970 "snaps": [
971 1
972 ]
973 },
974 "clones": [
975 {
976 "snap": 1,
977 "size": 1033,
978 "overlap": "[]",
979 "snaps": [
980 1
981 ]
982 }
983 ]
984 },
985 "errors": []
986 },
987 {
988 "name": "obj5",
989 "nspace": "",
990 "locator": "",
991 "snap": "head",
992 "snapset": {
993 "snap_context": {
994 "seq": 6,
995 "snaps": [
996 6,
997 5,
998 4,
999 3,
1000 2,
1001 1
1002 ]
1003 },
1004 "clones": [
1005 {
1006 "snap": 1,
1007 "size": 1032,
1008 "overlap": "[]",
1009 "snaps": [
1010 1
1011 ]
1012 },
1013 {
1014 "snap": 2,
1015 "size": 256,
1016 "overlap": "[]",
1017 "snaps": [
1018 2
1019 ]
1020 },
1021 {
1022 "snap": 4,
1023 "size": 512,
1024 "overlap": "[]",
1025 "snaps": [
1026 4,
1027 3
1028 ]
1029 },
1030 {
1031 "snap": 6,
1032 "size": 1024,
1033 "overlap": "[]",
1034 "snaps": [
1035 6,
1036 5
1037 ]
1038 }
1039 ]
1040 },
1041 "errors": [
1042 "extra_clones"
1043 ],
1044 "extra clones": [
1045 7
1046 ]
1047 },
1048 {
1049 "name": "obj6",
1050 "nspace": "",
1051 "locator": "",
1052 "snap": "head",
1053 "snapset": {
1054 "snap_context": {
1055 "seq": 1,
1056 "snaps": [
1057 1
1058 ]
1059 },
1060 "clones": []
1061 },
1062 "errors": [
1063 "extra_clones"
1064 ],
1065 "extra clones": [
1066 1
1067 ]
1068 },
1069 {
1070 "name": "obj7",
1071 "nspace": "",
1072 "locator": "",
1073 "snap": "head",
1074 "snapset": {
1075 "snap_context": {
1076 "seq": 0,
1077 "snaps": []
1078 },
1079 "clones": []
1080 },
1081 "errors": [
1082 "extra_clones"
1083 ],
1084 "extra clones": [
1085 1
1086 ]
1087 },
1088 {
1089 "name": "obj8",
1090 "nspace": "",
1091 "locator": "",
1092 "snap": "head",
1093 "snapset": {
1094 "snap_context": {
1095 "seq": 0,
1096 "snaps": [
1097 1
1098 ]
1099 },
1100 "clones": [
1101 {
1102 "snap": 1,
1103 "size": 1032,
1104 "overlap": "[]",
1105 "snaps": [
1106 1
1107 ]
1108 }
1109 ]
1110 },
1111 "errors": [
1112 "snapset_error"
1113 ]
1114 },
1115 {
1116 "name": "obj9",
1117 "nspace": "",
1118 "locator": "",
1119 "snap": "head",
1120 "snapset": {
1121 "snap_context": {
1122 "seq": 1,
1123 "snaps": [
1124 1
1125 ]
1126 },
1127 "clones": [
1128 {
1129 "snap": 1,
1130 "size": "????",
1131 "overlap": "[]",
1132 "snaps": [
1133 1
1134 ]
1135 }
1136 ]
1137 },
1138 "errors": []
1139 }
1140 ]
1141 }
1142 EOF
1143 fi
1144
1145 jq "$jqfilter" $dir/json | python -c "$sortkeys" > $dir/csjson
1146 multidiff $dir/checkcsjson $dir/csjson || test $getjson = "yes" || return 1
1147 if test $getjson = "yes"
1148 then
1149 jq '.' $dir/json > save1.json
1150 fi
1151
1152 if test "$LOCALRUN" = "yes" && which jsonschema > /dev/null;
1153 then
1154 jsonschema -i $dir/json $CEPH_ROOT/doc/rados/command/list-inconsistent-snap.json || return 1
1155 fi
1156
1157 pidfiles=$(find $dir 2>/dev/null | grep 'osd[^/]*\.pid')
1158 pids=""
1159 for pidfile in ${pidfiles}
1160 do
1161 pids+="$(cat $pidfile) "
1162 done
1163
1164 ERRORS=0
1165
1166 # When removing snapshots with a corrupt replica, it crashes.
1167 # See http://tracker.ceph.com/issues/23875
1168 if [ $which = "primary" ];
1169 then
1170 for i in `seq 1 7`
1171 do
1172 rados -p $poolname rmsnap snap$i
1173 done
1174 sleep 5
1175 local -i loop=0
1176 while ceph pg dump pgs | grep -q snaptrim;
1177 do
1178 if ceph pg dump pgs | grep -q snaptrim_error;
1179 then
1180 break
1181 fi
1182 sleep 2
1183 loop+=1
1184 if (( $loop >= 10 )) ; then
1185 ERRORS=$(expr $ERRORS + 1)
1186 break
1187 fi
1188 done
1189 fi
1190 ceph pg dump pgs
1191
1192 for pid in $pids
1193 do
1194 if ! kill -0 $pid
1195 then
1196 echo "OSD Crash occurred"
1197 ERRORS=$(expr $ERRORS + 1)
1198 fi
1199 done
1200
1201 kill_daemons $dir || return 1
1202
1203 declare -a err_strings
1204 err_strings[0]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 shard [0-1] .*:::obj4:7 : missing"
1205 err_strings[1]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 shard [0-1] soid .*:::obj3:head : size 3840 != size 768 from auth oi"
1206 err_strings[2]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 shard [0-1] .*:::obj5:1 : missing"
1207 err_strings[3]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 shard [0-1] .*:::obj5:2 : missing"
1208 err_strings[4]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 shard [0-1] soid .*:::obj5:4 : size 4608 != size 512 from auth oi"
1209 err_strings[5]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 soid .*:::obj5:7 : failed to pick suitable object info"
1210 err_strings[6]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 shard [0-1] .*:::obj1:head : missing"
1211 err_strings[7]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 scrub ${scruberrors} errors"
1212
1213 for err_string in "${err_strings[@]}"
1214 do
1215 if ! grep "$err_string" $dir/osd.${primary}.log > /dev/null;
1216 then
1217 echo "Missing log message '$err_string'"
1218 ERRORS=$(expr $ERRORS + 1)
1219 fi
1220 done
1221
1222 # Check replica specific messages
1223 declare -a rep_err_strings
1224 osd=$(eval echo \$$which)
1225 rep_err_strings[0]="log_channel[(]cluster[)] log [[]ERR[]] : osd[.][0-9]* found snap mapper error on pg 1.0 oid 1:461f8b5e:::obj16:7 snaps missing in mapper, should be: 1,2,3,4,5,6,7 was r -2...repaired"
1226 for err_string in "${rep_err_strings[@]}"
1227 do
1228 if ! grep "$err_string" $dir/osd.${osd}.log > /dev/null;
1229 then
1230 echo "Missing log message '$err_string'"
1231 ERRORS=$(expr $ERRORS + 1)
1232 fi
1233 done
1234
1235 if [ $ERRORS != "0" ];
1236 then
1237 echo "TEST FAILED WITH $ERRORS ERRORS"
1238 return 1
1239 fi
1240
1241 echo "TEST PASSED"
1242 return 0
1243 }
1244
1245 function TEST_scrub_snaps_replica() {
1246 local dir=$1
1247 ORIG_ARGS=$CEPH_ARGS
1248 CEPH_ARGS+=" --osd_scrub_chunk_min=3 --osd_scrub_chunk_max=3"
1249 _scrub_snaps_multi $dir replica
1250 err=$?
1251 CEPH_ARGS=$ORIG_ARGS
1252 return $err
1253 }
1254
1255 function TEST_scrub_snaps_primary() {
1256 local dir=$1
1257 ORIG_ARGS=$CEPH_ARGS
1258 CEPH_ARGS+=" --osd_scrub_chunk_min=3 --osd_scrub_chunk_max=3"
1259 _scrub_snaps_multi $dir primary
1260 err=$?
1261 CEPH_ARGS=$ORIG_ARGS
1262 return $err
1263 }
1264
1265 main osd-scrub-snaps "$@"
1266
1267 # Local Variables:
1268 # compile-command: "cd build ; make -j4 && \
1269 # ../qa/run-standalone.sh osd-scrub-snaps.sh"
1270 # End: