3 # Copyright (C) 2014 Red Hat <contact@redhat.com>
5 # Author: Loic Dachary <loic@dachary.org>
7 # This program is free software; you can redistribute it and/or modify
8 # it under the terms of the GNU Library Public License as published by
9 # the Free Software Foundation; either version 2, or (at your option)
12 # This program is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU Library Public License for more details.
18 source $CEPH_ROOT/qa
/standalone
/ceph-helpers.sh
20 if [ `uname` = FreeBSD
]; then
21 # erasure coding overwrites are only tested on Bluestore
22 # erasure coding on filestore is unsafe
23 # http://docs.ceph.com/docs/master/rados/operations/erasure-code/#erasure-coding-with-overwrites
24 use_ec_overwrite
=false
29 # Test development and debugging
30 # Set to "yes" in order to ignore diff errors and save results to update test
33 # Filter out mtime and local_mtime dates, version, prior_version and last_reqid (client) from any object_info.
34 jqfilter
='def walk(f):
36 | if type == "object" then
38 ( {}; . + { ($key): ($in[$key] | walk(f)) } ) | f
39 elif type == "array" then map( walk(f) ) | f
42 walk(if type == "object" then del(.mtime) else . end)
43 | walk(if type == "object" then del(.local_mtime) else . end)
44 | walk(if type == "object" then del(.last_reqid) else . end)
45 | walk(if type == "object" then del(.version) else . end)
46 | walk(if type == "object" then del(.prior_version) else . end)'
48 sortkeys
='import json; import sys ; JSON=sys.stdin.read() ; ud = json.loads(JSON) ; print json.dumps(ud, sort_keys=True, indent=2)'
54 export CEPH_MON
="127.0.0.1:7107" # git grep '\<7107\>' : there must be only one
56 CEPH_ARGS
+="--fsid=$(uuidgen) --auth-supported=none "
57 CEPH_ARGS
+="--mon-host=$CEPH_MON "
58 CEPH_ARGS
+="--osd-skip-data-digest=false "
60 export -n CEPH_CLI_TEST_DUP_COMMAND
61 local funcs
=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')}
62 for func
in $funcs ; do
63 $func $dir ||
return 1
67 function add_something
() {
70 local obj
=${3:-SOMETHING}
71 local scrub
=${4:-noscrub}
73 if [ "$scrub" = "noscrub" ];
75 ceph osd
set noscrub ||
return 1
76 ceph osd
set nodeep-scrub ||
return 1
78 ceph osd
unset noscrub ||
return 1
79 ceph osd
unset nodeep-scrub ||
return 1
83 echo $payload > $dir/ORIGINAL
84 rados
--pool $poolname put
$obj $dir/ORIGINAL ||
return 1
88 # Corrupt one copy of a replicated pool
90 function TEST_corrupt_and_repair_replicated
() {
94 setup
$dir ||
return 1
95 run_mon
$dir a
--osd_pool_default_size=2 ||
return 1
96 run_mgr
$dir x ||
return 1
97 run_osd
$dir 0 ||
return 1
98 run_osd
$dir 1 ||
return 1
99 create_rbd_pool ||
return 1
100 wait_for_clean ||
return 1
102 add_something
$dir $poolname ||
return 1
103 corrupt_and_repair_one
$dir $poolname $
(get_not_primary
$poolname SOMETHING
) ||
return 1
104 # Reproduces http://tracker.ceph.com/issues/8914
105 corrupt_and_repair_one
$dir $poolname $
(get_primary
$poolname SOMETHING
) ||
return 1
107 teardown
$dir ||
return 1
111 # Allow repair to be scheduled when some recovering is still undergoing on the same OSD
113 function TEST_allow_repair_during_recovery
() {
117 setup
$dir ||
return 1
118 run_mon
$dir a
--osd_pool_default_size=2 ||
return 1
119 run_mgr
$dir x ||
return 1
120 run_osd
$dir 0 --osd_scrub_during_recovery=false \
121 --osd_repair_during_recovery=true \
122 --osd_debug_pretend_recovery_active=true ||
return 1
123 run_osd
$dir 1 --osd_scrub_during_recovery=false \
124 --osd_repair_during_recovery=true \
125 --osd_debug_pretend_recovery_active=true ||
return 1
126 create_rbd_pool ||
return 1
127 wait_for_clean ||
return 1
129 add_something
$dir $poolname ||
return 1
130 corrupt_and_repair_one
$dir $poolname $
(get_not_primary
$poolname SOMETHING
) ||
return 1
132 teardown
$dir ||
return 1
136 # Skip non-repair scrub correctly during recovery
138 function TEST_skip_non_repair_during_recovery
() {
142 setup
$dir ||
return 1
143 run_mon
$dir a
--osd_pool_default_size=2 ||
return 1
144 run_mgr
$dir x ||
return 1
145 run_osd
$dir 0 --osd_scrub_during_recovery=false \
146 --osd_repair_during_recovery=true \
147 --osd_debug_pretend_recovery_active=true ||
return 1
148 run_osd
$dir 1 --osd_scrub_during_recovery=false \
149 --osd_repair_during_recovery=true \
150 --osd_debug_pretend_recovery_active=true ||
return 1
151 create_rbd_pool ||
return 1
152 wait_for_clean ||
return 1
154 add_something
$dir $poolname ||
return 1
155 scrub_and_not_schedule
$dir $poolname $
(get_not_primary
$poolname SOMETHING
) ||
return 1
157 teardown
$dir ||
return 1
160 function scrub_and_not_schedule
() {
166 # 1) start a non-repair scrub
168 local pg
=$
(get_pg
$poolname SOMETHING
)
169 local last_scrub
=$
(get_last_scrub_stamp
$pg)
173 # 2) Assure the scrub is not scheduled
175 for ((i
=0; i
< 3; i
++)); do
176 if test "$(get_last_scrub_stamp $pg)" '>' "$last_scrub" ; then
183 # 3) Access to the file must OK
185 objectstore_tool
$dir $osd SOMETHING list-attrs ||
return 1
186 rados
--pool $poolname get SOMETHING
$dir/COPY ||
return 1
187 diff $dir/ORIGINAL
$dir/COPY ||
return 1
190 function corrupt_and_repair_two
() {
197 # 1) remove the corresponding file from the OSDs
200 run_in_background pids objectstore_tool
$dir $first SOMETHING remove
201 run_in_background pids objectstore_tool
$dir $second SOMETHING remove
204 if [ $return_code -ne 0 ]; then return $return_code; fi
209 local pg
=$
(get_pg
$poolname SOMETHING
)
212 # 3) The files must be back
215 run_in_background pids objectstore_tool
$dir $first SOMETHING list-attrs
216 run_in_background pids objectstore_tool
$dir $second SOMETHING list-attrs
219 if [ $return_code -ne 0 ]; then return $return_code; fi
221 rados
--pool $poolname get SOMETHING
$dir/COPY ||
return 1
222 diff $dir/ORIGINAL
$dir/COPY ||
return 1
227 # 2) remove the corresponding file from a designated OSD
229 # 4) check that the file has been restored in the designated OSD
231 function corrupt_and_repair_one
() {
237 # 1) remove the corresponding file from the OSD
239 objectstore_tool
$dir $osd SOMETHING remove ||
return 1
243 local pg
=$
(get_pg
$poolname SOMETHING
)
246 # 3) The file must be back
248 objectstore_tool
$dir $osd SOMETHING list-attrs ||
return 1
249 rados
--pool $poolname get SOMETHING
$dir/COPY ||
return 1
250 diff $dir/ORIGINAL
$dir/COPY ||
return 1
253 function corrupt_and_repair_erasure_coded
() {
257 add_something
$dir $poolname ||
return 1
259 local primary
=$
(get_primary
$poolname SOMETHING
)
260 local -a osds
=($
(get_osds
$poolname SOMETHING |
sed -e "s/$primary//"))
261 local not_primary_first
=${osds[0]}
262 local not_primary_second
=${osds[1]}
264 # Reproduces http://tracker.ceph.com/issues/10017
265 corrupt_and_repair_one
$dir $poolname $primary ||
return 1
266 # Reproduces http://tracker.ceph.com/issues/10409
267 corrupt_and_repair_one
$dir $poolname $not_primary_first ||
return 1
268 corrupt_and_repair_two
$dir $poolname $not_primary_first $not_primary_second ||
return 1
269 corrupt_and_repair_two
$dir $poolname $primary $not_primary_first ||
return 1
273 function auto_repair_erasure_coded
() {
275 local allow_overwrites
=$2
276 local poolname
=ecpool
278 # Launch a cluster with 5 seconds scrub interval
279 setup
$dir ||
return 1
280 run_mon
$dir a ||
return 1
281 run_mgr
$dir x ||
return 1
282 local ceph_osd_args
="--osd-scrub-auto-repair=true \
283 --osd-deep-scrub-interval=5 \
284 --osd-scrub-max-interval=5 \
285 --osd-scrub-min-interval=5 \
286 --osd-scrub-interval-randomize-ratio=0"
287 for id
in $
(seq 0 2) ; do
288 if [ "$allow_overwrites" = "true" ]; then
289 run_osd
$dir $id $ceph_osd_args ||
return 1
291 run_osd_filestore
$dir $id $ceph_osd_args ||
return 1
294 create_rbd_pool ||
return 1
295 wait_for_clean ||
return 1
298 create_ec_pool
$poolname $allow_overwrites k
=2 m
=1 ||
return 1
302 echo $payload > $dir/ORIGINAL
303 rados
--pool $poolname put SOMETHING
$dir/ORIGINAL ||
return 1
305 # Remove the object from one shard physically
306 # Restarted osd get $ceph_osd_args passed
307 objectstore_tool
$dir $
(get_not_primary
$poolname SOMETHING
) SOMETHING remove ||
return 1
308 # Wait for auto repair
309 local pgid
=$
(get_pg
$poolname SOMETHING
)
310 wait_for_scrub
$pgid "$(get_last_scrub_stamp $pgid)"
311 wait_for_clean ||
return 1
312 # Verify - the file should be back
313 # Restarted osd get $ceph_osd_args passed
314 objectstore_tool
$dir $
(get_not_primary
$poolname SOMETHING
) SOMETHING list-attrs ||
return 1
315 rados
--pool $poolname get SOMETHING
$dir/COPY ||
return 1
316 diff $dir/ORIGINAL
$dir/COPY ||
return 1
319 teardown
$dir ||
return 1
322 function TEST_auto_repair_erasure_coded_appends
() {
323 auto_repair_erasure_coded
$1 false
326 function TEST_auto_repair_erasure_coded_overwrites
() {
327 if [ "$use_ec_overwrite" = "true" ]; then
328 auto_repair_erasure_coded
$1 true
332 function TEST_auto_repair_bluestore_basic
() {
334 local poolname
=testpool
336 # Launch a cluster with 5 seconds scrub interval
337 setup
$dir ||
return 1
338 run_mon
$dir a ||
return 1
339 run_mgr
$dir x ||
return 1
340 local ceph_osd_args
="--osd-scrub-auto-repair=true \
341 --osd_deep_scrub_randomize_ratio=0 \
342 --osd-scrub-interval-randomize-ratio=0"
343 for id
in $
(seq 0 2) ; do
344 run_osd
$dir $id $ceph_osd_args ||
return 1
347 create_pool
$poolname 1 1 ||
return 1
348 ceph osd pool
set $poolname size
2
349 wait_for_clean ||
return 1
353 echo $payload > $dir/ORIGINAL
354 rados
--pool $poolname put SOMETHING
$dir/ORIGINAL ||
return 1
356 # Remove the object from one shard physically
357 # Restarted osd get $ceph_osd_args passed
358 objectstore_tool
$dir $
(get_not_primary
$poolname SOMETHING
) SOMETHING remove ||
return 1
360 local pgid
=$
(get_pg
$poolname SOMETHING
)
361 local primary
=$
(get_primary
$poolname SOMETHING
)
362 local last_scrub_stamp
="$(get_last_scrub_stamp $pgid)"
363 ceph tell
$pgid deep_scrub
364 ceph tell
$pgid scrub
366 # Wait for auto repair
367 wait_for_scrub
$pgid "$last_scrub_stamp" ||
return 1
368 wait_for_clean ||
return 1
370 # Verify - the file should be back
371 # Restarted osd get $ceph_osd_args passed
372 objectstore_tool
$dir $
(get_not_primary
$poolname SOMETHING
) SOMETHING list-attrs ||
return 1
373 objectstore_tool
$dir $
(get_not_primary
$poolname SOMETHING
) SOMETHING get-bytes
$dir/COPY ||
return 1
374 diff $dir/ORIGINAL
$dir/COPY ||
return 1
375 grep scrub_finish
$dir/osd.
${primary}.log
378 teardown
$dir ||
return 1
381 function TEST_auto_repair_bluestore_scrub
() {
383 local poolname
=testpool
385 # Launch a cluster with 5 seconds scrub interval
386 setup
$dir ||
return 1
387 run_mon
$dir a ||
return 1
388 run_mgr
$dir x ||
return 1
389 local ceph_osd_args
="--osd-scrub-auto-repair=true \
390 --osd_deep_scrub_randomize_ratio=0 \
391 --osd-scrub-interval-randomize-ratio=0"
392 for id
in $
(seq 0 2) ; do
393 run_osd
$dir $id $ceph_osd_args ||
return 1
396 create_pool
$poolname 1 1 ||
return 1
397 ceph osd pool
set $poolname size
2
398 wait_for_clean ||
return 1
402 echo $payload > $dir/ORIGINAL
403 rados
--pool $poolname put SOMETHING
$dir/ORIGINAL ||
return 1
405 # Remove the object from one shard physically
406 # Restarted osd get $ceph_osd_args passed
407 objectstore_tool
$dir $
(get_not_primary
$poolname SOMETHING
) SOMETHING remove ||
return 1
409 local pgid
=$
(get_pg
$poolname SOMETHING
)
410 local primary
=$
(get_primary
$poolname SOMETHING
)
411 local last_scrub_stamp
="$(get_last_scrub_stamp $pgid)"
412 ceph tell
$pgid scrub
414 # Wait for scrub -> auto repair
415 wait_for_scrub
$pgid "$last_scrub_stamp" ||
return 1
417 # Actually this causes 2 scrubs, so we better wait a little longer
419 wait_for_clean ||
return 1
421 # Verify - the file should be back
422 # Restarted osd get $ceph_osd_args passed
423 objectstore_tool
$dir $
(get_not_primary
$poolname SOMETHING
) SOMETHING list-attrs ||
return 1
424 rados
--pool $poolname get SOMETHING
$dir/COPY ||
return 1
425 diff $dir/ORIGINAL
$dir/COPY ||
return 1
426 grep scrub_finish
$dir/osd.
${primary}.log
428 # This should have caused 1 object to be repaired
429 COUNT
=$
(ceph pg
$pgid query | jq
'.info.stats.stat_sum.num_objects_repaired')
430 test "$COUNT" = "1" ||
return 1
433 teardown
$dir ||
return 1
436 function TEST_auto_repair_bluestore_failed
() {
438 local poolname
=testpool
440 # Launch a cluster with 5 seconds scrub interval
441 setup
$dir ||
return 1
442 run_mon
$dir a ||
return 1
443 run_mgr
$dir x ||
return 1
444 local ceph_osd_args
="--osd-scrub-auto-repair=true \
445 --osd_deep_scrub_randomize_ratio=0 \
446 --osd-scrub-interval-randomize-ratio=0"
447 for id
in $
(seq 0 2) ; do
448 run_osd
$dir $id $ceph_osd_args ||
return 1
451 create_pool
$poolname 1 1 ||
return 1
452 ceph osd pool
set $poolname size
2
453 wait_for_clean ||
return 1
457 echo $payload > $dir/ORIGINAL
460 rados
--pool $poolname put obj
$i $dir/ORIGINAL ||
return 1
463 # Remove the object from one shard physically
464 # Restarted osd get $ceph_osd_args passed
465 objectstore_tool
$dir $
(get_not_primary
$poolname SOMETHING
) obj1 remove ||
return 1
466 # obj2 can't be repaired
467 objectstore_tool
$dir $
(get_not_primary
$poolname SOMETHING
) obj2 remove ||
return 1
468 objectstore_tool
$dir $
(get_primary
$poolname SOMETHING
) obj2 rm-attr _ ||
return 1
470 local pgid
=$
(get_pg
$poolname obj1
)
471 local primary
=$
(get_primary
$poolname obj1
)
472 local last_scrub_stamp
="$(get_last_scrub_stamp $pgid)"
473 ceph tell
$pgid deep_scrub
474 ceph tell
$pgid scrub
476 # Wait for auto repair
477 wait_for_scrub
$pgid "$last_scrub_stamp" ||
return 1
478 wait_for_clean ||
return 1
480 grep scrub_finish
$dir/osd.
${primary}.log
481 grep -q "scrub_finish.*still present after re-scrub" $dir/osd.
${primary}.log ||
return 1
483 ceph pg dump pgs |
grep -q "^$(pgid).*+failed_repair" ||
return 1
485 # Verify - obj1 should be back
486 # Restarted osd get $ceph_osd_args passed
487 objectstore_tool
$dir $
(get_not_primary
$poolname obj1
) obj1 list-attrs ||
return 1
488 rados
--pool $poolname get obj1
$dir/COPY ||
return 1
489 diff $dir/ORIGINAL
$dir/COPY ||
return 1
490 grep scrub_finish
$dir/osd.
${primary}.log
493 objectstore_tool
$dir $
(get_primary
$poolname SOMETHING
) obj2 remove ||
return 1
498 ceph pg dump pgs |
grep -q "^$(pgid).* active+clean " ||
return 1
499 grep scrub_finish
$dir/osd.
${primary}.log
502 teardown
$dir ||
return 1
505 function TEST_auto_repair_bluestore_failed_norecov
() {
507 local poolname
=testpool
509 # Launch a cluster with 5 seconds scrub interval
510 setup
$dir ||
return 1
511 run_mon
$dir a ||
return 1
512 run_mgr
$dir x ||
return 1
513 local ceph_osd_args
="--osd-scrub-auto-repair=true \
514 --osd_deep_scrub_randomize_ratio=0 \
515 --osd-scrub-interval-randomize-ratio=0"
516 for id
in $
(seq 0 2) ; do
517 run_osd
$dir $id $ceph_osd_args ||
return 1
520 create_pool
$poolname 1 1 ||
return 1
521 ceph osd pool
set $poolname size
2
522 wait_for_clean ||
return 1
526 echo $payload > $dir/ORIGINAL
529 rados
--pool $poolname put obj
$i $dir/ORIGINAL ||
return 1
532 # Remove the object from one shard physically
533 # Restarted osd get $ceph_osd_args passed
534 # obj1 can't be repaired
535 objectstore_tool
$dir $
(get_not_primary
$poolname SOMETHING
) obj1 remove ||
return 1
536 objectstore_tool
$dir $
(get_primary
$poolname SOMETHING
) obj1 rm-attr _ ||
return 1
537 # obj2 can't be repaired
538 objectstore_tool
$dir $
(get_not_primary
$poolname SOMETHING
) obj2 remove ||
return 1
539 objectstore_tool
$dir $
(get_primary
$poolname SOMETHING
) obj2 rm-attr _ ||
return 1
541 local pgid
=$
(get_pg
$poolname obj1
)
542 local primary
=$
(get_primary
$poolname obj1
)
543 local last_scrub_stamp
="$(get_last_scrub_stamp $pgid)"
544 ceph tell
$pgid deep_scrub
545 ceph tell
$pgid scrub
547 # Wait for auto repair
548 wait_for_scrub
$pgid "$last_scrub_stamp" ||
return 1
549 wait_for_clean ||
return 1
551 grep -q "scrub_finish.*present with no repair possible" $dir/osd.
${primary}.log ||
return 1
553 ceph pg dump pgs |
grep -q "^$(pgid).*+failed_repair" ||
return 1
556 teardown
$dir ||
return 1
559 function TEST_repair_stats
() {
561 local poolname
=testpool
564 # This need to be an even number
567 # Launch a cluster with 5 seconds scrub interval
568 setup
$dir ||
return 1
569 run_mon
$dir a ||
return 1
570 run_mgr
$dir x ||
return 1
571 local ceph_osd_args
="--osd_deep_scrub_randomize_ratio=0 \
572 --osd-scrub-interval-randomize-ratio=0"
573 for id
in $
(seq 0 $
(expr $OSDS - 1)) ; do
574 run_osd
$dir $id $ceph_osd_args ||
return 1
577 create_pool
$poolname 1 1 ||
return 1
578 ceph osd pool
set $poolname size
2
579 wait_for_clean ||
return 1
583 echo $payload > $dir/ORIGINAL
584 for i
in $
(seq 1 $OBJS)
586 rados
--pool $poolname put obj
$i $dir/ORIGINAL ||
return 1
589 # Remove the object from one shard physically
590 # Restarted osd get $ceph_osd_args passed
591 local other
=$
(get_not_primary
$poolname obj1
)
592 local pgid
=$
(get_pg
$poolname obj1
)
593 local primary
=$
(get_primary
$poolname obj1
)
595 kill_daemons
$dir TERM osd.
$other >&2 < /dev
/null ||
return 1
596 kill_daemons
$dir TERM osd.
$primary >&2 < /dev
/null ||
return 1
597 for i
in $
(seq 1 $REPAIRS)
599 # Remove from both osd.0 and osd.1
601 _objectstore_tool_nodown
$dir $OSD obj
$i remove ||
return 1
603 activate_osd
$dir $primary $ceph_osd_args ||
return 1
604 activate_osd
$dir $other $ceph_osd_args ||
return 1
605 wait_for_clean ||
return 1
608 wait_for_clean ||
return 1
612 # This should have caused 1 object to be repaired
613 ceph pg
$pgid query | jq
'.info.stats.stat_sum'
614 COUNT
=$
(ceph pg
$pgid query | jq
'.info.stats.stat_sum.num_objects_repaired')
615 test "$COUNT" = "$REPAIRS" ||
return 1
617 ceph pg dump
--format=json-pretty | jq
".pg_map.osd_stats[] | select(.osd == $primary )"
618 COUNT
=$
(ceph pg dump
--format=json-pretty | jq
".pg_map.osd_stats[] | select(.osd == $primary ).num_shards_repaired")
619 test "$COUNT" = "$(expr $REPAIRS / 2)" ||
return 1
621 ceph pg dump
--format=json-pretty | jq
".pg_map.osd_stats[] | select(.osd == $other )"
622 COUNT
=$
(ceph pg dump
--format=json-pretty | jq
".pg_map.osd_stats[] | select(.osd == $other ).num_shards_repaired")
623 test "$COUNT" = "$(expr $REPAIRS / 2)" ||
return 1
625 ceph pg dump
--format=json-pretty | jq
".pg_map.osd_stats_sum"
626 COUNT
=$
(ceph pg dump
--format=json-pretty | jq
".pg_map.osd_stats_sum.num_shards_repaired")
627 test "$COUNT" = "$REPAIRS" ||
return 1
630 teardown
$dir ||
return 1
633 function TEST_repair_stats_ec
() {
635 local poolname
=testpool
638 # This need to be an even number
640 local allow_overwrites
=false
642 # Launch a cluster with 5 seconds scrub interval
643 setup
$dir ||
return 1
644 run_mon
$dir a ||
return 1
645 run_mgr
$dir x ||
return 1
646 local ceph_osd_args
="--osd_deep_scrub_randomize_ratio=0 \
647 --osd-scrub-interval-randomize-ratio=0"
648 for id
in $
(seq 0 $
(expr $OSDS - 1)) ; do
649 run_osd
$dir $id $ceph_osd_args ||
return 1
653 create_ec_pool
$poolname $allow_overwrites k
=2 m
=1 ||
return 1
657 echo $payload > $dir/ORIGINAL
658 for i
in $
(seq 1 $OBJS)
660 rados
--pool $poolname put obj
$i $dir/ORIGINAL ||
return 1
663 # Remove the object from one shard physically
664 # Restarted osd get $ceph_osd_args passed
665 local other
=$
(get_not_primary
$poolname obj1
)
666 local pgid
=$
(get_pg
$poolname obj1
)
667 local primary
=$
(get_primary
$poolname obj1
)
669 kill_daemons
$dir TERM osd.
$other >&2 < /dev
/null ||
return 1
670 kill_daemons
$dir TERM osd.
$primary >&2 < /dev
/null ||
return 1
671 for i
in $
(seq 1 $REPAIRS)
673 # Remove from both osd.0 and osd.1
675 _objectstore_tool_nodown
$dir $OSD obj
$i remove ||
return 1
677 activate_osd
$dir $primary $ceph_osd_args ||
return 1
678 activate_osd
$dir $other $ceph_osd_args ||
return 1
679 wait_for_clean ||
return 1
682 wait_for_clean ||
return 1
686 # This should have caused 1 object to be repaired
687 ceph pg
$pgid query | jq
'.info.stats.stat_sum'
688 COUNT
=$
(ceph pg
$pgid query | jq
'.info.stats.stat_sum.num_objects_repaired')
689 test "$COUNT" = "$REPAIRS" ||
return 1
691 for osd
in $
(seq 0 $
(expr $OSDS - 1)) ; do
692 if [ $osd = $other -o $osd = $primary ]; then
693 repair
=$
(expr $REPAIRS / 2)
698 ceph pg dump
--format=json-pretty | jq
".pg_map.osd_stats[] | select(.osd == $osd )"
699 COUNT
=$
(ceph pg dump
--format=json-pretty | jq
".pg_map.osd_stats[] | select(.osd == $osd ).num_shards_repaired")
700 test "$COUNT" = "$repair" ||
return 1
703 ceph pg dump
--format=json-pretty | jq
".pg_map.osd_stats_sum"
704 COUNT
=$
(ceph pg dump
--format=json-pretty | jq
".pg_map.osd_stats_sum.num_shards_repaired")
705 test "$COUNT" = "$REPAIRS" ||
return 1
708 teardown
$dir ||
return 1
711 function corrupt_and_repair_jerasure
() {
713 local allow_overwrites
=$2
714 local poolname
=ecpool
716 setup
$dir ||
return 1
717 run_mon
$dir a ||
return 1
718 run_mgr
$dir x ||
return 1
719 for id
in $
(seq 0 3) ; do
720 if [ "$allow_overwrites" = "true" ]; then
721 run_osd
$dir $id ||
return 1
723 run_osd_filestore
$dir $id ||
return 1
726 create_rbd_pool ||
return 1
727 wait_for_clean ||
return 1
729 create_ec_pool
$poolname $allow_overwrites k
=2 m
=2 ||
return 1
730 corrupt_and_repair_erasure_coded
$dir $poolname ||
return 1
732 teardown
$dir ||
return 1
735 function TEST_corrupt_and_repair_jerasure_appends
() {
736 corrupt_and_repair_jerasure
$1 false
739 function TEST_corrupt_and_repair_jerasure_overwrites
() {
740 if [ "$use_ec_overwrite" = "true" ]; then
741 corrupt_and_repair_jerasure
$1 true
745 function corrupt_and_repair_lrc
() {
747 local allow_overwrites
=$2
748 local poolname
=ecpool
750 setup
$dir ||
return 1
751 run_mon
$dir a ||
return 1
752 run_mgr
$dir x ||
return 1
753 for id
in $
(seq 0 9) ; do
754 if [ "$allow_overwrites" = "true" ]; then
755 run_osd
$dir $id ||
return 1
757 run_osd_filestore
$dir $id ||
return 1
760 create_rbd_pool ||
return 1
761 wait_for_clean ||
return 1
763 create_ec_pool
$poolname $allow_overwrites k
=4 m
=2 l
=3 plugin
=lrc ||
return 1
764 corrupt_and_repair_erasure_coded
$dir $poolname ||
return 1
766 teardown
$dir ||
return 1
769 function TEST_corrupt_and_repair_lrc_appends
() {
770 corrupt_and_repair_lrc
$1 false
773 function TEST_corrupt_and_repair_lrc_overwrites
() {
774 if [ "$use_ec_overwrite" = "true" ]; then
775 corrupt_and_repair_lrc
$1 true
779 function unfound_erasure_coded
() {
781 local allow_overwrites
=$2
782 local poolname
=ecpool
785 setup
$dir ||
return 1
786 run_mon
$dir a ||
return 1
787 run_mgr
$dir x ||
return 1
788 for id
in $
(seq 0 3) ; do
789 if [ "$allow_overwrites" = "true" ]; then
790 run_osd
$dir $id ||
return 1
792 run_osd_filestore
$dir $id ||
return 1
796 create_ec_pool
$poolname $allow_overwrites k
=2 m
=2 ||
return 1
798 add_something
$dir $poolname ||
return 1
800 local primary
=$
(get_primary
$poolname SOMETHING
)
801 local -a osds
=($
(get_osds
$poolname SOMETHING |
sed -e "s/$primary//"))
802 local not_primary_first
=${osds[0]}
803 local not_primary_second
=${osds[1]}
804 local not_primary_third
=${osds[2]}
807 # 1) remove the corresponding file from the OSDs
810 run_in_background pids objectstore_tool
$dir $not_primary_first SOMETHING remove
811 run_in_background pids objectstore_tool
$dir $not_primary_second SOMETHING remove
812 run_in_background pids objectstore_tool
$dir $not_primary_third SOMETHING remove
815 if [ $return_code -ne 0 ]; then return $return_code; fi
820 local pg
=$
(get_pg
$poolname SOMETHING
)
825 # it may take a bit to appear due to mon/mgr asynchrony
826 for f
in `seq 1 60`; do
827 ceph
-s |
grep "1/1 objects unfound" && break
830 ceph
-s|
grep "4 up" ||
return 1
831 ceph
-s|
grep "4 in" ||
return 1
832 ceph
-s|
grep "1/1 objects unfound" ||
return 1
834 teardown
$dir ||
return 1
837 function TEST_unfound_erasure_coded_appends
() {
838 unfound_erasure_coded
$1 false
841 function TEST_unfound_erasure_coded_overwrites
() {
842 if [ "$use_ec_overwrite" = "true" ]; then
843 unfound_erasure_coded
$1 true
848 # list_missing for EC pool
850 function list_missing_erasure_coded
() {
852 local allow_overwrites
=$2
853 local poolname
=ecpool
855 setup
$dir ||
return 1
856 run_mon
$dir a ||
return 1
857 run_mgr
$dir x ||
return 1
858 for id
in $
(seq 0 2) ; do
859 if [ "$allow_overwrites" = "true" ]; then
860 run_osd
$dir $id ||
return 1
862 run_osd_filestore
$dir $id ||
return 1
865 create_rbd_pool ||
return 1
866 wait_for_clean ||
return 1
868 create_ec_pool
$poolname $allow_overwrites k
=2 m
=1 ||
return 1
870 # Put an object and remove the two shards (including primary)
871 add_something
$dir $poolname MOBJ0 ||
return 1
872 local -a osds0
=($
(get_osds
$poolname MOBJ0
))
874 # Put another object and remove two shards (excluding primary)
875 add_something
$dir $poolname MOBJ1 ||
return 1
876 local -a osds1
=($
(get_osds
$poolname MOBJ1
))
878 # Stop all osd daemons
879 for id
in $
(seq 0 2) ; do
880 kill_daemons
$dir TERM osd.
$id >&2 < /dev
/null ||
return 1
884 ceph-objectstore-tool
--data-path $dir/$id \
885 MOBJ0 remove ||
return 1
887 ceph-objectstore-tool
--data-path $dir/$id \
888 MOBJ0 remove ||
return 1
891 ceph-objectstore-tool
--data-path $dir/$id \
892 MOBJ1 remove ||
return 1
894 ceph-objectstore-tool
--data-path $dir/$id \
895 MOBJ1 remove ||
return 1
897 for id
in $
(seq 0 2) ; do
898 activate_osd
$dir $id >&2 ||
return 1
900 create_rbd_pool ||
return 1
901 wait_for_clean ||
return 1
903 # Get get - both objects should in the same PG
904 local pg
=$
(get_pg
$poolname MOBJ0
)
906 # Repair the PG, which triggers the recovering,
907 # and should mark the object as unfound
910 for i
in $
(seq 0 120) ; do
911 [ $i -lt 60 ] ||
return 1
912 matches
=$
(ceph pg
$pg list_unfound |
egrep "MOBJ0|MOBJ1" |
wc -l)
913 [ $matches -eq 2 ] && break
916 teardown
$dir ||
return 1
919 function TEST_list_missing_erasure_coded_appends
() {
920 list_missing_erasure_coded
$1 false
923 function TEST_list_missing_erasure_coded_overwrites
() {
924 if [ "$use_ec_overwrite" = "true" ]; then
925 list_missing_erasure_coded
$1 true
930 # Corrupt one copy of a replicated pool
932 function TEST_corrupt_scrub_replicated
() {
934 local poolname
=csr_pool
937 setup
$dir ||
return 1
938 run_mon
$dir a
--osd_pool_default_size=2 ||
return 1
939 run_mgr
$dir x ||
return 1
940 run_osd
$dir 0 ||
return 1
941 run_osd
$dir 1 ||
return 1
942 create_rbd_pool ||
return 1
943 wait_for_clean ||
return 1
945 create_pool foo
1 ||
return 1
946 create_pool
$poolname 1 1 ||
return 1
947 wait_for_clean ||
return 1
949 for i
in $
(seq 1 $total_objs) ; do
951 add_something
$dir $poolname $objname ||
return 1
953 rados
--pool $poolname setomapheader
$objname hdr-
$objname ||
return 1
954 rados
--pool $poolname setomapval
$objname key-
$objname val-
$objname ||
return 1
957 # Increase file 1 MB + 1KB
958 dd if=/dev
/zero of
=$dir/new.ROBJ19 bs
=1024 count
=1025
959 rados
--pool $poolname put
$objname $dir/new.ROBJ19 ||
return 1
960 rm -f $dir/new.ROBJ19
962 local pg
=$
(get_pg
$poolname ROBJ0
)
963 local primary
=$
(get_primary
$poolname ROBJ0
)
965 # Compute an old omap digest and save oi
966 CEPH_ARGS
='' ceph daemon $
(get_asok_path osd
.0) \
967 config
set osd_deep_scrub_update_digest_min_age
0
968 CEPH_ARGS
='' ceph daemon $
(get_asok_path osd
.1) \
969 config
set osd_deep_scrub_update_digest_min_age
0
972 for i
in $
(seq 1 $total_objs) ; do
975 # Alternate corruption between osd.0 and osd.1
976 local osd
=$
(expr $i % 2)
980 # Size (deep scrub data_digest too)
981 local payload
=UVWXYZZZ
982 echo $payload > $dir/CORRUPT
983 objectstore_tool
$dir $osd $objname set-bytes
$dir/CORRUPT ||
return 1
987 # digest (deep scrub only)
989 echo $payload > $dir/CORRUPT
990 objectstore_tool
$dir $osd $objname set-bytes
$dir/CORRUPT ||
return 1
995 objectstore_tool
$dir $osd $objname remove ||
return 1
999 # Modify omap value (deep scrub only)
1000 objectstore_tool
$dir $osd $objname set-omap key-
$objname $dir/CORRUPT ||
return 1
1004 # Delete omap key (deep scrub only)
1005 objectstore_tool
$dir $osd $objname rm-omap key-
$objname ||
return 1
1009 # Add extra omap key (deep scrub only)
1010 echo extra
> $dir/extra-val
1011 objectstore_tool
$dir $osd $objname set-omap key2-
$objname $dir/extra-val ||
return 1
1016 # Modify omap header (deep scrub only)
1017 echo -n newheader
> $dir/hdr
1018 objectstore_tool
$dir $osd $objname set-omaphdr
$dir/hdr ||
return 1
1023 rados
--pool $poolname setxattr
$objname key1-
$objname val1-
$objname ||
return 1
1024 rados
--pool $poolname setxattr
$objname key2-
$objname val2-
$objname ||
return 1
1027 echo -n bad-val
> $dir/bad-val
1028 objectstore_tool
$dir $osd $objname set-attr _key1-
$objname $dir/bad-val ||
return 1
1029 objectstore_tool
$dir $osd $objname rm-attr _key2-
$objname ||
return 1
1030 echo -n val3-
$objname > $dir/newval
1031 objectstore_tool
$dir $osd $objname set-attr _key3-
$objname $dir/newval ||
return 1
1032 rm $dir/bad-val
$dir/newval
1036 objectstore_tool
$dir $osd $objname get-attr _
> $dir/robj9-oi
1037 echo -n D
> $dir/change
1038 rados
--pool $poolname put
$objname $dir/change
1039 objectstore_tool
$dir $osd $objname set-attr _
$dir/robj9-oi
1040 rm $dir/oi
$dir/change
1043 # ROBJ10 must be handled after digests are re-computed by a deep scrub below
1044 # ROBJ11 must be handled with config change before deep scrub
1045 # ROBJ12 must be handled with config change before scrubs
1046 # ROBJ13 must be handled before scrubs
1049 echo -n bad-val
> $dir/bad-val
1050 objectstore_tool
$dir 0 $objname set-attr _
$dir/bad-val ||
return 1
1051 objectstore_tool
$dir 1 $objname rm-attr _ ||
return 1
1056 objectstore_tool
$dir $osd $objname rm-attr _ ||
return 1
1060 objectstore_tool
$dir 0 $objname rm-attr snapset ||
return 1
1061 echo -n bad-val
> $dir/bad-val
1062 objectstore_tool
$dir 1 $objname set-attr snapset
$dir/bad-val ||
return 1
1066 # Deep-scrub only (all replicas are diffent than the object info
1067 local payload
=ROBJ17
1068 echo $payload > $dir/new.ROBJ17
1069 objectstore_tool
$dir 0 $objname set-bytes
$dir/new.ROBJ17 ||
return 1
1070 objectstore_tool
$dir 1 $objname set-bytes
$dir/new.ROBJ17 ||
return 1
1074 # Deep-scrub only (all replicas are diffent than the object info
1075 local payload
=ROBJ18
1076 echo $payload > $dir/new.ROBJ18
1077 objectstore_tool
$dir 0 $objname set-bytes
$dir/new.ROBJ18 ||
return 1
1078 objectstore_tool
$dir 1 $objname set-bytes
$dir/new.ROBJ18 ||
return 1
1079 # Make one replica have a different object info, so a full repair must happen too
1080 objectstore_tool
$dir $osd $objname corrupt-info ||
return 1
1084 # Set osd-max-object-size smaller than this object's size
1089 local pg
=$
(get_pg
$poolname ROBJ0
)
1091 ceph tell osd.\
* injectargs
-- --osd-max-object-size=1048576
1093 inject_eio rep data
$poolname ROBJ11
$dir 0 ||
return 1 # shard 0 of [1, 0], osd.1
1094 inject_eio rep mdata
$poolname ROBJ12
$dir 1 ||
return 1 # shard 1 of [1, 0], osd.0
1095 inject_eio rep mdata
$poolname ROBJ13
$dir 1 ||
return 1 # shard 1 of [1, 0], osd.0
1096 inject_eio rep data
$poolname ROBJ13
$dir 0 ||
return 1 # shard 0 of [1, 0], osd.1
1101 declare -a s_err_strings
1102 err_strings
[0]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 shard 1 soid 3:30259878:::ROBJ15:head : candidate had a missing info key"
1103 err_strings
[1]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 soid 3:33aca486:::ROBJ18:head : object info inconsistent "
1104 err_strings
[2]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 shard 1 soid 3:5c7b2c47:::ROBJ16:head : candidate had a corrupt snapset"
1105 err_strings
[3]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 shard 0 soid 3:5c7b2c47:::ROBJ16:head : candidate had a missing snapset key"
1106 err_strings
[4]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 soid 3:5c7b2c47:::ROBJ16:head : failed to pick suitable object info"
1107 err_strings
[5]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 soid 3:86586531:::ROBJ8:head : attr value mismatch '_key1-ROBJ8', attr name mismatch '_key3-ROBJ8', attr name mismatch '_key2-ROBJ8'"
1108 err_strings
[6]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 shard 0 soid 3:bc819597:::ROBJ12:head : candidate had a stat error"
1109 err_strings
[7]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 shard 1 soid 3:c0c86b1d:::ROBJ14:head : candidate had a missing info key"
1110 err_strings
[8]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 shard 0 soid 3:c0c86b1d:::ROBJ14:head : candidate had a corrupt info"
1111 err_strings
[9]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 soid 3:c0c86b1d:::ROBJ14:head : failed to pick suitable object info"
1112 err_strings
[10]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 shard 1 soid 3:ce3f1d6a:::ROBJ1:head : candidate size 9 info size 7 mismatch"
1113 err_strings
[11]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 shard 1 soid 3:ce3f1d6a:::ROBJ1:head : size 9 != size 7 from auth oi 3:ce3f1d6a:::ROBJ1:head[(][0-9]*'[0-9]* osd.1.0:[0-9]* dirty|omap|data_digest|omap_digest s 7 uv 3 dd 2ddbf8f5 od f5fba2c6 alloc_hint [[]0 0 0[]][)], size 9 != size 7 from shard 0"
1114 err_strings
[12]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 shard 0 soid 3:d60617f9:::ROBJ13:head : candidate had a stat error"
1115 err_strings
[13]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 shard 1 3:f2a5b2a4:::ROBJ3:head : missing"
1116 err_strings
[14]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 shard 1 soid 3:ffdb2004:::ROBJ9:head : candidate size 1 info size 7 mismatch"
1117 err_strings
[15]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 shard 1 soid 3:ffdb2004:::ROBJ9:head : object info inconsistent "
1118 err_strings
[16]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 3:c0c86b1d:::ROBJ14:head : no '_' attr"
1119 err_strings
[17]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 3:5c7b2c47:::ROBJ16:head : can't decode 'snapset' attr buffer::malformed_input: .* no longer understand old encoding version 3 < 97"
1120 err_strings
[18]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 scrub : stat mismatch, got 19/19 objects, 0/0 clones, 18/19 dirty, 18/19 omap, 0/0 pinned, 0/0 hit_set_archive, 0/0 whiteouts, 1049713/1049720 bytes, 0/0 manifest objects, 0/0 hit_set_archive bytes."
1121 err_strings
[19]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 scrub 1 missing, 8 inconsistent objects"
1122 err_strings
[20]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 scrub 18 errors"
1123 err_strings
[21]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 soid 3:123a5f55:::ROBJ19:head : size 1049600 > 1048576 is too large"
1125 for err_string
in "${err_strings[@]}"
1127 if ! grep -q "$err_string" $dir/osd.
${primary}.log
1129 echo "Missing log message '$err_string'"
1130 ERRORS
=$
(expr $ERRORS + 1)
1134 rados list-inconsistent-pg
$poolname > $dir/json ||
return 1
1136 test $
(jq
'. | length' $dir/json
) = "1" ||
return 1
1138 test $
(jq
-r '.[0]' $dir/json
) = $pg ||
return 1
1140 rados list-inconsistent-obj
$pg > $dir/json ||
return 1
1141 # Get epoch for repair-get requests
1142 epoch
=$
(jq .epoch
$dir/json
)
1144 jq
"$jqfilter" << EOF | jq '.inconsistents' | python3 -c "$sortkeys" > $dir/checkcsjson
1167 "prior_version": "21'3",
1168 "last_reqid": "osd.1.0:57",
1182 "data_digest": "0x2ddbf8f5",
1183 "omap_digest": "0xf5fba2c6",
1184 "expected_object_size": 0,
1185 "expected_write_size": 0,
1186 "alloc_hint_flags": 0,
1194 "size_mismatch_info",
1195 "obj_size_info_mismatch"
1201 "selected_object_info": {
1212 "prior_version": "21'3",
1213 "last_reqid": "osd.1.0:57",
1216 "mtime": "2018-04-05 14:33:19.804040",
1217 "local_mtime": "2018-04-05 14:33:19.804839",
1227 "data_digest": "0x2ddbf8f5",
1228 "omap_digest": "0xf5fba2c6",
1229 "expected_object_size": 0,
1230 "expected_write_size": 0,
1231 "alloc_hint_flags": 0,
1237 "union_shard_errors": [
1238 "size_mismatch_info",
1239 "obj_size_info_mismatch"
1268 "selected_object_info": {
1279 "prior_version": "43'36",
1280 "last_reqid": "osd.1.0:55",
1294 "data_digest": "0x2ddbf8f5",
1295 "omap_digest": "0x067f306a",
1296 "expected_object_size": 0,
1297 "expected_write_size": 0,
1298 "alloc_hint_flags": 0,
1304 "union_shard_errors": [
1332 "selected_object_info": {
1343 "prior_version": "45'39",
1344 "last_reqid": "osd.1.0:58",
1358 "data_digest": "0x2ddbf8f5",
1359 "omap_digest": "0x6441854d",
1360 "expected_object_size": 0,
1361 "expected_write_size": 0,
1362 "alloc_hint_flags": 0,
1368 "union_shard_errors": [
1383 "object_info": "bad-val",
1400 "union_shard_errors": [
1427 "prior_version": "49'45",
1428 "last_reqid": "osd.1.0:48",
1431 "mtime": "2018-04-05 14:33:29.498969",
1432 "local_mtime": "2018-04-05 14:33:29.499890",
1442 "data_digest": "0x2ddbf8f5",
1443 "omap_digest": "0x2d2a4d6e",
1444 "expected_object_size": 0,
1445 "expected_write_size": 0,
1446 "alloc_hint_flags": 0,
1466 "selected_object_info": {
1477 "prior_version": "49'45",
1478 "last_reqid": "osd.1.0:48",
1492 "data_digest": "0x2ddbf8f5",
1493 "omap_digest": "0x2d2a4d6e",
1494 "expected_object_size": 0,
1495 "expected_write_size": 0,
1496 "alloc_hint_flags": 0,
1502 "union_shard_errors": [
1538 "snapset": "bad-val",
1542 "union_shard_errors": [
1549 "object_info_inconsistency"
1557 "selected_object_info": {
1558 "alloc_hint_flags": 255,
1559 "data_digest": "0x2ddbf8f5",
1560 "expected_object_size": 0,
1561 "expected_write_size": 0,
1581 "omap_digest": "0xddc3680f",
1592 "alloc_hint_flags": 0,
1593 "data_digest": "0x2ddbf8f5",
1594 "expected_object_size": 0,
1595 "expected_write_size": 0,
1615 "omap_digest": "0xddc3680f",
1629 "alloc_hint_flags": 255,
1630 "data_digest": "0x2ddbf8f5",
1631 "expected_object_size": 0,
1632 "expected_write_size": 0,
1652 "omap_digest": "0xddc3680f",
1664 "union_shard_errors": []
1677 "union_shard_errors": [],
1678 "selected_object_info": {
1689 "prior_version": "63'58",
1690 "last_reqid": "osd.1.0:58",
1693 "mtime": "2019-08-09T23:33:58.340709+0000",
1694 "local_mtime": "2019-08-09T23:33:58.345676+0000",
1704 "data_digest": "0x3dde0ef3",
1705 "omap_digest": "0xbffddd28",
1706 "expected_object_size": 0,
1707 "expected_write_size": 0,
1708 "alloc_hint_flags": 0,
1745 "selected_object_info": {
1756 "prior_version": "25'9",
1757 "last_reqid": "osd.1.0:60",
1771 "data_digest": "0x2ddbf8f5",
1772 "omap_digest": "0x00b35dfd",
1773 "expected_object_size": 0,
1774 "expected_write_size": 0,
1775 "alloc_hint_flags": 0,
1781 "union_shard_errors": [
1800 "name": "key1-ROBJ8"
1804 "value": "val2-ROBJ8",
1805 "name": "key2-ROBJ8"
1817 "value": "val1-ROBJ8",
1818 "name": "key1-ROBJ8"
1822 "value": "val3-ROBJ8",
1823 "name": "key3-ROBJ8"
1832 "selected_object_info": {
1843 "prior_version": "79'65",
1844 "last_reqid": "client.4554.0:1",
1858 "data_digest": "0x2ddbf8f5",
1859 "omap_digest": "0xd6be81dc",
1860 "expected_object_size": 0,
1861 "expected_write_size": 0,
1862 "alloc_hint_flags": 0,
1868 "union_shard_errors": [],
1870 "attr_value_mismatch",
1871 "attr_name_mismatch"
1895 "prior_version": "51'64",
1896 "last_reqid": "client.4649.0:1",
1910 "data_digest": "0x2b63260d",
1911 "omap_digest": "0x2eecc539",
1912 "expected_object_size": 0,
1913 "expected_write_size": 0,
1914 "alloc_hint_flags": 0,
1937 "prior_version": "37'27",
1938 "last_reqid": "osd.1.0:63",
1941 "mtime": "2018-04-05 14:33:25.352485",
1942 "local_mtime": "2018-04-05 14:33:25.353746",
1952 "data_digest": "0x2ddbf8f5",
1953 "omap_digest": "0x2eecc539",
1954 "expected_object_size": 0,
1955 "expected_write_size": 0,
1956 "alloc_hint_flags": 0,
1964 "obj_size_info_mismatch"
1970 "selected_object_info": {
1981 "prior_version": "51'64",
1982 "last_reqid": "client.4649.0:1",
1996 "data_digest": "0x2b63260d",
1997 "omap_digest": "0x2eecc539",
1998 "expected_object_size": 0,
1999 "expected_write_size": 0,
2000 "alloc_hint_flags": 0,
2006 "union_shard_errors": [
2007 "obj_size_info_mismatch"
2010 "object_info_inconsistency"
2025 jq
"$jqfilter" $dir/json | jq
'.inconsistents' | python3
-c "$sortkeys" > $dir/csjson
2026 multidiff
$dir/checkcsjson
$dir/csjson ||
test $getjson = "yes" ||
return 1
2027 if test $getjson = "yes"
2029 jq
'.' $dir/json
> save1.json
2032 if test "$LOCALRUN" = "yes" && which jsonschema
> /dev
/null
;
2034 jsonschema
-i $dir/json
$CEPH_ROOT/doc
/rados
/command
/list-inconsistent-obj.json ||
return 1
2038 # Change data and size again because digest was recomputed
2039 echo -n ZZZ
> $dir/change
2040 rados
--pool $poolname put
$objname $dir/change
2041 # Set one to an even older value
2042 objectstore_tool
$dir 0 $objname set-attr _
$dir/robj9-oi
2043 rm $dir/oi
$dir/change
2046 objectstore_tool
$dir 1 $objname get-attr _
> $dir/oi
2047 rados
--pool $poolname setomapval
$objname key2-
$objname val2-
$objname
2048 objectstore_tool
$dir 0 $objname set-attr _
$dir/oi
2049 objectstore_tool
$dir 1 $objname set-attr _
$dir/oi
2052 inject_eio rep data
$poolname ROBJ11
$dir 0 ||
return 1 # shard 0 of [1, 0], osd.1
2053 inject_eio rep mdata
$poolname ROBJ12
$dir 1 ||
return 1 # shard 1 of [1, 0], osd.0
2054 inject_eio rep mdata
$poolname ROBJ13
$dir 1 ||
return 1 # shard 1 of [1, 0], osd.0
2055 inject_eio rep data
$poolname ROBJ13
$dir 0 ||
return 1 # shard 0 of [1, 0], osd.1
2057 # ROBJ19 won't error this time
2058 ceph tell osd.\
* injectargs
-- --osd-max-object-size=134217728
2063 err_strings
[0]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 shard 1 soid 3:30259878:::ROBJ15:head : candidate had a missing info key"
2064 err_strings
[1]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 shard 0 soid 3:33aca486:::ROBJ18:head : data_digest 0xbd89c912 != data_digest 0x2ddbf8f5 from auth oi 3:33aca486:::ROBJ18:head[(][0-9]*'[0-9]* osd.1.0:[0-9]* dirty|omap|data_digest|omap_digest s 7 uv 54 dd 2ddbf8f5 od ddc3680f alloc_hint [[]0 0 255[]][)], object info inconsistent "
2065 err_strings
[2]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 shard 1 soid 3:33aca486:::ROBJ18:head : data_digest 0xbd89c912 != data_digest 0x2ddbf8f5 from auth oi 3:33aca486:::ROBJ18:head[(][0-9]*'[0-9]* osd.1.0:[0-9]* dirty|omap|data_digest|omap_digest s 7 uv 54 dd 2ddbf8f5 od ddc3680f alloc_hint [[]0 0 255[]][)]"
2066 err_strings
[3]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 soid 3:33aca486:::ROBJ18:head : failed to pick suitable auth object"
2067 err_strings
[4]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 shard 1 soid 3:5c7b2c47:::ROBJ16:head : candidate had a corrupt snapset"
2068 err_strings
[5]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 shard 0 soid 3:5c7b2c47:::ROBJ16:head : candidate had a missing snapset key"
2069 err_strings
[6]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 soid 3:5c7b2c47:::ROBJ16:head : failed to pick suitable object info"
2070 err_strings
[7]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 soid 3:86586531:::ROBJ8:head : attr value mismatch '_key1-ROBJ8', attr name mismatch '_key3-ROBJ8', attr name mismatch '_key2-ROBJ8'"
2071 err_strings
[8]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 shard 1 soid 3:87abbf36:::ROBJ11:head : candidate had a read error"
2072 err_strings
[9]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 shard 0 soid 3:8aa5320e:::ROBJ17:head : data_digest 0x5af0c3ef != data_digest 0x2ddbf8f5 from auth oi 3:8aa5320e:::ROBJ17:head[(][0-9]*'[0-9]* osd.1.0:[0-9]* dirty|omap|data_digest|omap_digest s 7 uv 51 dd 2ddbf8f5 od e9572720 alloc_hint [[]0 0 0[]][)]"
2073 err_strings
[10]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 shard 1 soid 3:8aa5320e:::ROBJ17:head : data_digest 0x5af0c3ef != data_digest 0x2ddbf8f5 from auth oi 3:8aa5320e:::ROBJ17:head[(][0-9]*'[0-9]* osd.1.0:[0-9]* dirty|omap|data_digest|omap_digest s 7 uv 51 dd 2ddbf8f5 od e9572720 alloc_hint [[]0 0 0[]][)]"
2074 err_strings
[11]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 soid 3:8aa5320e:::ROBJ17:head : failed to pick suitable auth object"
2075 err_strings
[12]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 soid 3:8b55fa4b:::ROBJ7:head : omap_digest 0xefced57a != omap_digest 0x6a73cc07 from shard 1"
2076 err_strings
[13]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 shard 1 soid 3:8b55fa4b:::ROBJ7:head : omap_digest 0x6a73cc07 != omap_digest 0xefced57a from auth oi 3:8b55fa4b:::ROBJ7:head[(][0-9]*'[0-9]* osd.1.0:[0-9]* dirty|omap|data_digest|omap_digest s 7 uv 21 dd 2ddbf8f5 od efced57a alloc_hint [[]0 0 0[]][)]"
2077 err_strings
[14]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 shard 0 soid 3:a53c12e8:::ROBJ6:head : omap_digest 0x689ee887 != omap_digest 0x179c919f from shard 1, omap_digest 0x689ee887 != omap_digest 0x179c919f from auth oi 3:a53c12e8:::ROBJ6:head[(][0-9]*'[0-9]* osd.1.0:[0-9]* dirty|omap|data_digest|omap_digest s 7 uv 18 dd 2ddbf8f5 od 179c919f alloc_hint [[]0 0 0[]][)]"
2078 err_strings
[15]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 shard 0 soid 3:b1f19cbd:::ROBJ10:head : omap_digest 0xa8dd5adc != omap_digest 0xc2025a24 from auth oi 3:b1f19cbd:::ROBJ10:head[(][0-9]*'[0-9]* osd.1.0:[0-9]* dirty|omap|data_digest|omap_digest s 7 uv 30 dd 2ddbf8f5 od c2025a24 alloc_hint [[]0 0 0[]][)]"
2079 err_strings
[16]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 shard 1 soid 3:b1f19cbd:::ROBJ10:head : omap_digest 0xa8dd5adc != omap_digest 0xc2025a24 from auth oi 3:b1f19cbd:::ROBJ10:head[(][0-9]*'[0-9]* osd.1.0:[0-9]* dirty|omap|data_digest|omap_digest s 7 uv 30 dd 2ddbf8f5 od c2025a24 alloc_hint [[]0 0 0[]][)]"
2080 err_strings
[17]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 soid 3:b1f19cbd:::ROBJ10:head : failed to pick suitable auth object"
2081 err_strings
[18]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 shard 0 soid 3:bc819597:::ROBJ12:head : candidate had a stat error"
2082 err_strings
[19]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 shard 1 soid 3:c0c86b1d:::ROBJ14:head : candidate had a missing info key"
2083 err_strings
[20]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 shard 0 soid 3:c0c86b1d:::ROBJ14:head : candidate had a corrupt info"
2084 err_strings
[21]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 soid 3:c0c86b1d:::ROBJ14:head : failed to pick suitable object info"
2085 err_strings
[22]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 shard 1 soid 3:ce3f1d6a:::ROBJ1:head : candidate size 9 info size 7 mismatch"
2086 err_strings
[23]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 shard 1 soid 3:ce3f1d6a:::ROBJ1:head : data_digest 0x2d4a11c2 != data_digest 0x2ddbf8f5 from shard 0, data_digest 0x2d4a11c2 != data_digest 0x2ddbf8f5 from auth oi 3:ce3f1d6a:::ROBJ1:head[(][0-9]*'[0-9]* osd.1.0:[0-9]* dirty|omap|data_digest|omap_digest s 7 uv 3 dd 2ddbf8f5 od f5fba2c6 alloc_hint [[]0 0 0[]][)], size 9 != size 7 from auth oi 3:ce3f1d6a:::ROBJ1:head[(][0-9]*'[0-9]* osd.1.0:[0-9]* dirty|omap|data_digest|omap_digest s 7 uv 3 dd 2ddbf8f5 od f5fba2c6 alloc_hint [[]0 0 0[]][)], size 9 != size 7 from shard 0"
2087 err_strings
[24]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 shard 1 soid 3:d60617f9:::ROBJ13:head : candidate had a read error"
2088 err_strings
[25]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 shard 0 soid 3:d60617f9:::ROBJ13:head : candidate had a stat error"
2089 err_strings
[26]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 soid 3:d60617f9:::ROBJ13:head : failed to pick suitable object info"
2090 err_strings
[27]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 shard 0 soid 3:e97ce31e:::ROBJ2:head : data_digest 0x578a4830 != data_digest 0x2ddbf8f5 from shard 1, data_digest 0x578a4830 != data_digest 0x2ddbf8f5 from auth oi 3:e97ce31e:::ROBJ2:head[(][0-9]*'[0-9]* osd.1.0:[0-9]* dirty|omap|data_digest|omap_digest s 7 uv 6 dd 2ddbf8f5 od f8e11918 alloc_hint [[]0 0 0[]][)]"
2091 err_strings
[28]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 shard 1 3:f2a5b2a4:::ROBJ3:head : missing"
2092 err_strings
[29]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 shard 0 soid 3:f4981d31:::ROBJ4:head : omap_digest 0xd7178dfe != omap_digest 0xe2d46ea4 from shard 1, omap_digest 0xd7178dfe != omap_digest 0xe2d46ea4 from auth oi 3:f4981d31:::ROBJ4:head[(][0-9]*'[0-9]* osd.1.0:[0-9]* dirty|omap|data_digest|omap_digest s 7 uv 12 dd 2ddbf8f5 od e2d46ea4 alloc_hint [[]0 0 0[]][)]"
2093 err_strings
[30]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 soid 3:f4bfd4d1:::ROBJ5:head : omap_digest 0x1a862a41 != omap_digest 0x6cac8f6 from shard 1"
2094 err_strings
[31]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 shard 1 soid 3:f4bfd4d1:::ROBJ5:head : omap_digest 0x6cac8f6 != omap_digest 0x1a862a41 from auth oi 3:f4bfd4d1:::ROBJ5:head[(][0-9]*'[0-9]* osd.1.0:[0-9]* dirty|omap|data_digest|omap_digest s 7 uv 15 dd 2ddbf8f5 od 1a862a41 alloc_hint [[]0 0 0[]][)]"
2095 err_strings
[32]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 shard 0 soid 3:ffdb2004:::ROBJ9:head : candidate size 3 info size 7 mismatch"
2096 err_strings
[33]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 shard 0 soid 3:ffdb2004:::ROBJ9:head : object info inconsistent "
2097 err_strings
[34]="log_channel[(]cluster[)] log [[]ERR[]] : deep-scrub [0-9]*[.]0 3:c0c86b1d:::ROBJ14:head : no '_' attr"
2098 err_strings
[35]="log_channel[(]cluster[)] log [[]ERR[]] : deep-scrub [0-9]*[.]0 3:5c7b2c47:::ROBJ16:head : can't decode 'snapset' attr buffer::malformed_input: .* no longer understand old encoding version 3 < 97"
2099 err_strings
[36]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 deep-scrub : stat mismatch, got 19/19 objects, 0/0 clones, 18/19 dirty, 18/19 omap, 0/0 pinned, 0/0 hit_set_archive, 0/0 whiteouts, 1049715/1049716 bytes, 0/0 manifest objects, 0/0 hit_set_archive bytes."
2100 err_strings
[37]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 deep-scrub 1 missing, 11 inconsistent objects"
2101 err_strings
[38]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 deep-scrub 35 errors"
2103 for err_string
in "${err_strings[@]}"
2105 if ! grep -q "$err_string" $dir/osd.
${primary}.log
2107 echo "Missing log message '$err_string'"
2108 ERRORS
=$
(expr $ERRORS + 1)
2112 rados list-inconsistent-pg
$poolname > $dir/json ||
return 1
2114 test $
(jq
'. | length' $dir/json
) = "1" ||
return 1
2116 test $
(jq
-r '.[0]' $dir/json
) = $pg ||
return 1
2118 rados list-inconsistent-obj
$pg > $dir/json ||
return 1
2119 # Get epoch for repair-get requests
2120 epoch
=$
(jq .epoch
$dir/json
)
2122 jq
"$jqfilter" << EOF | jq '.inconsistents' | python3 -c "$sortkeys" > $dir/checkcsjson
2128 "data_digest": "0x2ddbf8f5",
2129 "omap_digest": "0xf5fba2c6",
2147 "prior_version": "21'3",
2148 "last_reqid": "osd.1.0:57",
2151 "mtime": "2018-04-05 14:33:19.804040",
2152 "local_mtime": "2018-04-05 14:33:19.804839",
2162 "data_digest": "0x2ddbf8f5",
2163 "omap_digest": "0xf5fba2c6",
2164 "expected_object_size": 0,
2165 "expected_write_size": 0,
2166 "alloc_hint_flags": 0,
2172 "data_digest": "0x2d4a11c2",
2173 "omap_digest": "0xf5fba2c6",
2176 "data_digest_mismatch_info",
2177 "size_mismatch_info",
2178 "obj_size_info_mismatch"
2184 "selected_object_info": {
2195 "prior_version": "21'3",
2196 "last_reqid": "osd.1.0:57",
2199 "mtime": "2018-04-05 14:33:19.804040",
2200 "local_mtime": "2018-04-05 14:33:19.804839",
2210 "data_digest": "0x2ddbf8f5",
2211 "omap_digest": "0xf5fba2c6",
2212 "expected_object_size": 0,
2213 "expected_write_size": 0,
2214 "alloc_hint_flags": 0,
2220 "union_shard_errors": [
2221 "data_digest_mismatch_info",
2222 "size_mismatch_info",
2223 "obj_size_info_mismatch"
2226 "data_digest_mismatch",
2240 "data_digest": "0x2ddbf8f5",
2241 "omap_digest": "0xa8dd5adc",
2244 "omap_digest_mismatch_info"
2250 "data_digest": "0x2ddbf8f5",
2251 "omap_digest": "0xa8dd5adc",
2254 "omap_digest_mismatch_info"
2260 "selected_object_info": {
2261 "alloc_hint_flags": 0,
2262 "data_digest": "0x2ddbf8f5",
2263 "expected_object_size": 0,
2264 "expected_write_size": 0,
2284 "omap_digest": "0xc2025a24",
2291 "union_shard_errors": [
2292 "omap_digest_mismatch_info"
2306 "data_digest": "0x2ddbf8f5",
2307 "omap_digest": "0xa03cef03",
2322 "selected_object_info": {
2333 "prior_version": "41'33",
2334 "last_reqid": "osd.1.0:51",
2337 "mtime": "2018-04-05 14:33:26.761286",
2338 "local_mtime": "2018-04-05 14:33:26.762368",
2348 "data_digest": "0x2ddbf8f5",
2349 "omap_digest": "0xa03cef03",
2350 "expected_object_size": 0,
2351 "expected_write_size": 0,
2352 "alloc_hint_flags": 0,
2358 "union_shard_errors": [
2380 "data_digest": "0x2ddbf8f5",
2381 "omap_digest": "0x067f306a",
2388 "selected_object_info": {
2399 "prior_version": "43'36",
2400 "last_reqid": "osd.1.0:55",
2403 "mtime": "2018-04-05 14:33:27.460958",
2404 "local_mtime": "2018-04-05 14:33:27.462109",
2414 "data_digest": "0x2ddbf8f5",
2415 "omap_digest": "0x067f306a",
2416 "expected_object_size": 0,
2417 "expected_write_size": 0,
2418 "alloc_hint_flags": 0,
2424 "union_shard_errors": [
2454 "union_shard_errors": [
2470 "object_info": "bad-val",
2471 "data_digest": "0x2ddbf8f5",
2472 "omap_digest": "0x4f14f849",
2481 "data_digest": "0x2ddbf8f5",
2482 "omap_digest": "0x4f14f849",
2491 "union_shard_errors": [
2518 "prior_version": "49'45",
2519 "last_reqid": "osd.1.0:48",
2522 "mtime": "2018-04-05 14:33:29.498969",
2523 "local_mtime": "2018-04-05 14:33:29.499890",
2533 "data_digest": "0x2ddbf8f5",
2534 "omap_digest": "0x2d2a4d6e",
2535 "expected_object_size": 0,
2536 "expected_write_size": 0,
2537 "alloc_hint_flags": 0,
2543 "data_digest": "0x2ddbf8f5",
2544 "omap_digest": "0x2d2a4d6e",
2551 "data_digest": "0x2ddbf8f5",
2552 "omap_digest": "0x2d2a4d6e",
2561 "selected_object_info": {
2572 "prior_version": "49'45",
2573 "last_reqid": "osd.1.0:48",
2576 "mtime": "2018-04-05 14:33:29.498969",
2577 "local_mtime": "2018-04-05 14:33:29.499890",
2587 "data_digest": "0x2ddbf8f5",
2588 "omap_digest": "0x2d2a4d6e",
2589 "expected_object_size": 0,
2590 "expected_write_size": 0,
2591 "alloc_hint_flags": 0,
2597 "union_shard_errors": [
2620 "data_digest": "0x2ddbf8f5",
2624 "omap_digest": "0x8b699207",
2630 "snapset": "bad-val",
2631 "data_digest": "0x2ddbf8f5",
2635 "omap_digest": "0x8b699207",
2641 "union_shard_errors": [
2654 "selected_object_info": {
2655 "alloc_hint_flags": 0,
2656 "data_digest": "0x2ddbf8f5",
2657 "expected_object_size": 0,
2658 "expected_write_size": 0,
2678 "omap_digest": "0xe9572720",
2687 "data_digest": "0x5af0c3ef",
2689 "data_digest_mismatch_info"
2691 "omap_digest": "0xe9572720",
2697 "data_digest": "0x5af0c3ef",
2699 "data_digest_mismatch_info"
2701 "omap_digest": "0xe9572720",
2707 "union_shard_errors": [
2708 "data_digest_mismatch_info"
2713 "object_info_inconsistency"
2721 "selected_object_info": {
2722 "alloc_hint_flags": 255,
2723 "data_digest": "0x2ddbf8f5",
2724 "expected_object_size": 0,
2725 "expected_write_size": 0,
2745 "omap_digest": "0xddc3680f",
2754 "data_digest": "0xbd89c912",
2756 "data_digest_mismatch_info"
2759 "alloc_hint_flags": 0,
2760 "data_digest": "0x2ddbf8f5",
2761 "expected_object_size": 0,
2762 "expected_write_size": 0,
2782 "omap_digest": "0xddc3680f",
2789 "omap_digest": "0xddc3680f",
2795 "data_digest": "0xbd89c912",
2797 "data_digest_mismatch_info"
2800 "alloc_hint_flags": 255,
2801 "data_digest": "0x2ddbf8f5",
2802 "expected_object_size": 0,
2803 "expected_write_size": 0,
2823 "omap_digest": "0xddc3680f",
2830 "omap_digest": "0xddc3680f",
2836 "union_shard_errors": [
2837 "data_digest_mismatch_info"
2843 "data_digest": "0x578a4830",
2844 "omap_digest": "0xf8e11918",
2847 "data_digest_mismatch_info"
2853 "data_digest": "0x2ddbf8f5",
2854 "omap_digest": "0xf8e11918",
2861 "selected_object_info": {
2872 "prior_version": "23'6",
2873 "last_reqid": "osd.1.0:59",
2876 "mtime": "2018-04-05 14:33:20.498756",
2877 "local_mtime": "2018-04-05 14:33:20.499704",
2887 "data_digest": "0x2ddbf8f5",
2888 "omap_digest": "0xf8e11918",
2889 "expected_object_size": 0,
2890 "expected_write_size": 0,
2891 "alloc_hint_flags": 0,
2897 "union_shard_errors": [
2898 "data_digest_mismatch_info"
2901 "data_digest_mismatch"
2914 "data_digest": "0x2ddbf8f5",
2915 "omap_digest": "0x00b35dfd",
2929 "selected_object_info": {
2940 "prior_version": "25'9",
2941 "last_reqid": "osd.1.0:60",
2944 "mtime": "2018-04-05 14:33:21.189382",
2945 "local_mtime": "2018-04-05 14:33:21.190446",
2955 "data_digest": "0x2ddbf8f5",
2956 "omap_digest": "0x00b35dfd",
2957 "expected_object_size": 0,
2958 "expected_write_size": 0,
2959 "alloc_hint_flags": 0,
2965 "union_shard_errors": [
2980 "data_digest": "0x2ddbf8f5",
2981 "omap_digest": "0xd7178dfe",
2984 "omap_digest_mismatch_info"
2990 "data_digest": "0x2ddbf8f5",
2991 "omap_digest": "0xe2d46ea4",
2998 "selected_object_info": {
3009 "prior_version": "27'12",
3010 "last_reqid": "osd.1.0:61",
3013 "mtime": "2018-04-05 14:33:21.862313",
3014 "local_mtime": "2018-04-05 14:33:21.863261",
3024 "data_digest": "0x2ddbf8f5",
3025 "omap_digest": "0xe2d46ea4",
3026 "expected_object_size": 0,
3027 "expected_write_size": 0,
3028 "alloc_hint_flags": 0,
3034 "union_shard_errors": [
3035 "omap_digest_mismatch_info"
3038 "omap_digest_mismatch"
3051 "data_digest": "0x2ddbf8f5",
3052 "omap_digest": "0x1a862a41",
3059 "data_digest": "0x2ddbf8f5",
3060 "omap_digest": "0x06cac8f6",
3063 "omap_digest_mismatch_info"
3069 "selected_object_info": {
3080 "prior_version": "29'15",
3081 "last_reqid": "osd.1.0:62",
3084 "mtime": "2018-04-05 14:33:22.589300",
3085 "local_mtime": "2018-04-05 14:33:22.590376",
3095 "data_digest": "0x2ddbf8f5",
3096 "omap_digest": "0x1a862a41",
3097 "expected_object_size": 0,
3098 "expected_write_size": 0,
3099 "alloc_hint_flags": 0,
3105 "union_shard_errors": [
3106 "omap_digest_mismatch_info"
3109 "omap_digest_mismatch"
3122 "data_digest": "0x2ddbf8f5",
3123 "omap_digest": "0x689ee887",
3126 "omap_digest_mismatch_info"
3132 "data_digest": "0x2ddbf8f5",
3133 "omap_digest": "0x179c919f",
3140 "selected_object_info": {
3151 "prior_version": "31'18",
3152 "last_reqid": "osd.1.0:53",
3155 "mtime": "2018-04-05 14:33:23.289188",
3156 "local_mtime": "2018-04-05 14:33:23.290130",
3166 "data_digest": "0x2ddbf8f5",
3167 "omap_digest": "0x179c919f",
3168 "expected_object_size": 0,
3169 "expected_write_size": 0,
3170 "alloc_hint_flags": 0,
3176 "union_shard_errors": [
3177 "omap_digest_mismatch_info"
3180 "omap_digest_mismatch"
3193 "data_digest": "0x2ddbf8f5",
3194 "omap_digest": "0xefced57a",
3201 "data_digest": "0x2ddbf8f5",
3202 "omap_digest": "0x6a73cc07",
3205 "omap_digest_mismatch_info"
3211 "selected_object_info": {
3222 "prior_version": "33'21",
3223 "last_reqid": "osd.1.0:52",
3226 "mtime": "2018-04-05 14:33:23.979658",
3227 "local_mtime": "2018-04-05 14:33:23.980731",
3237 "data_digest": "0x2ddbf8f5",
3238 "omap_digest": "0xefced57a",
3239 "expected_object_size": 0,
3240 "expected_write_size": 0,
3241 "alloc_hint_flags": 0,
3247 "union_shard_errors": [
3248 "omap_digest_mismatch_info"
3251 "omap_digest_mismatch"
3268 "name": "key1-ROBJ8"
3272 "value": "val2-ROBJ8",
3273 "name": "key2-ROBJ8"
3276 "data_digest": "0x2ddbf8f5",
3277 "omap_digest": "0xd6be81dc",
3287 "value": "val1-ROBJ8",
3288 "name": "key1-ROBJ8"
3292 "value": "val3-ROBJ8",
3293 "name": "key3-ROBJ8"
3296 "data_digest": "0x2ddbf8f5",
3297 "omap_digest": "0xd6be81dc",
3304 "selected_object_info": {
3315 "prior_version": "79'65",
3316 "last_reqid": "client.4554.0:1",
3319 "mtime": "2018-04-05 14:34:05.598688",
3320 "local_mtime": "2018-04-05 14:34:05.599698",
3330 "data_digest": "0x2ddbf8f5",
3331 "omap_digest": "0xd6be81dc",
3332 "expected_object_size": 0,
3333 "expected_write_size": 0,
3334 "alloc_hint_flags": 0,
3340 "union_shard_errors": [],
3342 "attr_value_mismatch",
3343 "attr_name_mismatch"
3367 "prior_version": "37'27",
3368 "last_reqid": "osd.1.0:63",
3371 "mtime": "2018-04-05 14:33:25.352485",
3372 "local_mtime": "2018-04-05 14:33:25.353746",
3382 "data_digest": "0x2ddbf8f5",
3383 "omap_digest": "0x2eecc539",
3384 "expected_object_size": 0,
3385 "expected_write_size": 0,
3386 "alloc_hint_flags": 0,
3392 "data_digest": "0x1f26fb26",
3393 "omap_digest": "0x2eecc539",
3396 "obj_size_info_mismatch"
3412 "version": "119'68",
3413 "prior_version": "51'64",
3414 "last_reqid": "client.4834.0:1",
3417 "mtime": "2018-04-05 14:35:01.500659",
3418 "local_mtime": "2018-04-05 14:35:01.502117",
3428 "data_digest": "0x1f26fb26",
3429 "omap_digest": "0x2eecc539",
3430 "expected_object_size": 0,
3431 "expected_write_size": 0,
3432 "alloc_hint_flags": 0,
3438 "data_digest": "0x1f26fb26",
3439 "omap_digest": "0x2eecc539",
3446 "selected_object_info": {
3456 "version": "119'68",
3457 "prior_version": "51'64",
3458 "last_reqid": "client.4834.0:1",
3461 "mtime": "2018-04-05 14:35:01.500659",
3462 "local_mtime": "2018-04-05 14:35:01.502117",
3472 "data_digest": "0x1f26fb26",
3473 "omap_digest": "0x2eecc539",
3474 "expected_object_size": 0,
3475 "expected_write_size": 0,
3476 "alloc_hint_flags": 0,
3482 "union_shard_errors": [
3483 "obj_size_info_mismatch"
3486 "object_info_inconsistency"
3501 jq
"$jqfilter" $dir/json | jq
'.inconsistents' | python3
-c "$sortkeys" > $dir/csjson
3502 multidiff
$dir/checkcsjson
$dir/csjson ||
test $getjson = "yes" ||
return 1
3503 if test $getjson = "yes"
3505 jq
'.' $dir/json
> save2.json
3508 if test "$LOCALRUN" = "yes" && which jsonschema
> /dev
/null
;
3510 jsonschema
-i $dir/json
$CEPH_ROOT/doc
/rados
/command
/list-inconsistent-obj.json ||
return 1
3516 # This hangs if the repair doesn't work
3517 timeout
30 rados
-p $poolname get ROBJ17
$dir/robj17.out ||
return 1
3518 timeout
30 rados
-p $poolname get ROBJ18
$dir/robj18.out ||
return 1
3519 # Even though we couldn't repair all of the introduced errors, we can fix ROBJ17
3520 diff -q $dir/new.ROBJ17
$dir/robj17.out ||
return 1
3521 rm -f $dir/new.ROBJ17
$dir/robj17.out ||
return 1
3522 diff -q $dir/new.ROBJ18
$dir/robj18.out ||
return 1
3523 rm -f $dir/new.ROBJ18
$dir/robj18.out ||
return 1
3525 if [ $ERRORS != "0" ];
3527 echo "TEST FAILED WITH $ERRORS ERRORS"
3531 ceph osd pool
rm $poolname $poolname --yes-i-really-really-mean-it
3532 teardown
$dir ||
return 1
3537 # Test scrub errors for an erasure coded pool
3539 function corrupt_scrub_erasure
() {
3541 local allow_overwrites
=$2
3542 local poolname
=ecpool
3545 setup
$dir ||
return 1
3546 run_mon
$dir a ||
return 1
3547 run_mgr
$dir x ||
return 1
3548 for id
in $
(seq 0 2) ; do
3549 if [ "$allow_overwrites" = "true" ]; then
3550 run_osd
$dir $id ||
return 1
3552 run_osd_filestore
$dir $id ||
return 1
3555 create_rbd_pool ||
return 1
3558 create_ec_pool
$poolname $allow_overwrites k
=2 m
=1 stripe_unit
=2K
--force ||
return 1
3559 wait_for_clean ||
return 1
3561 for i
in $
(seq 1 $total_objs) ; do
3563 add_something
$dir $poolname $objname ||
return 1
3565 local osd
=$
(expr $i % 2)
3569 # Size (deep scrub data_digest too)
3570 local payload
=UVWXYZZZ
3571 echo $payload > $dir/CORRUPT
3572 objectstore_tool
$dir $osd $objname set-bytes
$dir/CORRUPT ||
return 1
3577 dd if=/dev
/urandom of
=$dir/CORRUPT bs
=2048 count
=1
3578 objectstore_tool
$dir $osd $objname set-bytes
$dir/CORRUPT ||
return 1
3583 objectstore_tool
$dir $osd $objname remove ||
return 1
3587 rados
--pool $poolname setxattr
$objname key1-
$objname val1-
$objname ||
return 1
3588 rados
--pool $poolname setxattr
$objname key2-
$objname val2-
$objname ||
return 1
3591 echo -n bad-val
> $dir/bad-val
3592 objectstore_tool
$dir $osd $objname set-attr _key1-
$objname $dir/bad-val ||
return 1
3593 objectstore_tool
$dir $osd $objname rm-attr _key2-
$objname ||
return 1
3594 echo -n val3-
$objname > $dir/newval
3595 objectstore_tool
$dir $osd $objname set-attr _key3-
$objname $dir/newval ||
return 1
3596 rm $dir/bad-val
$dir/newval
3601 dd if=/dev
/urandom of
=$dir/CORRUPT bs
=2048 count
=2
3602 objectstore_tool
$dir $osd $objname set-bytes
$dir/CORRUPT ||
return 1
3606 objectstore_tool
$dir 0 $objname rm-attr hinfo_key ||
return 1
3607 echo -n bad-val
> $dir/bad-val
3608 objectstore_tool
$dir 1 $objname set-attr hinfo_key
$dir/bad-val ||
return 1
3612 local payload
=MAKETHISDIFFERENTFROMOTHEROBJECTS
3613 echo $payload > $dir/DIFFERENT
3614 rados
--pool $poolname put
$objname $dir/DIFFERENT ||
return 1
3616 # Get hinfo_key from EOBJ1
3617 objectstore_tool
$dir 0 EOBJ1 get-attr hinfo_key
> $dir/hinfo
3618 objectstore_tool
$dir 0 $objname set-attr hinfo_key
$dir/hinfo ||
return 1
3625 local pg
=$
(get_pg
$poolname EOBJ0
)
3629 rados list-inconsistent-pg
$poolname > $dir/json ||
return 1
3631 test $
(jq
'. | length' $dir/json
) = "1" ||
return 1
3633 test $
(jq
-r '.[0]' $dir/json
) = $pg ||
return 1
3635 rados list-inconsistent-obj
$pg > $dir/json ||
return 1
3636 # Get epoch for repair-get requests
3637 epoch
=$
(jq .epoch
$dir/json
)
3639 jq
"$jqfilter" << EOF | jq '.inconsistents' | python3 -c "$sortkeys" > $dir/checkcsjson
3663 "prior_version": "0'0",
3664 "last_reqid": "client.4184.0:1",
3676 "data_digest": "0x2ddbf8f5",
3677 "omap_digest": "0xffffffff",
3678 "expected_object_size": 0,
3679 "expected_write_size": 0,
3680 "alloc_hint_flags": 0,
3689 "size_mismatch_info",
3690 "obj_size_info_mismatch"
3703 "selected_object_info": {
3714 "prior_version": "0'0",
3715 "last_reqid": "client.4184.0:1",
3727 "data_digest": "0x2ddbf8f5",
3728 "omap_digest": "0xffffffff",
3729 "expected_object_size": 0,
3730 "expected_write_size": 0,
3731 "alloc_hint_flags": 0,
3737 "union_shard_errors": [
3738 "size_mismatch_info",
3739 "obj_size_info_mismatch"
3777 "selected_object_info": {
3788 "prior_version": "0'0",
3789 "last_reqid": "client.4252.0:1",
3801 "data_digest": "0x2ddbf8f5",
3802 "omap_digest": "0xffffffff",
3803 "expected_object_size": 0,
3804 "expected_write_size": 0,
3805 "alloc_hint_flags": 0,
3811 "union_shard_errors": [
3830 "name": "key1-EOBJ4"
3834 "value": "val2-EOBJ4",
3835 "name": "key2-EOBJ4"
3853 "value": "val1-EOBJ4",
3854 "name": "key1-EOBJ4"
3858 "value": "val2-EOBJ4",
3859 "name": "key2-EOBJ4"
3872 "value": "val1-EOBJ4",
3873 "name": "key1-EOBJ4"
3877 "value": "val3-EOBJ4",
3878 "name": "key3-EOBJ4"
3883 "selected_object_info": {
3894 "prior_version": "45'5",
3895 "last_reqid": "client.4294.0:1",
3907 "data_digest": "0x2ddbf8f5",
3908 "omap_digest": "0xffffffff",
3909 "expected_object_size": 0,
3910 "expected_write_size": 0,
3911 "alloc_hint_flags": 0,
3917 "union_shard_errors": [],
3919 "attr_value_mismatch",
3920 "attr_name_mismatch"
3951 "prior_version": "0'0",
3952 "last_reqid": "client.4382.0:1",
3964 "data_digest": "0x2ddbf8f5",
3965 "omap_digest": "0xffffffff",
3966 "expected_object_size": 0,
3967 "expected_write_size": 0,
3968 "alloc_hint_flags": 0,
3977 "size_mismatch_info",
3978 "obj_size_info_mismatch"
3991 "selected_object_info": {
4002 "prior_version": "0'0",
4003 "last_reqid": "client.4382.0:1",
4015 "data_digest": "0x2ddbf8f5",
4016 "omap_digest": "0xffffffff",
4017 "expected_object_size": 0,
4018 "expected_write_size": 0,
4019 "alloc_hint_flags": 0,
4025 "union_shard_errors": [
4026 "size_mismatch_info",
4027 "obj_size_info_mismatch"
4049 "selected_object_info": {
4060 "prior_version": "0'0",
4061 "last_reqid": "client.4418.0:1",
4073 "data_digest": "0x2ddbf8f5",
4074 "omap_digest": "0xffffffff",
4075 "expected_object_size": 0,
4076 "expected_write_size": 0,
4077 "alloc_hint_flags": 0,
4100 "hashinfo": "bad-val",
4110 "cumulative_shard_hashes": [
4124 "total_chunk_size": 2048
4128 "union_shard_errors": [
4135 "hinfo_inconsistency"
4144 "selected_object_info": {
4155 "prior_version": "75'9",
4156 "last_reqid": "client.4482.0:1",
4168 "data_digest": "0x136e4e27",
4169 "omap_digest": "0xffffffff",
4170 "expected_object_size": 0,
4171 "expected_write_size": 0,
4172 "alloc_hint_flags": 0,
4181 "cumulative_shard_hashes": [
4195 "total_chunk_size": 2048
4205 "cumulative_shard_hashes": [
4219 "total_chunk_size": 2048
4229 "cumulative_shard_hashes": [
4243 "total_chunk_size": 2048
4252 "union_shard_errors": []
4259 jq
"$jqfilter" $dir/json | jq
'.inconsistents' | python3
-c "$sortkeys" > $dir/csjson
4260 multidiff
$dir/checkcsjson
$dir/csjson ||
test $getjson = "yes" ||
return 1
4261 if test $getjson = "yes"
4263 jq
'.' $dir/json
> save3.json
4266 if test "$LOCALRUN" = "yes" && which jsonschema
> /dev
/null
;
4268 jsonschema
-i $dir/json
$CEPH_ROOT/doc
/rados
/command
/list-inconsistent-obj.json ||
return 1
4273 rados list-inconsistent-pg
$poolname > $dir/json ||
return 1
4275 test $
(jq
'. | length' $dir/json
) = "1" ||
return 1
4277 test $
(jq
-r '.[0]' $dir/json
) = $pg ||
return 1
4279 rados list-inconsistent-obj
$pg > $dir/json ||
return 1
4280 # Get epoch for repair-get requests
4281 epoch
=$
(jq .epoch
$dir/json
)
4283 if [ "$allow_overwrites" = "true" ]
4285 jq
"$jqfilter" << EOF | jq '.inconsistents' | python3 -c "$sortkeys" > $dir/checkcsjson
4291 "data_digest": "0x00000000",
4292 "omap_digest": "0xffffffff",
4311 "prior_version": "0'0",
4312 "last_reqid": "client.4184.0:1",
4315 "mtime": "2018-04-05 14:31:33.837147",
4316 "local_mtime": "2018-04-05 14:31:33.840763",
4324 "data_digest": "0x2ddbf8f5",
4325 "omap_digest": "0xffffffff",
4326 "expected_object_size": 0,
4327 "expected_write_size": 0,
4328 "alloc_hint_flags": 0,
4338 "size_mismatch_info",
4339 "obj_size_info_mismatch"
4345 "data_digest": "0x00000000",
4346 "omap_digest": "0xffffffff",
4354 "selected_object_info": {
4365 "prior_version": "0'0",
4366 "last_reqid": "client.4184.0:1",
4369 "mtime": "2018-04-05 14:31:33.837147",
4370 "local_mtime": "2018-04-05 14:31:33.840763",
4378 "data_digest": "0x2ddbf8f5",
4379 "omap_digest": "0xffffffff",
4380 "expected_object_size": 0,
4381 "expected_write_size": 0,
4382 "alloc_hint_flags": 0,
4388 "union_shard_errors": [
4390 "size_mismatch_info",
4391 "obj_size_info_mismatch"
4407 "data_digest": "0x00000000",
4408 "omap_digest": "0xffffffff",
4424 "data_digest": "0x00000000",
4425 "omap_digest": "0xffffffff",
4433 "selected_object_info": {
4444 "prior_version": "0'0",
4445 "last_reqid": "client.4252.0:1",
4448 "mtime": "2018-04-05 14:31:46.841145",
4449 "local_mtime": "2018-04-05 14:31:46.844996",
4457 "data_digest": "0x2ddbf8f5",
4458 "omap_digest": "0xffffffff",
4459 "expected_object_size": 0,
4460 "expected_write_size": 0,
4461 "alloc_hint_flags": 0,
4467 "union_shard_errors": [
4486 "name": "key1-EOBJ4"
4490 "value": "val2-EOBJ4",
4491 "name": "key2-EOBJ4"
4494 "data_digest": "0x00000000",
4495 "omap_digest": "0xffffffff",
4506 "value": "val1-EOBJ4",
4507 "name": "key1-EOBJ4"
4511 "value": "val2-EOBJ4",
4512 "name": "key2-EOBJ4"
4515 "data_digest": "0x00000000",
4516 "omap_digest": "0xffffffff",
4527 "value": "val1-EOBJ4",
4528 "name": "key1-EOBJ4"
4532 "value": "val3-EOBJ4",
4533 "name": "key3-EOBJ4"
4536 "data_digest": "0x00000000",
4537 "omap_digest": "0xffffffff",
4545 "selected_object_info": {
4556 "prior_version": "45'5",
4557 "last_reqid": "client.4294.0:1",
4560 "mtime": "2018-04-05 14:31:54.663622",
4561 "local_mtime": "2018-04-05 14:31:54.664527",
4569 "data_digest": "0x2ddbf8f5",
4570 "omap_digest": "0xffffffff",
4571 "expected_object_size": 0,
4572 "expected_write_size": 0,
4573 "alloc_hint_flags": 0,
4579 "union_shard_errors": [],
4581 "attr_value_mismatch",
4582 "attr_name_mismatch"
4595 "data_digest": "0x00000000",
4596 "omap_digest": "0xffffffff",
4604 "data_digest": "0x00000000",
4605 "omap_digest": "0xffffffff",
4617 "prior_version": "0'0",
4618 "last_reqid": "client.4382.0:1",
4621 "mtime": "2018-04-05 14:32:12.929161",
4622 "local_mtime": "2018-04-05 14:32:12.934707",
4630 "data_digest": "0x2ddbf8f5",
4631 "omap_digest": "0xffffffff",
4632 "expected_object_size": 0,
4633 "expected_write_size": 0,
4634 "alloc_hint_flags": 0,
4642 "size_mismatch_info",
4643 "obj_size_info_mismatch"
4650 "data_digest": "0x00000000",
4651 "omap_digest": "0xffffffff",
4659 "selected_object_info": {
4670 "prior_version": "0'0",
4671 "last_reqid": "client.4382.0:1",
4674 "mtime": "2018-04-05 14:32:12.929161",
4675 "local_mtime": "2018-04-05 14:32:12.934707",
4683 "data_digest": "0x2ddbf8f5",
4684 "omap_digest": "0xffffffff",
4685 "expected_object_size": 0,
4686 "expected_write_size": 0,
4687 "alloc_hint_flags": 0,
4693 "union_shard_errors": [
4694 "size_mismatch_info",
4695 "obj_size_info_mismatch"
4717 "union_shard_errors": [
4722 "selected_object_info": {
4733 "prior_version": "0'0",
4734 "last_reqid": "client.4418.0:1",
4737 "mtime": "2018-04-05 14:32:20.634116",
4738 "local_mtime": "2018-04-05 14:32:20.637999",
4746 "data_digest": "0x2ddbf8f5",
4747 "omap_digest": "0xffffffff",
4748 "expected_object_size": 0,
4749 "expected_write_size": 0,
4750 "alloc_hint_flags": 0,
4776 "hashinfo": "bad-val"
4784 "omap_digest": "0xffffffff",
4785 "data_digest": "0x00000000",
4787 "cumulative_shard_hashes": [
4801 "total_chunk_size": 2048
4815 "hinfo_inconsistency"
4817 "union_shard_errors": [],
4818 "selected_object_info": {
4829 "prior_version": "75'9",
4830 "last_reqid": "client.4482.0:1",
4833 "mtime": "2018-04-05 14:32:33.058782",
4834 "local_mtime": "2018-04-05 14:32:33.059679",
4842 "data_digest": "0x136e4e27",
4843 "omap_digest": "0xffffffff",
4844 "expected_object_size": 0,
4845 "expected_write_size": 0,
4846 "alloc_hint_flags": 0,
4859 "omap_digest": "0xffffffff",
4860 "data_digest": "0x00000000",
4862 "cumulative_shard_hashes": [
4876 "total_chunk_size": 2048
4885 "omap_digest": "0xffffffff",
4886 "data_digest": "0x00000000",
4888 "cumulative_shard_hashes": [
4902 "total_chunk_size": 2048
4911 "omap_digest": "0xffffffff",
4912 "data_digest": "0x00000000",
4914 "cumulative_shard_hashes": [
4928 "total_chunk_size": 2048
4940 jq
"$jqfilter" << EOF | jq '.inconsistents' | python3 -c "$sortkeys" > $dir/checkcsjson
4946 "data_digest": "0x04cfa72f",
4947 "omap_digest": "0xffffffff",
4966 "prior_version": "0'0",
4967 "last_reqid": "client.4192.0:1",
4970 "mtime": "2018-04-05 14:30:10.688009",
4971 "local_mtime": "2018-04-05 14:30:10.691774",
4979 "data_digest": "0x2ddbf8f5",
4980 "omap_digest": "0xffffffff",
4981 "expected_object_size": 0,
4982 "expected_write_size": 0,
4983 "alloc_hint_flags": 0,
4993 "size_mismatch_info",
4994 "obj_size_info_mismatch"
5000 "data_digest": "0x04cfa72f",
5001 "omap_digest": "0xffffffff",
5009 "selected_object_info": {
5020 "prior_version": "0'0",
5021 "last_reqid": "client.4192.0:1",
5024 "mtime": "2018-04-05 14:30:10.688009",
5025 "local_mtime": "2018-04-05 14:30:10.691774",
5033 "data_digest": "0x2ddbf8f5",
5034 "omap_digest": "0xffffffff",
5035 "expected_object_size": 0,
5036 "expected_write_size": 0,
5037 "alloc_hint_flags": 0,
5043 "union_shard_errors": [
5045 "size_mismatch_info",
5046 "obj_size_info_mismatch"
5071 "data_digest": "0x04cfa72f",
5072 "omap_digest": "0xffffffff",
5080 "data_digest": "0x04cfa72f",
5081 "omap_digest": "0xffffffff",
5089 "selected_object_info": {
5100 "prior_version": "0'0",
5101 "last_reqid": "client.4224.0:1",
5104 "mtime": "2018-04-05 14:30:14.152945",
5105 "local_mtime": "2018-04-05 14:30:14.154014",
5113 "data_digest": "0x2ddbf8f5",
5114 "omap_digest": "0xffffffff",
5115 "expected_object_size": 0,
5116 "expected_write_size": 0,
5117 "alloc_hint_flags": 0,
5123 "union_shard_errors": [
5138 "data_digest": "0x04cfa72f",
5139 "omap_digest": "0xffffffff",
5155 "data_digest": "0x04cfa72f",
5156 "omap_digest": "0xffffffff",
5164 "selected_object_info": {
5175 "prior_version": "0'0",
5176 "last_reqid": "client.4258.0:1",
5179 "mtime": "2018-04-05 14:30:18.875544",
5180 "local_mtime": "2018-04-05 14:30:18.880153",
5188 "data_digest": "0x2ddbf8f5",
5189 "omap_digest": "0xffffffff",
5190 "expected_object_size": 0,
5191 "expected_write_size": 0,
5192 "alloc_hint_flags": 0,
5198 "union_shard_errors": [
5217 "name": "key1-EOBJ4"
5221 "value": "val2-EOBJ4",
5222 "name": "key2-EOBJ4"
5225 "data_digest": "0x04cfa72f",
5226 "omap_digest": "0xffffffff",
5239 "omap_digest": "0xffffffff",
5240 "data_digest": "0x04cfa72f",
5244 "value": "val1-EOBJ4",
5245 "name": "key1-EOBJ4"
5249 "value": "val2-EOBJ4",
5250 "name": "key2-EOBJ4"
5260 "omap_digest": "0xffffffff",
5261 "data_digest": "0x04cfa72f",
5265 "value": "val1-EOBJ4",
5266 "name": "key1-EOBJ4"
5270 "value": "val3-EOBJ4",
5271 "name": "key3-EOBJ4"
5276 "selected_object_info": {
5287 "prior_version": "45'5",
5288 "last_reqid": "client.4296.0:1",
5291 "mtime": "2018-04-05 14:30:22.271983",
5292 "local_mtime": "2018-04-05 14:30:22.272840",
5300 "data_digest": "0x2ddbf8f5",
5301 "omap_digest": "0xffffffff",
5302 "expected_object_size": 0,
5303 "expected_write_size": 0,
5304 "alloc_hint_flags": 0,
5310 "union_shard_errors": [],
5312 "attr_value_mismatch",
5313 "attr_name_mismatch"
5326 "data_digest": "0x04cfa72f",
5327 "omap_digest": "0xffffffff",
5346 "prior_version": "0'0",
5347 "last_reqid": "client.4384.0:1",
5350 "mtime": "2018-04-05 14:30:35.162395",
5351 "local_mtime": "2018-04-05 14:30:35.166390",
5359 "data_digest": "0x2ddbf8f5",
5360 "omap_digest": "0xffffffff",
5361 "expected_object_size": 0,
5362 "expected_write_size": 0,
5363 "alloc_hint_flags": 0,
5372 "size_mismatch_info",
5374 "obj_size_info_mismatch"
5380 "data_digest": "0x04cfa72f",
5381 "omap_digest": "0xffffffff",
5389 "selected_object_info": {
5400 "prior_version": "0'0",
5401 "last_reqid": "client.4384.0:1",
5404 "mtime": "2018-04-05 14:30:35.162395",
5405 "local_mtime": "2018-04-05 14:30:35.166390",
5413 "data_digest": "0x2ddbf8f5",
5414 "omap_digest": "0xffffffff",
5415 "expected_object_size": 0,
5416 "expected_write_size": 0,
5417 "alloc_hint_flags": 0,
5423 "union_shard_errors": [
5424 "size_mismatch_info",
5426 "obj_size_info_mismatch"
5448 "union_shard_errors": [
5453 "selected_object_info": {
5464 "prior_version": "0'0",
5465 "last_reqid": "client.4420.0:1",
5468 "mtime": "2018-04-05 14:30:40.914673",
5469 "local_mtime": "2018-04-05 14:30:40.917705",
5477 "data_digest": "0x2ddbf8f5",
5478 "omap_digest": "0xffffffff",
5479 "expected_object_size": 0,
5480 "expected_write_size": 0,
5481 "alloc_hint_flags": 0,
5507 "hashinfo": "bad-val"
5515 "omap_digest": "0xffffffff",
5516 "data_digest": "0x04cfa72f",
5518 "cumulative_shard_hashes": [
5532 "total_chunk_size": 2048
5546 "hinfo_inconsistency"
5548 "union_shard_errors": [
5551 "selected_object_info": {
5562 "prior_version": "75'9",
5563 "last_reqid": "client.4486.0:1",
5566 "mtime": "2018-04-05 14:30:50.995009",
5567 "local_mtime": "2018-04-05 14:30:50.996112",
5575 "data_digest": "0x136e4e27",
5576 "omap_digest": "0xffffffff",
5577 "expected_object_size": 0,
5578 "expected_write_size": 0,
5579 "alloc_hint_flags": 0,
5595 "cumulative_shard_hashes": [
5609 "total_chunk_size": 2048
5618 "omap_digest": "0xffffffff",
5619 "data_digest": "0x5b7455a8",
5621 "cumulative_shard_hashes": [
5635 "total_chunk_size": 2048
5644 "omap_digest": "0xffffffff",
5645 "data_digest": "0x5b7455a8",
5647 "cumulative_shard_hashes": [
5661 "total_chunk_size": 2048
5673 jq
"$jqfilter" $dir/json | jq
'.inconsistents' | python3
-c "$sortkeys" > $dir/csjson
5674 multidiff
$dir/checkcsjson
$dir/csjson ||
test $getjson = "yes" ||
return 1
5675 if test $getjson = "yes"
5677 if [ "$allow_overwrites" = "true" ]
5683 jq
'.' $dir/json
> save
${num}.json
5686 if test "$LOCALRUN" = "yes" && which jsonschema
> /dev
/null
;
5688 jsonschema
-i $dir/json
$CEPH_ROOT/doc
/rados
/command
/list-inconsistent-obj.json ||
return 1
5691 ceph osd pool
rm $poolname $poolname --yes-i-really-really-mean-it
5692 teardown
$dir ||
return 1
5695 function TEST_corrupt_scrub_erasure_appends
() {
5696 corrupt_scrub_erasure
$1 false
5699 function TEST_corrupt_scrub_erasure_overwrites
() {
5700 if [ "$use_ec_overwrite" = "true" ]; then
5701 corrupt_scrub_erasure
$1 true
5706 # Test to make sure that a periodic scrub won't cause deep-scrub info to be lost
5708 function TEST_periodic_scrub_replicated
() {
5710 local poolname
=psr_pool
5713 setup
$dir ||
return 1
5714 run_mon
$dir a
--osd_pool_default_size=2 ||
return 1
5715 run_mgr
$dir x ||
return 1
5716 local ceph_osd_args
="--osd-scrub-interval-randomize-ratio=0 --osd-deep-scrub-randomize-ratio=0 "
5717 ceph_osd_args
+="--osd_scrub_backoff_ratio=0"
5718 run_osd
$dir 0 $ceph_osd_args ||
return 1
5719 run_osd
$dir 1 $ceph_osd_args ||
return 1
5720 create_rbd_pool ||
return 1
5721 wait_for_clean ||
return 1
5723 create_pool
$poolname 1 1 ||
return 1
5724 wait_for_clean ||
return 1
5727 add_something
$dir $poolname $objname scrub ||
return 1
5728 local primary
=$
(get_primary
$poolname $objname)
5729 local pg
=$
(get_pg
$poolname $objname)
5731 # Add deep-scrub only error
5732 local payload
=UVWXYZ
5733 echo $payload > $dir/CORRUPT
5734 # Uses $ceph_osd_args for osd restart
5735 objectstore_tool
$dir $osd $objname set-bytes
$dir/CORRUPT ||
return 1
5737 # No scrub information available, so expect failure
5739 ! rados list-inconsistent-obj
$pg | jq
'.' ||
return 1
5742 pg_deep_scrub
$pg ||
return 1
5744 # Make sure bad object found
5745 rados list-inconsistent-obj
$pg | jq
'.' |
grep -q $objname ||
return 1
5748 local last_scrub
=$
(get_last_scrub_stamp
$pg)
5749 # Fake a schedule scrub
5750 ceph tell
$pg scrub ||
return 1
5751 # Wait for schedule regular scrub
5752 wait_for_scrub
$pg "$last_scrub"
5754 # It needed to be upgraded
5755 grep -q "Deep scrub errors, upgrading scrub to deep-scrub" $dir/osd.
${primary}.log ||
return 1
5757 # Bad object still known
5758 rados list-inconsistent-obj
$pg | jq
'.' |
grep -q $objname ||
return 1
5760 # Can't upgrade with this set
5761 ceph osd
set nodeep-scrub
5762 # Let map change propagate to OSDs
5763 ceph tell osd
.0 get_latest_osdmap
5767 # Fake a schedule scrub
5768 ceph tell
$pg scrub ||
return 1
5769 # Wait for schedule regular scrub
5770 # to notice scrub and skip it
5772 for i
in $
(seq 14 -1 0)
5775 ! grep -q "Regular scrub skipped due to deep-scrub errors and nodeep-scrub set" $dir/osd.
${primary}.log ||
{ found
=true
; break; }
5776 echo Time left
: $i seconds
5778 test $found = "true" ||
return 1
5780 # Bad object still known
5781 rados list-inconsistent-obj
$pg | jq
'.' |
grep -q $objname ||
return 1
5784 # Request a regular scrub and it will be done
5786 grep -q "Regular scrub request, deep-scrub details will be lost" $dir/osd.
${primary}.log ||
return 1
5788 # deep-scrub error is no longer present
5789 rados list-inconsistent-obj
$pg | jq
'.' |
grep -qv $objname ||
return 1
5792 function TEST_scrub_warning
() {
5794 local poolname
=psr_pool
5799 local i7_days
=$
(calc
$i1_day \
* 7)
5800 local i14_days
=$
(calc
$i1_day \
* 14)
5802 local conf_overdue_seconds
=$
(calc
$i7_days + $i1_day + \
( $i7_days \
* $overdue \
) )
5803 local pool_overdue_seconds
=$
(calc
$i14_days + $i1_day + \
( $i14_days \
* $overdue \
) )
5805 setup
$dir ||
return 1
5806 run_mon
$dir a
--osd_pool_default_size=1 ||
return 1
5807 run_mgr
$dir x
--mon_warn_pg_not_scrubbed_ratio=${overdue} --mon_warn_pg_not_deep_scrubbed_ratio=${overdue} ||
return 1
5808 run_osd
$dir 0 $ceph_osd_args --osd_scrub_backoff_ratio=0 ||
return 1
5810 for i
in $
(seq 1 $
(expr $scrubs + $deep_scrubs))
5812 create_pool
$poolname-$i 1 1 ||
return 1
5813 wait_for_clean ||
return 1
5816 ceph osd pool
set $poolname-$i scrub_max_interval
$i14_days
5818 if [ $i = $
(expr $scrubs + 1) ];
5820 ceph osd pool
set $poolname-$i deep_scrub_interval
$i14_days
5827 ceph osd
set noscrub ||
return 1
5828 ceph osd
set nodeep-scrub ||
return 1
5829 ceph config
set global osd_scrub_interval_randomize_ratio
0
5830 ceph config
set global osd_deep_scrub_randomize_ratio
0
5831 ceph config
set global osd_scrub_max_interval
${i7_days}
5832 ceph config
set global osd_deep_scrub_interval
${i7_days}
5834 # Fake schedule scrubs
5835 for i
in $
(seq 1 $scrubs)
5839 overdue_seconds
=$pool_overdue_seconds
5841 overdue_seconds
=$conf_overdue_seconds
5843 ceph tell
${i}.0 scrub $(expr ${overdue_seconds} + ${i}00) ||
return 1
5845 # Fake schedule deep scrubs
5846 for i
in $
(seq $
(expr $scrubs + 1) $
(expr $scrubs + $deep_scrubs))
5848 if [ $i = "$(expr $scrubs + 1)" ];
5850 overdue_seconds
=$pool_overdue_seconds
5852 overdue_seconds
=$conf_overdue_seconds
5854 ceph tell
${i}.0 deep_scrub $(expr ${overdue_seconds} + ${i}00) ||
return 1
5860 ceph health |
grep -q "$deep_scrubs pgs not deep-scrubbed in time" ||
return 1
5861 ceph health |
grep -q "$scrubs pgs not scrubbed in time" ||
return 1
5862 COUNT
=$
(ceph health detail |
grep "not scrubbed since" |
wc -l)
5863 if [ "$COUNT" != $scrubs ]; then
5864 ceph health detail |
grep "not scrubbed since"
5867 COUNT
=$
(ceph health detail |
grep "not deep-scrubbed since" |
wc -l)
5868 if [ "$COUNT" != $deep_scrubs ]; then
5869 ceph health detail |
grep "not deep-scrubbed since"
5876 # Corrupt snapset in replicated pool
5878 function TEST_corrupt_snapset_scrub_rep
() {
5880 local poolname
=csr_pool
5883 setup
$dir ||
return 1
5884 run_mon
$dir a
--osd_pool_default_size=2 ||
return 1
5885 run_mgr
$dir x ||
return 1
5886 run_osd
$dir 0 ||
return 1
5887 run_osd
$dir 1 ||
return 1
5888 create_rbd_pool ||
return 1
5889 wait_for_clean ||
return 1
5891 create_pool foo
1 ||
return 1
5892 create_pool
$poolname 1 1 ||
return 1
5893 wait_for_clean ||
return 1
5895 for i
in $
(seq 1 $total_objs) ; do
5897 add_something
$dir $poolname $objname ||
return 1
5899 rados
--pool $poolname setomapheader
$objname hdr-
$objname ||
return 1
5900 rados
--pool $poolname setomapval
$objname key-
$objname val-
$objname ||
return 1
5903 local pg
=$
(get_pg
$poolname ROBJ0
)
5904 local primary
=$
(get_primary
$poolname ROBJ0
)
5906 rados
-p $poolname mksnap snap1
5907 echo -n head_of_snapshot_data
> $dir/change
5909 for i
in $
(seq 1 $total_objs) ; do
5912 # Alternate corruption between osd.0 and osd.1
5913 local osd
=$
(expr $i % 2)
5917 rados
--pool $poolname put
$objname $dir/change
5918 objectstore_tool
$dir $osd --head $objname clear-snapset corrupt ||
return 1
5922 rados
--pool $poolname put
$objname $dir/change
5923 objectstore_tool
$dir $osd --head $objname clear-snapset corrupt ||
return 1
5932 rados list-inconsistent-pg
$poolname > $dir/json ||
return 1
5934 test $
(jq
'. | length' $dir/json
) = "1" ||
return 1
5936 test $
(jq
-r '.[0]' $dir/json
) = $pg ||
return 1
5938 rados list-inconsistent-obj
$pg > $dir/json ||
return 1
5940 jq
"$jqfilter" << EOF | jq '.inconsistents' | python3 -c "$sortkeys" > $dir/checkcsjson
5953 "snapset_inconsistency"
5955 "union_shard_errors": [],
5956 "selected_object_info": {
5967 "prior_version": "21'3",
5968 "last_reqid": "client.4195.0:1",
5971 "mtime": "2018-04-05 14:35:43.286117",
5972 "local_mtime": "2018-04-05 14:35:43.288990",
5981 "data_digest": "0x53acb008",
5982 "omap_digest": "0xffffffff",
5983 "expected_object_size": 0,
5984 "expected_write_size": 0,
5985 "alloc_hint_flags": 0,
6032 "snapset_inconsistency"
6034 "union_shard_errors": [],
6035 "selected_object_info": {
6046 "prior_version": "23'6",
6047 "last_reqid": "client.4223.0:1",
6050 "mtime": "2018-04-05 14:35:48.326856",
6051 "local_mtime": "2018-04-05 14:35:48.328097",
6060 "data_digest": "0x53acb008",
6061 "omap_digest": "0xffffffff",
6062 "expected_object_size": 0,
6063 "expected_write_size": 0,
6064 "alloc_hint_flags": 0,
6106 jq
"$jqfilter" $dir/json | jq
'.inconsistents' | python3
-c "$sortkeys" > $dir/csjson
6107 multidiff
$dir/checkcsjson
$dir/csjson ||
test $getjson = "yes" ||
return 1
6108 if test $getjson = "yes"
6110 jq
'.' $dir/json
> save6.json
6113 if test "$LOCALRUN" = "yes" && which jsonschema
> /dev
/null
;
6115 jsonschema
-i $dir/json
$CEPH_ROOT/doc
/rados
/command
/list-inconsistent-obj.json ||
return 1
6119 declare -a err_strings
6120 err_strings
[0]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 soid [0-9]*:.*:::ROBJ1:head : snapset inconsistent"
6121 err_strings
[1]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 soid [0-9]*:.*:::ROBJ2:head : snapset inconsistent"
6122 err_strings
[2]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 [0-9]*:.*:::ROBJ1:1 : is an unexpected clone"
6123 err_strings
[3]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 scrub : stat mismatch, got 3/4 objects, 1/2 clones, 3/4 dirty, 3/4 omap, 0/0 pinned, 0/0 hit_set_archive, 0/0 whiteouts, 49/56 bytes, 0/0 manifest objects, 0/0 hit_set_archive bytes."
6124 err_strings
[4]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 scrub 0 missing, 2 inconsistent objects"
6125 err_strings
[5]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 scrub 4 errors"
6127 for err_string
in "${err_strings[@]}"
6129 if ! grep -q "$err_string" $dir/osd.
${primary}.log
6131 echo "Missing log message '$err_string'"
6132 ERRORS
=$
(expr $ERRORS + 1)
6136 if [ $ERRORS != "0" ];
6138 echo "TEST FAILED WITH $ERRORS ERRORS"
6142 ceph osd pool
rm $poolname $poolname --yes-i-really-really-mean-it
6143 teardown
$dir ||
return 1
6146 function TEST_request_scrub_priority
() {
6148 local poolname
=psr_pool
6153 setup
$dir ||
return 1
6154 run_mon
$dir a
--osd_pool_default_size=1 ||
return 1
6155 run_mgr
$dir x ||
return 1
6156 local ceph_osd_args
="--osd-scrub-interval-randomize-ratio=0 --osd-deep-scrub-randomize-ratio=0 "
6157 ceph_osd_args
+="--osd_scrub_backoff_ratio=0"
6158 run_osd
$dir 0 $ceph_osd_args ||
return 1
6160 create_pool
$poolname $PGS $PGS ||
return 1
6161 wait_for_clean ||
return 1
6164 add_something
$dir $poolname $objname noscrub ||
return 1
6165 local primary
=$
(get_primary
$poolname $objname)
6166 local pg
=$
(get_pg
$poolname $objname)
6167 poolid
=$
(ceph osd dump |
grep "^pool.*[']${poolname}[']" |
awk '{ print $2 }')
6170 for i
in $
(seq 0 $
(expr $PGS - 1))
6172 opg
="${poolid}.${i}"
6173 if [ "$opg" = "$pg" ]; then
6176 otherpgs
="${otherpgs}${opg} "
6177 local other_last_scrub
=$
(get_last_scrub_stamp
$pg)
6178 # Fake a schedule scrub
6179 ceph tell
$opg scrub
$opg ||
return 1
6185 # Request a regular scrub and it will be done
6186 local last_scrub
=$
(get_last_scrub_stamp
$pg)
6189 ceph osd
unset noscrub ||
return 1
6190 ceph osd
unset nodeep-scrub ||
return 1
6192 wait_for_scrub
$pg "$last_scrub"
6194 for opg
in $otherpgs $pg
6196 wait_for_scrub
$opg "$other_last_scrub"
6199 # Verify that the requested scrub ran first
6200 grep "log_channel.*scrub ok" $dir/osd.
${primary}.log |
grep -v purged_snaps |
head -1 |
sed 's/.*[[]DBG[]]//' |
grep -q $pg ||
return 1
6206 main osd-scrub-repair
"$@"
6209 # compile-command: "cd build ; make -j4 && \
6210 # ../qa/run-standalone.sh osd-scrub-repair.sh"