3 # Copyright (C) 2014 Red Hat <contact@redhat.com>
5 # Author: Loic Dachary <loic@dachary.org>
7 # This program is free software; you can redistribute it and/or modify
8 # it under the terms of the GNU Library Public License as published by
9 # the Free Software Foundation; either version 2, or (at your option)
12 # This program is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU Library Public License for more details.
18 source $CEPH_ROOT/qa
/standalone
/ceph-helpers.sh
20 if [ `uname` = FreeBSD
]; then
21 # erasure coding overwrites are only tested on Bluestore
22 # erasure coding on filestore is unsafe
23 # http://docs.ceph.com/docs/master/rados/operations/erasure-code/#erasure-coding-with-overwrites
24 use_ec_overwrite
=false
29 # Test development and debugging
30 # Set to "yes" in order to ignore diff errors and save results to update test
33 # Filter out mtime and local_mtime dates, version, prior_version and last_reqid (client) from any object_info.
34 jqfilter
='def walk(f):
36 | if type == "object" then
38 ( {}; . + { ($key): ($in[$key] | walk(f)) } ) | f
39 elif type == "array" then map( walk(f) ) | f
42 walk(if type == "object" then del(.mtime) else . end)
43 | walk(if type == "object" then del(.local_mtime) else . end)
44 | walk(if type == "object" then del(.last_reqid) else . end)
45 | walk(if type == "object" then del(.version) else . end)
46 | walk(if type == "object" then del(.prior_version) else . end)'
48 sortkeys
='import json; import sys ; JSON=sys.stdin.read() ; ud = json.loads(JSON) ; print json.dumps(ud, sort_keys=True, indent=2)'
54 export CEPH_MON
="127.0.0.1:7107" # git grep '\<7107\>' : there must be only one
56 CEPH_ARGS
+="--fsid=$(uuidgen) --auth-supported=none "
57 CEPH_ARGS
+="--mon-host=$CEPH_MON "
58 CEPH_ARGS
+="--osd-skip-data-digest=false "
60 export -n CEPH_CLI_TEST_DUP_COMMAND
61 local funcs
=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')}
62 for func
in $funcs ; do
63 $func $dir ||
return 1
67 function add_something
() {
70 local obj
=${3:-SOMETHING}
71 local scrub
=${4:-noscrub}
73 if [ "$scrub" = "noscrub" ];
75 ceph osd
set noscrub ||
return 1
76 ceph osd
set nodeep-scrub ||
return 1
78 ceph osd
unset noscrub ||
return 1
79 ceph osd
unset nodeep-scrub ||
return 1
83 echo $payload > $dir/ORIGINAL
84 rados
--pool $poolname put
$obj $dir/ORIGINAL ||
return 1
88 # Corrupt one copy of a replicated pool
90 function TEST_corrupt_and_repair_replicated
() {
94 setup
$dir ||
return 1
95 run_mon
$dir a
--osd_pool_default_size=2 ||
return 1
96 run_mgr
$dir x ||
return 1
97 run_osd
$dir 0 ||
return 1
98 run_osd
$dir 1 ||
return 1
99 create_rbd_pool ||
return 1
100 wait_for_clean ||
return 1
102 add_something
$dir $poolname ||
return 1
103 corrupt_and_repair_one
$dir $poolname $
(get_not_primary
$poolname SOMETHING
) ||
return 1
104 # Reproduces http://tracker.ceph.com/issues/8914
105 corrupt_and_repair_one
$dir $poolname $
(get_primary
$poolname SOMETHING
) ||
return 1
107 teardown
$dir ||
return 1
111 # Allow repair to be scheduled when some recovering is still undergoing on the same OSD
113 function TEST_allow_repair_during_recovery
() {
117 setup
$dir ||
return 1
118 run_mon
$dir a
--osd_pool_default_size=2 ||
return 1
119 run_mgr
$dir x ||
return 1
120 run_osd
$dir 0 --osd_scrub_during_recovery=false \
121 --osd_repair_during_recovery=true \
122 --osd_debug_pretend_recovery_active=true ||
return 1
123 run_osd
$dir 1 --osd_scrub_during_recovery=false \
124 --osd_repair_during_recovery=true \
125 --osd_debug_pretend_recovery_active=true ||
return 1
126 create_rbd_pool ||
return 1
127 wait_for_clean ||
return 1
129 add_something
$dir $poolname ||
return 1
130 corrupt_and_repair_one
$dir $poolname $
(get_not_primary
$poolname SOMETHING
) ||
return 1
132 teardown
$dir ||
return 1
136 # Skip non-repair scrub correctly during recovery
138 function TEST_skip_non_repair_during_recovery
() {
142 setup
$dir ||
return 1
143 run_mon
$dir a
--osd_pool_default_size=2 ||
return 1
144 run_mgr
$dir x ||
return 1
145 run_osd
$dir 0 --osd_scrub_during_recovery=false \
146 --osd_repair_during_recovery=true \
147 --osd_debug_pretend_recovery_active=true ||
return 1
148 run_osd
$dir 1 --osd_scrub_during_recovery=false \
149 --osd_repair_during_recovery=true \
150 --osd_debug_pretend_recovery_active=true ||
return 1
151 create_rbd_pool ||
return 1
152 wait_for_clean ||
return 1
154 add_something
$dir $poolname ||
return 1
155 scrub_and_not_schedule
$dir $poolname $
(get_not_primary
$poolname SOMETHING
) ||
return 1
157 teardown
$dir ||
return 1
160 function scrub_and_not_schedule
() {
166 # 1) start a non-repair scrub
168 local pg
=$
(get_pg
$poolname SOMETHING
)
169 local last_scrub
=$
(get_last_scrub_stamp
$pg)
173 # 2) Assure the scrub is not scheduled
175 for ((i
=0; i
< 3; i
++)); do
176 if test "$(get_last_scrub_stamp $pg)" '>' "$last_scrub" ; then
183 # 3) Access to the file must OK
185 objectstore_tool
$dir $osd SOMETHING list-attrs ||
return 1
186 rados
--pool $poolname get SOMETHING
$dir/COPY ||
return 1
187 diff $dir/ORIGINAL
$dir/COPY ||
return 1
190 function corrupt_and_repair_two
() {
197 # 1) remove the corresponding file from the OSDs
200 run_in_background pids objectstore_tool
$dir $first SOMETHING remove
201 run_in_background pids objectstore_tool
$dir $second SOMETHING remove
204 if [ $return_code -ne 0 ]; then return $return_code; fi
209 local pg
=$
(get_pg
$poolname SOMETHING
)
212 # 3) The files must be back
215 run_in_background pids objectstore_tool
$dir $first SOMETHING list-attrs
216 run_in_background pids objectstore_tool
$dir $second SOMETHING list-attrs
219 if [ $return_code -ne 0 ]; then return $return_code; fi
221 rados
--pool $poolname get SOMETHING
$dir/COPY ||
return 1
222 diff $dir/ORIGINAL
$dir/COPY ||
return 1
227 # 2) remove the corresponding file from a designated OSD
229 # 4) check that the file has been restored in the designated OSD
231 function corrupt_and_repair_one
() {
237 # 1) remove the corresponding file from the OSD
239 objectstore_tool
$dir $osd SOMETHING remove ||
return 1
243 local pg
=$
(get_pg
$poolname SOMETHING
)
246 # 3) The file must be back
248 objectstore_tool
$dir $osd SOMETHING list-attrs ||
return 1
249 rados
--pool $poolname get SOMETHING
$dir/COPY ||
return 1
250 diff $dir/ORIGINAL
$dir/COPY ||
return 1
253 function corrupt_and_repair_erasure_coded
() {
257 add_something
$dir $poolname ||
return 1
259 local primary
=$
(get_primary
$poolname SOMETHING
)
260 local -a osds
=($
(get_osds
$poolname SOMETHING |
sed -e "s/$primary//"))
261 local not_primary_first
=${osds[0]}
262 local not_primary_second
=${osds[1]}
264 # Reproduces http://tracker.ceph.com/issues/10017
265 corrupt_and_repair_one
$dir $poolname $primary ||
return 1
266 # Reproduces http://tracker.ceph.com/issues/10409
267 corrupt_and_repair_one
$dir $poolname $not_primary_first ||
return 1
268 corrupt_and_repair_two
$dir $poolname $not_primary_first $not_primary_second ||
return 1
269 corrupt_and_repair_two
$dir $poolname $primary $not_primary_first ||
return 1
273 function auto_repair_erasure_coded
() {
275 local allow_overwrites
=$2
276 local poolname
=ecpool
278 # Launch a cluster with 5 seconds scrub interval
279 setup
$dir ||
return 1
280 run_mon
$dir a ||
return 1
281 run_mgr
$dir x ||
return 1
282 local ceph_osd_args
="--osd-scrub-auto-repair=true \
283 --osd-deep-scrub-interval=5 \
284 --osd-scrub-max-interval=5 \
285 --osd-scrub-min-interval=5 \
286 --osd-scrub-interval-randomize-ratio=0"
287 for id
in $
(seq 0 2) ; do
288 if [ "$allow_overwrites" = "true" ]; then
289 run_osd
$dir $id $ceph_osd_args ||
return 1
291 run_osd_filestore
$dir $id $ceph_osd_args ||
return 1
294 create_rbd_pool ||
return 1
295 wait_for_clean ||
return 1
298 create_ec_pool
$poolname $allow_overwrites k
=2 m
=1 ||
return 1
302 echo $payload > $dir/ORIGINAL
303 rados
--pool $poolname put SOMETHING
$dir/ORIGINAL ||
return 1
305 # Remove the object from one shard physically
306 # Restarted osd get $ceph_osd_args passed
307 objectstore_tool
$dir $
(get_not_primary
$poolname SOMETHING
) SOMETHING remove ||
return 1
308 # Wait for auto repair
309 local pgid
=$
(get_pg
$poolname SOMETHING
)
310 wait_for_scrub
$pgid "$(get_last_scrub_stamp $pgid)"
311 wait_for_clean ||
return 1
312 # Verify - the file should be back
313 # Restarted osd get $ceph_osd_args passed
314 objectstore_tool
$dir $
(get_not_primary
$poolname SOMETHING
) SOMETHING list-attrs ||
return 1
315 rados
--pool $poolname get SOMETHING
$dir/COPY ||
return 1
316 diff $dir/ORIGINAL
$dir/COPY ||
return 1
319 teardown
$dir ||
return 1
322 function TEST_auto_repair_erasure_coded_appends
() {
323 auto_repair_erasure_coded
$1 false
326 function TEST_auto_repair_erasure_coded_overwrites
() {
327 if [ "$use_ec_overwrite" = "true" ]; then
328 auto_repair_erasure_coded
$1 true
332 function TEST_auto_repair_bluestore_basic
() {
334 local poolname
=testpool
336 # Launch a cluster with 5 seconds scrub interval
337 setup
$dir ||
return 1
338 run_mon
$dir a ||
return 1
339 run_mgr
$dir x ||
return 1
340 local ceph_osd_args
="--osd-scrub-auto-repair=true \
341 --osd_deep_scrub_randomize_ratio=0 \
342 --osd-scrub-interval-randomize-ratio=0"
343 for id
in $
(seq 0 2) ; do
344 run_osd
$dir $id $ceph_osd_args ||
return 1
347 create_pool
$poolname 1 1 ||
return 1
348 ceph osd pool
set $poolname size
2
349 wait_for_clean ||
return 1
353 echo $payload > $dir/ORIGINAL
354 rados
--pool $poolname put SOMETHING
$dir/ORIGINAL ||
return 1
356 # Remove the object from one shard physically
357 # Restarted osd get $ceph_osd_args passed
358 objectstore_tool
$dir $
(get_not_primary
$poolname SOMETHING
) SOMETHING remove ||
return 1
360 local pgid
=$
(get_pg
$poolname SOMETHING
)
361 local primary
=$
(get_primary
$poolname SOMETHING
)
362 local last_scrub_stamp
="$(get_last_scrub_stamp $pgid)"
363 CEPH_ARGS
='' ceph daemon $
(get_asok_path osd.
$primary) trigger_deep_scrub
$pgid
364 CEPH_ARGS
='' ceph daemon $
(get_asok_path osd.
$primary) trigger_scrub
$pgid
366 # Wait for auto repair
367 wait_for_scrub
$pgid "$last_scrub_stamp" ||
return 1
368 wait_for_clean ||
return 1
370 # Verify - the file should be back
371 # Restarted osd get $ceph_osd_args passed
372 objectstore_tool
$dir $
(get_not_primary
$poolname SOMETHING
) SOMETHING list-attrs ||
return 1
373 objectstore_tool
$dir $
(get_not_primary
$poolname SOMETHING
) SOMETHING get-bytes
$dir/COPY ||
return 1
374 diff $dir/ORIGINAL
$dir/COPY ||
return 1
375 grep scrub_finish
$dir/osd.
${primary}.log
378 teardown
$dir ||
return 1
381 function TEST_auto_repair_bluestore_scrub
() {
383 local poolname
=testpool
385 # Launch a cluster with 5 seconds scrub interval
386 setup
$dir ||
return 1
387 run_mon
$dir a ||
return 1
388 run_mgr
$dir x ||
return 1
389 local ceph_osd_args
="--osd-scrub-auto-repair=true \
390 --osd_deep_scrub_randomize_ratio=0 \
391 --osd-scrub-interval-randomize-ratio=0"
392 for id
in $
(seq 0 2) ; do
393 run_osd
$dir $id $ceph_osd_args ||
return 1
396 create_pool
$poolname 1 1 ||
return 1
397 ceph osd pool
set $poolname size
2
398 wait_for_clean ||
return 1
402 echo $payload > $dir/ORIGINAL
403 rados
--pool $poolname put SOMETHING
$dir/ORIGINAL ||
return 1
405 # Remove the object from one shard physically
406 # Restarted osd get $ceph_osd_args passed
407 objectstore_tool
$dir $
(get_not_primary
$poolname SOMETHING
) SOMETHING remove ||
return 1
409 local pgid
=$
(get_pg
$poolname SOMETHING
)
410 local primary
=$
(get_primary
$poolname SOMETHING
)
411 local last_scrub_stamp
="$(get_last_scrub_stamp $pgid)"
412 CEPH_ARGS
='' ceph daemon $
(get_asok_path osd.
$primary) trigger_scrub
$pgid
414 # Wait for scrub -> auto repair
415 wait_for_scrub
$pgid "$last_scrub_stamp" ||
return 1
417 # Actually this causes 2 scrubs, so we better wait a little longer
419 wait_for_clean ||
return 1
421 # Verify - the file should be back
422 # Restarted osd get $ceph_osd_args passed
423 objectstore_tool
$dir $
(get_not_primary
$poolname SOMETHING
) SOMETHING list-attrs ||
return 1
424 rados
--pool $poolname get SOMETHING
$dir/COPY ||
return 1
425 diff $dir/ORIGINAL
$dir/COPY ||
return 1
426 grep scrub_finish
$dir/osd.
${primary}.log
428 # This should have caused 1 object to be repaired
429 COUNT
=$
(ceph pg
$pgid query | jq
'.info.stats.stat_sum.num_objects_repaired')
430 test "$COUNT" = "1" ||
return 1
433 teardown
$dir ||
return 1
436 function TEST_auto_repair_bluestore_failed
() {
438 local poolname
=testpool
440 # Launch a cluster with 5 seconds scrub interval
441 setup
$dir ||
return 1
442 run_mon
$dir a ||
return 1
443 run_mgr
$dir x ||
return 1
444 local ceph_osd_args
="--osd-scrub-auto-repair=true \
445 --osd_deep_scrub_randomize_ratio=0 \
446 --osd-scrub-interval-randomize-ratio=0"
447 for id
in $
(seq 0 2) ; do
448 run_osd
$dir $id $ceph_osd_args ||
return 1
451 create_pool
$poolname 1 1 ||
return 1
452 ceph osd pool
set $poolname size
2
453 wait_for_clean ||
return 1
457 echo $payload > $dir/ORIGINAL
460 rados
--pool $poolname put obj
$i $dir/ORIGINAL ||
return 1
463 # Remove the object from one shard physically
464 # Restarted osd get $ceph_osd_args passed
465 objectstore_tool
$dir $
(get_not_primary
$poolname SOMETHING
) obj1 remove ||
return 1
466 # obj2 can't be repaired
467 objectstore_tool
$dir $
(get_not_primary
$poolname SOMETHING
) obj2 remove ||
return 1
468 objectstore_tool
$dir $
(get_primary
$poolname SOMETHING
) obj2 rm-attr _ ||
return 1
470 local pgid
=$
(get_pg
$poolname obj1
)
471 local primary
=$
(get_primary
$poolname obj1
)
472 local last_scrub_stamp
="$(get_last_scrub_stamp $pgid)"
473 CEPH_ARGS
='' ceph daemon $
(get_asok_path osd.
$primary) trigger_deep_scrub
$pgid
474 CEPH_ARGS
='' ceph daemon $
(get_asok_path osd.
$primary) trigger_scrub
$pgid
476 # Wait for auto repair
477 wait_for_scrub
$pgid "$last_scrub_stamp" ||
return 1
478 wait_for_clean ||
return 1
480 grep scrub_finish
$dir/osd.
${primary}.log
481 grep -q "scrub_finish.*still present after re-scrub" $dir/osd.
${primary}.log ||
return 1
483 ceph pg dump pgs |
grep -q "^$(pgid).*+failed_repair" ||
return 1
485 # Verify - obj1 should be back
486 # Restarted osd get $ceph_osd_args passed
487 objectstore_tool
$dir $
(get_not_primary
$poolname obj1
) obj1 list-attrs ||
return 1
488 rados
--pool $poolname get obj1
$dir/COPY ||
return 1
489 diff $dir/ORIGINAL
$dir/COPY ||
return 1
490 grep scrub_finish
$dir/osd.
${primary}.log
493 objectstore_tool
$dir $
(get_primary
$poolname SOMETHING
) obj2 remove ||
return 1
498 ceph pg dump pgs |
grep -q "^$(pgid).* active+clean " ||
return 1
499 grep scrub_finish
$dir/osd.
${primary}.log
502 teardown
$dir ||
return 1
505 function TEST_auto_repair_bluestore_failed_norecov
() {
507 local poolname
=testpool
509 # Launch a cluster with 5 seconds scrub interval
510 setup
$dir ||
return 1
511 run_mon
$dir a ||
return 1
512 run_mgr
$dir x ||
return 1
513 local ceph_osd_args
="--osd-scrub-auto-repair=true \
514 --osd_deep_scrub_randomize_ratio=0 \
515 --osd-scrub-interval-randomize-ratio=0"
516 for id
in $
(seq 0 2) ; do
517 run_osd
$dir $id $ceph_osd_args ||
return 1
520 create_pool
$poolname 1 1 ||
return 1
521 ceph osd pool
set $poolname size
2
522 wait_for_clean ||
return 1
526 echo $payload > $dir/ORIGINAL
529 rados
--pool $poolname put obj
$i $dir/ORIGINAL ||
return 1
532 # Remove the object from one shard physically
533 # Restarted osd get $ceph_osd_args passed
534 # obj1 can't be repaired
535 objectstore_tool
$dir $
(get_not_primary
$poolname SOMETHING
) obj1 remove ||
return 1
536 objectstore_tool
$dir $
(get_primary
$poolname SOMETHING
) obj1 rm-attr _ ||
return 1
537 # obj2 can't be repaired
538 objectstore_tool
$dir $
(get_not_primary
$poolname SOMETHING
) obj2 remove ||
return 1
539 objectstore_tool
$dir $
(get_primary
$poolname SOMETHING
) obj2 rm-attr _ ||
return 1
541 local pgid
=$
(get_pg
$poolname obj1
)
542 local primary
=$
(get_primary
$poolname obj1
)
543 local last_scrub_stamp
="$(get_last_scrub_stamp $pgid)"
544 CEPH_ARGS
='' ceph daemon $
(get_asok_path osd.
$primary) trigger_deep_scrub
$pgid
545 CEPH_ARGS
='' ceph daemon $
(get_asok_path osd.
$primary) trigger_scrub
$pgid
547 # Wait for auto repair
548 wait_for_scrub
$pgid "$last_scrub_stamp" ||
return 1
549 wait_for_clean ||
return 1
551 grep -q "scrub_finish.*present with no repair possible" $dir/osd.
${primary}.log ||
return 1
553 ceph pg dump pgs |
grep -q "^$(pgid).*+failed_repair" ||
return 1
556 teardown
$dir ||
return 1
559 function TEST_repair_stats
() {
561 local poolname
=testpool
564 # This need to be an even number
567 # Launch a cluster with 5 seconds scrub interval
568 setup
$dir ||
return 1
569 run_mon
$dir a ||
return 1
570 run_mgr
$dir x ||
return 1
571 local ceph_osd_args
="--osd_deep_scrub_randomize_ratio=0 \
572 --osd-scrub-interval-randomize-ratio=0"
573 for id
in $
(seq 0 $
(expr $OSDS - 1)) ; do
574 run_osd
$dir $id $ceph_osd_args ||
return 1
577 create_pool
$poolname 1 1 ||
return 1
578 ceph osd pool
set $poolname size
2
579 wait_for_clean ||
return 1
583 echo $payload > $dir/ORIGINAL
584 for i
in $
(seq 1 $OBJS)
586 rados
--pool $poolname put obj
$i $dir/ORIGINAL ||
return 1
589 # Remove the object from one shard physically
590 # Restarted osd get $ceph_osd_args passed
591 local other
=$
(get_not_primary
$poolname obj1
)
592 local pgid
=$
(get_pg
$poolname obj1
)
593 local primary
=$
(get_primary
$poolname obj1
)
595 kill_daemons
$dir TERM osd.
$other >&2 < /dev
/null ||
return 1
596 kill_daemons
$dir TERM osd.
$primary >&2 < /dev
/null ||
return 1
597 for i
in $
(seq 1 $REPAIRS)
599 # Remove from both osd.0 and osd.1
601 _objectstore_tool_nodown
$dir $OSD obj
$i remove ||
return 1
603 run_osd
$dir $primary $ceph_osd_args ||
return 1
604 run_osd
$dir $other $ceph_osd_args ||
return 1
605 wait_for_clean ||
return 1
608 wait_for_clean ||
return 1
611 # This should have caused 1 object to be repaired
612 ceph pg
$pgid query | jq
'.info.stats.stat_sum'
613 COUNT
=$
(ceph pg
$pgid query | jq
'.info.stats.stat_sum.num_objects_repaired')
614 test "$COUNT" = "$REPAIRS" ||
return 1
616 ceph pg dump
--format=json-pretty | jq
".pg_map.osd_stats[] | select(.osd == $primary )"
617 COUNT
=$
(ceph pg dump
--format=json-pretty | jq
".pg_map.osd_stats[] | select(.osd == $primary ).num_shards_repaired")
618 test "$COUNT" = "$(expr $REPAIRS / 2)" ||
return 1
620 ceph pg dump
--format=json-pretty | jq
".pg_map.osd_stats[] | select(.osd == $other )"
621 COUNT
=$
(ceph pg dump
--format=json-pretty | jq
".pg_map.osd_stats[] | select(.osd == $other ).num_shards_repaired")
622 test "$COUNT" = "$(expr $REPAIRS / 2)" ||
return 1
624 ceph pg dump
--format=json-pretty | jq
".pg_map.osd_stats_sum"
625 COUNT
=$
(ceph pg dump
--format=json-pretty | jq
".pg_map.osd_stats_sum.num_shards_repaired")
626 test "$COUNT" = "$REPAIRS" ||
return 1
629 teardown
$dir ||
return 1
632 function TEST_repair_stats_ec
() {
634 local poolname
=testpool
637 # This need to be an even number
639 local allow_overwrites
=false
641 # Launch a cluster with 5 seconds scrub interval
642 setup
$dir ||
return 1
643 run_mon
$dir a ||
return 1
644 run_mgr
$dir x ||
return 1
645 local ceph_osd_args
="--osd_deep_scrub_randomize_ratio=0 \
646 --osd-scrub-interval-randomize-ratio=0"
647 for id
in $
(seq 0 $
(expr $OSDS - 1)) ; do
648 run_osd
$dir $id $ceph_osd_args ||
return 1
652 create_ec_pool
$poolname $allow_overwrites k
=2 m
=1 ||
return 1
656 echo $payload > $dir/ORIGINAL
657 for i
in $
(seq 1 $OBJS)
659 rados
--pool $poolname put obj
$i $dir/ORIGINAL ||
return 1
662 # Remove the object from one shard physically
663 # Restarted osd get $ceph_osd_args passed
664 local other
=$
(get_not_primary
$poolname obj1
)
665 local pgid
=$
(get_pg
$poolname obj1
)
666 local primary
=$
(get_primary
$poolname obj1
)
668 kill_daemons
$dir TERM osd.
$other >&2 < /dev
/null ||
return 1
669 kill_daemons
$dir TERM osd.
$primary >&2 < /dev
/null ||
return 1
670 for i
in $
(seq 1 $REPAIRS)
672 # Remove from both osd.0 and osd.1
674 _objectstore_tool_nodown
$dir $OSD obj
$i remove ||
return 1
676 run_osd
$dir $primary $ceph_osd_args ||
return 1
677 run_osd
$dir $other $ceph_osd_args ||
return 1
678 wait_for_clean ||
return 1
681 wait_for_clean ||
return 1
684 # This should have caused 1 object to be repaired
685 ceph pg
$pgid query | jq
'.info.stats.stat_sum'
686 COUNT
=$
(ceph pg
$pgid query | jq
'.info.stats.stat_sum.num_objects_repaired')
687 test "$COUNT" = "$REPAIRS" ||
return 1
689 for osd
in $
(seq 0 $
(expr $OSDS - 1)) ; do
690 if [ $osd = $other -o $osd = $primary ]; then
691 repair
=$
(expr $REPAIRS / 2)
696 ceph pg dump
--format=json-pretty | jq
".pg_map.osd_stats[] | select(.osd == $osd )"
697 COUNT
=$
(ceph pg dump
--format=json-pretty | jq
".pg_map.osd_stats[] | select(.osd == $osd ).num_shards_repaired")
698 test "$COUNT" = "$repair" ||
return 1
701 ceph pg dump
--format=json-pretty | jq
".pg_map.osd_stats_sum"
702 COUNT
=$
(ceph pg dump
--format=json-pretty | jq
".pg_map.osd_stats_sum.num_shards_repaired")
703 test "$COUNT" = "$REPAIRS" ||
return 1
706 teardown
$dir ||
return 1
709 function corrupt_and_repair_jerasure
() {
711 local allow_overwrites
=$2
712 local poolname
=ecpool
714 setup
$dir ||
return 1
715 run_mon
$dir a ||
return 1
716 run_mgr
$dir x ||
return 1
717 for id
in $
(seq 0 3) ; do
718 if [ "$allow_overwrites" = "true" ]; then
719 run_osd
$dir $id ||
return 1
721 run_osd_filestore
$dir $id ||
return 1
724 create_rbd_pool ||
return 1
725 wait_for_clean ||
return 1
727 create_ec_pool
$poolname $allow_overwrites k
=2 m
=2 ||
return 1
728 corrupt_and_repair_erasure_coded
$dir $poolname ||
return 1
730 teardown
$dir ||
return 1
733 function TEST_corrupt_and_repair_jerasure_appends
() {
734 corrupt_and_repair_jerasure
$1 false
737 function TEST_corrupt_and_repair_jerasure_overwrites
() {
738 if [ "$use_ec_overwrite" = "true" ]; then
739 corrupt_and_repair_jerasure
$1 true
743 function corrupt_and_repair_lrc
() {
745 local allow_overwrites
=$2
746 local poolname
=ecpool
748 setup
$dir ||
return 1
749 run_mon
$dir a ||
return 1
750 run_mgr
$dir x ||
return 1
751 for id
in $
(seq 0 9) ; do
752 if [ "$allow_overwrites" = "true" ]; then
753 run_osd
$dir $id ||
return 1
755 run_osd_filestore
$dir $id ||
return 1
758 create_rbd_pool ||
return 1
759 wait_for_clean ||
return 1
761 create_ec_pool
$poolname $allow_overwrites k
=4 m
=2 l
=3 plugin
=lrc ||
return 1
762 corrupt_and_repair_erasure_coded
$dir $poolname ||
return 1
764 teardown
$dir ||
return 1
767 function TEST_corrupt_and_repair_lrc_appends
() {
768 corrupt_and_repair_lrc
$1 false
771 function TEST_corrupt_and_repair_lrc_overwrites
() {
772 if [ "$use_ec_overwrite" = "true" ]; then
773 corrupt_and_repair_lrc
$1 true
777 function unfound_erasure_coded
() {
779 local allow_overwrites
=$2
780 local poolname
=ecpool
783 setup
$dir ||
return 1
784 run_mon
$dir a ||
return 1
785 run_mgr
$dir x ||
return 1
786 for id
in $
(seq 0 3) ; do
787 if [ "$allow_overwrites" = "true" ]; then
788 run_osd
$dir $id ||
return 1
790 run_osd_filestore
$dir $id ||
return 1
794 create_ec_pool
$poolname $allow_overwrites k
=2 m
=2 ||
return 1
796 add_something
$dir $poolname ||
return 1
798 local primary
=$
(get_primary
$poolname SOMETHING
)
799 local -a osds
=($
(get_osds
$poolname SOMETHING |
sed -e "s/$primary//"))
800 local not_primary_first
=${osds[0]}
801 local not_primary_second
=${osds[1]}
802 local not_primary_third
=${osds[2]}
805 # 1) remove the corresponding file from the OSDs
808 run_in_background pids objectstore_tool
$dir $not_primary_first SOMETHING remove
809 run_in_background pids objectstore_tool
$dir $not_primary_second SOMETHING remove
810 run_in_background pids objectstore_tool
$dir $not_primary_third SOMETHING remove
813 if [ $return_code -ne 0 ]; then return $return_code; fi
818 local pg
=$
(get_pg
$poolname SOMETHING
)
823 # it may take a bit to appear due to mon/mgr asynchrony
824 for f
in `seq 1 60`; do
825 ceph
-s |
grep "1/1 objects unfound" && break
828 ceph
-s|
grep "4 up" ||
return 1
829 ceph
-s|
grep "4 in" ||
return 1
830 ceph
-s|
grep "1/1 objects unfound" ||
return 1
832 teardown
$dir ||
return 1
835 function TEST_unfound_erasure_coded_appends
() {
836 unfound_erasure_coded
$1 false
839 function TEST_unfound_erasure_coded_overwrites
() {
840 if [ "$use_ec_overwrite" = "true" ]; then
841 unfound_erasure_coded
$1 true
846 # list_missing for EC pool
848 function list_missing_erasure_coded
() {
850 local allow_overwrites
=$2
851 local poolname
=ecpool
853 setup
$dir ||
return 1
854 run_mon
$dir a ||
return 1
855 run_mgr
$dir x ||
return 1
856 for id
in $
(seq 0 2) ; do
857 if [ "$allow_overwrites" = "true" ]; then
858 run_osd
$dir $id ||
return 1
860 run_osd_filestore
$dir $id ||
return 1
863 create_rbd_pool ||
return 1
864 wait_for_clean ||
return 1
866 create_ec_pool
$poolname $allow_overwrites k
=2 m
=1 ||
return 1
868 # Put an object and remove the two shards (including primary)
869 add_something
$dir $poolname MOBJ0 ||
return 1
870 local -a osds0
=($
(get_osds
$poolname MOBJ0
))
872 # Put another object and remove two shards (excluding primary)
873 add_something
$dir $poolname MOBJ1 ||
return 1
874 local -a osds1
=($
(get_osds
$poolname MOBJ1
))
876 # Stop all osd daemons
877 for id
in $
(seq 0 2) ; do
878 kill_daemons
$dir TERM osd.
$id >&2 < /dev
/null ||
return 1
882 ceph-objectstore-tool
--data-path $dir/$id \
883 MOBJ0 remove ||
return 1
885 ceph-objectstore-tool
--data-path $dir/$id \
886 MOBJ0 remove ||
return 1
889 ceph-objectstore-tool
--data-path $dir/$id \
890 MOBJ1 remove ||
return 1
892 ceph-objectstore-tool
--data-path $dir/$id \
893 MOBJ1 remove ||
return 1
895 for id
in $
(seq 0 2) ; do
896 activate_osd
$dir $id >&2 ||
return 1
898 create_rbd_pool ||
return 1
899 wait_for_clean ||
return 1
901 # Get get - both objects should in the same PG
902 local pg
=$
(get_pg
$poolname MOBJ0
)
904 # Repair the PG, which triggers the recovering,
905 # and should mark the object as unfound
908 for i
in $
(seq 0 120) ; do
909 [ $i -lt 60 ] ||
return 1
910 matches
=$
(ceph pg
$pg list_unfound |
egrep "MOBJ0|MOBJ1" |
wc -l)
911 [ $matches -eq 2 ] && break
914 teardown
$dir ||
return 1
917 function TEST_list_missing_erasure_coded_appends
() {
918 list_missing_erasure_coded
$1 false
921 function TEST_list_missing_erasure_coded_overwrites
() {
922 if [ "$use_ec_overwrite" = "true" ]; then
923 list_missing_erasure_coded
$1 true
928 # Corrupt one copy of a replicated pool
930 function TEST_corrupt_scrub_replicated
() {
932 local poolname
=csr_pool
935 setup
$dir ||
return 1
936 run_mon
$dir a
--osd_pool_default_size=2 ||
return 1
937 run_mgr
$dir x ||
return 1
938 run_osd
$dir 0 ||
return 1
939 run_osd
$dir 1 ||
return 1
940 create_rbd_pool ||
return 1
941 wait_for_clean ||
return 1
943 create_pool foo
1 ||
return 1
944 create_pool
$poolname 1 1 ||
return 1
945 wait_for_clean ||
return 1
947 for i
in $
(seq 1 $total_objs) ; do
949 add_something
$dir $poolname $objname ||
return 1
951 rados
--pool $poolname setomapheader
$objname hdr-
$objname ||
return 1
952 rados
--pool $poolname setomapval
$objname key-
$objname val-
$objname ||
return 1
955 # Increase file 1 MB + 1KB
956 dd if=/dev
/zero of
=$dir/new.ROBJ19 bs
=1024 count
=1025
957 rados
--pool $poolname put
$objname $dir/new.ROBJ19 ||
return 1
958 rm -f $dir/new.ROBJ19
960 local pg
=$
(get_pg
$poolname ROBJ0
)
961 local primary
=$
(get_primary
$poolname ROBJ0
)
963 # Compute an old omap digest and save oi
964 CEPH_ARGS
='' ceph daemon $
(get_asok_path osd
.0) \
965 config
set osd_deep_scrub_update_digest_min_age
0
966 CEPH_ARGS
='' ceph daemon $
(get_asok_path osd
.1) \
967 config
set osd_deep_scrub_update_digest_min_age
0
970 for i
in $
(seq 1 $total_objs) ; do
973 # Alternate corruption between osd.0 and osd.1
974 local osd
=$
(expr $i % 2)
978 # Size (deep scrub data_digest too)
979 local payload
=UVWXYZZZ
980 echo $payload > $dir/CORRUPT
981 objectstore_tool
$dir $osd $objname set-bytes
$dir/CORRUPT ||
return 1
985 # digest (deep scrub only)
987 echo $payload > $dir/CORRUPT
988 objectstore_tool
$dir $osd $objname set-bytes
$dir/CORRUPT ||
return 1
993 objectstore_tool
$dir $osd $objname remove ||
return 1
997 # Modify omap value (deep scrub only)
998 objectstore_tool
$dir $osd $objname set-omap key-
$objname $dir/CORRUPT ||
return 1
1002 # Delete omap key (deep scrub only)
1003 objectstore_tool
$dir $osd $objname rm-omap key-
$objname ||
return 1
1007 # Add extra omap key (deep scrub only)
1008 echo extra
> $dir/extra-val
1009 objectstore_tool
$dir $osd $objname set-omap key2-
$objname $dir/extra-val ||
return 1
1014 # Modify omap header (deep scrub only)
1015 echo -n newheader
> $dir/hdr
1016 objectstore_tool
$dir $osd $objname set-omaphdr
$dir/hdr ||
return 1
1021 rados
--pool $poolname setxattr
$objname key1-
$objname val1-
$objname ||
return 1
1022 rados
--pool $poolname setxattr
$objname key2-
$objname val2-
$objname ||
return 1
1025 echo -n bad-val
> $dir/bad-val
1026 objectstore_tool
$dir $osd $objname set-attr _key1-
$objname $dir/bad-val ||
return 1
1027 objectstore_tool
$dir $osd $objname rm-attr _key2-
$objname ||
return 1
1028 echo -n val3-
$objname > $dir/newval
1029 objectstore_tool
$dir $osd $objname set-attr _key3-
$objname $dir/newval ||
return 1
1030 rm $dir/bad-val
$dir/newval
1034 objectstore_tool
$dir $osd $objname get-attr _
> $dir/robj9-oi
1035 echo -n D
> $dir/change
1036 rados
--pool $poolname put
$objname $dir/change
1037 objectstore_tool
$dir $osd $objname set-attr _
$dir/robj9-oi
1038 rm $dir/oi
$dir/change
1041 # ROBJ10 must be handled after digests are re-computed by a deep scrub below
1042 # ROBJ11 must be handled with config change before deep scrub
1043 # ROBJ12 must be handled with config change before scrubs
1044 # ROBJ13 must be handled before scrubs
1047 echo -n bad-val
> $dir/bad-val
1048 objectstore_tool
$dir 0 $objname set-attr _
$dir/bad-val ||
return 1
1049 objectstore_tool
$dir 1 $objname rm-attr _ ||
return 1
1054 objectstore_tool
$dir $osd $objname rm-attr _ ||
return 1
1058 objectstore_tool
$dir 0 $objname rm-attr snapset ||
return 1
1059 echo -n bad-val
> $dir/bad-val
1060 objectstore_tool
$dir 1 $objname set-attr snapset
$dir/bad-val ||
return 1
1064 # Deep-scrub only (all replicas are diffent than the object info
1065 local payload
=ROBJ17
1066 echo $payload > $dir/new.ROBJ17
1067 objectstore_tool
$dir 0 $objname set-bytes
$dir/new.ROBJ17 ||
return 1
1068 objectstore_tool
$dir 1 $objname set-bytes
$dir/new.ROBJ17 ||
return 1
1072 # Deep-scrub only (all replicas are diffent than the object info
1073 local payload
=ROBJ18
1074 echo $payload > $dir/new.ROBJ18
1075 objectstore_tool
$dir 0 $objname set-bytes
$dir/new.ROBJ18 ||
return 1
1076 objectstore_tool
$dir 1 $objname set-bytes
$dir/new.ROBJ18 ||
return 1
1077 # Make one replica have a different object info, so a full repair must happen too
1078 objectstore_tool
$dir $osd $objname corrupt-info ||
return 1
1082 # Set osd-max-object-size smaller than this object's size
1087 local pg
=$
(get_pg
$poolname ROBJ0
)
1089 ceph tell osd.\
* injectargs
-- --osd-max-object-size=1048576
1091 inject_eio rep data
$poolname ROBJ11
$dir 0 ||
return 1 # shard 0 of [1, 0], osd.1
1092 inject_eio rep mdata
$poolname ROBJ12
$dir 1 ||
return 1 # shard 1 of [1, 0], osd.0
1093 inject_eio rep mdata
$poolname ROBJ13
$dir 1 ||
return 1 # shard 1 of [1, 0], osd.0
1094 inject_eio rep data
$poolname ROBJ13
$dir 0 ||
return 1 # shard 0 of [1, 0], osd.1
1099 declare -a s_err_strings
1100 err_strings
[0]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 shard 1 soid 3:30259878:::ROBJ15:head : candidate had a missing info key"
1101 err_strings
[1]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 soid 3:33aca486:::ROBJ18:head : object info inconsistent "
1102 err_strings
[2]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 shard 1 soid 3:5c7b2c47:::ROBJ16:head : candidate had a corrupt snapset"
1103 err_strings
[3]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 shard 0 soid 3:5c7b2c47:::ROBJ16:head : candidate had a missing snapset key"
1104 err_strings
[4]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 soid 3:5c7b2c47:::ROBJ16:head : failed to pick suitable object info"
1105 err_strings
[5]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 soid 3:86586531:::ROBJ8:head : attr value mismatch '_key1-ROBJ8', attr name mismatch '_key3-ROBJ8', attr name mismatch '_key2-ROBJ8'"
1106 err_strings
[6]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 shard 0 soid 3:bc819597:::ROBJ12:head : candidate had a stat error"
1107 err_strings
[7]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 shard 1 soid 3:c0c86b1d:::ROBJ14:head : candidate had a missing info key"
1108 err_strings
[8]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 shard 0 soid 3:c0c86b1d:::ROBJ14:head : candidate had a corrupt info"
1109 err_strings
[9]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 soid 3:c0c86b1d:::ROBJ14:head : failed to pick suitable object info"
1110 err_strings
[10]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 shard 1 soid 3:ce3f1d6a:::ROBJ1:head : candidate size 9 info size 7 mismatch"
1111 err_strings
[11]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 shard 1 soid 3:ce3f1d6a:::ROBJ1:head : size 9 != size 7 from auth oi 3:ce3f1d6a:::ROBJ1:head[(][0-9]*'[0-9]* osd.1.0:[0-9]* dirty|omap|data_digest|omap_digest s 7 uv 3 dd 2ddbf8f5 od f5fba2c6 alloc_hint [[]0 0 0[]][)], size 9 != size 7 from shard 0"
1112 err_strings
[12]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 shard 0 soid 3:d60617f9:::ROBJ13:head : candidate had a stat error"
1113 err_strings
[13]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 shard 1 3:f2a5b2a4:::ROBJ3:head : missing"
1114 err_strings
[14]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 shard 1 soid 3:ffdb2004:::ROBJ9:head : candidate size 1 info size 7 mismatch"
1115 err_strings
[15]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 shard 1 soid 3:ffdb2004:::ROBJ9:head : object info inconsistent "
1116 err_strings
[16]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 3:c0c86b1d:::ROBJ14:head : no '_' attr"
1117 err_strings
[17]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 3:5c7b2c47:::ROBJ16:head : can't decode 'snapset' attr buffer::malformed_input: .* no longer understand old encoding version 3 < 97"
1118 err_strings
[18]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 scrub : stat mismatch, got 19/19 objects, 0/0 clones, 18/19 dirty, 18/19 omap, 0/0 pinned, 0/0 hit_set_archive, 0/0 whiteouts, 1049713/1049720 bytes, 0/0 manifest objects, 0/0 hit_set_archive bytes."
1119 err_strings
[19]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 scrub 1 missing, 8 inconsistent objects"
1120 err_strings
[20]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 scrub 18 errors"
1121 err_strings
[21]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 soid 3:123a5f55:::ROBJ19:head : size 1049600 > 1048576 is too large"
1123 for err_string
in "${err_strings[@]}"
1125 if ! grep -q "$err_string" $dir/osd.
${primary}.log
1127 echo "Missing log message '$err_string'"
1128 ERRORS
=$
(expr $ERRORS + 1)
1132 rados list-inconsistent-pg
$poolname > $dir/json ||
return 1
1134 test $
(jq
'. | length' $dir/json
) = "1" ||
return 1
1136 test $
(jq
-r '.[0]' $dir/json
) = $pg ||
return 1
1138 rados list-inconsistent-obj
$pg > $dir/json ||
return 1
1139 # Get epoch for repair-get requests
1140 epoch
=$
(jq .epoch
$dir/json
)
1142 jq
"$jqfilter" << EOF | jq '.inconsistents' | python -c "$sortkeys" > $dir/checkcsjson
1165 "prior_version": "21'3",
1166 "last_reqid": "osd.1.0:57",
1180 "data_digest": "0x2ddbf8f5",
1181 "omap_digest": "0xf5fba2c6",
1182 "expected_object_size": 0,
1183 "expected_write_size": 0,
1184 "alloc_hint_flags": 0,
1192 "size_mismatch_info",
1193 "obj_size_info_mismatch"
1199 "selected_object_info": {
1210 "prior_version": "21'3",
1211 "last_reqid": "osd.1.0:57",
1214 "mtime": "2018-04-05 14:33:19.804040",
1215 "local_mtime": "2018-04-05 14:33:19.804839",
1225 "data_digest": "0x2ddbf8f5",
1226 "omap_digest": "0xf5fba2c6",
1227 "expected_object_size": 0,
1228 "expected_write_size": 0,
1229 "alloc_hint_flags": 0,
1235 "union_shard_errors": [
1236 "size_mismatch_info",
1237 "obj_size_info_mismatch"
1266 "selected_object_info": {
1277 "prior_version": "43'36",
1278 "last_reqid": "osd.1.0:55",
1292 "data_digest": "0x2ddbf8f5",
1293 "omap_digest": "0x067f306a",
1294 "expected_object_size": 0,
1295 "expected_write_size": 0,
1296 "alloc_hint_flags": 0,
1302 "union_shard_errors": [
1330 "selected_object_info": {
1341 "prior_version": "45'39",
1342 "last_reqid": "osd.1.0:58",
1356 "data_digest": "0x2ddbf8f5",
1357 "omap_digest": "0x6441854d",
1358 "expected_object_size": 0,
1359 "expected_write_size": 0,
1360 "alloc_hint_flags": 0,
1366 "union_shard_errors": [
1381 "object_info": "bad-val",
1398 "union_shard_errors": [
1425 "prior_version": "49'45",
1426 "last_reqid": "osd.1.0:48",
1429 "mtime": "2018-04-05 14:33:29.498969",
1430 "local_mtime": "2018-04-05 14:33:29.499890",
1440 "data_digest": "0x2ddbf8f5",
1441 "omap_digest": "0x2d2a4d6e",
1442 "expected_object_size": 0,
1443 "expected_write_size": 0,
1444 "alloc_hint_flags": 0,
1464 "selected_object_info": {
1475 "prior_version": "49'45",
1476 "last_reqid": "osd.1.0:48",
1490 "data_digest": "0x2ddbf8f5",
1491 "omap_digest": "0x2d2a4d6e",
1492 "expected_object_size": 0,
1493 "expected_write_size": 0,
1494 "alloc_hint_flags": 0,
1500 "union_shard_errors": [
1536 "snapset": "bad-val",
1540 "union_shard_errors": [
1547 "object_info_inconsistency"
1555 "selected_object_info": {
1556 "alloc_hint_flags": 255,
1557 "data_digest": "0x2ddbf8f5",
1558 "expected_object_size": 0,
1559 "expected_write_size": 0,
1579 "omap_digest": "0xddc3680f",
1590 "alloc_hint_flags": 0,
1591 "data_digest": "0x2ddbf8f5",
1592 "expected_object_size": 0,
1593 "expected_write_size": 0,
1613 "omap_digest": "0xddc3680f",
1627 "alloc_hint_flags": 255,
1628 "data_digest": "0x2ddbf8f5",
1629 "expected_object_size": 0,
1630 "expected_write_size": 0,
1650 "omap_digest": "0xddc3680f",
1662 "union_shard_errors": []
1675 "union_shard_errors": [],
1676 "selected_object_info": {
1687 "prior_version": "63'58",
1688 "last_reqid": "osd.1.0:58",
1691 "mtime": "2019-08-09T23:33:58.340709+0000",
1692 "local_mtime": "2019-08-09T23:33:58.345676+0000",
1702 "data_digest": "0x3dde0ef3",
1703 "omap_digest": "0xbffddd28",
1704 "expected_object_size": 0,
1705 "expected_write_size": 0,
1706 "alloc_hint_flags": 0,
1743 "selected_object_info": {
1754 "prior_version": "25'9",
1755 "last_reqid": "osd.1.0:60",
1769 "data_digest": "0x2ddbf8f5",
1770 "omap_digest": "0x00b35dfd",
1771 "expected_object_size": 0,
1772 "expected_write_size": 0,
1773 "alloc_hint_flags": 0,
1779 "union_shard_errors": [
1798 "name": "key1-ROBJ8"
1802 "value": "val2-ROBJ8",
1803 "name": "key2-ROBJ8"
1815 "value": "val1-ROBJ8",
1816 "name": "key1-ROBJ8"
1820 "value": "val3-ROBJ8",
1821 "name": "key3-ROBJ8"
1830 "selected_object_info": {
1841 "prior_version": "79'65",
1842 "last_reqid": "client.4554.0:1",
1856 "data_digest": "0x2ddbf8f5",
1857 "omap_digest": "0xd6be81dc",
1858 "expected_object_size": 0,
1859 "expected_write_size": 0,
1860 "alloc_hint_flags": 0,
1866 "union_shard_errors": [],
1868 "attr_value_mismatch",
1869 "attr_name_mismatch"
1893 "prior_version": "51'64",
1894 "last_reqid": "client.4649.0:1",
1908 "data_digest": "0x2b63260d",
1909 "omap_digest": "0x2eecc539",
1910 "expected_object_size": 0,
1911 "expected_write_size": 0,
1912 "alloc_hint_flags": 0,
1935 "prior_version": "37'27",
1936 "last_reqid": "osd.1.0:63",
1939 "mtime": "2018-04-05 14:33:25.352485",
1940 "local_mtime": "2018-04-05 14:33:25.353746",
1950 "data_digest": "0x2ddbf8f5",
1951 "omap_digest": "0x2eecc539",
1952 "expected_object_size": 0,
1953 "expected_write_size": 0,
1954 "alloc_hint_flags": 0,
1962 "obj_size_info_mismatch"
1968 "selected_object_info": {
1979 "prior_version": "51'64",
1980 "last_reqid": "client.4649.0:1",
1994 "data_digest": "0x2b63260d",
1995 "omap_digest": "0x2eecc539",
1996 "expected_object_size": 0,
1997 "expected_write_size": 0,
1998 "alloc_hint_flags": 0,
2004 "union_shard_errors": [
2005 "obj_size_info_mismatch"
2008 "object_info_inconsistency"
2023 jq
"$jqfilter" $dir/json | jq
'.inconsistents' | python
-c "$sortkeys" > $dir/csjson
2024 multidiff
$dir/checkcsjson
$dir/csjson ||
test $getjson = "yes" ||
return 1
2025 if test $getjson = "yes"
2027 jq
'.' $dir/json
> save1.json
2030 if test "$LOCALRUN" = "yes" && which jsonschema
> /dev
/null
;
2032 jsonschema
-i $dir/json
$CEPH_ROOT/doc
/rados
/command
/list-inconsistent-obj.json ||
return 1
2036 # Change data and size again because digest was recomputed
2037 echo -n ZZZ
> $dir/change
2038 rados
--pool $poolname put
$objname $dir/change
2039 # Set one to an even older value
2040 objectstore_tool
$dir 0 $objname set-attr _
$dir/robj9-oi
2041 rm $dir/oi
$dir/change
2044 objectstore_tool
$dir 1 $objname get-attr _
> $dir/oi
2045 rados
--pool $poolname setomapval
$objname key2-
$objname val2-
$objname
2046 objectstore_tool
$dir 0 $objname set-attr _
$dir/oi
2047 objectstore_tool
$dir 1 $objname set-attr _
$dir/oi
2050 inject_eio rep data
$poolname ROBJ11
$dir 0 ||
return 1 # shard 0 of [1, 0], osd.1
2051 inject_eio rep mdata
$poolname ROBJ12
$dir 1 ||
return 1 # shard 1 of [1, 0], osd.0
2052 inject_eio rep mdata
$poolname ROBJ13
$dir 1 ||
return 1 # shard 1 of [1, 0], osd.0
2053 inject_eio rep data
$poolname ROBJ13
$dir 0 ||
return 1 # shard 0 of [1, 0], osd.1
2055 # ROBJ19 won't error this time
2056 ceph tell osd.\
* injectargs
-- --osd-max-object-size=134217728
2061 err_strings
[0]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 shard 1 soid 3:30259878:::ROBJ15:head : candidate had a missing info key"
2062 err_strings
[1]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 shard 0 soid 3:33aca486:::ROBJ18:head : data_digest 0xbd89c912 != data_digest 0x2ddbf8f5 from auth oi 3:33aca486:::ROBJ18:head[(][0-9]*'[0-9]* osd.1.0:[0-9]* dirty|omap|data_digest|omap_digest s 7 uv 54 dd 2ddbf8f5 od ddc3680f alloc_hint [[]0 0 255[]][)], object info inconsistent "
2063 err_strings
[2]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 shard 1 soid 3:33aca486:::ROBJ18:head : data_digest 0xbd89c912 != data_digest 0x2ddbf8f5 from auth oi 3:33aca486:::ROBJ18:head[(][0-9]*'[0-9]* osd.1.0:[0-9]* dirty|omap|data_digest|omap_digest s 7 uv 54 dd 2ddbf8f5 od ddc3680f alloc_hint [[]0 0 255[]][)]"
2064 err_strings
[3]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 soid 3:33aca486:::ROBJ18:head : failed to pick suitable auth object"
2065 err_strings
[4]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 shard 1 soid 3:5c7b2c47:::ROBJ16:head : candidate had a corrupt snapset"
2066 err_strings
[5]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 shard 0 soid 3:5c7b2c47:::ROBJ16:head : candidate had a missing snapset key"
2067 err_strings
[6]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 soid 3:5c7b2c47:::ROBJ16:head : failed to pick suitable object info"
2068 err_strings
[7]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 soid 3:86586531:::ROBJ8:head : attr value mismatch '_key1-ROBJ8', attr name mismatch '_key3-ROBJ8', attr name mismatch '_key2-ROBJ8'"
2069 err_strings
[8]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 shard 1 soid 3:87abbf36:::ROBJ11:head : candidate had a read error"
2070 err_strings
[9]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 shard 0 soid 3:8aa5320e:::ROBJ17:head : data_digest 0x5af0c3ef != data_digest 0x2ddbf8f5 from auth oi 3:8aa5320e:::ROBJ17:head[(][0-9]*'[0-9]* osd.1.0:[0-9]* dirty|omap|data_digest|omap_digest s 7 uv 51 dd 2ddbf8f5 od e9572720 alloc_hint [[]0 0 0[]][)]"
2071 err_strings
[10]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 shard 1 soid 3:8aa5320e:::ROBJ17:head : data_digest 0x5af0c3ef != data_digest 0x2ddbf8f5 from auth oi 3:8aa5320e:::ROBJ17:head[(][0-9]*'[0-9]* osd.1.0:[0-9]* dirty|omap|data_digest|omap_digest s 7 uv 51 dd 2ddbf8f5 od e9572720 alloc_hint [[]0 0 0[]][)]"
2072 err_strings
[11]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 soid 3:8aa5320e:::ROBJ17:head : failed to pick suitable auth object"
2073 err_strings
[12]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 soid 3:8b55fa4b:::ROBJ7:head : omap_digest 0xefced57a != omap_digest 0x6a73cc07 from shard 1"
2074 err_strings
[13]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 shard 1 soid 3:8b55fa4b:::ROBJ7:head : omap_digest 0x6a73cc07 != omap_digest 0xefced57a from auth oi 3:8b55fa4b:::ROBJ7:head[(][0-9]*'[0-9]* osd.1.0:[0-9]* dirty|omap|data_digest|omap_digest s 7 uv 21 dd 2ddbf8f5 od efced57a alloc_hint [[]0 0 0[]][)]"
2075 err_strings
[14]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 shard 0 soid 3:a53c12e8:::ROBJ6:head : omap_digest 0x689ee887 != omap_digest 0x179c919f from shard 1, omap_digest 0x689ee887 != omap_digest 0x179c919f from auth oi 3:a53c12e8:::ROBJ6:head[(][0-9]*'[0-9]* osd.1.0:[0-9]* dirty|omap|data_digest|omap_digest s 7 uv 18 dd 2ddbf8f5 od 179c919f alloc_hint [[]0 0 0[]][)]"
2076 err_strings
[15]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 shard 0 soid 3:b1f19cbd:::ROBJ10:head : omap_digest 0xa8dd5adc != omap_digest 0xc2025a24 from auth oi 3:b1f19cbd:::ROBJ10:head[(][0-9]*'[0-9]* osd.1.0:[0-9]* dirty|omap|data_digest|omap_digest s 7 uv 30 dd 2ddbf8f5 od c2025a24 alloc_hint [[]0 0 0[]][)]"
2077 err_strings
[16]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 shard 1 soid 3:b1f19cbd:::ROBJ10:head : omap_digest 0xa8dd5adc != omap_digest 0xc2025a24 from auth oi 3:b1f19cbd:::ROBJ10:head[(][0-9]*'[0-9]* osd.1.0:[0-9]* dirty|omap|data_digest|omap_digest s 7 uv 30 dd 2ddbf8f5 od c2025a24 alloc_hint [[]0 0 0[]][)]"
2078 err_strings
[17]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 soid 3:b1f19cbd:::ROBJ10:head : failed to pick suitable auth object"
2079 err_strings
[18]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 shard 0 soid 3:bc819597:::ROBJ12:head : candidate had a stat error"
2080 err_strings
[19]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 shard 1 soid 3:c0c86b1d:::ROBJ14:head : candidate had a missing info key"
2081 err_strings
[20]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 shard 0 soid 3:c0c86b1d:::ROBJ14:head : candidate had a corrupt info"
2082 err_strings
[21]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 soid 3:c0c86b1d:::ROBJ14:head : failed to pick suitable object info"
2083 err_strings
[22]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 shard 1 soid 3:ce3f1d6a:::ROBJ1:head : candidate size 9 info size 7 mismatch"
2084 err_strings
[23]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 shard 1 soid 3:ce3f1d6a:::ROBJ1:head : data_digest 0x2d4a11c2 != data_digest 0x2ddbf8f5 from shard 0, data_digest 0x2d4a11c2 != data_digest 0x2ddbf8f5 from auth oi 3:ce3f1d6a:::ROBJ1:head[(][0-9]*'[0-9]* osd.1.0:[0-9]* dirty|omap|data_digest|omap_digest s 7 uv 3 dd 2ddbf8f5 od f5fba2c6 alloc_hint [[]0 0 0[]][)], size 9 != size 7 from auth oi 3:ce3f1d6a:::ROBJ1:head[(][0-9]*'[0-9]* osd.1.0:[0-9]* dirty|omap|data_digest|omap_digest s 7 uv 3 dd 2ddbf8f5 od f5fba2c6 alloc_hint [[]0 0 0[]][)], size 9 != size 7 from shard 0"
2085 err_strings
[24]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 shard 1 soid 3:d60617f9:::ROBJ13:head : candidate had a read error"
2086 err_strings
[25]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 shard 0 soid 3:d60617f9:::ROBJ13:head : candidate had a stat error"
2087 err_strings
[26]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 soid 3:d60617f9:::ROBJ13:head : failed to pick suitable object info"
2088 err_strings
[27]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 shard 0 soid 3:e97ce31e:::ROBJ2:head : data_digest 0x578a4830 != data_digest 0x2ddbf8f5 from shard 1, data_digest 0x578a4830 != data_digest 0x2ddbf8f5 from auth oi 3:e97ce31e:::ROBJ2:head[(][0-9]*'[0-9]* osd.1.0:[0-9]* dirty|omap|data_digest|omap_digest s 7 uv 6 dd 2ddbf8f5 od f8e11918 alloc_hint [[]0 0 0[]][)]"
2089 err_strings
[28]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 shard 1 3:f2a5b2a4:::ROBJ3:head : missing"
2090 err_strings
[29]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 shard 0 soid 3:f4981d31:::ROBJ4:head : omap_digest 0xd7178dfe != omap_digest 0xe2d46ea4 from shard 1, omap_digest 0xd7178dfe != omap_digest 0xe2d46ea4 from auth oi 3:f4981d31:::ROBJ4:head[(][0-9]*'[0-9]* osd.1.0:[0-9]* dirty|omap|data_digest|omap_digest s 7 uv 12 dd 2ddbf8f5 od e2d46ea4 alloc_hint [[]0 0 0[]][)]"
2091 err_strings
[30]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 soid 3:f4bfd4d1:::ROBJ5:head : omap_digest 0x1a862a41 != omap_digest 0x6cac8f6 from shard 1"
2092 err_strings
[31]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 shard 1 soid 3:f4bfd4d1:::ROBJ5:head : omap_digest 0x6cac8f6 != omap_digest 0x1a862a41 from auth oi 3:f4bfd4d1:::ROBJ5:head[(][0-9]*'[0-9]* osd.1.0:[0-9]* dirty|omap|data_digest|omap_digest s 7 uv 15 dd 2ddbf8f5 od 1a862a41 alloc_hint [[]0 0 0[]][)]"
2093 err_strings
[32]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 shard 0 soid 3:ffdb2004:::ROBJ9:head : candidate size 3 info size 7 mismatch"
2094 err_strings
[33]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 shard 0 soid 3:ffdb2004:::ROBJ9:head : object info inconsistent "
2095 err_strings
[34]="log_channel[(]cluster[)] log [[]ERR[]] : deep-scrub [0-9]*[.]0 3:c0c86b1d:::ROBJ14:head : no '_' attr"
2096 err_strings
[35]="log_channel[(]cluster[)] log [[]ERR[]] : deep-scrub [0-9]*[.]0 3:5c7b2c47:::ROBJ16:head : can't decode 'snapset' attr buffer::malformed_input: .* no longer understand old encoding version 3 < 97"
2097 err_strings
[36]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 deep-scrub : stat mismatch, got 19/19 objects, 0/0 clones, 18/19 dirty, 18/19 omap, 0/0 pinned, 0/0 hit_set_archive, 0/0 whiteouts, 1049715/1049716 bytes, 0/0 manifest objects, 0/0 hit_set_archive bytes."
2098 err_strings
[37]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 deep-scrub 1 missing, 11 inconsistent objects"
2099 err_strings
[38]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 deep-scrub 35 errors"
2101 for err_string
in "${err_strings[@]}"
2103 if ! grep -q "$err_string" $dir/osd.
${primary}.log
2105 echo "Missing log message '$err_string'"
2106 ERRORS
=$
(expr $ERRORS + 1)
2110 rados list-inconsistent-pg
$poolname > $dir/json ||
return 1
2112 test $
(jq
'. | length' $dir/json
) = "1" ||
return 1
2114 test $
(jq
-r '.[0]' $dir/json
) = $pg ||
return 1
2116 rados list-inconsistent-obj
$pg > $dir/json ||
return 1
2117 # Get epoch for repair-get requests
2118 epoch
=$
(jq .epoch
$dir/json
)
2120 jq
"$jqfilter" << EOF | jq '.inconsistents' | python -c "$sortkeys" > $dir/checkcsjson
2126 "data_digest": "0x2ddbf8f5",
2127 "omap_digest": "0xf5fba2c6",
2145 "prior_version": "21'3",
2146 "last_reqid": "osd.1.0:57",
2149 "mtime": "2018-04-05 14:33:19.804040",
2150 "local_mtime": "2018-04-05 14:33:19.804839",
2160 "data_digest": "0x2ddbf8f5",
2161 "omap_digest": "0xf5fba2c6",
2162 "expected_object_size": 0,
2163 "expected_write_size": 0,
2164 "alloc_hint_flags": 0,
2170 "data_digest": "0x2d4a11c2",
2171 "omap_digest": "0xf5fba2c6",
2174 "data_digest_mismatch_info",
2175 "size_mismatch_info",
2176 "obj_size_info_mismatch"
2182 "selected_object_info": {
2193 "prior_version": "21'3",
2194 "last_reqid": "osd.1.0:57",
2197 "mtime": "2018-04-05 14:33:19.804040",
2198 "local_mtime": "2018-04-05 14:33:19.804839",
2208 "data_digest": "0x2ddbf8f5",
2209 "omap_digest": "0xf5fba2c6",
2210 "expected_object_size": 0,
2211 "expected_write_size": 0,
2212 "alloc_hint_flags": 0,
2218 "union_shard_errors": [
2219 "data_digest_mismatch_info",
2220 "size_mismatch_info",
2221 "obj_size_info_mismatch"
2224 "data_digest_mismatch",
2238 "data_digest": "0x2ddbf8f5",
2239 "omap_digest": "0xa8dd5adc",
2242 "omap_digest_mismatch_info"
2248 "data_digest": "0x2ddbf8f5",
2249 "omap_digest": "0xa8dd5adc",
2252 "omap_digest_mismatch_info"
2258 "selected_object_info": {
2259 "alloc_hint_flags": 0,
2260 "data_digest": "0x2ddbf8f5",
2261 "expected_object_size": 0,
2262 "expected_write_size": 0,
2282 "omap_digest": "0xc2025a24",
2289 "union_shard_errors": [
2290 "omap_digest_mismatch_info"
2304 "data_digest": "0x2ddbf8f5",
2305 "omap_digest": "0xa03cef03",
2320 "selected_object_info": {
2331 "prior_version": "41'33",
2332 "last_reqid": "osd.1.0:51",
2335 "mtime": "2018-04-05 14:33:26.761286",
2336 "local_mtime": "2018-04-05 14:33:26.762368",
2346 "data_digest": "0x2ddbf8f5",
2347 "omap_digest": "0xa03cef03",
2348 "expected_object_size": 0,
2349 "expected_write_size": 0,
2350 "alloc_hint_flags": 0,
2356 "union_shard_errors": [
2378 "data_digest": "0x2ddbf8f5",
2379 "omap_digest": "0x067f306a",
2386 "selected_object_info": {
2397 "prior_version": "43'36",
2398 "last_reqid": "osd.1.0:55",
2401 "mtime": "2018-04-05 14:33:27.460958",
2402 "local_mtime": "2018-04-05 14:33:27.462109",
2412 "data_digest": "0x2ddbf8f5",
2413 "omap_digest": "0x067f306a",
2414 "expected_object_size": 0,
2415 "expected_write_size": 0,
2416 "alloc_hint_flags": 0,
2422 "union_shard_errors": [
2452 "union_shard_errors": [
2468 "object_info": "bad-val",
2469 "data_digest": "0x2ddbf8f5",
2470 "omap_digest": "0x4f14f849",
2479 "data_digest": "0x2ddbf8f5",
2480 "omap_digest": "0x4f14f849",
2489 "union_shard_errors": [
2516 "prior_version": "49'45",
2517 "last_reqid": "osd.1.0:48",
2520 "mtime": "2018-04-05 14:33:29.498969",
2521 "local_mtime": "2018-04-05 14:33:29.499890",
2531 "data_digest": "0x2ddbf8f5",
2532 "omap_digest": "0x2d2a4d6e",
2533 "expected_object_size": 0,
2534 "expected_write_size": 0,
2535 "alloc_hint_flags": 0,
2541 "data_digest": "0x2ddbf8f5",
2542 "omap_digest": "0x2d2a4d6e",
2549 "data_digest": "0x2ddbf8f5",
2550 "omap_digest": "0x2d2a4d6e",
2559 "selected_object_info": {
2570 "prior_version": "49'45",
2571 "last_reqid": "osd.1.0:48",
2574 "mtime": "2018-04-05 14:33:29.498969",
2575 "local_mtime": "2018-04-05 14:33:29.499890",
2585 "data_digest": "0x2ddbf8f5",
2586 "omap_digest": "0x2d2a4d6e",
2587 "expected_object_size": 0,
2588 "expected_write_size": 0,
2589 "alloc_hint_flags": 0,
2595 "union_shard_errors": [
2618 "data_digest": "0x2ddbf8f5",
2622 "omap_digest": "0x8b699207",
2628 "snapset": "bad-val",
2629 "data_digest": "0x2ddbf8f5",
2633 "omap_digest": "0x8b699207",
2639 "union_shard_errors": [
2652 "selected_object_info": {
2653 "alloc_hint_flags": 0,
2654 "data_digest": "0x2ddbf8f5",
2655 "expected_object_size": 0,
2656 "expected_write_size": 0,
2676 "omap_digest": "0xe9572720",
2685 "data_digest": "0x5af0c3ef",
2687 "data_digest_mismatch_info"
2689 "omap_digest": "0xe9572720",
2695 "data_digest": "0x5af0c3ef",
2697 "data_digest_mismatch_info"
2699 "omap_digest": "0xe9572720",
2705 "union_shard_errors": [
2706 "data_digest_mismatch_info"
2711 "object_info_inconsistency"
2719 "selected_object_info": {
2720 "alloc_hint_flags": 255,
2721 "data_digest": "0x2ddbf8f5",
2722 "expected_object_size": 0,
2723 "expected_write_size": 0,
2743 "omap_digest": "0xddc3680f",
2752 "data_digest": "0xbd89c912",
2754 "data_digest_mismatch_info"
2757 "alloc_hint_flags": 0,
2758 "data_digest": "0x2ddbf8f5",
2759 "expected_object_size": 0,
2760 "expected_write_size": 0,
2780 "omap_digest": "0xddc3680f",
2787 "omap_digest": "0xddc3680f",
2793 "data_digest": "0xbd89c912",
2795 "data_digest_mismatch_info"
2798 "alloc_hint_flags": 255,
2799 "data_digest": "0x2ddbf8f5",
2800 "expected_object_size": 0,
2801 "expected_write_size": 0,
2821 "omap_digest": "0xddc3680f",
2828 "omap_digest": "0xddc3680f",
2834 "union_shard_errors": [
2835 "data_digest_mismatch_info"
2841 "data_digest": "0x578a4830",
2842 "omap_digest": "0xf8e11918",
2845 "data_digest_mismatch_info"
2851 "data_digest": "0x2ddbf8f5",
2852 "omap_digest": "0xf8e11918",
2859 "selected_object_info": {
2870 "prior_version": "23'6",
2871 "last_reqid": "osd.1.0:59",
2874 "mtime": "2018-04-05 14:33:20.498756",
2875 "local_mtime": "2018-04-05 14:33:20.499704",
2885 "data_digest": "0x2ddbf8f5",
2886 "omap_digest": "0xf8e11918",
2887 "expected_object_size": 0,
2888 "expected_write_size": 0,
2889 "alloc_hint_flags": 0,
2895 "union_shard_errors": [
2896 "data_digest_mismatch_info"
2899 "data_digest_mismatch"
2912 "data_digest": "0x2ddbf8f5",
2913 "omap_digest": "0x00b35dfd",
2927 "selected_object_info": {
2938 "prior_version": "25'9",
2939 "last_reqid": "osd.1.0:60",
2942 "mtime": "2018-04-05 14:33:21.189382",
2943 "local_mtime": "2018-04-05 14:33:21.190446",
2953 "data_digest": "0x2ddbf8f5",
2954 "omap_digest": "0x00b35dfd",
2955 "expected_object_size": 0,
2956 "expected_write_size": 0,
2957 "alloc_hint_flags": 0,
2963 "union_shard_errors": [
2978 "data_digest": "0x2ddbf8f5",
2979 "omap_digest": "0xd7178dfe",
2982 "omap_digest_mismatch_info"
2988 "data_digest": "0x2ddbf8f5",
2989 "omap_digest": "0xe2d46ea4",
2996 "selected_object_info": {
3007 "prior_version": "27'12",
3008 "last_reqid": "osd.1.0:61",
3011 "mtime": "2018-04-05 14:33:21.862313",
3012 "local_mtime": "2018-04-05 14:33:21.863261",
3022 "data_digest": "0x2ddbf8f5",
3023 "omap_digest": "0xe2d46ea4",
3024 "expected_object_size": 0,
3025 "expected_write_size": 0,
3026 "alloc_hint_flags": 0,
3032 "union_shard_errors": [
3033 "omap_digest_mismatch_info"
3036 "omap_digest_mismatch"
3049 "data_digest": "0x2ddbf8f5",
3050 "omap_digest": "0x1a862a41",
3057 "data_digest": "0x2ddbf8f5",
3058 "omap_digest": "0x06cac8f6",
3061 "omap_digest_mismatch_info"
3067 "selected_object_info": {
3078 "prior_version": "29'15",
3079 "last_reqid": "osd.1.0:62",
3082 "mtime": "2018-04-05 14:33:22.589300",
3083 "local_mtime": "2018-04-05 14:33:22.590376",
3093 "data_digest": "0x2ddbf8f5",
3094 "omap_digest": "0x1a862a41",
3095 "expected_object_size": 0,
3096 "expected_write_size": 0,
3097 "alloc_hint_flags": 0,
3103 "union_shard_errors": [
3104 "omap_digest_mismatch_info"
3107 "omap_digest_mismatch"
3120 "data_digest": "0x2ddbf8f5",
3121 "omap_digest": "0x689ee887",
3124 "omap_digest_mismatch_info"
3130 "data_digest": "0x2ddbf8f5",
3131 "omap_digest": "0x179c919f",
3138 "selected_object_info": {
3149 "prior_version": "31'18",
3150 "last_reqid": "osd.1.0:53",
3153 "mtime": "2018-04-05 14:33:23.289188",
3154 "local_mtime": "2018-04-05 14:33:23.290130",
3164 "data_digest": "0x2ddbf8f5",
3165 "omap_digest": "0x179c919f",
3166 "expected_object_size": 0,
3167 "expected_write_size": 0,
3168 "alloc_hint_flags": 0,
3174 "union_shard_errors": [
3175 "omap_digest_mismatch_info"
3178 "omap_digest_mismatch"
3191 "data_digest": "0x2ddbf8f5",
3192 "omap_digest": "0xefced57a",
3199 "data_digest": "0x2ddbf8f5",
3200 "omap_digest": "0x6a73cc07",
3203 "omap_digest_mismatch_info"
3209 "selected_object_info": {
3220 "prior_version": "33'21",
3221 "last_reqid": "osd.1.0:52",
3224 "mtime": "2018-04-05 14:33:23.979658",
3225 "local_mtime": "2018-04-05 14:33:23.980731",
3235 "data_digest": "0x2ddbf8f5",
3236 "omap_digest": "0xefced57a",
3237 "expected_object_size": 0,
3238 "expected_write_size": 0,
3239 "alloc_hint_flags": 0,
3245 "union_shard_errors": [
3246 "omap_digest_mismatch_info"
3249 "omap_digest_mismatch"
3266 "name": "key1-ROBJ8"
3270 "value": "val2-ROBJ8",
3271 "name": "key2-ROBJ8"
3274 "data_digest": "0x2ddbf8f5",
3275 "omap_digest": "0xd6be81dc",
3285 "value": "val1-ROBJ8",
3286 "name": "key1-ROBJ8"
3290 "value": "val3-ROBJ8",
3291 "name": "key3-ROBJ8"
3294 "data_digest": "0x2ddbf8f5",
3295 "omap_digest": "0xd6be81dc",
3302 "selected_object_info": {
3313 "prior_version": "79'65",
3314 "last_reqid": "client.4554.0:1",
3317 "mtime": "2018-04-05 14:34:05.598688",
3318 "local_mtime": "2018-04-05 14:34:05.599698",
3328 "data_digest": "0x2ddbf8f5",
3329 "omap_digest": "0xd6be81dc",
3330 "expected_object_size": 0,
3331 "expected_write_size": 0,
3332 "alloc_hint_flags": 0,
3338 "union_shard_errors": [],
3340 "attr_value_mismatch",
3341 "attr_name_mismatch"
3365 "prior_version": "37'27",
3366 "last_reqid": "osd.1.0:63",
3369 "mtime": "2018-04-05 14:33:25.352485",
3370 "local_mtime": "2018-04-05 14:33:25.353746",
3380 "data_digest": "0x2ddbf8f5",
3381 "omap_digest": "0x2eecc539",
3382 "expected_object_size": 0,
3383 "expected_write_size": 0,
3384 "alloc_hint_flags": 0,
3390 "data_digest": "0x1f26fb26",
3391 "omap_digest": "0x2eecc539",
3394 "obj_size_info_mismatch"
3410 "version": "119'68",
3411 "prior_version": "51'64",
3412 "last_reqid": "client.4834.0:1",
3415 "mtime": "2018-04-05 14:35:01.500659",
3416 "local_mtime": "2018-04-05 14:35:01.502117",
3426 "data_digest": "0x1f26fb26",
3427 "omap_digest": "0x2eecc539",
3428 "expected_object_size": 0,
3429 "expected_write_size": 0,
3430 "alloc_hint_flags": 0,
3436 "data_digest": "0x1f26fb26",
3437 "omap_digest": "0x2eecc539",
3444 "selected_object_info": {
3454 "version": "119'68",
3455 "prior_version": "51'64",
3456 "last_reqid": "client.4834.0:1",
3459 "mtime": "2018-04-05 14:35:01.500659",
3460 "local_mtime": "2018-04-05 14:35:01.502117",
3470 "data_digest": "0x1f26fb26",
3471 "omap_digest": "0x2eecc539",
3472 "expected_object_size": 0,
3473 "expected_write_size": 0,
3474 "alloc_hint_flags": 0,
3480 "union_shard_errors": [
3481 "obj_size_info_mismatch"
3484 "object_info_inconsistency"
3499 jq
"$jqfilter" $dir/json | jq
'.inconsistents' | python
-c "$sortkeys" > $dir/csjson
3500 multidiff
$dir/checkcsjson
$dir/csjson ||
test $getjson = "yes" ||
return 1
3501 if test $getjson = "yes"
3503 jq
'.' $dir/json
> save2.json
3506 if test "$LOCALRUN" = "yes" && which jsonschema
> /dev
/null
;
3508 jsonschema
-i $dir/json
$CEPH_ROOT/doc
/rados
/command
/list-inconsistent-obj.json ||
return 1
3514 # This hangs if the repair doesn't work
3515 timeout
30 rados
-p $poolname get ROBJ17
$dir/robj17.out ||
return 1
3516 timeout
30 rados
-p $poolname get ROBJ18
$dir/robj18.out ||
return 1
3517 # Even though we couldn't repair all of the introduced errors, we can fix ROBJ17
3518 diff -q $dir/new.ROBJ17
$dir/robj17.out ||
return 1
3519 rm -f $dir/new.ROBJ17
$dir/robj17.out ||
return 1
3520 diff -q $dir/new.ROBJ18
$dir/robj18.out ||
return 1
3521 rm -f $dir/new.ROBJ18
$dir/robj18.out ||
return 1
3523 if [ $ERRORS != "0" ];
3525 echo "TEST FAILED WITH $ERRORS ERRORS"
3529 ceph osd pool
rm $poolname $poolname --yes-i-really-really-mean-it
3530 teardown
$dir ||
return 1
3535 # Test scrub errors for an erasure coded pool
3537 function corrupt_scrub_erasure
() {
3539 local allow_overwrites
=$2
3540 local poolname
=ecpool
3543 setup
$dir ||
return 1
3544 run_mon
$dir a ||
return 1
3545 run_mgr
$dir x ||
return 1
3546 for id
in $
(seq 0 2) ; do
3547 if [ "$allow_overwrites" = "true" ]; then
3548 run_osd
$dir $id ||
return 1
3550 run_osd_filestore
$dir $id ||
return 1
3553 create_rbd_pool ||
return 1
3556 create_ec_pool
$poolname $allow_overwrites k
=2 m
=1 stripe_unit
=2K
--force ||
return 1
3557 wait_for_clean ||
return 1
3559 for i
in $
(seq 1 $total_objs) ; do
3561 add_something
$dir $poolname $objname ||
return 1
3563 local osd
=$
(expr $i % 2)
3567 # Size (deep scrub data_digest too)
3568 local payload
=UVWXYZZZ
3569 echo $payload > $dir/CORRUPT
3570 objectstore_tool
$dir $osd $objname set-bytes
$dir/CORRUPT ||
return 1
3575 dd if=/dev
/urandom of
=$dir/CORRUPT bs
=2048 count
=1
3576 objectstore_tool
$dir $osd $objname set-bytes
$dir/CORRUPT ||
return 1
3581 objectstore_tool
$dir $osd $objname remove ||
return 1
3585 rados
--pool $poolname setxattr
$objname key1-
$objname val1-
$objname ||
return 1
3586 rados
--pool $poolname setxattr
$objname key2-
$objname val2-
$objname ||
return 1
3589 echo -n bad-val
> $dir/bad-val
3590 objectstore_tool
$dir $osd $objname set-attr _key1-
$objname $dir/bad-val ||
return 1
3591 objectstore_tool
$dir $osd $objname rm-attr _key2-
$objname ||
return 1
3592 echo -n val3-
$objname > $dir/newval
3593 objectstore_tool
$dir $osd $objname set-attr _key3-
$objname $dir/newval ||
return 1
3594 rm $dir/bad-val
$dir/newval
3599 dd if=/dev
/urandom of
=$dir/CORRUPT bs
=2048 count
=2
3600 objectstore_tool
$dir $osd $objname set-bytes
$dir/CORRUPT ||
return 1
3604 objectstore_tool
$dir 0 $objname rm-attr hinfo_key ||
return 1
3605 echo -n bad-val
> $dir/bad-val
3606 objectstore_tool
$dir 1 $objname set-attr hinfo_key
$dir/bad-val ||
return 1
3610 local payload
=MAKETHISDIFFERENTFROMOTHEROBJECTS
3611 echo $payload > $dir/DIFFERENT
3612 rados
--pool $poolname put
$objname $dir/DIFFERENT ||
return 1
3614 # Get hinfo_key from EOBJ1
3615 objectstore_tool
$dir 0 EOBJ1 get-attr hinfo_key
> $dir/hinfo
3616 objectstore_tool
$dir 0 $objname set-attr hinfo_key
$dir/hinfo ||
return 1
3623 local pg
=$
(get_pg
$poolname EOBJ0
)
3627 rados list-inconsistent-pg
$poolname > $dir/json ||
return 1
3629 test $
(jq
'. | length' $dir/json
) = "1" ||
return 1
3631 test $
(jq
-r '.[0]' $dir/json
) = $pg ||
return 1
3633 rados list-inconsistent-obj
$pg > $dir/json ||
return 1
3634 # Get epoch for repair-get requests
3635 epoch
=$
(jq .epoch
$dir/json
)
3637 jq
"$jqfilter" << EOF | jq '.inconsistents' | python -c "$sortkeys" > $dir/checkcsjson
3661 "prior_version": "0'0",
3662 "last_reqid": "client.4184.0:1",
3674 "data_digest": "0x2ddbf8f5",
3675 "omap_digest": "0xffffffff",
3676 "expected_object_size": 0,
3677 "expected_write_size": 0,
3678 "alloc_hint_flags": 0,
3687 "size_mismatch_info",
3688 "obj_size_info_mismatch"
3701 "selected_object_info": {
3712 "prior_version": "0'0",
3713 "last_reqid": "client.4184.0:1",
3725 "data_digest": "0x2ddbf8f5",
3726 "omap_digest": "0xffffffff",
3727 "expected_object_size": 0,
3728 "expected_write_size": 0,
3729 "alloc_hint_flags": 0,
3735 "union_shard_errors": [
3736 "size_mismatch_info",
3737 "obj_size_info_mismatch"
3775 "selected_object_info": {
3786 "prior_version": "0'0",
3787 "last_reqid": "client.4252.0:1",
3799 "data_digest": "0x2ddbf8f5",
3800 "omap_digest": "0xffffffff",
3801 "expected_object_size": 0,
3802 "expected_write_size": 0,
3803 "alloc_hint_flags": 0,
3809 "union_shard_errors": [
3828 "name": "key1-EOBJ4"
3832 "value": "val2-EOBJ4",
3833 "name": "key2-EOBJ4"
3851 "value": "val1-EOBJ4",
3852 "name": "key1-EOBJ4"
3856 "value": "val2-EOBJ4",
3857 "name": "key2-EOBJ4"
3870 "value": "val1-EOBJ4",
3871 "name": "key1-EOBJ4"
3875 "value": "val3-EOBJ4",
3876 "name": "key3-EOBJ4"
3881 "selected_object_info": {
3892 "prior_version": "45'5",
3893 "last_reqid": "client.4294.0:1",
3905 "data_digest": "0x2ddbf8f5",
3906 "omap_digest": "0xffffffff",
3907 "expected_object_size": 0,
3908 "expected_write_size": 0,
3909 "alloc_hint_flags": 0,
3915 "union_shard_errors": [],
3917 "attr_value_mismatch",
3918 "attr_name_mismatch"
3949 "prior_version": "0'0",
3950 "last_reqid": "client.4382.0:1",
3962 "data_digest": "0x2ddbf8f5",
3963 "omap_digest": "0xffffffff",
3964 "expected_object_size": 0,
3965 "expected_write_size": 0,
3966 "alloc_hint_flags": 0,
3975 "size_mismatch_info",
3976 "obj_size_info_mismatch"
3989 "selected_object_info": {
4000 "prior_version": "0'0",
4001 "last_reqid": "client.4382.0:1",
4013 "data_digest": "0x2ddbf8f5",
4014 "omap_digest": "0xffffffff",
4015 "expected_object_size": 0,
4016 "expected_write_size": 0,
4017 "alloc_hint_flags": 0,
4023 "union_shard_errors": [
4024 "size_mismatch_info",
4025 "obj_size_info_mismatch"
4047 "selected_object_info": {
4058 "prior_version": "0'0",
4059 "last_reqid": "client.4418.0:1",
4071 "data_digest": "0x2ddbf8f5",
4072 "omap_digest": "0xffffffff",
4073 "expected_object_size": 0,
4074 "expected_write_size": 0,
4075 "alloc_hint_flags": 0,
4098 "hashinfo": "bad-val",
4108 "cumulative_shard_hashes": [
4122 "total_chunk_size": 2048
4126 "union_shard_errors": [
4133 "hinfo_inconsistency"
4142 "selected_object_info": {
4153 "prior_version": "75'9",
4154 "last_reqid": "client.4482.0:1",
4166 "data_digest": "0x136e4e27",
4167 "omap_digest": "0xffffffff",
4168 "expected_object_size": 0,
4169 "expected_write_size": 0,
4170 "alloc_hint_flags": 0,
4179 "cumulative_shard_hashes": [
4193 "total_chunk_size": 2048
4203 "cumulative_shard_hashes": [
4217 "total_chunk_size": 2048
4227 "cumulative_shard_hashes": [
4241 "total_chunk_size": 2048
4250 "union_shard_errors": []
4257 jq
"$jqfilter" $dir/json | jq
'.inconsistents' | python
-c "$sortkeys" > $dir/csjson
4258 multidiff
$dir/checkcsjson
$dir/csjson ||
test $getjson = "yes" ||
return 1
4259 if test $getjson = "yes"
4261 jq
'.' $dir/json
> save3.json
4264 if test "$LOCALRUN" = "yes" && which jsonschema
> /dev
/null
;
4266 jsonschema
-i $dir/json
$CEPH_ROOT/doc
/rados
/command
/list-inconsistent-obj.json ||
return 1
4271 rados list-inconsistent-pg
$poolname > $dir/json ||
return 1
4273 test $
(jq
'. | length' $dir/json
) = "1" ||
return 1
4275 test $
(jq
-r '.[0]' $dir/json
) = $pg ||
return 1
4277 rados list-inconsistent-obj
$pg > $dir/json ||
return 1
4278 # Get epoch for repair-get requests
4279 epoch
=$
(jq .epoch
$dir/json
)
4281 if [ "$allow_overwrites" = "true" ]
4283 jq
"$jqfilter" << EOF | jq '.inconsistents' | python -c "$sortkeys" > $dir/checkcsjson
4289 "data_digest": "0x00000000",
4290 "omap_digest": "0xffffffff",
4309 "prior_version": "0'0",
4310 "last_reqid": "client.4184.0:1",
4313 "mtime": "2018-04-05 14:31:33.837147",
4314 "local_mtime": "2018-04-05 14:31:33.840763",
4322 "data_digest": "0x2ddbf8f5",
4323 "omap_digest": "0xffffffff",
4324 "expected_object_size": 0,
4325 "expected_write_size": 0,
4326 "alloc_hint_flags": 0,
4336 "size_mismatch_info",
4337 "obj_size_info_mismatch"
4343 "data_digest": "0x00000000",
4344 "omap_digest": "0xffffffff",
4352 "selected_object_info": {
4363 "prior_version": "0'0",
4364 "last_reqid": "client.4184.0:1",
4367 "mtime": "2018-04-05 14:31:33.837147",
4368 "local_mtime": "2018-04-05 14:31:33.840763",
4376 "data_digest": "0x2ddbf8f5",
4377 "omap_digest": "0xffffffff",
4378 "expected_object_size": 0,
4379 "expected_write_size": 0,
4380 "alloc_hint_flags": 0,
4386 "union_shard_errors": [
4388 "size_mismatch_info",
4389 "obj_size_info_mismatch"
4405 "data_digest": "0x00000000",
4406 "omap_digest": "0xffffffff",
4422 "data_digest": "0x00000000",
4423 "omap_digest": "0xffffffff",
4431 "selected_object_info": {
4442 "prior_version": "0'0",
4443 "last_reqid": "client.4252.0:1",
4446 "mtime": "2018-04-05 14:31:46.841145",
4447 "local_mtime": "2018-04-05 14:31:46.844996",
4455 "data_digest": "0x2ddbf8f5",
4456 "omap_digest": "0xffffffff",
4457 "expected_object_size": 0,
4458 "expected_write_size": 0,
4459 "alloc_hint_flags": 0,
4465 "union_shard_errors": [
4484 "name": "key1-EOBJ4"
4488 "value": "val2-EOBJ4",
4489 "name": "key2-EOBJ4"
4492 "data_digest": "0x00000000",
4493 "omap_digest": "0xffffffff",
4504 "value": "val1-EOBJ4",
4505 "name": "key1-EOBJ4"
4509 "value": "val2-EOBJ4",
4510 "name": "key2-EOBJ4"
4513 "data_digest": "0x00000000",
4514 "omap_digest": "0xffffffff",
4525 "value": "val1-EOBJ4",
4526 "name": "key1-EOBJ4"
4530 "value": "val3-EOBJ4",
4531 "name": "key3-EOBJ4"
4534 "data_digest": "0x00000000",
4535 "omap_digest": "0xffffffff",
4543 "selected_object_info": {
4554 "prior_version": "45'5",
4555 "last_reqid": "client.4294.0:1",
4558 "mtime": "2018-04-05 14:31:54.663622",
4559 "local_mtime": "2018-04-05 14:31:54.664527",
4567 "data_digest": "0x2ddbf8f5",
4568 "omap_digest": "0xffffffff",
4569 "expected_object_size": 0,
4570 "expected_write_size": 0,
4571 "alloc_hint_flags": 0,
4577 "union_shard_errors": [],
4579 "attr_value_mismatch",
4580 "attr_name_mismatch"
4593 "data_digest": "0x00000000",
4594 "omap_digest": "0xffffffff",
4602 "data_digest": "0x00000000",
4603 "omap_digest": "0xffffffff",
4615 "prior_version": "0'0",
4616 "last_reqid": "client.4382.0:1",
4619 "mtime": "2018-04-05 14:32:12.929161",
4620 "local_mtime": "2018-04-05 14:32:12.934707",
4628 "data_digest": "0x2ddbf8f5",
4629 "omap_digest": "0xffffffff",
4630 "expected_object_size": 0,
4631 "expected_write_size": 0,
4632 "alloc_hint_flags": 0,
4640 "size_mismatch_info",
4641 "obj_size_info_mismatch"
4648 "data_digest": "0x00000000",
4649 "omap_digest": "0xffffffff",
4657 "selected_object_info": {
4668 "prior_version": "0'0",
4669 "last_reqid": "client.4382.0:1",
4672 "mtime": "2018-04-05 14:32:12.929161",
4673 "local_mtime": "2018-04-05 14:32:12.934707",
4681 "data_digest": "0x2ddbf8f5",
4682 "omap_digest": "0xffffffff",
4683 "expected_object_size": 0,
4684 "expected_write_size": 0,
4685 "alloc_hint_flags": 0,
4691 "union_shard_errors": [
4692 "size_mismatch_info",
4693 "obj_size_info_mismatch"
4715 "union_shard_errors": [
4720 "selected_object_info": {
4731 "prior_version": "0'0",
4732 "last_reqid": "client.4418.0:1",
4735 "mtime": "2018-04-05 14:32:20.634116",
4736 "local_mtime": "2018-04-05 14:32:20.637999",
4744 "data_digest": "0x2ddbf8f5",
4745 "omap_digest": "0xffffffff",
4746 "expected_object_size": 0,
4747 "expected_write_size": 0,
4748 "alloc_hint_flags": 0,
4774 "hashinfo": "bad-val"
4782 "omap_digest": "0xffffffff",
4783 "data_digest": "0x00000000",
4785 "cumulative_shard_hashes": [
4799 "total_chunk_size": 2048
4813 "hinfo_inconsistency"
4815 "union_shard_errors": [],
4816 "selected_object_info": {
4827 "prior_version": "75'9",
4828 "last_reqid": "client.4482.0:1",
4831 "mtime": "2018-04-05 14:32:33.058782",
4832 "local_mtime": "2018-04-05 14:32:33.059679",
4840 "data_digest": "0x136e4e27",
4841 "omap_digest": "0xffffffff",
4842 "expected_object_size": 0,
4843 "expected_write_size": 0,
4844 "alloc_hint_flags": 0,
4857 "omap_digest": "0xffffffff",
4858 "data_digest": "0x00000000",
4860 "cumulative_shard_hashes": [
4874 "total_chunk_size": 2048
4883 "omap_digest": "0xffffffff",
4884 "data_digest": "0x00000000",
4886 "cumulative_shard_hashes": [
4900 "total_chunk_size": 2048
4909 "omap_digest": "0xffffffff",
4910 "data_digest": "0x00000000",
4912 "cumulative_shard_hashes": [
4926 "total_chunk_size": 2048
4938 jq
"$jqfilter" << EOF | jq '.inconsistents' | python -c "$sortkeys" > $dir/checkcsjson
4944 "data_digest": "0x04cfa72f",
4945 "omap_digest": "0xffffffff",
4964 "prior_version": "0'0",
4965 "last_reqid": "client.4192.0:1",
4968 "mtime": "2018-04-05 14:30:10.688009",
4969 "local_mtime": "2018-04-05 14:30:10.691774",
4977 "data_digest": "0x2ddbf8f5",
4978 "omap_digest": "0xffffffff",
4979 "expected_object_size": 0,
4980 "expected_write_size": 0,
4981 "alloc_hint_flags": 0,
4991 "size_mismatch_info",
4992 "obj_size_info_mismatch"
4998 "data_digest": "0x04cfa72f",
4999 "omap_digest": "0xffffffff",
5007 "selected_object_info": {
5018 "prior_version": "0'0",
5019 "last_reqid": "client.4192.0:1",
5022 "mtime": "2018-04-05 14:30:10.688009",
5023 "local_mtime": "2018-04-05 14:30:10.691774",
5031 "data_digest": "0x2ddbf8f5",
5032 "omap_digest": "0xffffffff",
5033 "expected_object_size": 0,
5034 "expected_write_size": 0,
5035 "alloc_hint_flags": 0,
5041 "union_shard_errors": [
5043 "size_mismatch_info",
5044 "obj_size_info_mismatch"
5069 "data_digest": "0x04cfa72f",
5070 "omap_digest": "0xffffffff",
5078 "data_digest": "0x04cfa72f",
5079 "omap_digest": "0xffffffff",
5087 "selected_object_info": {
5098 "prior_version": "0'0",
5099 "last_reqid": "client.4224.0:1",
5102 "mtime": "2018-04-05 14:30:14.152945",
5103 "local_mtime": "2018-04-05 14:30:14.154014",
5111 "data_digest": "0x2ddbf8f5",
5112 "omap_digest": "0xffffffff",
5113 "expected_object_size": 0,
5114 "expected_write_size": 0,
5115 "alloc_hint_flags": 0,
5121 "union_shard_errors": [
5136 "data_digest": "0x04cfa72f",
5137 "omap_digest": "0xffffffff",
5153 "data_digest": "0x04cfa72f",
5154 "omap_digest": "0xffffffff",
5162 "selected_object_info": {
5173 "prior_version": "0'0",
5174 "last_reqid": "client.4258.0:1",
5177 "mtime": "2018-04-05 14:30:18.875544",
5178 "local_mtime": "2018-04-05 14:30:18.880153",
5186 "data_digest": "0x2ddbf8f5",
5187 "omap_digest": "0xffffffff",
5188 "expected_object_size": 0,
5189 "expected_write_size": 0,
5190 "alloc_hint_flags": 0,
5196 "union_shard_errors": [
5215 "name": "key1-EOBJ4"
5219 "value": "val2-EOBJ4",
5220 "name": "key2-EOBJ4"
5223 "data_digest": "0x04cfa72f",
5224 "omap_digest": "0xffffffff",
5237 "omap_digest": "0xffffffff",
5238 "data_digest": "0x04cfa72f",
5242 "value": "val1-EOBJ4",
5243 "name": "key1-EOBJ4"
5247 "value": "val2-EOBJ4",
5248 "name": "key2-EOBJ4"
5258 "omap_digest": "0xffffffff",
5259 "data_digest": "0x04cfa72f",
5263 "value": "val1-EOBJ4",
5264 "name": "key1-EOBJ4"
5268 "value": "val3-EOBJ4",
5269 "name": "key3-EOBJ4"
5274 "selected_object_info": {
5285 "prior_version": "45'5",
5286 "last_reqid": "client.4296.0:1",
5289 "mtime": "2018-04-05 14:30:22.271983",
5290 "local_mtime": "2018-04-05 14:30:22.272840",
5298 "data_digest": "0x2ddbf8f5",
5299 "omap_digest": "0xffffffff",
5300 "expected_object_size": 0,
5301 "expected_write_size": 0,
5302 "alloc_hint_flags": 0,
5308 "union_shard_errors": [],
5310 "attr_value_mismatch",
5311 "attr_name_mismatch"
5324 "data_digest": "0x04cfa72f",
5325 "omap_digest": "0xffffffff",
5344 "prior_version": "0'0",
5345 "last_reqid": "client.4384.0:1",
5348 "mtime": "2018-04-05 14:30:35.162395",
5349 "local_mtime": "2018-04-05 14:30:35.166390",
5357 "data_digest": "0x2ddbf8f5",
5358 "omap_digest": "0xffffffff",
5359 "expected_object_size": 0,
5360 "expected_write_size": 0,
5361 "alloc_hint_flags": 0,
5370 "size_mismatch_info",
5372 "obj_size_info_mismatch"
5378 "data_digest": "0x04cfa72f",
5379 "omap_digest": "0xffffffff",
5387 "selected_object_info": {
5398 "prior_version": "0'0",
5399 "last_reqid": "client.4384.0:1",
5402 "mtime": "2018-04-05 14:30:35.162395",
5403 "local_mtime": "2018-04-05 14:30:35.166390",
5411 "data_digest": "0x2ddbf8f5",
5412 "omap_digest": "0xffffffff",
5413 "expected_object_size": 0,
5414 "expected_write_size": 0,
5415 "alloc_hint_flags": 0,
5421 "union_shard_errors": [
5422 "size_mismatch_info",
5424 "obj_size_info_mismatch"
5446 "union_shard_errors": [
5451 "selected_object_info": {
5462 "prior_version": "0'0",
5463 "last_reqid": "client.4420.0:1",
5466 "mtime": "2018-04-05 14:30:40.914673",
5467 "local_mtime": "2018-04-05 14:30:40.917705",
5475 "data_digest": "0x2ddbf8f5",
5476 "omap_digest": "0xffffffff",
5477 "expected_object_size": 0,
5478 "expected_write_size": 0,
5479 "alloc_hint_flags": 0,
5505 "hashinfo": "bad-val"
5513 "omap_digest": "0xffffffff",
5514 "data_digest": "0x04cfa72f",
5516 "cumulative_shard_hashes": [
5530 "total_chunk_size": 2048
5544 "hinfo_inconsistency"
5546 "union_shard_errors": [
5549 "selected_object_info": {
5560 "prior_version": "75'9",
5561 "last_reqid": "client.4486.0:1",
5564 "mtime": "2018-04-05 14:30:50.995009",
5565 "local_mtime": "2018-04-05 14:30:50.996112",
5573 "data_digest": "0x136e4e27",
5574 "omap_digest": "0xffffffff",
5575 "expected_object_size": 0,
5576 "expected_write_size": 0,
5577 "alloc_hint_flags": 0,
5593 "cumulative_shard_hashes": [
5607 "total_chunk_size": 2048
5616 "omap_digest": "0xffffffff",
5617 "data_digest": "0x5b7455a8",
5619 "cumulative_shard_hashes": [
5633 "total_chunk_size": 2048
5642 "omap_digest": "0xffffffff",
5643 "data_digest": "0x5b7455a8",
5645 "cumulative_shard_hashes": [
5659 "total_chunk_size": 2048
5671 jq
"$jqfilter" $dir/json | jq
'.inconsistents' | python
-c "$sortkeys" > $dir/csjson
5672 multidiff
$dir/checkcsjson
$dir/csjson ||
test $getjson = "yes" ||
return 1
5673 if test $getjson = "yes"
5675 if [ "$allow_overwrites" = "true" ]
5681 jq
'.' $dir/json
> save
${num}.json
5684 if test "$LOCALRUN" = "yes" && which jsonschema
> /dev
/null
;
5686 jsonschema
-i $dir/json
$CEPH_ROOT/doc
/rados
/command
/list-inconsistent-obj.json ||
return 1
5689 ceph osd pool
rm $poolname $poolname --yes-i-really-really-mean-it
5690 teardown
$dir ||
return 1
5693 function TEST_corrupt_scrub_erasure_appends
() {
5694 corrupt_scrub_erasure
$1 false
5697 function TEST_corrupt_scrub_erasure_overwrites
() {
5698 if [ "$use_ec_overwrite" = "true" ]; then
5699 corrupt_scrub_erasure
$1 true
5704 # Test to make sure that a periodic scrub won't cause deep-scrub info to be lost
5706 function TEST_periodic_scrub_replicated
() {
5708 local poolname
=psr_pool
5711 setup
$dir ||
return 1
5712 run_mon
$dir a
--osd_pool_default_size=2 ||
return 1
5713 run_mgr
$dir x ||
return 1
5714 local ceph_osd_args
="--osd-scrub-interval-randomize-ratio=0 --osd-deep-scrub-randomize-ratio=0 "
5715 ceph_osd_args
+="--osd_scrub_backoff_ratio=0"
5716 run_osd
$dir 0 $ceph_osd_args ||
return 1
5717 run_osd
$dir 1 $ceph_osd_args ||
return 1
5718 create_rbd_pool ||
return 1
5719 wait_for_clean ||
return 1
5721 create_pool
$poolname 1 1 ||
return 1
5722 wait_for_clean ||
return 1
5725 add_something
$dir $poolname $objname scrub ||
return 1
5726 local primary
=$
(get_primary
$poolname $objname)
5727 local pg
=$
(get_pg
$poolname $objname)
5729 # Add deep-scrub only error
5730 local payload
=UVWXYZ
5731 echo $payload > $dir/CORRUPT
5732 # Uses $ceph_osd_args for osd restart
5733 objectstore_tool
$dir $osd $objname set-bytes
$dir/CORRUPT ||
return 1
5735 # No scrub information available, so expect failure
5737 ! rados list-inconsistent-obj
$pg | jq
'.' ||
return 1
5740 pg_deep_scrub
$pg ||
return 1
5742 # Make sure bad object found
5743 rados list-inconsistent-obj
$pg | jq
'.' |
grep -q $objname ||
return 1
5746 local last_scrub
=$
(get_last_scrub_stamp
$pg)
5747 # Fake a schedule scrub
5748 CEPH_ARGS
='' ceph
--admin-daemon $
(get_asok_path osd.
${primary}) \
5749 trigger_scrub
$pg ||
return 1
5750 # Wait for schedule regular scrub
5751 wait_for_scrub
$pg "$last_scrub"
5753 # It needed to be upgraded
5754 grep -q "Deep scrub errors, upgrading scrub to deep-scrub" $dir/osd.
${primary}.log ||
return 1
5756 # Bad object still known
5757 rados list-inconsistent-obj
$pg | jq
'.' |
grep -q $objname ||
return 1
5759 # Can't upgrade with this set
5760 ceph osd
set nodeep-scrub
5761 # Let map change propagate to OSDs
5765 # Fake a schedule scrub
5766 CEPH_ARGS
='' ceph
--admin-daemon $
(get_asok_path osd.
${primary}) \
5767 trigger_scrub
$pg ||
return 1
5768 # Wait for schedule regular scrub
5769 # to notice scrub and skip it
5771 for i
in $
(seq 14 -1 0)
5774 ! grep -q "Regular scrub skipped due to deep-scrub errors and nodeep-scrub set" $dir/osd.
${primary}.log ||
{ found
=true
; break; }
5775 echo Time left
: $i seconds
5777 test $found = "true" ||
return 1
5779 # Bad object still known
5780 rados list-inconsistent-obj
$pg | jq
'.' |
grep -q $objname ||
return 1
5783 # Request a regular scrub and it will be done
5785 grep -q "Regular scrub request, deep-scrub details will be lost" $dir/osd.
${primary}.log ||
return 1
5787 # deep-scrub error is no longer present
5788 rados list-inconsistent-obj
$pg | jq
'.' |
grep -qv $objname ||
return 1
5791 function TEST_scrub_warning
() {
5793 local poolname
=psr_pool
5798 local i7_days
=$
(calc
$i1_day \
* 7)
5799 local i14_days
=$
(calc
$i1_day \
* 14)
5801 local conf_overdue_seconds
=$
(calc
$i7_days + $i1_day + \
( $i7_days \
* $overdue \
) )
5802 local pool_overdue_seconds
=$
(calc
$i14_days + $i1_day + \
( $i14_days \
* $overdue \
) )
5804 setup
$dir ||
return 1
5805 run_mon
$dir a
--osd_pool_default_size=1 ||
return 1
5806 run_mgr
$dir x
--mon_warn_pg_not_scrubbed_ratio=${overdue} --mon_warn_pg_not_deep_scrubbed_ratio=${overdue} ||
return 1
5807 run_osd
$dir 0 $ceph_osd_args --osd_scrub_backoff_ratio=0 ||
return 1
5809 for i
in $
(seq 1 $
(expr $scrubs + $deep_scrubs))
5811 create_pool
$poolname-$i 1 1 ||
return 1
5812 wait_for_clean ||
return 1
5815 ceph osd pool
set $poolname-$i scrub_max_interval
$i14_days
5817 if [ $i = $
(expr $scrubs + 1) ];
5819 ceph osd pool
set $poolname-$i deep_scrub_interval
$i14_days
5826 ceph osd
set noscrub ||
return 1
5827 ceph osd
set nodeep-scrub ||
return 1
5828 ceph config
set global osd_scrub_interval_randomize_ratio
0
5829 ceph config
set global osd_deep_scrub_randomize_ratio
0
5830 ceph config
set global osd_scrub_max_interval
${i7_days}
5831 ceph config
set global osd_deep_scrub_interval
${i7_days}
5833 # Fake schedule scrubs
5834 for i
in $
(seq 1 $scrubs)
5838 overdue_seconds
=$pool_overdue_seconds
5840 overdue_seconds
=$conf_overdue_seconds
5842 CEPH_ARGS
='' ceph daemon $
(get_asok_path osd.
${primary}) \
5843 trigger_scrub
${i}.0 $(expr ${overdue_seconds} + ${i}00) ||
return 1
5845 # Fake schedule deep scrubs
5846 for i
in $
(seq $
(expr $scrubs + 1) $
(expr $scrubs + $deep_scrubs))
5848 if [ $i = "$(expr $scrubs + 1)" ];
5850 overdue_seconds
=$pool_overdue_seconds
5852 overdue_seconds
=$conf_overdue_seconds
5854 CEPH_ARGS
='' ceph daemon $
(get_asok_path osd.
${primary}) \
5855 trigger_deep_scrub
${i}.0 $(expr ${overdue_seconds} + ${i}00) ||
return 1
5861 ceph health |
grep -q "$deep_scrubs pgs not deep-scrubbed in time" ||
return 1
5862 ceph health |
grep -q "$scrubs pgs not scrubbed in time" ||
return 1
5863 COUNT
=$
(ceph health detail |
grep "not scrubbed since" |
wc -l)
5864 if [ "$COUNT" != $scrubs ]; then
5865 ceph health detail |
grep "not scrubbed since"
5868 COUNT
=$
(ceph health detail |
grep "not deep-scrubbed since" |
wc -l)
5869 if [ "$COUNT" != $deep_scrubs ]; then
5870 ceph health detail |
grep "not deep-scrubbed since"
5877 # Corrupt snapset in replicated pool
5879 function TEST_corrupt_snapset_scrub_rep
() {
5881 local poolname
=csr_pool
5884 setup
$dir ||
return 1
5885 run_mon
$dir a
--osd_pool_default_size=2 ||
return 1
5886 run_mgr
$dir x ||
return 1
5887 run_osd
$dir 0 ||
return 1
5888 run_osd
$dir 1 ||
return 1
5889 create_rbd_pool ||
return 1
5890 wait_for_clean ||
return 1
5892 create_pool foo
1 ||
return 1
5893 create_pool
$poolname 1 1 ||
return 1
5894 wait_for_clean ||
return 1
5896 for i
in $
(seq 1 $total_objs) ; do
5898 add_something
$dir $poolname $objname ||
return 1
5900 rados
--pool $poolname setomapheader
$objname hdr-
$objname ||
return 1
5901 rados
--pool $poolname setomapval
$objname key-
$objname val-
$objname ||
return 1
5904 local pg
=$
(get_pg
$poolname ROBJ0
)
5905 local primary
=$
(get_primary
$poolname ROBJ0
)
5907 rados
-p $poolname mksnap snap1
5908 echo -n head_of_snapshot_data
> $dir/change
5910 for i
in $
(seq 1 $total_objs) ; do
5913 # Alternate corruption between osd.0 and osd.1
5914 local osd
=$
(expr $i % 2)
5918 rados
--pool $poolname put
$objname $dir/change
5919 objectstore_tool
$dir $osd --head $objname clear-snapset corrupt ||
return 1
5923 rados
--pool $poolname put
$objname $dir/change
5924 objectstore_tool
$dir $osd --head $objname clear-snapset corrupt ||
return 1
5933 rados list-inconsistent-pg
$poolname > $dir/json ||
return 1
5935 test $
(jq
'. | length' $dir/json
) = "1" ||
return 1
5937 test $
(jq
-r '.[0]' $dir/json
) = $pg ||
return 1
5939 rados list-inconsistent-obj
$pg > $dir/json ||
return 1
5941 jq
"$jqfilter" << EOF | jq '.inconsistents' | python -c "$sortkeys" > $dir/checkcsjson
5954 "snapset_inconsistency"
5956 "union_shard_errors": [],
5957 "selected_object_info": {
5968 "prior_version": "21'3",
5969 "last_reqid": "client.4195.0:1",
5972 "mtime": "2018-04-05 14:35:43.286117",
5973 "local_mtime": "2018-04-05 14:35:43.288990",
5982 "data_digest": "0x53acb008",
5983 "omap_digest": "0xffffffff",
5984 "expected_object_size": 0,
5985 "expected_write_size": 0,
5986 "alloc_hint_flags": 0,
6041 "snapset_inconsistency"
6043 "union_shard_errors": [],
6044 "selected_object_info": {
6055 "prior_version": "23'6",
6056 "last_reqid": "client.4223.0:1",
6059 "mtime": "2018-04-05 14:35:48.326856",
6060 "local_mtime": "2018-04-05 14:35:48.328097",
6069 "data_digest": "0x53acb008",
6070 "omap_digest": "0xffffffff",
6071 "expected_object_size": 0,
6072 "expected_write_size": 0,
6073 "alloc_hint_flags": 0,
6123 jq
"$jqfilter" $dir/json | jq
'.inconsistents' | python
-c "$sortkeys" > $dir/csjson
6124 multidiff
$dir/checkcsjson
$dir/csjson ||
test $getjson = "yes" ||
return 1
6125 if test $getjson = "yes"
6127 jq
'.' $dir/json
> save6.json
6130 if test "$LOCALRUN" = "yes" && which jsonschema
> /dev
/null
;
6132 jsonschema
-i $dir/json
$CEPH_ROOT/doc
/rados
/command
/list-inconsistent-obj.json ||
return 1
6136 declare -a err_strings
6137 err_strings
[0]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 soid [0-9]*:.*:::ROBJ1:head : snapset inconsistent"
6138 err_strings
[1]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 soid [0-9]*:.*:::ROBJ2:head : snapset inconsistent"
6139 err_strings
[2]="log_channel[(]cluster[)] log [[]ERR[]] : scrub [0-9]*[.]0 [0-9]*:.*:::ROBJ1:1 : is an unexpected clone"
6140 err_strings
[3]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 scrub : stat mismatch, got 3/4 objects, 1/2 clones, 3/4 dirty, 3/4 omap, 0/0 pinned, 0/0 hit_set_archive, 0/0 whiteouts, 49/56 bytes, 0/0 manifest objects, 0/0 hit_set_archive bytes."
6141 err_strings
[4]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 scrub 0 missing, 2 inconsistent objects"
6142 err_strings
[5]="log_channel[(]cluster[)] log [[]ERR[]] : [0-9]*[.]0 scrub 4 errors"
6144 for err_string
in "${err_strings[@]}"
6146 if ! grep -q "$err_string" $dir/osd.
${primary}.log
6148 echo "Missing log message '$err_string'"
6149 ERRORS
=$
(expr $ERRORS + 1)
6153 if [ $ERRORS != "0" ];
6155 echo "TEST FAILED WITH $ERRORS ERRORS"
6159 ceph osd pool
rm $poolname $poolname --yes-i-really-really-mean-it
6160 teardown
$dir ||
return 1
6163 function TEST_request_scrub_priority
() {
6165 local poolname
=psr_pool
6170 setup
$dir ||
return 1
6171 run_mon
$dir a
--osd_pool_default_size=1 ||
return 1
6172 run_mgr
$dir x ||
return 1
6173 local ceph_osd_args
="--osd-scrub-interval-randomize-ratio=0 --osd-deep-scrub-randomize-ratio=0 "
6174 ceph_osd_args
+="--osd_scrub_backoff_ratio=0"
6175 run_osd
$dir 0 $ceph_osd_args ||
return 1
6177 create_pool
$poolname $PGS $PGS ||
return 1
6178 wait_for_clean ||
return 1
6181 add_something
$dir $poolname $objname noscrub ||
return 1
6182 local primary
=$
(get_primary
$poolname $objname)
6183 local pg
=$
(get_pg
$poolname $objname)
6184 poolid
=$
(ceph osd dump |
grep "^pool.*[']${poolname}[']" |
awk '{ print $2 }')
6187 for i
in $
(seq 0 $
(expr $PGS - 1))
6189 opg
="${poolid}.${i}"
6190 if [ "$opg" = "$pg" ]; then
6193 otherpgs
="${otherpgs}${opg} "
6194 local other_last_scrub
=$
(get_last_scrub_stamp
$pg)
6195 # Fake a schedule scrub
6196 CEPH_ARGS
='' ceph
--admin-daemon $
(get_asok_path osd.
${primary}) \
6197 trigger_scrub
$opg ||
return 1
6203 # Request a regular scrub and it will be done
6204 local last_scrub
=$
(get_last_scrub_stamp
$pg)
6207 ceph osd
unset noscrub ||
return 1
6208 ceph osd
unset nodeep-scrub ||
return 1
6210 wait_for_scrub
$pg "$last_scrub"
6212 for opg
in $otherpgs $pg
6214 wait_for_scrub
$opg "$other_last_scrub"
6217 # Verify that the requested scrub ran first
6218 grep "log_channel.*scrub ok" $dir/osd.
${primary}.log |
head -1 |
sed 's/.*[[]DBG[]]//' |
grep -q $pg ||
return 1
6224 main osd-scrub-repair
"$@"
6227 # compile-command: "cd build ; make -j4 && \
6228 # ../qa/run-standalone.sh osd-scrub-repair.sh"