]> git.proxmox.com Git - ceph.git/blob - ceph/qa/standalone/erasure-code/test-erasure-eio.sh
update sources to v12.2.3
[ceph.git] / ceph / qa / standalone / erasure-code / test-erasure-eio.sh
1 #!/bin/bash
2 #
3 # Copyright (C) 2015 Red Hat <contact@redhat.com>
4 #
5 #
6 # Author: Kefu Chai <kchai@redhat.com>
7 #
8 # This program is free software; you can redistribute it and/or modify
9 # it under the terms of the GNU Library Public License as published by
10 # the Free Software Foundation; either version 2, or (at your option)
11 # any later version.
12 #
13 # This program is distributed in the hope that it will be useful,
14 # but WITHOUT ANY WARRANTY; without even the implied warranty of
15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 # GNU Library Public License for more details.
17 #
18
19 source $CEPH_ROOT/qa/standalone/ceph-helpers.sh
20
21 function run() {
22 local dir=$1
23 shift
24
25 export CEPH_MON="127.0.0.1:7112" # git grep '\<7112\>' : there must be only one
26 export CEPH_ARGS
27 CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none "
28 CEPH_ARGS+="--mon-host=$CEPH_MON "
29
30 local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')}
31 for func in $funcs ; do
32 setup $dir || return 1
33 run_mon $dir a || return 1
34 run_mgr $dir x || return 1
35 create_rbd_pool || return 1
36
37 # check that erasure code plugins are preloaded
38 CEPH_ARGS='' ceph --admin-daemon $(get_asok_path mon.a) log flush || return 1
39 grep 'load: jerasure.*lrc' $dir/mon.a.log || return 1
40 $func $dir || return 1
41 teardown $dir || return 1
42 done
43 }
44
45 function setup_osds() {
46 local count=$1
47 shift
48
49 for id in $(seq 0 $(expr $count - 1)) ; do
50 run_osd $dir $id || return 1
51 done
52 wait_for_clean || return 1
53
54 # check that erasure code plugins are preloaded
55 CEPH_ARGS='' ceph --admin-daemon $(get_asok_path osd.0) log flush || return 1
56 grep 'load: jerasure.*lrc' $dir/osd.0.log || return 1
57 }
58
59 function get_state() {
60 local pgid=$1
61 local sname=state
62 ceph --format json pg dump pgs 2>/dev/null | \
63 jq -r ".[] | select(.pgid==\"$pgid\") | .$sname"
64 }
65
66 function create_erasure_coded_pool() {
67 local poolname=$1
68 shift
69 local k=$1
70 shift
71 local m=$1
72 shift
73
74 ceph osd erasure-code-profile set myprofile \
75 plugin=jerasure \
76 k=$k m=$m \
77 crush-failure-domain=osd || return 1
78 create_pool $poolname 1 1 erasure myprofile \
79 || return 1
80 wait_for_clean || return 1
81 }
82
83 function delete_pool() {
84 local poolname=$1
85
86 ceph osd pool delete $poolname $poolname --yes-i-really-really-mean-it
87 ceph osd erasure-code-profile rm myprofile
88 }
89
90 function rados_put() {
91 local dir=$1
92 local poolname=$2
93 local objname=${3:-SOMETHING}
94
95 for marker in AAA BBB CCCC DDDD ; do
96 printf "%*s" 1024 $marker
97 done > $dir/ORIGINAL
98 #
99 # get and put an object, compare they are equal
100 #
101 rados --pool $poolname put $objname $dir/ORIGINAL || return 1
102 }
103
104 function rados_get() {
105 local dir=$1
106 local poolname=$2
107 local objname=${3:-SOMETHING}
108 local expect=${4:-ok}
109
110 #
111 # Expect a failure to get object
112 #
113 if [ $expect = "fail" ];
114 then
115 ! rados --pool $poolname get $objname $dir/COPY
116 return
117 fi
118 #
119 # get an object, compare with $dir/ORIGINAL
120 #
121 rados --pool $poolname get $objname $dir/COPY || return 1
122 diff $dir/ORIGINAL $dir/COPY || return 1
123 rm $dir/COPY
124 }
125
126
127 function inject_remove() {
128 local pooltype=$1
129 shift
130 local which=$1
131 shift
132 local poolname=$1
133 shift
134 local objname=$1
135 shift
136 local dir=$1
137 shift
138 local shard_id=$1
139 shift
140
141 local -a initial_osds=($(get_osds $poolname $objname))
142 local osd_id=${initial_osds[$shard_id]}
143 objectstore_tool $dir $osd_id $objname remove || return 1
144 }
145
146 # Test with an inject error
147 function rados_put_get_data() {
148 local inject=$1
149 shift
150 local dir=$1
151 shift
152 local shard_id=$1
153 shift
154 local arg=$1
155
156 # inject eio to speificied shard
157 #
158 local poolname=pool-jerasure
159 local objname=obj-$inject-$$-$shard_id
160 rados_put $dir $poolname $objname || return 1
161 inject_$inject ec data $poolname $objname $dir $shard_id || return 1
162 rados_get $dir $poolname $objname || return 1
163
164 if [ "$arg" = "recovery" ];
165 then
166 #
167 # take out the last OSD used to store the object,
168 # bring it back, and check for clean PGs which means
169 # recovery didn't crash the primary.
170 #
171 local -a initial_osds=($(get_osds $poolname $objname))
172 local last_osd=${initial_osds[-1]}
173 # Kill OSD
174 kill_daemons $dir TERM osd.${last_osd} >&2 < /dev/null || return 1
175 ceph osd out ${last_osd} || return 1
176 ! get_osds $poolname $objname | grep '\<'${last_osd}'\>' || return 1
177 ceph osd in ${last_osd} || return 1
178 run_osd $dir ${last_osd} || return 1
179 wait_for_clean || return 1
180 fi
181
182 shard_id=$(expr $shard_id + 1)
183 inject_$inject ec data $poolname $objname $dir $shard_id || return 1
184 # Now 2 out of 3 shards get an error, so should fail
185 rados_get $dir $poolname $objname fail || return 1
186 rm $dir/ORIGINAL
187 }
188
189 # Change the size of speificied shard
190 #
191 function set_size() {
192 local objname=$1
193 shift
194 local dir=$1
195 shift
196 local shard_id=$1
197 shift
198 local bytes=$1
199 shift
200 local mode=${1}
201
202 local poolname=pool-jerasure
203 local -a initial_osds=($(get_osds $poolname $objname))
204 local osd_id=${initial_osds[$shard_id]}
205 ceph osd set noout
206 if [ "$mode" = "add" ];
207 then
208 objectstore_tool $dir $osd_id $objname get-bytes $dir/CORRUPT || return 1
209 dd if=/dev/urandom bs=$bytes count=1 >> $dir/CORRUPT
210 elif [ "$bytes" = "0" ];
211 then
212 touch $dir/CORRUPT
213 else
214 dd if=/dev/urandom bs=$bytes count=1 of=$dir/CORRUPT
215 fi
216 objectstore_tool $dir $osd_id $objname set-bytes $dir/CORRUPT || return 1
217 rm -f $dir/CORRUPT
218 ceph osd unset noout
219 }
220
221 function rados_get_data_bad_size() {
222 local dir=$1
223 shift
224 local shard_id=$1
225 shift
226 local bytes=$1
227 shift
228 local mode=${1:-set}
229
230 local poolname=pool-jerasure
231 local objname=obj-size-$$-$shard_id-$bytes
232 rados_put $dir $poolname $objname || return 1
233
234 # Change the size of speificied shard
235 #
236 set_size $objname $dir $shard_id $bytes $mode || return 1
237
238 rados_get $dir $poolname $objname || return 1
239
240 # Leave objname and modify another shard
241 shard_id=$(expr $shard_id + 1)
242 set_size $objname $dir $shard_id $bytes $mode || return 1
243 rados_get $dir $poolname $objname fail || return 1
244 rm $dir/ORIGINAL
245 }
246
247 #
248 # These two test cases try to validate the following behavior:
249 # For object on EC pool, if there is one shard having read error (
250 # either primary or replica), client can still read object.
251 #
252 # If 2 shards have read errors the client will get an error.
253 #
254 function TEST_rados_get_subread_eio_shard_0() {
255 local dir=$1
256 setup_osds 4 || return 1
257
258 local poolname=pool-jerasure
259 create_erasure_coded_pool $poolname 2 1 || return 1
260 # inject eio on primary OSD (0) and replica OSD (1)
261 local shard_id=0
262 rados_put_get_data eio $dir $shard_id || return 1
263 delete_pool $poolname
264 }
265
266 function TEST_rados_get_subread_eio_shard_1() {
267 local dir=$1
268 setup_osds 4 || return 1
269
270 local poolname=pool-jerasure
271 create_erasure_coded_pool $poolname 2 1 || return 1
272 # inject eio into replicas OSD (1) and OSD (2)
273 local shard_id=1
274 rados_put_get_data eio $dir $shard_id || return 1
275 delete_pool $poolname
276 }
277
278 # We don't remove the object from the primary because
279 # that just causes it to appear to be missing
280
281 function TEST_rados_get_subread_missing() {
282 local dir=$1
283 setup_osds 4 || return 1
284
285 local poolname=pool-jerasure
286 create_erasure_coded_pool $poolname 2 1 || return 1
287 # inject remove into replicas OSD (1) and OSD (2)
288 local shard_id=1
289 rados_put_get_data remove $dir $shard_id || return 1
290 delete_pool $poolname
291 }
292
293 #
294 #
295 # These two test cases try to validate that following behavior:
296 # For object on EC pool, if there is one shard which an incorrect
297 # size this will cause an internal read error, client can still read object.
298 #
299 # If 2 shards have incorrect size the client will get an error.
300 #
301 function TEST_rados_get_bad_size_shard_0() {
302 local dir=$1
303 setup_osds 4 || return 1
304
305 local poolname=pool-jerasure
306 create_erasure_coded_pool $poolname 2 1 || return 1
307 # Set incorrect size into primary OSD (0) and replica OSD (1)
308 local shard_id=0
309 rados_get_data_bad_size $dir $shard_id 10 || return 1
310 rados_get_data_bad_size $dir $shard_id 0 || return 1
311 rados_get_data_bad_size $dir $shard_id 256 add || return 1
312 delete_pool $poolname
313 }
314
315 function TEST_rados_get_bad_size_shard_1() {
316 local dir=$1
317 setup_osds 4 || return 1
318
319 local poolname=pool-jerasure
320 create_erasure_coded_pool $poolname 2 1 || return 1
321 # Set incorrect size into replicas OSD (1) and OSD (2)
322 local shard_id=1
323 rados_get_data_bad_size $dir $shard_id 10 || return 1
324 rados_get_data_bad_size $dir $shard_id 0 || return 1
325 rados_get_data_bad_size $dir $shard_id 256 add || return 1
326 delete_pool $poolname
327 }
328
329 function TEST_rados_get_with_subreadall_eio_shard_0() {
330 local dir=$1
331 local shard_id=0
332
333 setup_osds 4 || return 1
334
335 local poolname=pool-jerasure
336 create_erasure_coded_pool $poolname 2 1 || return 1
337 # inject eio on primary OSD (0)
338 rados_put_get_data eio $dir $shard_id recovery || return 1
339
340 delete_pool $poolname
341 }
342
343 function TEST_rados_get_with_subreadall_eio_shard_1() {
344 local dir=$1
345 local shard_id=1
346
347 setup_osds 4 || return 1
348
349 local poolname=pool-jerasure
350 create_erasure_coded_pool $poolname 2 1 || return 1
351 # inject eio on replica OSD (1)
352 rados_put_get_data eio $dir $shard_id recovery || return 1
353
354 delete_pool $poolname
355 }
356
357 # Test recovery the first k copies aren't all available
358 function TEST_ec_recovery_errors() {
359 local dir=$1
360 local objname=myobject
361
362 setup_osds 7 || return 1
363
364 local poolname=pool-jerasure
365 create_erasure_coded_pool $poolname 3 2 || return 1
366
367 rados_put $dir $poolname $objname || return 1
368 inject_eio ec data $poolname $objname $dir 0 || return 1
369
370 local -a initial_osds=($(get_osds $poolname $objname))
371 local last_osd=${initial_osds[-1]}
372 # Kill OSD
373 kill_daemons $dir TERM osd.${last_osd} >&2 < /dev/null || return 1
374 ceph osd down ${last_osd} || return 1
375 ceph osd out ${last_osd} || return 1
376
377 # Cluster should recover this object
378 wait_for_clean || return 1
379
380 delete_pool $poolname
381 }
382
383 # Test backfill with unfound object
384 function TEST_ec_backfill_unfound() {
385 local dir=$1
386 local objname=myobject
387 local lastobj=300
388 # Must be between 1 and $lastobj
389 local testobj=obj250
390
391 export CEPH_ARGS
392 CEPH_ARGS+=' --osd_min_pg_log_entries=5 --osd_max_pg_log_entries=10'
393 setup_osds 5 || return 1
394
395 local poolname=pool-jerasure
396 create_erasure_coded_pool $poolname 3 2 || return 1
397
398 ceph pg dump pgs
399
400 rados_put $dir $poolname $objname || return 1
401
402 local -a initial_osds=($(get_osds $poolname $objname))
403 local last_osd=${initial_osds[-1]}
404 kill_daemons $dir TERM osd.${last_osd} 2>&2 < /dev/null || return 1
405 ceph osd down ${last_osd} || return 1
406 ceph osd out ${last_osd} || return 1
407
408 ceph pg dump pgs
409
410 dd if=/dev/urandom of=${dir}/ORIGINAL bs=1024 count=4
411 for i in $(seq 1 $lastobj)
412 do
413 rados --pool $poolname put obj${i} $dir/ORIGINAL || return 1
414 done
415
416 inject_eio ec data $poolname $testobj $dir 0 || return 1
417 inject_eio ec data $poolname $testobj $dir 1 || return 1
418
419 run_osd $dir ${last_osd} || return 1
420 ceph osd in ${last_osd} || return 1
421
422 sleep 15
423
424 for tmp in $(seq 1 100); do
425 state=$(get_state 2.0)
426 echo $state | grep backfill_unfound
427 if [ "$?" = "0" ]; then
428 break
429 fi
430 echo $state
431 sleep 1
432 done
433
434 ceph pg dump pgs
435 ceph pg 2.0 list_missing | grep -q $testobj || return 1
436
437 # Command should hang because object is unfound
438 timeout 5 rados -p $poolname get $testobj $dir/CHECK
439 test $? = "124" || return 1
440
441 ceph pg 2.0 mark_unfound_lost delete
442
443 wait_for_clean || return 1
444
445 for i in $(seq 1 $lastobj)
446 do
447 if [ obj${i} = "$testobj" ]; then
448 # Doesn't exist anymore
449 ! rados -p $poolname get $testobj $dir/CHECK || return 1
450 else
451 rados --pool $poolname get obj${i} $dir/CHECK || return 1
452 diff -q $dir/ORIGINAL $dir/CHECK || return 1
453 fi
454 done
455
456 rm -f ${dir}/ORIGINAL ${dir}/CHECK
457
458 delete_pool $poolname
459 }
460
461 # Test recovery with unfound object
462 function TEST_ec_recovery_unfound() {
463 local dir=$1
464 local objname=myobject
465 local lastobj=100
466 # Must be between 1 and $lastobj
467 local testobj=obj75
468
469 setup_osds 5 || return 1
470
471 local poolname=pool-jerasure
472 create_erasure_coded_pool $poolname 3 2 || return 1
473
474 ceph pg dump pgs
475
476 rados_put $dir $poolname $objname || return 1
477
478 local -a initial_osds=($(get_osds $poolname $objname))
479 local last_osd=${initial_osds[-1]}
480 kill_daemons $dir TERM osd.${last_osd} 2>&2 < /dev/null || return 1
481 ceph osd down ${last_osd} || return 1
482 ceph osd out ${last_osd} || return 1
483
484 ceph pg dump pgs
485
486 dd if=/dev/urandom of=${dir}/ORIGINAL bs=1024 count=4
487 for i in $(seq 1 $lastobj)
488 do
489 rados --pool $poolname put obj${i} $dir/ORIGINAL || return 1
490 done
491
492 inject_eio ec data $poolname $testobj $dir 0 || return 1
493 inject_eio ec data $poolname $testobj $dir 1 || return 1
494
495 run_osd $dir ${last_osd} || return 1
496 ceph osd in ${last_osd} || return 1
497
498 sleep 15
499
500 for tmp in $(seq 1 100); do
501 state=$(get_state 2.0)
502 echo $state | grep recovery_unfound
503 if [ "$?" = "0" ]; then
504 break
505 fi
506 echo "$state "
507 sleep 1
508 done
509
510 ceph pg dump pgs
511 ceph pg 2.0 list_missing | grep -q $testobj || return 1
512
513 # Command should hang because object is unfound
514 timeout 5 rados -p $poolname get $testobj $dir/CHECK
515 test $? = "124" || return 1
516
517 ceph pg 2.0 mark_unfound_lost delete
518
519 wait_for_clean || return 1
520
521 for i in $(seq 1 $lastobj)
522 do
523 if [ obj${i} = "$testobj" ]; then
524 # Doesn't exist anymore
525 ! rados -p $poolname get $testobj $dir/CHECK || return 1
526 else
527 rados --pool $poolname get obj${i} $dir/CHECK || return 1
528 diff -q $dir/ORIGINAL $dir/CHECK || return 1
529 fi
530 done
531
532 rm -f ${dir}/ORIGINAL ${dir}/CHECK
533
534 delete_pool $poolname
535 }
536
537 main test-erasure-eio "$@"
538
539 # Local Variables:
540 # compile-command: "cd ../.. ; make -j4 && test/erasure-code/test-erasure-eio.sh"
541 # End: