ceph/qa/standalone/osd-backfill/osd-backfill-space.sh

   1 #!/usr/bin/env bash
   2 #
   3 # Copyright (C) 2018 Red Hat <contact@redhat.com>
   4 #
   5 # Author: David Zafman <dzafman@redhat.com>
   6 #
   7 # This program is free software; you can redistribute it and/or modify
   8 # it under the terms of the GNU Library Public License as published by
   9 # the Free Software Foundation; either version 2, or (at your option)
  10 # any later version.
  11 #
  12 # This program is distributed in the hope that it will be useful,
  13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 # GNU Library Public License for more details.
  16 #
  17
  18 source $CEPH_ROOT/qa/standalone/ceph-helpers.sh
  19
  20 function run() {
  21     local dir=$1
  22     shift
  23
  24     export CEPH_MON="127.0.0.1:7180" # git grep '\<7180\>' : there must be only one
  25     export CEPH_ARGS
  26     CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none "
  27     CEPH_ARGS+="--mon-host=$CEPH_MON "
  28     CEPH_ARGS+="--osd_min_pg_log_entries=5 --osd_max_pg_log_entries=10 "
  29     CEPH_ARGS+="--fake_statfs_for_testing=3686400 "
  30     CEPH_ARGS+="--osd_max_backfills=10 "
  31     CEPH_ARGS+="--osd_mclock_profile=high_recovery_ops "
  32     export objects=600
  33     export poolprefix=test
  34
  35     local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')}
  36     for func in $funcs ; do
  37         setup $dir || return 1
  38         $func $dir || return 1
  39         teardown $dir || return 1
  40     done
  41 }
  42
  43
  44 function get_num_in_state() {
  45     local state=$1
  46     local expression
  47     expression+="select(contains(\"${state}\"))"
  48     ceph --format json pg dump pgs 2>/dev/null | \
  49         jq ".pg_stats | [.[] | .state | $expression] | length"
  50 }
  51
  52
  53 function wait_for_not_state() {
  54     local state=$1
  55     local num_in_state=-1
  56     local cur_in_state
  57     local -a delays=($(get_timeout_delays $2 5))
  58     local -i loop=0
  59
  60     flush_pg_stats || return 1
  61     while test $(get_num_pgs) == 0 ; do
  62         sleep 1
  63     done
  64
  65     while true ; do
  66         cur_in_state=$(get_num_in_state ${state})
  67         test $cur_in_state = "0" && break
  68         if test $cur_in_state != $num_in_state ; then
  69             loop=0
  70             num_in_state=$cur_in_state
  71         elif (( $loop >= ${#delays[*]} )) ; then
  72             ceph pg dump pgs
  73             return 1
  74         fi
  75         sleep ${delays[$loop]}
  76         loop+=1
  77     done
  78     return 0
  79 }
  80
  81
  82 function wait_for_not_backfilling() {
  83     local timeout=$1
  84     wait_for_not_state backfilling $timeout
  85 }
  86
  87
  88 function wait_for_not_activating() {
  89     local timeout=$1
  90     wait_for_not_state activating $timeout
  91 }
  92
  93 # All tests are created in an environment which has fake total space
  94 # of 3600K (3686400) which can hold 600 6K replicated objects or
  95 # 200 18K shards of erasure coded objects.  For a k=3, m=2 EC pool
  96 # we have a theoretical 54K object but with the chunk size of 4K
  97 # and a rounding of 4K to account for the chunks is 36K max object
  98 # which is ((36K / 3) + 4K) * 200  = 3200K which is 88% of
  99 # 3600K for a shard.
 100
 101 # Create 2 pools with size 1
 102 # Write enough data that only 1 pool pg can fit per osd
 103 # Incresase the pool size to 2
 104 # On 3 OSDs this should result in 1 OSD with overlapping replicas,
 105 # so both pools can't fit.  We assume pgid 1.0 and 2.0 won't
 106 # map to the same 2 OSDs.
 107 # At least 1 pool shouldn't have room to backfill
 108 # All other pools should go active+clean
 109 function TEST_backfill_test_simple() {
 110     local dir=$1
 111     local pools=2
 112     local OSDS=3
 113
 114     run_mon $dir a || return 1
 115     run_mgr $dir x || return 1
 116     export CEPH_ARGS
 117
 118     for osd in $(seq 0 $(expr $OSDS - 1))
 119     do
 120       run_osd $dir $osd || return 1
 121     done
 122
 123     ceph osd set-backfillfull-ratio .85
 124
 125     for p in $(seq 1 $pools)
 126     do
 127       create_pool "${poolprefix}$p" 1 1
 128       ceph osd pool set "${poolprefix}$p" size 1 --yes-i-really-mean-it
 129     done
 130
 131     wait_for_clean || return 1
 132
 133     # This won't work is if the 2 pools primary and only osds
 134     # are the same.
 135
 136     dd if=/dev/urandom of=$dir/datafile bs=1024 count=4
 137     for o in $(seq 1 $objects)
 138     do
 139       for p in $(seq 1 $pools)
 140       do
 141         rados -p "${poolprefix}$p" put obj$o $dir/datafile
 142       done
 143     done
 144
 145     ceph pg dump pgs
 146
 147     for p in $(seq 1 $pools)
 148     do
 149       ceph osd pool set "${poolprefix}$p" size 2
 150     done
 151     sleep 30
 152
 153     wait_for_not_backfilling 1200 || return 1
 154     wait_for_not_activating 60 || return 1
 155
 156     ERRORS=0
 157     if [ "$(ceph pg dump pgs | grep +backfill_toofull | wc -l)" != "1" ];
 158     then
 159       echo "One pool should have been in backfill_toofull"
 160       ERRORS="$(expr $ERRORS + 1)"
 161     fi
 162
 163     expected="$(expr $pools - 1)"
 164     if [ "$(ceph pg dump pgs | grep active+clean | wc -l)" != "$expected" ];
 165     then
 166       echo "$expected didn't finish backfill"
 167       ERRORS="$(expr $ERRORS + 1)"
 168     fi
 169
 170     ceph pg dump pgs
 171
 172     if [ $ERRORS != "0" ];
 173     then
 174       return 1
 175     fi
 176
 177     for i in $(seq 1 $pools)
 178     do
 179       delete_pool "${poolprefix}$i"
 180     done
 181     kill_daemons $dir || return 1
 182     ! grep -q "num_bytes mismatch" $dir/osd.*.log || return 1
 183 }
 184
 185
 186 # Create 8 pools of size 1 on 20 OSDs
 187 # Write 4K * 600 objects (only 1 pool pg can fit on any given osd)
 188 # Increase pool size to 2
 189 # At least 1 pool shouldn't have room to backfill
 190 # All other pools should go active+clean
 191 function TEST_backfill_test_multi() {
 192     local dir=$1
 193     local pools=8
 194     local OSDS=20
 195
 196     run_mon $dir a || return 1
 197     run_mgr $dir x || return 1
 198     export CEPH_ARGS
 199
 200     for osd in $(seq 0 $(expr $OSDS - 1))
 201     do
 202       run_osd $dir $osd || return 1
 203     done
 204
 205     ceph osd set-backfillfull-ratio .85
 206
 207     for p in $(seq 1 $pools)
 208     do
 209       create_pool "${poolprefix}$p" 1 1
 210       ceph osd pool set "${poolprefix}$p" size 1 --yes-i-really-mean-it
 211     done
 212
 213     wait_for_clean || return 1
 214
 215     dd if=/dev/urandom of=$dir/datafile bs=1024 count=4
 216     for o in $(seq 1 $objects)
 217     do
 218       for p in $(seq 1 $pools)
 219       do
 220         rados -p "${poolprefix}$p" put obj$o $dir/datafile
 221       done
 222     done
 223
 224     ceph pg dump pgs
 225
 226     for p in $(seq 1 $pools)
 227     do
 228       ceph osd pool set "${poolprefix}$p" size 2
 229     done
 230     sleep 30
 231
 232     wait_for_not_backfilling 1200 || return 1
 233     wait_for_not_activating 60 || return 1
 234
 235     ERRORS=0
 236     full="$(ceph pg dump pgs | grep +backfill_toofull | wc -l)"
 237     if [ "$full" -lt "1" ];
 238     then
 239       echo "At least one pool should have been in backfill_toofull"
 240       ERRORS="$(expr $ERRORS + 1)"
 241     fi
 242
 243     expected="$(expr $pools - $full)"
 244     if [ "$(ceph pg dump pgs | grep active+clean | wc -l)" != "$expected" ];
 245     then
 246       echo "$expected didn't finish backfill"
 247       ERRORS="$(expr $ERRORS + 1)"
 248     fi
 249
 250     ceph pg dump pgs
 251     ceph status
 252
 253     ceph status --format=json-pretty > $dir/stat.json
 254
 255     eval SEV=$(jq '.health.checks.PG_BACKFILL_FULL.severity' $dir/stat.json)
 256     if [ "$SEV" != "HEALTH_WARN" ]; then
 257       echo "PG_BACKFILL_FULL severity $SEV not HEALTH_WARN"
 258       ERRORS="$(expr $ERRORS + 1)"
 259     fi
 260     eval MSG=$(jq '.health.checks.PG_BACKFILL_FULL.summary.message' $dir/stat.json)
 261     if [ "$MSG" != "Low space hindering backfill (add storage if this doesn't resolve itself): 4 pgs backfill_toofull" ]; then
 262       echo "PG_BACKFILL_FULL message '$MSG' mismatched"
 263       ERRORS="$(expr $ERRORS + 1)"
 264     fi
 265     rm -f $dir/stat.json
 266
 267     if [ $ERRORS != "0" ];
 268     then
 269       return 1
 270     fi
 271
 272     for i in $(seq 1 $pools)
 273     do
 274       delete_pool "${poolprefix}$i"
 275     done
 276     # Work around for http://tracker.ceph.com/issues/38195
 277     kill_daemons $dir #|| return 1
 278     ! grep -q "num_bytes mismatch" $dir/osd.*.log || return 1
 279 }
 280
 281
 282 # To make sure that when 2 pg try to backfill at the same time to
 283 # the same target.  This might be covered by the simple test above
 284 # but this makes sure we get it.
 285 #
 286 # Create 10 pools of size 2 and identify 2 that have the same
 287 # non-primary osd.
 288 # Delete all other pools
 289 # Set size to 1 and write 4K * 600 to each pool
 290 # Set size back to 2
 291 # The 2 pools should race to backfill.
 292 # One pool goes active+clean
 293 # The other goes acitve+...+backfill_toofull
 294 function TEST_backfill_test_sametarget() {
 295     local dir=$1
 296     local pools=10
 297     local OSDS=5
 298
 299     run_mon $dir a || return 1
 300     run_mgr $dir x || return 1
 301     export CEPH_ARGS
 302
 303     for osd in $(seq 0 $(expr $OSDS - 1))
 304     do
 305       run_osd $dir $osd || return 1
 306     done
 307
 308     ceph osd set-backfillfull-ratio .85
 309
 310     for p in $(seq 1 $pools)
 311     do
 312       create_pool "${poolprefix}$p" 1 1
 313       ceph osd pool set "${poolprefix}$p" size 2
 314     done
 315     sleep 5
 316
 317     wait_for_clean || return 1
 318
 319     ceph pg dump pgs
 320
 321     # Find 2 pools with a pg that distinct primaries but second
 322     # replica on the same osd.
 323     local PG1
 324     local POOLNUM1
 325     local pool1
 326     local chk_osd1
 327     local chk_osd2
 328
 329     local PG2
 330     local POOLNUM2
 331     local pool2
 332     for p in $(seq 1 $pools)
 333     do
 334       ceph pg map ${p}.0 --format=json | jq '.acting[]' > $dir/acting
 335       local test_osd1=$(head -1 $dir/acting)
 336       local test_osd2=$(tail -1 $dir/acting)
 337       if [ $p = "1" ];
 338       then
 339         PG1="${p}.0"
 340         POOLNUM1=$p
 341         pool1="${poolprefix}$p"
 342         chk_osd1=$test_osd1
 343         chk_osd2=$test_osd2
 344       elif [ $chk_osd1 != $test_osd1 -a $chk_osd2 = $test_osd2 ];
 345       then
 346         PG2="${p}.0"
 347         POOLNUM2=$p
 348         pool2="${poolprefix}$p"
 349         break
 350       fi
 351     done
 352     rm -f $dir/acting
 353
 354     if [ "$pool2" = "" ];
 355     then
 356       echo "Failure to find appropirate PGs"
 357       return 1
 358     fi
 359
 360     for p in $(seq 1 $pools)
 361     do
 362       if [ $p != $POOLNUM1 -a $p != $POOLNUM2 ];
 363       then
 364         delete_pool ${poolprefix}$p
 365       fi
 366     done
 367
 368     ceph osd pool set $pool1 size 1 --yes-i-really-mean-it
 369     ceph osd pool set $pool2 size 1 --yes-i-really-mean-it
 370
 371     wait_for_clean || return 1
 372
 373     dd if=/dev/urandom of=$dir/datafile bs=1024 count=4
 374     for i in $(seq 1 $objects)
 375     do
 376         rados -p $pool1 put obj$i $dir/datafile
 377         rados -p $pool2 put obj$i $dir/datafile
 378     done
 379
 380     ceph osd pool set $pool1 size 2
 381     ceph osd pool set $pool2 size 2
 382     sleep 30
 383
 384     wait_for_not_backfilling 1200 || return 1
 385     wait_for_not_activating 60 || return 1
 386
 387     ERRORS=0
 388     if [ "$(ceph pg dump pgs | grep +backfill_toofull | wc -l)" != "1" ];
 389     then
 390       echo "One pool should have been in backfill_toofull"
 391       ERRORS="$(expr $ERRORS + 1)"
 392     fi
 393
 394     if [ "$(ceph pg dump pgs | grep active+clean | wc -l)" != "1" ];
 395     then
 396       echo "One didn't finish backfill"
 397       ERRORS="$(expr $ERRORS + 1)"
 398     fi
 399
 400     ceph pg dump pgs
 401
 402     if [ $ERRORS != "0" ];
 403     then
 404       return 1
 405     fi
 406
 407     delete_pool $pool1
 408     delete_pool $pool2
 409     kill_daemons $dir || return 1
 410     ! grep -q "num_bytes mismatch" $dir/osd.*.log || return 1
 411 }
 412
 413 # 2 pools can't both backfill to a target which has other data
 414 # 1 of the pools has objects that increase from 1024 to 2611 bytes
 415 #
 416 # Write to fill pool which is size 1
 417 # Take fill pool osd down (other 2 pools must go to the remaining OSDs
 418 # Save an export of data on fill OSD and restart it
 419 # Write an intial 1K to pool1 which has pg 2.0
 420 # Export 2.0 from non-fillpool OSD don't wait for it to start-up
 421 # Take down fillpool OSD
 422 # Put 1K object version of 2.0 on fillpool OSD
 423 # Put back fillpool data on fillpool OSD
 424 # With fillpool down write 2611 byte objects
 425 # Take down $osd and bring back $fillosd simultaneously
 426 # Wait for backfilling
 427 # One PG will be able to backfill its remaining data
 428 # One PG must get backfill_toofull
 429 function TEST_backfill_multi_partial() {
 430     local dir=$1
 431     local EC=$2
 432     local pools=2
 433     local OSDS=3
 434
 435     run_mon $dir a || return 1
 436     run_mgr $dir x || return 1
 437     export CEPH_ARGS
 438
 439     for osd in $(seq 0 $(expr $OSDS - 1))
 440     do
 441       run_osd $dir $osd || return 1
 442     done
 443
 444     ceph osd set-backfillfull-ratio .85
 445
 446     ceph osd set-require-min-compat-client luminous
 447     create_pool fillpool 1 1
 448     ceph osd pool set fillpool size 1 --yes-i-really-mean-it
 449     for p in $(seq 1 $pools)
 450     do
 451       create_pool "${poolprefix}$p" 1 1
 452       ceph osd pool set "${poolprefix}$p" size 2
 453     done
 454
 455     wait_for_clean || return 1
 456
 457     # Partially fill an osd
 458     # We have room for 600 6K replicated objects, if we create 2611 byte objects
 459     # there is 3600K - (2611 * 600) = 2070K, so the fill pool and one
 460     # replica from the other 2 is 85% of 3600K
 461
 462     dd if=/dev/urandom of=$dir/datafile bs=2611 count=1
 463     for o in $(seq 1 $objects)
 464     do
 465       rados -p fillpool put obj-fill-${o} $dir/datafile
 466     done
 467
 468     local fillosd=$(get_primary fillpool obj-fill-1)
 469     osd=$(expr $fillosd + 1)
 470     if [ "$osd" = "$OSDS" ]; then
 471       osd="0"
 472     fi
 473
 474     kill_daemon $dir/osd.$fillosd.pid TERM
 475     ceph osd out osd.$fillosd
 476
 477     _objectstore_tool_nodown $dir $fillosd --op export-remove --pgid 1.0 --file $dir/fillexport.out || return 1
 478     activate_osd $dir $fillosd || return 1
 479
 480     ceph pg dump pgs
 481
 482     dd if=/dev/urandom of=$dir/datafile bs=1024 count=1
 483     for o in $(seq 1 $objects)
 484     do
 485       rados -p "${poolprefix}1" put obj-1-${o} $dir/datafile
 486     done
 487
 488     ceph pg dump pgs
 489     # The $osd OSD is started, but we don't wait so we can kill $fillosd at the same time
 490     _objectstore_tool_nowait $dir $osd --op export --pgid 2.0 --file $dir/export.out
 491     kill_daemon $dir/osd.$fillosd.pid TERM
 492     _objectstore_tool_nodown $dir $fillosd --force --op remove --pgid 2.0
 493     _objectstore_tool_nodown $dir $fillosd --op import --pgid 2.0 --file $dir/export.out || return 1
 494     _objectstore_tool_nodown $dir $fillosd --op import --pgid 1.0 --file $dir/fillexport.out || return 1
 495     ceph pg dump pgs
 496     sleep 20
 497     ceph pg dump pgs
 498
 499     # re-write everything
 500     dd if=/dev/urandom of=$dir/datafile bs=2611 count=1
 501     for o in $(seq 1 $objects)
 502     do
 503       for p in $(seq 1 $pools)
 504       do
 505         rados -p "${poolprefix}$p" put obj-${p}-${o} $dir/datafile
 506       done
 507     done
 508
 509     kill_daemon $dir/osd.$osd.pid TERM
 510     ceph osd out osd.$osd
 511
 512     activate_osd $dir $fillosd || return 1
 513     ceph osd in osd.$fillosd
 514     sleep 30
 515
 516     wait_for_not_backfilling 1200 || return 1
 517     wait_for_not_activating 60 || return 1
 518
 519     flush_pg_stats || return 1
 520     ceph pg dump pgs
 521
 522     ERRORS=0
 523     if [ "$(get_num_in_state backfill_toofull)" != "1" ];
 524     then
 525       echo "One PG should be in backfill_toofull"
 526       ERRORS="$(expr $ERRORS + 1)"
 527     fi
 528
 529     if [ "$(get_num_in_state active+clean)" != "2" ];
 530     then
 531       echo "Two PGs should be active+clean after one PG completed backfill"
 532       ERRORS="$(expr $ERRORS + 1)"
 533     fi
 534
 535     if [ $ERRORS != "0" ];
 536     then
 537       return 1
 538     fi
 539
 540     delete_pool fillpool
 541     for i in $(seq 1 $pools)
 542     do
 543       delete_pool "${poolprefix}$i"
 544     done
 545     kill_daemons $dir || return 1
 546     ! grep -q "num_bytes mismatch" $dir/osd.*.log || return 1
 547 }
 548
 549 # Make sure that the amount of bytes already on the replica doesn't
 550 # cause an out of space condition
 551 #
 552 # Create 1 pool and write 4K * 600 objects
 553 # Remove 25% (150) of the objects with one OSD down (noout set)
 554 # Increase the size of the remaining 75% (450) of the objects to 6K
 555 # Bring back down OSD
 556 # The pool should go active+clean
 557 function TEST_backfill_grow() {
 558     local dir=$1
 559     local poolname="test"
 560     local OSDS=3
 561
 562     run_mon $dir a || return 1
 563     run_mgr $dir x || return 1
 564
 565     for osd in $(seq 0 $(expr $OSDS - 1))
 566     do
 567       run_osd $dir $osd || return 1
 568     done
 569
 570     ceph osd set-backfillfull-ratio .85
 571
 572     create_pool $poolname 1 1
 573     ceph osd pool set $poolname size 3
 574     sleep 5
 575
 576     wait_for_clean || return 1
 577
 578     dd if=/dev/urandom of=${dir}/4kdata bs=1k count=4
 579     for i in $(seq 1 $objects)
 580     do
 581         rados -p $poolname put obj$i $dir/4kdata
 582     done
 583
 584     local PG=$(get_pg $poolname obj1)
 585     # Remember primary during the backfill
 586     local primary=$(get_primary $poolname obj1)
 587     local otherosd=$(get_not_primary $poolname obj1)
 588
 589     ceph osd set noout
 590     kill_daemons $dir TERM $otherosd || return 1
 591
 592     rmobjects=$(expr $objects / 4)
 593     for i in $(seq 1 $rmobjects)
 594     do
 595         rados -p $poolname rm obj$i
 596     done
 597
 598     dd if=/dev/urandom of=${dir}/6kdata bs=6k count=1
 599     for i in $(seq $(expr $rmobjects + 1) $objects)
 600     do
 601         rados -p $poolname put obj$i $dir/6kdata
 602     done
 603
 604     activate_osd $dir $otherosd || return 1
 605
 606     ceph tell osd.$primary debug kick_recovery_wq 0
 607
 608     sleep 2
 609
 610     wait_for_clean || return 1
 611
 612     delete_pool $poolname
 613     kill_daemons $dir || return 1
 614     ! grep -q "num_bytes mismatch" $dir/osd.*.log || return 1
 615 }
 616
 617 # Create a 5 shard EC pool on 6 OSD cluster
 618 # Fill 1 OSD with 2600K of data take that osd down.
 619 # Write the EC pool on 5 OSDs
 620 # Take down 1 (must contain an EC shard)
 621 # Bring up OSD with fill data
 622 # Not enought room to backfill to partially full OSD
 623 function TEST_ec_backfill_simple() {
 624     local dir=$1
 625     local EC=$2
 626     local pools=1
 627     local OSDS=6
 628     local k=3
 629     local m=2
 630     local ecobjects=$(expr $objects / $k)
 631
 632     run_mon $dir a || return 1
 633     run_mgr $dir x || return 1
 634     export CEPH_ARGS
 635
 636     for osd in $(seq 0 $(expr $OSDS - 1))
 637     do
 638       run_osd $dir $osd || return 1
 639     done
 640
 641     ceph osd set-backfillfull-ratio .85
 642     create_pool fillpool 1 1
 643     ceph osd pool set fillpool size 1 --yes-i-really-mean-it
 644
 645     # Partially fill an osd
 646     # We have room for 200 18K replicated objects, if we create 13K objects
 647     # there is only 3600K - (13K * 200) = 1000K which won't hold
 648     # a k=3 shard below ((18K / 3) + 4K) * 200 = 2000K
 649     # Actual usage per shard is 8K * 200 = 1600K because 18K/3 is 6K which
 650     # rounds to 8K.  The 2000K is the ceiling on the 18K * 200 = 3600K logical
 651     # bytes in the pool.
 652     dd if=/dev/urandom of=$dir/datafile bs=1024 count=13
 653     for o in $(seq 1 $ecobjects)
 654     do
 655       rados -p fillpool put obj$o $dir/datafile
 656     done
 657
 658     local fillosd=$(get_primary fillpool obj1)
 659     osd=$(expr $fillosd + 1)
 660     if [ "$osd" = "$OSDS" ]; then
 661       osd="0"
 662     fi
 663
 664     sleep 5
 665     kill_daemon $dir/osd.$fillosd.pid TERM
 666     ceph osd out osd.$fillosd
 667     sleep 2
 668     ceph osd erasure-code-profile set ec-profile k=$k m=$m crush-failure-domain=osd technique=reed_sol_van plugin=jerasure || return 1
 669
 670     for p in $(seq 1 $pools)
 671     do
 672         ceph osd pool create "${poolprefix}$p" 1 1 erasure ec-profile
 673     done
 674
 675     # Can't wait for clean here because we created a stale pg
 676     #wait_for_clean || return 1
 677     sleep 5
 678
 679     ceph pg dump pgs
 680
 681     dd if=/dev/urandom of=$dir/datafile bs=1024 count=18
 682     for o in $(seq 1 $ecobjects)
 683     do
 684       for p in $(seq 1 $pools)
 685       do
 686         rados -p "${poolprefix}$p" put obj$o $dir/datafile
 687       done
 688     done
 689
 690     kill_daemon $dir/osd.$osd.pid TERM
 691     ceph osd out osd.$osd
 692
 693     activate_osd $dir $fillosd || return 1
 694     ceph osd in osd.$fillosd
 695     sleep 30
 696
 697     ceph pg dump pgs
 698
 699     wait_for_not_backfilling 1200 || return 1
 700     wait_for_not_activating 60 || return 1
 701
 702     ceph pg dump pgs
 703
 704     ERRORS=0
 705     if [ "$(ceph pg dump pgs | grep -v "^1.0" | grep +backfill_toofull | wc -l)" != "1" ]; then
 706       echo "One pool should have been in backfill_toofull"
 707       ERRORS="$(expr $ERRORS + 1)"
 708     fi
 709
 710     if [ $ERRORS != "0" ];
 711     then
 712       return 1
 713     fi
 714
 715     delete_pool fillpool
 716     for i in $(seq 1 $pools)
 717     do
 718       delete_pool "${poolprefix}$i"
 719     done
 720     kill_daemons $dir || return 1
 721 }
 722
 723 function osdlist() {
 724     local OSDS=$1
 725     local excludeosd=$2
 726
 727     osds=""
 728     for osd in $(seq 0 $(expr $OSDS - 1))
 729     do
 730       if [ $osd = $excludeosd ];
 731       then
 732         continue
 733       fi
 734       if [ -n "$osds" ]; then
 735         osds="${osds} "
 736       fi
 737       osds="${osds}${osd}"
 738     done
 739     echo $osds
 740 }
 741
 742 # Create a pool with size 1 and fill with data so that only 1 EC shard can fit.
 743 # Write data to 2 EC pools mapped to the same OSDs (excluding filled one)
 744 # Remap the last OSD to partially full OSD on both pools
 745 # The 2 pools should race to backfill.
 746 # One pool goes active+clean
 747 # The other goes acitve+...+backfill_toofull
 748 function TEST_ec_backfill_multi() {
 749     local dir=$1
 750     local EC=$2
 751     local pools=2
 752     local OSDS=6
 753     local k=3
 754     local m=2
 755     local ecobjects=$(expr $objects / $k)
 756
 757     run_mon $dir a || return 1
 758     run_mgr $dir x || return 1
 759     export CEPH_ARGS
 760
 761     for osd in $(seq 0 $(expr $OSDS - 1))
 762     do
 763       run_osd $dir $osd || return 1
 764     done
 765
 766     # This test requires that shards from 2 different pools
 767     # fit on a given OSD, but both will not fix.  I'm using
 768     # making the fillosd plus 1 shard use 75% of the space,
 769     # leaving not enough to be under the 85% set here.
 770     ceph osd set-backfillfull-ratio .85
 771
 772     ceph osd set-require-min-compat-client luminous
 773     create_pool fillpool 1 1
 774     ceph osd pool set fillpool size 1 --yes-i-really-mean-it
 775
 776     # Partially fill an osd
 777     # We have room for 200 18K replicated objects, if we create 9K objects
 778     # there is only 3600K - (9K * 200) = 1800K which will only hold
 779     # one k=3 shard below ((12K / 3) + 4K) * 200 = 1600K
 780     # The actual data will be (12K / 3) * 200 = 800K because the extra
 781     # is the reservation padding for chunking.
 782     dd if=/dev/urandom of=$dir/datafile bs=1024 count=9
 783     for o in $(seq 1 $ecobjects)
 784     do
 785       rados -p fillpool put obj$o $dir/datafile
 786     done
 787
 788     local fillosd=$(get_primary fillpool obj1)
 789     ceph osd erasure-code-profile set ec-profile k=3 m=2 crush-failure-domain=osd technique=reed_sol_van plugin=jerasure || return 1
 790
 791     nonfillosds="$(osdlist $OSDS $fillosd)"
 792
 793     for p in $(seq 1 $pools)
 794     do
 795         ceph osd pool create "${poolprefix}$p" 1 1 erasure ec-profile
 796         ceph osd pg-upmap "$(expr $p + 1).0" $nonfillosds
 797     done
 798
 799     # Can't wait for clean here because we created a stale pg
 800     #wait_for_clean || return 1
 801     sleep 15
 802
 803     ceph pg dump pgs
 804
 805     dd if=/dev/urandom of=$dir/datafile bs=1024 count=12
 806     for o in $(seq 1 $ecobjects)
 807     do
 808       for p in $(seq 1 $pools)
 809       do
 810         rados -p "${poolprefix}$p" put obj$o-$p $dir/datafile
 811       done
 812     done
 813
 814     ceph pg dump pgs
 815
 816     for p in $(seq 1 $pools)
 817     do
 818       ceph osd pg-upmap $(expr $p + 1).0 ${nonfillosds% *} $fillosd
 819     done
 820
 821     sleep 30
 822
 823     wait_for_not_backfilling 1200 || return 1
 824     wait_for_not_activating 60 || return 1
 825
 826     ceph pg dump pgs
 827
 828     ERRORS=0
 829     if [ "$(ceph pg dump pgs | grep -v "^1.0" | grep +backfill_toofull | wc -l)" != "1" ];
 830     then
 831       echo "One pool should have been in backfill_toofull"
 832       ERRORS="$(expr $ERRORS + 1)"
 833     fi
 834
 835     if [ "$(ceph pg dump pgs | grep -v "^1.0" | grep active+clean | wc -l)" != "1" ];
 836     then
 837       echo "One didn't finish backfill"
 838       ERRORS="$(expr $ERRORS + 1)"
 839     fi
 840
 841     if [ $ERRORS != "0" ];
 842     then
 843       return 1
 844     fi
 845
 846     delete_pool fillpool
 847     for i in $(seq 1 $pools)
 848     do
 849       delete_pool "${poolprefix}$i"
 850     done
 851     kill_daemons $dir || return 1
 852 }
 853
 854 # Similar to TEST_ec_backfill_multi but one of the ec pools
 855 # already had some data on the target OSD
 856
 857 # Create a pool with size 1 and fill with data so that only 1 EC shard can fit.
 858 # Write a small amount of data to 1 EC pool that still includes the filled one
 859 # Take down fillosd with noout set
 860 # Write data to 2 EC pools mapped to the same OSDs (excluding filled one)
 861 # Remap the last OSD to partially full OSD on both pools
 862 # The 2 pools should race to backfill.
 863 # One pool goes active+clean
 864 # The other goes acitve+...+backfill_toofull
 865 function SKIP_TEST_ec_backfill_multi_partial() {
 866     local dir=$1
 867     local EC=$2
 868     local pools=2
 869     local OSDS=5
 870     local k=3
 871     local m=2
 872     local ecobjects=$(expr $objects / $k)
 873     local lastosd=$(expr $OSDS - 1)
 874
 875     run_mon $dir a || return 1
 876     run_mgr $dir x || return 1
 877     export CEPH_ARGS
 878
 879     for osd in $(seq 0 $(expr $OSDS - 1))
 880     do
 881       run_osd $dir $osd || return 1
 882     done
 883
 884     # This test requires that shards from 2 different pools
 885     # fit on a given OSD, but both will not fix.  I'm using
 886     # making the fillosd plus 1 shard use 75% of the space,
 887     # leaving not enough to be under the 85% set here.
 888     ceph osd set-backfillfull-ratio .85
 889
 890     ceph osd set-require-min-compat-client luminous
 891     create_pool fillpool 1 1
 892     ceph osd pool set fillpool size 1 --yes-i-really-mean-it
 893     # last osd
 894     ceph osd pg-upmap 1.0 $lastosd
 895
 896     # Partially fill an osd
 897     # We have room for 200 18K replicated objects, if we create 9K objects
 898     # there is only 3600K - (9K * 200) = 1800K which will only hold
 899     # one k=3 shard below ((12K / 3) + 4K) * 200 = 1600K
 900     # The actual data will be (12K / 3) * 200 = 800K because the extra
 901     # is the reservation padding for chunking.
 902     dd if=/dev/urandom of=$dir/datafile bs=1024 count=9
 903     for o in $(seq 1 $ecobjects)
 904     do
 905       rados -p fillpool put obj$o $dir/datafile
 906     done
 907
 908     local fillosd=$(get_primary fillpool obj1)
 909     ceph osd erasure-code-profile set ec-profile k=3 m=2 crush-failure-domain=osd technique=reed_sol_van plugin=jerasure || return 1
 910
 911     nonfillosds="$(osdlist $OSDS $fillosd)"
 912
 913     for p in $(seq 1 $pools)
 914     do
 915         ceph osd pool create "${poolprefix}$p" 1 1 erasure ec-profile
 916         ceph osd pg-upmap "$(expr $p + 1).0" $(seq 0 $lastosd)
 917     done
 918
 919     # Can't wait for clean here because we created a stale pg
 920     #wait_for_clean || return 1
 921     sleep 15
 922
 923     ceph pg dump pgs
 924
 925     dd if=/dev/urandom of=$dir/datafile bs=1024 count=1
 926     for o in $(seq 1 $ecobjects)
 927     do
 928       rados -p "${poolprefix}1" put obj$o-1 $dir/datafile
 929     done
 930
 931     for p in $(seq 1 $pools)
 932     do
 933         ceph osd pg-upmap "$(expr $p + 1).0" $(seq 0 $(expr $lastosd - 1))
 934     done
 935     ceph pg dump pgs
 936
 937     #ceph osd set noout
 938     #kill_daemons $dir TERM osd.$lastosd || return 1
 939
 940     dd if=/dev/urandom of=$dir/datafile bs=1024 count=12
 941     for o in $(seq 1 $ecobjects)
 942     do
 943       for p in $(seq 1 $pools)
 944       do
 945         rados -p "${poolprefix}$p" put obj$o-$p $dir/datafile
 946       done
 947     done
 948
 949     ceph pg dump pgs
 950
 951     # Now backfill lastosd by adding back into the upmap
 952     for p in $(seq 1 $pools)
 953     do
 954         ceph osd pg-upmap "$(expr $p + 1).0" $(seq 0 $lastosd)
 955     done
 956     #activate_osd $dir $lastosd || return 1
 957     #ceph tell osd.0 debug kick_recovery_wq 0
 958
 959     sleep 30
 960     ceph pg dump pgs
 961
 962     wait_for_not_backfilling 1200 || return 1
 963     wait_for_not_activating 60 || return 1
 964
 965     ceph pg dump pgs
 966
 967     ERRORS=0
 968     if [ "$(ceph pg dump pgs | grep -v "^1.0" | grep +backfill_toofull | wc -l)" != "1" ];
 969     then
 970       echo "One pool should have been in backfill_toofull"
 971       ERRORS="$(expr $ERRORS + 1)"
 972     fi
 973
 974     if [ "$(ceph pg dump pgs | grep -v "^1.0" | grep active+clean | wc -l)" != "1" ];
 975     then
 976       echo "One didn't finish backfill"
 977       ERRORS="$(expr $ERRORS + 1)"
 978     fi
 979
 980     if [ $ERRORS != "0" ];
 981     then
 982       return 1
 983     fi
 984
 985     delete_pool fillpool
 986     for i in $(seq 1 $pools)
 987     do
 988       delete_pool "${poolprefix}$i"
 989     done
 990     kill_daemons $dir || return 1
 991 }
 992
 993 function SKIP_TEST_ec_backfill_multi_partial() {
 994     local dir=$1
 995     local EC=$2
 996     local pools=2
 997     local OSDS=6
 998
 999     run_mon $dir a || return 1
1000     run_mgr $dir x || return 1
1001     export CEPH_ARGS
1002
1003     for osd in $(seq 0 $(expr $OSDS - 1))
1004     do
1005       run_osd $dir $osd || return 1
1006     done
1007
1008     # Below we need to fit 3200K in 3600K which is 88%
1009     # so set to 90%
1010     ceph osd set-backfillfull-ratio .90
1011
1012     ceph osd set-require-min-compat-client luminous
1013     create_pool fillpool 1 1
1014     ceph osd pool set fillpool size 1 --yes-i-really-mean-it
1015
1016     # Partially fill an osd
1017     # We have room for 200 48K ec objects, if we create 4k replicated objects
1018     # there is 3600K - (4K * 200) = 2800K which won't hold 2 k=3 shard
1019     # of 200 12K objects which takes ((12K / 3) + 4K) * 200 = 1600K each.
1020     # On the other OSDs 2 * 1600K = 3200K which is 88% of 3600K.
1021     dd if=/dev/urandom of=$dir/datafile bs=1024 count=4
1022     for o in $(seq 1 $objects)
1023     do
1024       rados -p fillpool put obj$o $dir/datafile
1025     done
1026
1027     local fillosd=$(get_primary fillpool obj1)
1028     osd=$(expr $fillosd + 1)
1029     if [ "$osd" = "$OSDS" ]; then
1030       osd="0"
1031     fi
1032
1033     sleep 5
1034     kill_daemon $dir/osd.$fillosd.pid TERM
1035     ceph osd out osd.$fillosd
1036     sleep 2
1037     ceph osd erasure-code-profile set ec-profile k=3 m=2 crush-failure-domain=osd technique=reed_sol_van plugin=jerasure || return 1
1038
1039     for p in $(seq 1 $pools)
1040     do
1041         ceph osd pool create "${poolprefix}$p" 1 1 erasure ec-profile
1042     done
1043
1044     # Can't wait for clean here because we created a stale pg
1045     #wait_for_clean || return 1
1046     sleep 5
1047
1048     ceph pg dump pgs
1049
1050     dd if=/dev/urandom of=$dir/datafile bs=1024 count=12
1051     for o in $(seq 1 $objects)
1052     do
1053       for p in $(seq 1 $pools)
1054       do
1055         rados -p "${poolprefix}$p" put obj$o $dir/datafile
1056       done
1057     done
1058
1059     #ceph pg map 2.0 --format=json | jq '.'
1060     kill_daemon $dir/osd.$osd.pid TERM
1061     ceph osd out osd.$osd
1062
1063     _objectstore_tool_nodown $dir $osd --op export --pgid 2.0 --file $dir/export.out
1064     _objectstore_tool_nodown $dir $fillosd --op import --pgid 2.0 --file $dir/export.out
1065
1066     activate_osd $dir $fillosd || return 1
1067     ceph osd in osd.$fillosd
1068     sleep 30
1069
1070     wait_for_not_backfilling 1200 || return 1
1071     wait_for_not_activating 60 || return 1
1072
1073     ERRORS=0
1074     if [ "$(ceph pg dump pgs | grep -v "^1.0" | grep +backfill_toofull | wc -l)" != "1" ];
1075     then
1076       echo "One pool should have been in backfill_toofull"
1077       ERRORS="$(expr $ERRORS + 1)"
1078     fi
1079
1080     if [ "$(ceph pg dump pgs | grep -v "^1.0" | grep active+clean | wc -l)" != "1" ];
1081     then
1082       echo "One didn't finish backfill"
1083       ERRORS="$(expr $ERRORS + 1)"
1084     fi
1085
1086     ceph pg dump pgs
1087
1088     if [ $ERRORS != "0" ];
1089     then
1090       return 1
1091     fi
1092
1093     delete_pool fillpool
1094     for i in $(seq 1 $pools)
1095     do
1096       delete_pool "${poolprefix}$i"
1097     done
1098     kill_daemons $dir || return 1
1099 }
1100
1101 # Create 1 EC pool
1102 # Write 200 12K objects ((12K / 3) + 4K) *200) = 1600K
1103 # Take 1 shard's OSD down (with noout set)
1104 # Remove 50 objects ((12K / 3) + 4k) * 50) = 400K
1105 # Write 150 36K objects (grow 150 objects) 2400K
1106 #       But there is already 1600K usage so backfill
1107 #       would be too full if it didn't account for existing data
1108 # Bring back down OSD so it must backfill
1109 # It should go active+clean taking into account data already there
1110 function TEST_ec_backfill_grow() {
1111     local dir=$1
1112     local poolname="test"
1113     local OSDS=6
1114     local k=3
1115     local m=2
1116     local ecobjects=$(expr $objects / $k)
1117
1118     run_mon $dir a || return 1
1119     run_mgr $dir x || return 1
1120
1121     for osd in $(seq 0 $(expr $OSDS - 1))
1122     do
1123       run_osd $dir $osd || return 1
1124     done
1125
1126     ceph osd set-backfillfull-ratio .85
1127
1128     ceph osd set-require-min-compat-client luminous
1129     ceph osd erasure-code-profile set ec-profile k=$k m=$m crush-failure-domain=osd technique=reed_sol_van plugin=jerasure || return 1
1130     ceph osd pool create $poolname 1 1 erasure ec-profile
1131
1132     wait_for_clean || return 1
1133
1134     dd if=/dev/urandom of=${dir}/12kdata bs=1k count=12
1135     for i in $(seq 1 $ecobjects)
1136     do
1137         rados -p $poolname put obj$i $dir/12kdata
1138     done
1139
1140     local PG=$(get_pg $poolname obj1)
1141     # Remember primary during the backfill
1142     local primary=$(get_primary $poolname obj1)
1143     local otherosd=$(get_not_primary $poolname obj1)
1144
1145     ceph osd set noout
1146     kill_daemons $dir TERM $otherosd || return 1
1147
1148     rmobjects=$(expr $ecobjects / 4)
1149     for i in $(seq 1 $rmobjects)
1150     do
1151         rados -p $poolname rm obj$i
1152     done
1153
1154     dd if=/dev/urandom of=${dir}/36kdata bs=1k count=36
1155     for i in $(seq $(expr $rmobjects + 1) $ecobjects)
1156     do
1157         rados -p $poolname put obj$i $dir/36kdata
1158     done
1159
1160     activate_osd $dir $otherosd || return 1
1161
1162     ceph tell osd.$primary debug kick_recovery_wq 0
1163
1164     sleep 2
1165
1166     wait_for_clean || return 1
1167
1168     delete_pool $poolname
1169     kill_daemons $dir || return 1
1170 }
1171
1172 main osd-backfill-space "$@"
1173
1174 # Local Variables:
1175 # compile-command: "make -j4 && ../qa/run-standalone.sh osd-backfill-space.sh"
1176 # End: