ceph/qa/standalone/osd/osd-backfill-space.sh

   1 #!/usr/bin/env bash
   2 #
   3 # Copyright (C) 2018 Red Hat <contact@redhat.com>
   4 #
   5 # Author: David Zafman <dzafman@redhat.com>
   6 #
   7 # This program is free software; you can redistribute it and/or modify
   8 # it under the terms of the GNU Library Public License as published by
   9 # the Free Software Foundation; either version 2, or (at your option)
  10 # any later version.
  11 #
  12 # This program is distributed in the hope that it will be useful,
  13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 # GNU Library Public License for more details.
  16 #
  17
  18 source $CEPH_ROOT/qa/standalone/ceph-helpers.sh
  19
  20 function run() {
  21     local dir=$1
  22     shift
  23
  24     export CEPH_MON="127.0.0.1:7180" # git grep '\<7180\>' : there must be only one
  25     export CEPH_ARGS
  26     CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none "
  27     CEPH_ARGS+="--mon-host=$CEPH_MON "
  28     CEPH_ARGS+="--osd_min_pg_log_entries=5 --osd_max_pg_log_entries=10 "
  29     CEPH_ARGS+="--fake_statfs_for_testing=3686400 "
  30     CEPH_ARGS+="--osd_max_backfills=10 "
  31     export objects=600
  32     export poolprefix=test
  33
  34     local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')}
  35     for func in $funcs ; do
  36         setup $dir || return 1
  37         $func $dir || return 1
  38         teardown $dir || return 1
  39     done
  40 }
  41
  42
  43 function get_num_in_state() {
  44     local state=$1
  45     local expression
  46     expression+="select(contains(\"${state}\"))"
  47     ceph --format json pg dump pgs 2>/dev/null | \
  48         jq ".pg_stats | [.[] | .state | $expression] | length"
  49 }
  50
  51
  52 function wait_for_state() {
  53     local state=$1
  54     local num_in_state=-1
  55     local cur_in_state
  56     local -a delays=($(get_timeout_delays $2 5))
  57     local -i loop=0
  58
  59     flush_pg_stats || return 1
  60     while test $(get_num_pgs) == 0 ; do
  61         sleep 1
  62     done
  63
  64     while true ; do
  65         cur_in_state=$(get_num_in_state ${state})
  66         test $cur_in_state = "0" && break
  67         if test $cur_in_state != $num_in_state ; then
  68             loop=0
  69             num_in_state=$cur_in_state
  70         elif (( $loop >= ${#delays[*]} )) ; then
  71             ceph pg dump pgs
  72             return 1
  73         fi
  74         sleep ${delays[$loop]}
  75         loop+=1
  76     done
  77     return 0
  78 }
  79
  80
  81 function wait_for_backfill() {
  82     local timeout=$1
  83     wait_for_state backfilling $timeout
  84 }
  85
  86
  87 function wait_for_active() {
  88     local timeout=$1
  89     wait_for_state activating $timeout
  90 }
  91
  92 # All tests are created in an environment which has fake total space
  93 # of 3600K (3686400) which can hold 600 6K replicated objects or
  94 # 200 18K shards of erasure coded objects.  For a k=3, m=2 EC pool
  95 # we have a theoretical 54K object but with the chunk size of 4K
  96 # and a rounding of 4K to account for the chunks is 36K max object
  97 # which is ((36K / 3) + 4K) * 200  = 3200K which is 88% of
  98 # 3600K for a shard.
  99
 100 # Create 2 pools with size 1
 101 # Write enough data that only 1 pool pg can fit per osd
 102 # Incresase the pool size to 2
 103 # On 3 OSDs this should result in 1 OSD with overlapping replicas,
 104 # so both pools can't fit.  We assume pgid 1.0 and 2.0 won't
 105 # map to the same 2 OSDs.
 106 # At least 1 pool shouldn't have room to backfill
 107 # All other pools should go active+clean
 108 function TEST_backfill_test_simple() {
 109     local dir=$1
 110     local pools=2
 111     local OSDS=3
 112
 113     run_mon $dir a || return 1
 114     run_mgr $dir x || return 1
 115     export CEPH_ARGS
 116
 117     for osd in $(seq 0 $(expr $OSDS - 1))
 118     do
 119       run_osd $dir $osd || return 1
 120     done
 121
 122     ceph osd set-backfillfull-ratio .85
 123
 124     for p in $(seq 1 $pools)
 125     do
 126       create_pool "${poolprefix}$p" 1 1
 127       ceph osd pool set "${poolprefix}$p" size 1
 128     done
 129
 130     wait_for_clean || return 1
 131
 132     # This won't work is if the 2 pools primary and only osds
 133     # are the same.
 134
 135     dd if=/dev/urandom of=$dir/datafile bs=1024 count=4
 136     for o in $(seq 1 $objects)
 137     do
 138       for p in $(seq 1 $pools)
 139       do
 140         rados -p "${poolprefix}$p" put obj$o $dir/datafile
 141       done
 142     done
 143
 144     ceph pg dump pgs
 145
 146     for p in $(seq 1 $pools)
 147     do
 148       ceph osd pool set "${poolprefix}$p" size 2
 149     done
 150     sleep 5
 151
 152     wait_for_backfill 240 || return 1
 153     wait_for_active 60 || return 1
 154
 155     ERRORS=0
 156     if [ "$(ceph pg dump pgs | grep +backfill_toofull | wc -l)" != "1" ];
 157     then
 158       echo "One pool should have been in backfill_toofull"
 159       ERRORS="$(expr $ERRORS + 1)"
 160     fi
 161
 162     expected="$(expr $pools - 1)"
 163     if [ "$(ceph pg dump pgs | grep active+clean | wc -l)" != "$expected" ];
 164     then
 165       echo "$expected didn't finish backfill"
 166       ERRORS="$(expr $ERRORS + 1)"
 167     fi
 168
 169     ceph pg dump pgs
 170
 171     if [ $ERRORS != "0" ];
 172     then
 173       return 1
 174     fi
 175
 176     for i in $(seq 1 $pools)
 177     do
 178       delete_pool "${poolprefix}$i"
 179     done
 180     kill_daemons $dir || return 1
 181     ! grep -q "num_bytes mismatch" $dir/osd.*.log || return 1
 182 }
 183
 184
 185 # Create 8 pools of size 1 on 20 OSDs
 186 # Write 4K * 600 objects (only 1 pool pg can fit on any given osd)
 187 # Increase pool size to 2
 188 # At least 1 pool shouldn't have room to backfill
 189 # All other pools should go active+clean
 190 function TEST_backfill_test_multi() {
 191     local dir=$1
 192     local pools=8
 193     local OSDS=20
 194
 195     run_mon $dir a || return 1
 196     run_mgr $dir x || return 1
 197     export CEPH_ARGS
 198
 199     for osd in $(seq 0 $(expr $OSDS - 1))
 200     do
 201       run_osd $dir $osd || return 1
 202     done
 203
 204     ceph osd set-backfillfull-ratio .85
 205
 206     for p in $(seq 1 $pools)
 207     do
 208       create_pool "${poolprefix}$p" 1 1
 209       ceph osd pool set "${poolprefix}$p" size 1
 210     done
 211
 212     wait_for_clean || return 1
 213
 214     dd if=/dev/urandom of=$dir/datafile bs=1024 count=4
 215     for o in $(seq 1 $objects)
 216     do
 217       for p in $(seq 1 $pools)
 218       do
 219         rados -p "${poolprefix}$p" put obj$o $dir/datafile
 220       done
 221     done
 222
 223     ceph pg dump pgs
 224
 225     for p in $(seq 1 $pools)
 226     do
 227       ceph osd pool set "${poolprefix}$p" size 2
 228     done
 229     sleep 5
 230
 231     wait_for_backfill 240 || return 1
 232     wait_for_active 60 || return 1
 233
 234     ERRORS=0
 235     full="$(ceph pg dump pgs | grep +backfill_toofull | wc -l)"
 236     if [ "$full" -lt "1" ];
 237     then
 238       echo "At least one pool should have been in backfill_toofull"
 239       ERRORS="$(expr $ERRORS + 1)"
 240     fi
 241
 242     expected="$(expr $pools - $full)"
 243     if [ "$(ceph pg dump pgs | grep active+clean | wc -l)" != "$expected" ];
 244     then
 245       echo "$expected didn't finish backfill"
 246       ERRORS="$(expr $ERRORS + 1)"
 247     fi
 248
 249     ceph pg dump pgs
 250
 251     if [ $ERRORS != "0" ];
 252     then
 253       return 1
 254     fi
 255
 256     for i in $(seq 1 $pools)
 257     do
 258       delete_pool "${poolprefix}$i"
 259     done
 260     # Work around for http://tracker.ceph.com/issues/38195
 261     kill_daemons $dir #|| return 1
 262     ! grep -q "num_bytes mismatch" $dir/osd.*.log || return 1
 263 }
 264
 265
 266 # To make sure that when 2 pg try to backfill at the same time to
 267 # the same target.  This might be covered by the simple test above
 268 # but this makes sure we get it.
 269 #
 270 # Create 10 pools of size 2 and identify 2 that have the same
 271 # non-primary osd.
 272 # Delete all other pools
 273 # Set size to 1 and write 4K * 600 to each pool
 274 # Set size back to 2
 275 # The 2 pools should race to backfill.
 276 # One pool goes active+clean
 277 # The other goes acitve+...+backfill_toofull
 278 function TEST_backfill_test_sametarget() {
 279     local dir=$1
 280     local pools=10
 281     local OSDS=5
 282
 283     run_mon $dir a || return 1
 284     run_mgr $dir x || return 1
 285     export CEPH_ARGS
 286
 287     for osd in $(seq 0 $(expr $OSDS - 1))
 288     do
 289       run_osd $dir $osd || return 1
 290     done
 291
 292     ceph osd set-backfillfull-ratio .85
 293
 294     for p in $(seq 1 $pools)
 295     do
 296       create_pool "${poolprefix}$p" 1 1
 297       ceph osd pool set "${poolprefix}$p" size 2
 298     done
 299     sleep 5
 300
 301     wait_for_clean || return 1
 302
 303     ceph pg dump pgs
 304
 305     # Find 2 pools with a pg that distinct primaries but second
 306     # replica on the same osd.
 307     local PG1
 308     local POOLNUM1
 309     local pool1
 310     local chk_osd1
 311     local chk_osd2
 312
 313     local PG2
 314     local POOLNUM2
 315     local pool2
 316     for p in $(seq 1 $pools)
 317     do
 318       ceph pg map ${p}.0 --format=json | jq '.acting[]' > $dir/acting
 319       local test_osd1=$(head -1 $dir/acting)
 320       local test_osd2=$(tail -1 $dir/acting)
 321       if [ $p = "1" ];
 322       then
 323         PG1="${p}.0"
 324         POOLNUM1=$p
 325         pool1="${poolprefix}$p"
 326         chk_osd1=$test_osd1
 327         chk_osd2=$test_osd2
 328       elif [ $chk_osd1 != $test_osd1 -a $chk_osd2 = $test_osd2 ];
 329       then
 330         PG2="${p}.0"
 331         POOLNUM2=$p
 332         pool2="${poolprefix}$p"
 333         break
 334       fi
 335     done
 336     rm -f $dir/acting
 337
 338     if [ "$pool2" = "" ];
 339     then
 340       echo "Failure to find appropirate PGs"
 341       return 1
 342     fi
 343
 344     for p in $(seq 1 $pools)
 345     do
 346       if [ $p != $POOLNUM1 -a $p != $POOLNUM2 ];
 347       then
 348         delete_pool ${poolprefix}$p
 349       fi
 350     done
 351
 352     ceph osd pool set $pool1 size 1
 353     ceph osd pool set $pool2 size 1
 354
 355     wait_for_clean || return 1
 356
 357     dd if=/dev/urandom of=$dir/datafile bs=1024 count=4
 358     for i in $(seq 1 $objects)
 359     do
 360         rados -p $pool1 put obj$i $dir/datafile
 361         rados -p $pool2 put obj$i $dir/datafile
 362     done
 363
 364     ceph osd pool set $pool1 size 2
 365     ceph osd pool set $pool2 size 2
 366     sleep 5
 367
 368     wait_for_backfill 240 || return 1
 369     wait_for_active 60 || return 1
 370
 371     ERRORS=0
 372     if [ "$(ceph pg dump pgs | grep +backfill_toofull | wc -l)" != "1" ];
 373     then
 374       echo "One pool should have been in backfill_toofull"
 375       ERRORS="$(expr $ERRORS + 1)"
 376     fi
 377
 378     if [ "$(ceph pg dump pgs | grep active+clean | wc -l)" != "1" ];
 379     then
 380       echo "One didn't finish backfill"
 381       ERRORS="$(expr $ERRORS + 1)"
 382     fi
 383
 384     ceph pg dump pgs
 385
 386     if [ $ERRORS != "0" ];
 387     then
 388       return 1
 389     fi
 390
 391     delete_pool $pool1
 392     delete_pool $pool2
 393     kill_daemons $dir || return 1
 394     ! grep -q "num_bytes mismatch" $dir/osd.*.log || return 1
 395 }
 396
 397 # 2 pools can't both backfill to a target which has other data
 398 # 1 of the pools has objects that increase from 1024 to 2611 bytes
 399 #
 400 # Write to fill pool which is size 1
 401 # Take fill pool osd down (other 2 pools must go to the remaining OSDs
 402 # Save an export of data on fill OSD and restart it
 403 # Write an intial 1K to pool1 which has pg 2.0
 404 # Export 2.0 from non-fillpool OSD don't wait for it to start-up
 405 # Take down fillpool OSD
 406 # Put 1K object version of 2.0 on fillpool OSD
 407 # Put back fillpool data on fillpool OSD
 408 # With fillpool down write 2611 byte objects
 409 # Take down $osd and bring back $fillosd simultaneously
 410 # Wait for backfilling
 411 # One PG will be able to backfill its remaining data
 412 # One PG must get backfill_toofull
 413 function TEST_backfill_multi_partial() {
 414     local dir=$1
 415     local EC=$2
 416     local pools=2
 417     local OSDS=3
 418
 419     run_mon $dir a || return 1
 420     run_mgr $dir x || return 1
 421     export CEPH_ARGS
 422
 423     for osd in $(seq 0 $(expr $OSDS - 1))
 424     do
 425       run_osd $dir $osd || return 1
 426     done
 427
 428     ceph osd set-backfillfull-ratio .85
 429
 430     ceph osd set-require-min-compat-client luminous
 431     create_pool fillpool 1 1
 432     ceph osd pool set fillpool size 1
 433     for p in $(seq 1 $pools)
 434     do
 435       create_pool "${poolprefix}$p" 1 1
 436       ceph osd pool set "${poolprefix}$p" size 2
 437     done
 438
 439     wait_for_clean || return 1
 440
 441     # Partially fill an osd
 442     # We have room for 600 6K replicated objects, if we create 2611 byte objects
 443     # there is 3600K - (2611 * 600) = 2070K, so the fill pool and one
 444     # replica from the other 2 is 85% of 3600K
 445
 446     dd if=/dev/urandom of=$dir/datafile bs=2611 count=1
 447     for o in $(seq 1 $objects)
 448     do
 449       rados -p fillpool put obj-fill-${o} $dir/datafile
 450     done
 451
 452     local fillosd=$(get_primary fillpool obj-fill-1)
 453     osd=$(expr $fillosd + 1)
 454     if [ "$osd" = "$OSDS" ]; then
 455       osd="0"
 456     fi
 457
 458     sleep 5
 459     kill $(cat $dir/osd.$fillosd.pid)
 460     ceph osd out osd.$fillosd
 461     sleep 2
 462
 463     _objectstore_tool_nodown $dir $fillosd --op export-remove --pgid 1.0 --file $dir/fillexport.out || return 1
 464     activate_osd $dir $fillosd || return 1
 465
 466     ceph pg dump pgs
 467
 468     dd if=/dev/urandom of=$dir/datafile bs=1024 count=1
 469     for o in $(seq 1 $objects)
 470     do
 471       rados -p "${poolprefix}1" put obj-1-${o} $dir/datafile
 472     done
 473
 474     ceph pg dump pgs
 475     # The $osd OSD is started, but we don't wait so we can kill $fillosd at the same time
 476     _objectstore_tool_nowait $dir $osd --op export --pgid 2.0 --file $dir/export.out
 477     kill $(cat $dir/osd.$fillosd.pid)
 478     sleep 5
 479     _objectstore_tool_nodown $dir $fillosd --force --op remove --pgid 2.0
 480     _objectstore_tool_nodown $dir $fillosd --op import --pgid 2.0 --file $dir/export.out || return 1
 481     _objectstore_tool_nodown $dir $fillosd --op import --pgid 1.0 --file $dir/fillexport.out || return 1
 482     ceph pg dump pgs
 483     sleep 20
 484     ceph pg dump pgs
 485
 486     # re-write everything
 487     dd if=/dev/urandom of=$dir/datafile bs=2611 count=1
 488     for o in $(seq 1 $objects)
 489     do
 490       for p in $(seq 1 $pools)
 491       do
 492         rados -p "${poolprefix}$p" put obj-${p}-${o} $dir/datafile
 493       done
 494     done
 495
 496     kill $(cat $dir/osd.$osd.pid)
 497     ceph osd out osd.$osd
 498
 499     activate_osd $dir $fillosd || return 1
 500     ceph osd in osd.$fillosd
 501     sleep 15
 502
 503     wait_for_backfill 240 || return 1
 504     wait_for_active 60 || return 1
 505
 506     flush_pg_stats || return 1
 507     ceph pg dump pgs
 508
 509     ERRORS=0
 510     if [ "$(get_num_in_state backfill_toofull)" != "1" ];
 511     then
 512       echo "One PG should be in backfill_toofull"
 513       ERRORS="$(expr $ERRORS + 1)"
 514     fi
 515
 516     if [ "$(get_num_in_state active+clean)" != "2" ];
 517     then
 518       echo "Two PGs should be active+clean after one PG completed backfill"
 519       ERRORS="$(expr $ERRORS + 1)"
 520     fi
 521
 522     if [ $ERRORS != "0" ];
 523     then
 524       return 1
 525     fi
 526
 527     delete_pool fillpool
 528     for i in $(seq 1 $pools)
 529     do
 530       delete_pool "${poolprefix}$i"
 531     done
 532     kill_daemons $dir || return 1
 533     ! grep -q "num_bytes mismatch" $dir/osd.*.log || return 1
 534 }
 535
 536 # Make sure that the amount of bytes already on the replica doesn't
 537 # cause an out of space condition
 538 #
 539 # Create 1 pool and write 4K * 600 objects
 540 # Remove 25% (150) of the objects with one OSD down (noout set)
 541 # Increase the size of the remaining 75% (450) of the objects to 6K
 542 # Bring back down OSD
 543 # The pool should go active+clean
 544 function TEST_backfill_grow() {
 545     local dir=$1
 546     local poolname="test"
 547     local OSDS=3
 548
 549     run_mon $dir a || return 1
 550     run_mgr $dir x || return 1
 551
 552     for osd in $(seq 0 $(expr $OSDS - 1))
 553     do
 554       run_osd $dir $osd || return 1
 555     done
 556
 557     ceph osd set-backfillfull-ratio .85
 558
 559     create_pool $poolname 1 1
 560     ceph osd pool set $poolname size 3
 561     sleep 5
 562
 563     wait_for_clean || return 1
 564
 565     dd if=/dev/urandom of=${dir}/4kdata bs=1k count=4
 566     for i in $(seq 1 $objects)
 567     do
 568         rados -p $poolname put obj$i $dir/4kdata
 569     done
 570
 571     local PG=$(get_pg $poolname obj1)
 572     # Remember primary during the backfill
 573     local primary=$(get_primary $poolname obj1)
 574     local otherosd=$(get_not_primary $poolname obj1)
 575
 576     ceph osd set noout
 577     kill_daemons $dir TERM $otherosd || return 1
 578
 579     rmobjects=$(expr $objects / 4)
 580     for i in $(seq 1 $rmobjects)
 581     do
 582         rados -p $poolname rm obj$i
 583     done
 584
 585     dd if=/dev/urandom of=${dir}/6kdata bs=6k count=1
 586     for i in $(seq $(expr $rmobjects + 1) $objects)
 587     do
 588         rados -p $poolname put obj$i $dir/6kdata
 589     done
 590
 591     activate_osd $dir $otherosd || return 1
 592
 593     ceph tell osd.$primary debug kick_recovery_wq 0
 594
 595     sleep 2
 596
 597     wait_for_clean || return 1
 598
 599     delete_pool $poolname
 600     kill_daemons $dir || return 1
 601     ! grep -q "num_bytes mismatch" $dir/osd.*.log || return 1
 602 }
 603
 604 # Create a 5 shard EC pool on 6 OSD cluster
 605 # Fill 1 OSD with 2600K of data take that osd down.
 606 # Write the EC pool on 5 OSDs
 607 # Take down 1 (must contain an EC shard)
 608 # Bring up OSD with fill data
 609 # Not enought room to backfill to partially full OSD
 610 function TEST_ec_backfill_simple() {
 611     local dir=$1
 612     local EC=$2
 613     local pools=1
 614     local OSDS=6
 615     local k=3
 616     local m=2
 617     local ecobjects=$(expr $objects / $k)
 618
 619     run_mon $dir a || return 1
 620     run_mgr $dir x || return 1
 621     export CEPH_ARGS
 622
 623     for osd in $(seq 0 $(expr $OSDS - 1))
 624     do
 625       run_osd $dir $osd || return 1
 626     done
 627
 628     ceph osd set-backfillfull-ratio .85
 629     create_pool fillpool 1 1
 630     ceph osd pool set fillpool size 1
 631
 632     # Partially fill an osd
 633     # We have room for 200 18K replicated objects, if we create 13K objects
 634     # there is only 3600K - (13K * 200) = 1000K which won't hold
 635     # a k=3 shard below ((18K / 3) + 4K) * 200 = 2000K
 636     # Actual usage per shard is 8K * 200 = 1600K because 18K/3 is 6K which
 637     # rounds to 8K.  The 2000K is the ceiling on the 18K * 200 = 3600K logical
 638     # bytes in the pool.
 639     dd if=/dev/urandom of=$dir/datafile bs=1024 count=13
 640     for o in $(seq 1 $ecobjects)
 641     do
 642       rados -p fillpool put obj$o $dir/datafile
 643     done
 644
 645     local fillosd=$(get_primary fillpool obj1)
 646     osd=$(expr $fillosd + 1)
 647     if [ "$osd" = "$OSDS" ]; then
 648       osd="0"
 649     fi
 650
 651     sleep 5
 652     kill $(cat $dir/osd.$fillosd.pid)
 653     ceph osd out osd.$fillosd
 654     sleep 2
 655     ceph osd erasure-code-profile set ec-profile k=$k m=$m crush-failure-domain=osd technique=reed_sol_van plugin=jerasure || return 1
 656
 657     for p in $(seq 1 $pools)
 658     do
 659         ceph osd pool create "${poolprefix}$p" 1 1 erasure ec-profile
 660     done
 661
 662     # Can't wait for clean here because we created a stale pg
 663     #wait_for_clean || return 1
 664     sleep 5
 665
 666     ceph pg dump pgs
 667
 668     dd if=/dev/urandom of=$dir/datafile bs=1024 count=18
 669     for o in $(seq 1 $ecobjects)
 670     do
 671       for p in $(seq 1 $pools)
 672       do
 673         rados -p "${poolprefix}$p" put obj$o $dir/datafile
 674       done
 675     done
 676
 677     kill $(cat $dir/osd.$osd.pid)
 678     ceph osd out osd.$osd
 679
 680     activate_osd $dir $fillosd || return 1
 681     ceph osd in osd.$fillosd
 682     sleep 30
 683
 684     ceph pg dump pgs
 685
 686     wait_for_backfill 240 || return 1
 687     wait_for_active 60 || return 1
 688
 689     ceph pg dump pgs
 690
 691     ERRORS=0
 692     if [ "$(ceph pg dump pgs | grep -v "^1.0" | grep +backfill_toofull | wc -l)" != "1" ]; then
 693       echo "One pool should have been in backfill_toofull"
 694       ERRORS="$(expr $ERRORS + 1)"
 695     fi
 696
 697     if [ $ERRORS != "0" ];
 698     then
 699       return 1
 700     fi
 701
 702     delete_pool fillpool
 703     for i in $(seq 1 $pools)
 704     do
 705       delete_pool "${poolprefix}$i"
 706     done
 707     kill_daemons $dir || return 1
 708 }
 709
 710 function osdlist() {
 711     local OSDS=$1
 712     local excludeosd=$2
 713
 714     osds=""
 715     for osd in $(seq 0 $(expr $OSDS - 1))
 716     do
 717       if [ $osd = $excludeosd ];
 718       then
 719         continue
 720       fi
 721       if [ -n "$osds" ]; then
 722         osds="${osds} "
 723       fi
 724       osds="${osds}${osd}"
 725     done
 726     echo $osds
 727 }
 728
 729 # Create a pool with size 1 and fill with data so that only 1 EC shard can fit.
 730 # Write data to 2 EC pools mapped to the same OSDs (excluding filled one)
 731 # Remap the last OSD to partially full OSD on both pools
 732 # The 2 pools should race to backfill.
 733 # One pool goes active+clean
 734 # The other goes acitve+...+backfill_toofull
 735 function TEST_ec_backfill_multi() {
 736     local dir=$1
 737     local EC=$2
 738     local pools=2
 739     local OSDS=6
 740     local k=3
 741     local m=2
 742     local ecobjects=$(expr $objects / $k)
 743
 744     run_mon $dir a || return 1
 745     run_mgr $dir x || return 1
 746     export CEPH_ARGS
 747
 748     for osd in $(seq 0 $(expr $OSDS - 1))
 749     do
 750       run_osd $dir $osd || return 1
 751     done
 752
 753     # This test requires that shards from 2 different pools
 754     # fit on a given OSD, but both will not fix.  I'm using
 755     # making the fillosd plus 1 shard use 75% of the space,
 756     # leaving not enough to be under the 85% set here.
 757     ceph osd set-backfillfull-ratio .85
 758
 759     ceph osd set-require-min-compat-client luminous
 760     create_pool fillpool 1 1
 761     ceph osd pool set fillpool size 1
 762
 763     # Partially fill an osd
 764     # We have room for 200 18K replicated objects, if we create 9K objects
 765     # there is only 3600K - (9K * 200) = 1800K which will only hold
 766     # one k=3 shard below ((12K / 3) + 4K) * 200 = 1600K
 767     # The actual data will be (12K / 3) * 200 = 800K because the extra
 768     # is the reservation padding for chunking.
 769     dd if=/dev/urandom of=$dir/datafile bs=1024 count=9
 770     for o in $(seq 1 $ecobjects)
 771     do
 772       rados -p fillpool put obj$o $dir/datafile
 773     done
 774
 775     local fillosd=$(get_primary fillpool obj1)
 776     ceph osd erasure-code-profile set ec-profile k=3 m=2 crush-failure-domain=osd technique=reed_sol_van plugin=jerasure || return 1
 777
 778     nonfillosds="$(osdlist $OSDS $fillosd)"
 779
 780     for p in $(seq 1 $pools)
 781     do
 782         ceph osd pool create "${poolprefix}$p" 1 1 erasure ec-profile
 783         ceph osd pg-upmap "$(expr $p + 1).0" $nonfillosds
 784     done
 785
 786     # Can't wait for clean here because we created a stale pg
 787     #wait_for_clean || return 1
 788     sleep 15
 789
 790     ceph pg dump pgs
 791
 792     dd if=/dev/urandom of=$dir/datafile bs=1024 count=12
 793     for o in $(seq 1 $ecobjects)
 794     do
 795       for p in $(seq 1 $pools)
 796       do
 797         rados -p "${poolprefix}$p" put obj$o-$p $dir/datafile
 798       done
 799     done
 800
 801     ceph pg dump pgs
 802
 803     for p in $(seq 1 $pools)
 804     do
 805       ceph osd pg-upmap $(expr $p + 1).0 ${nonfillosds% *} $fillosd
 806     done
 807
 808     sleep 10
 809
 810     wait_for_backfill 240 || return 1
 811     wait_for_active 60 || return 1
 812
 813     ceph pg dump pgs
 814
 815     ERRORS=0
 816     if [ "$(ceph pg dump pgs | grep -v "^1.0" | grep +backfill_toofull | wc -l)" != "1" ];
 817     then
 818       echo "One pool should have been in backfill_toofull"
 819       ERRORS="$(expr $ERRORS + 1)"
 820     fi
 821
 822     if [ "$(ceph pg dump pgs | grep -v "^1.0" | grep active+clean | wc -l)" != "1" ];
 823     then
 824       echo "One didn't finish backfill"
 825       ERRORS="$(expr $ERRORS + 1)"
 826     fi
 827
 828     if [ $ERRORS != "0" ];
 829     then
 830       return 1
 831     fi
 832
 833     delete_pool fillpool
 834     for i in $(seq 1 $pools)
 835     do
 836       delete_pool "${poolprefix}$i"
 837     done
 838     kill_daemons $dir || return 1
 839 }
 840
 841 # Similar to TEST_ec_backfill_multi but one of the ec pools
 842 # already had some data on the target OSD
 843
 844 # Create a pool with size 1 and fill with data so that only 1 EC shard can fit.
 845 # Write a small amount of data to 1 EC pool that still includes the filled one
 846 # Take down fillosd with noout set
 847 # Write data to 2 EC pools mapped to the same OSDs (excluding filled one)
 848 # Remap the last OSD to partially full OSD on both pools
 849 # The 2 pools should race to backfill.
 850 # One pool goes active+clean
 851 # The other goes acitve+...+backfill_toofull
 852 function SKIP_TEST_ec_backfill_multi_partial() {
 853     local dir=$1
 854     local EC=$2
 855     local pools=2
 856     local OSDS=5
 857     local k=3
 858     local m=2
 859     local ecobjects=$(expr $objects / $k)
 860     local lastosd=$(expr $OSDS - 1)
 861
 862     run_mon $dir a || return 1
 863     run_mgr $dir x || return 1
 864     export CEPH_ARGS
 865
 866     for osd in $(seq 0 $(expr $OSDS - 1))
 867     do
 868       run_osd $dir $osd || return 1
 869     done
 870
 871     # This test requires that shards from 2 different pools
 872     # fit on a given OSD, but both will not fix.  I'm using
 873     # making the fillosd plus 1 shard use 75% of the space,
 874     # leaving not enough to be under the 85% set here.
 875     ceph osd set-backfillfull-ratio .85
 876
 877     ceph osd set-require-min-compat-client luminous
 878     create_pool fillpool 1 1
 879     ceph osd pool set fillpool size 1
 880     # last osd
 881     ceph osd pg-upmap 1.0 $lastosd
 882
 883     # Partially fill an osd
 884     # We have room for 200 18K replicated objects, if we create 9K objects
 885     # there is only 3600K - (9K * 200) = 1800K which will only hold
 886     # one k=3 shard below ((12K / 3) + 4K) * 200 = 1600K
 887     # The actual data will be (12K / 3) * 200 = 800K because the extra
 888     # is the reservation padding for chunking.
 889     dd if=/dev/urandom of=$dir/datafile bs=1024 count=9
 890     for o in $(seq 1 $ecobjects)
 891     do
 892       rados -p fillpool put obj$o $dir/datafile
 893     done
 894
 895     local fillosd=$(get_primary fillpool obj1)
 896     ceph osd erasure-code-profile set ec-profile k=3 m=2 crush-failure-domain=osd technique=reed_sol_van plugin=jerasure || return 1
 897
 898     nonfillosds="$(osdlist $OSDS $fillosd)"
 899
 900     for p in $(seq 1 $pools)
 901     do
 902         ceph osd pool create "${poolprefix}$p" 1 1 erasure ec-profile
 903         ceph osd pg-upmap "$(expr $p + 1).0" $(seq 0 $lastosd)
 904     done
 905
 906     # Can't wait for clean here because we created a stale pg
 907     #wait_for_clean || return 1
 908     sleep 15
 909
 910     ceph pg dump pgs
 911
 912     dd if=/dev/urandom of=$dir/datafile bs=1024 count=1
 913     for o in $(seq 1 $ecobjects)
 914     do
 915       rados -p "${poolprefix}1" put obj$o-1 $dir/datafile
 916     done
 917
 918     for p in $(seq 1 $pools)
 919     do
 920         ceph osd pg-upmap "$(expr $p + 1).0" $(seq 0 $(expr $lastosd - 1))
 921     done
 922     ceph pg dump pgs
 923
 924     #ceph osd set noout
 925     #kill_daemons $dir TERM osd.$lastosd || return 1
 926
 927     dd if=/dev/urandom of=$dir/datafile bs=1024 count=12
 928     for o in $(seq 1 $ecobjects)
 929     do
 930       for p in $(seq 1 $pools)
 931       do
 932         rados -p "${poolprefix}$p" put obj$o-$p $dir/datafile
 933       done
 934     done
 935
 936     ceph pg dump pgs
 937
 938     # Now backfill lastosd by adding back into the upmap
 939     for p in $(seq 1 $pools)
 940     do
 941         ceph osd pg-upmap "$(expr $p + 1).0" $(seq 0 $lastosd)
 942     done
 943     #activate_osd $dir $lastosd || return 1
 944     #ceph tell osd.0 debug kick_recovery_wq 0
 945
 946     sleep 10
 947     ceph pg dump pgs
 948
 949     wait_for_backfill 240 || return 1
 950     wait_for_active 60 || return 1
 951
 952     ceph pg dump pgs
 953
 954     ERRORS=0
 955     if [ "$(ceph pg dump pgs | grep -v "^1.0" | grep +backfill_toofull | wc -l)" != "1" ];
 956     then
 957       echo "One pool should have been in backfill_toofull"
 958       ERRORS="$(expr $ERRORS + 1)"
 959     fi
 960
 961     if [ "$(ceph pg dump pgs | grep -v "^1.0" | grep active+clean | wc -l)" != "1" ];
 962     then
 963       echo "One didn't finish backfill"
 964       ERRORS="$(expr $ERRORS + 1)"
 965     fi
 966
 967     if [ $ERRORS != "0" ];
 968     then
 969       return 1
 970     fi
 971
 972     delete_pool fillpool
 973     for i in $(seq 1 $pools)
 974     do
 975       delete_pool "${poolprefix}$i"
 976     done
 977     kill_daemons $dir || return 1
 978 }
 979
 980 function SKIP_TEST_ec_backfill_multi_partial() {
 981     local dir=$1
 982     local EC=$2
 983     local pools=2
 984     local OSDS=6
 985
 986     run_mon $dir a || return 1
 987     run_mgr $dir x || return 1
 988     export CEPH_ARGS
 989
 990     for osd in $(seq 0 $(expr $OSDS - 1))
 991     do
 992       run_osd $dir $osd || return 1
 993     done
 994
 995     # Below we need to fit 3200K in 3600K which is 88%
 996     # so set to 90%
 997     ceph osd set-backfillfull-ratio .90
 998
 999     ceph osd set-require-min-compat-client luminous
1000     create_pool fillpool 1 1
1001     ceph osd pool set fillpool size 1
1002
1003     # Partially fill an osd
1004     # We have room for 200 48K ec objects, if we create 4k replicated objects
1005     # there is 3600K - (4K * 200) = 2800K which won't hold 2 k=3 shard
1006     # of 200 12K objects which takes ((12K / 3) + 4K) * 200 = 1600K each.
1007     # On the other OSDs 2 * 1600K = 3200K which is 88% of 3600K.
1008     dd if=/dev/urandom of=$dir/datafile bs=1024 count=4
1009     for o in $(seq 1 $objects)
1010     do
1011       rados -p fillpool put obj$o $dir/datafile
1012     done
1013
1014     local fillosd=$(get_primary fillpool obj1)
1015     osd=$(expr $fillosd + 1)
1016     if [ "$osd" = "$OSDS" ]; then
1017       osd="0"
1018     fi
1019
1020     sleep 5
1021     kill $(cat $dir/osd.$fillosd.pid)
1022     ceph osd out osd.$fillosd
1023     sleep 2
1024     ceph osd erasure-code-profile set ec-profile k=3 m=2 crush-failure-domain=osd technique=reed_sol_van plugin=jerasure || return 1
1025
1026     for p in $(seq 1 $pools)
1027     do
1028         ceph osd pool create "${poolprefix}$p" 1 1 erasure ec-profile
1029     done
1030
1031     # Can't wait for clean here because we created a stale pg
1032     #wait_for_clean || return 1
1033     sleep 5
1034
1035     ceph pg dump pgs
1036
1037     dd if=/dev/urandom of=$dir/datafile bs=1024 count=12
1038     for o in $(seq 1 $objects)
1039     do
1040       for p in $(seq 1 $pools)
1041       do
1042         rados -p "${poolprefix}$p" put obj$o $dir/datafile
1043       done
1044     done
1045
1046     #ceph pg map 2.0 --format=json | jq '.'
1047     kill $(cat $dir/osd.$osd.pid)
1048     ceph osd out osd.$osd
1049
1050     _objectstore_tool_nodown $dir $osd --op export --pgid 2.0 --file $dir/export.out
1051     _objectstore_tool_nodown $dir $fillosd --op import --pgid 2.0 --file $dir/export.out
1052
1053     activate_osd $dir $fillosd || return 1
1054     ceph osd in osd.$fillosd
1055     sleep 15
1056
1057     wait_for_backfill 240 || return 1
1058     wait_for_active 60 || return 1
1059
1060     ERRORS=0
1061     if [ "$(ceph pg dump pgs | grep -v "^1.0" | grep +backfill_toofull | wc -l)" != "1" ];
1062     then
1063       echo "One pool should have been in backfill_toofull"
1064       ERRORS="$(expr $ERRORS + 1)"
1065     fi
1066
1067     if [ "$(ceph pg dump pgs | grep -v "^1.0" | grep active+clean | wc -l)" != "1" ];
1068     then
1069       echo "One didn't finish backfill"
1070       ERRORS="$(expr $ERRORS + 1)"
1071     fi
1072
1073     ceph pg dump pgs
1074
1075     if [ $ERRORS != "0" ];
1076     then
1077       return 1
1078     fi
1079
1080     delete_pool fillpool
1081     for i in $(seq 1 $pools)
1082     do
1083       delete_pool "${poolprefix}$i"
1084     done
1085     kill_daemons $dir || return 1
1086 }
1087
1088 # Create 1 EC pool
1089 # Write 200 12K objects ((12K / 3) + 4K) *200) = 1600K
1090 # Take 1 shard's OSD down (with noout set)
1091 # Remove 50 objects ((12K / 3) + 4k) * 50) = 400K
1092 # Write 150 36K objects (grow 150 objects) 2400K
1093 #       But there is already 1600K usage so backfill
1094 #       would be too full if it didn't account for existing data
1095 # Bring back down OSD so it must backfill
1096 # It should go active+clean taking into account data already there
1097 function TEST_ec_backfill_grow() {
1098     local dir=$1
1099     local poolname="test"
1100     local OSDS=6
1101     local k=3
1102     local m=2
1103     local ecobjects=$(expr $objects / $k)
1104
1105     run_mon $dir a || return 1
1106     run_mgr $dir x || return 1
1107
1108     for osd in $(seq 0 $(expr $OSDS - 1))
1109     do
1110       run_osd $dir $osd || return 1
1111     done
1112
1113     ceph osd set-backfillfull-ratio .85
1114
1115     ceph osd set-require-min-compat-client luminous
1116     ceph osd erasure-code-profile set ec-profile k=$k m=$m crush-failure-domain=osd technique=reed_sol_van plugin=jerasure || return 1
1117     ceph osd pool create $poolname 1 1 erasure ec-profile
1118
1119     wait_for_clean || return 1
1120
1121     dd if=/dev/urandom of=${dir}/12kdata bs=1k count=12
1122     for i in $(seq 1 $ecobjects)
1123     do
1124         rados -p $poolname put obj$i $dir/12kdata
1125     done
1126
1127     local PG=$(get_pg $poolname obj1)
1128     # Remember primary during the backfill
1129     local primary=$(get_primary $poolname obj1)
1130     local otherosd=$(get_not_primary $poolname obj1)
1131
1132     ceph osd set noout
1133     kill_daemons $dir TERM $otherosd || return 1
1134
1135     rmobjects=$(expr $ecobjects / 4)
1136     for i in $(seq 1 $rmobjects)
1137     do
1138         rados -p $poolname rm obj$i
1139     done
1140
1141     dd if=/dev/urandom of=${dir}/36kdata bs=1k count=36
1142     for i in $(seq $(expr $rmobjects + 1) $ecobjects)
1143     do
1144         rados -p $poolname put obj$i $dir/36kdata
1145     done
1146
1147     activate_osd $dir $otherosd || return 1
1148
1149     ceph tell osd.$primary debug kick_recovery_wq 0
1150
1151     sleep 2
1152
1153     wait_for_clean || return 1
1154
1155     delete_pool $poolname
1156     kill_daemons $dir || return 1
1157 }
1158
1159 main osd-backfill-space "$@"
1160
1161 # Local Variables:
1162 # compile-command: "make -j4 && ../qa/run-standalone.sh osd-backfill-space.sh"
1163 # End: