ceph/qa/tasks/cephfs/test_full.py

   1
   2
   3 import json
   4 import logging
   5 import os
   6 from textwrap import dedent
   7 import time
   8 from teuthology.orchestra.run import CommandFailedError
   9 from tasks.cephfs.fuse_mount import FuseMount
  10 from tasks.cephfs.cephfs_test_case import CephFSTestCase
  11
  12
  13 log = logging.getLogger(__name__)
  14
  15
  16 class FullnessTestCase(CephFSTestCase):
  17     CLIENTS_REQUIRED = 2
  18
  19     # Subclasses define whether they're filling whole cluster or just data pool
  20     data_only = False
  21
  22     # Subclasses define how many bytes should be written to achieve fullness
  23     pool_capacity = None
  24     fill_mb = None
  25
  26     # Subclasses define what fullness means to them
  27     def is_full(self):
  28         raise NotImplementedError()
  29
  30     def setUp(self):
  31         CephFSTestCase.setUp(self)
  32
  33         mds_status = self.fs.rank_asok(["status"])
  34
  35         # Capture the initial OSD map epoch for later use
  36         self.initial_osd_epoch = mds_status['osdmap_epoch_barrier']
  37
  38     def test_barrier(self):
  39         """
  40         That when an OSD epoch barrier is set on an MDS, subsequently
  41         issued capabilities cause clients to update their OSD map to that
  42         epoch.
  43         """
  44
  45         # Sync up clients with initial MDS OSD map barrier
  46         self.mount_a.open_no_data("foo")
  47         self.mount_b.open_no_data("bar")
  48
  49         # Grab mounts' initial OSD epochs: later we will check that
  50         # it hasn't advanced beyond this point.
  51         mount_a_initial_epoch = self.mount_a.get_osd_epoch()[0]
  52         mount_b_initial_epoch = self.mount_b.get_osd_epoch()[0]
  53
  54         # Freshly mounted at start of test, should be up to date with OSD map
  55         self.assertGreaterEqual(mount_a_initial_epoch, self.initial_osd_epoch)
  56         self.assertGreaterEqual(mount_b_initial_epoch, self.initial_osd_epoch)
  57
  58         # Set and unset a flag to cause OSD epoch to increment
  59         self.fs.mon_manager.raw_cluster_cmd("osd", "set", "pause")
  60         self.fs.mon_manager.raw_cluster_cmd("osd", "unset", "pause")
  61
  62         out = self.fs.mon_manager.raw_cluster_cmd("osd", "dump", "--format=json").strip()
  63         new_epoch = json.loads(out)['epoch']
  64         self.assertNotEqual(self.initial_osd_epoch, new_epoch)
  65
  66         # Do a metadata operation on clients, witness that they end up with
  67         # the old OSD map from startup time (nothing has prompted client
  68         # to update its map)
  69         self.mount_a.open_no_data("alpha")
  70         self.mount_b.open_no_data("bravo1")
  71
  72         # Sleep long enough that if the OSD map was propagating it would
  73         # have done so (this is arbitrary because we are 'waiting' for something
  74         # to *not* happen).
  75         time.sleep(30)
  76
  77         mount_a_epoch, mount_a_barrier = self.mount_a.get_osd_epoch()
  78         self.assertEqual(mount_a_epoch, mount_a_initial_epoch)
  79         mount_b_epoch, mount_b_barrier = self.mount_b.get_osd_epoch()
  80         self.assertEqual(mount_b_epoch, mount_b_initial_epoch)
  81
  82         # Set a barrier on the MDS
  83         self.fs.rank_asok(["osdmap", "barrier", new_epoch.__str__()])
  84
  85         # Do an operation on client B, witness that it ends up with
  86         # the latest OSD map from the barrier.  This shouldn't generate any
  87         # cap revokes to A because B was already the last one to touch
  88         # a file in root.
  89         self.mount_b.run_shell(["touch", "bravo2"])
  90         self.mount_b.open_no_data("bravo2")
  91
  92         # Some time passes here because the metadata part of the operation
  93         # completes immediately, while the resulting OSD map update happens
  94         # asynchronously (it's an Objecter::_maybe_request_map) as a result
  95         # of seeing the new epoch barrier.
  96         self.wait_until_equal(
  97             lambda: self.mount_b.get_osd_epoch(),
  98             (new_epoch, new_epoch),
  99             30,
 100             lambda x: x[0] > new_epoch or x[1] > new_epoch)
 101
 102         # ...and none of this should have affected the oblivious mount a,
 103         # because it wasn't doing any data or metadata IO
 104         mount_a_epoch, mount_a_barrier = self.mount_a.get_osd_epoch()
 105         self.assertEqual(mount_a_epoch, mount_a_initial_epoch)
 106
 107     def _data_pool_name(self):
 108         data_pool_names = self.fs.get_data_pool_names()
 109         if len(data_pool_names) > 1:
 110             raise RuntimeError("This test can't handle multiple data pools")
 111         else:
 112             return data_pool_names[0]
 113
 114     def _test_full(self, easy_case):
 115         """
 116         - That a client trying to write data to a file is prevented
 117         from doing so with an -EFULL result
 118         - That they are also prevented from creating new files by the MDS.
 119         - That they may delete another file to get the system healthy again
 120
 121         :param easy_case: if true, delete a successfully written file to
 122                           free up space.  else, delete the file that experienced
 123                           the failed write.
 124         """
 125
 126         osd_mon_report_interval = int(self.fs.get_config("osd_mon_report_interval", service_type='osd'))
 127
 128         log.info("Writing {0}MB should fill this cluster".format(self.fill_mb))
 129
 130         # Fill up the cluster.  This dd may or may not fail, as it depends on
 131         # how soon the cluster recognises its own fullness
 132         self.mount_a.write_n_mb("large_file_a", self.fill_mb / 2)
 133         try:
 134             self.mount_a.write_n_mb("large_file_b", self.fill_mb / 2)
 135         except CommandFailedError:
 136             log.info("Writing file B failed (full status happened already)")
 137             assert self.is_full()
 138         else:
 139             log.info("Writing file B succeeded (full status will happen soon)")
 140             self.wait_until_true(lambda: self.is_full(),
 141                                  timeout=osd_mon_report_interval * 5)
 142
 143         # Attempting to write more data should give me ENOSPC
 144         with self.assertRaises(CommandFailedError) as ar:
 145             self.mount_a.write_n_mb("large_file_b", 50, seek=self.fill_mb / 2)
 146         self.assertEqual(ar.exception.exitstatus, 1)  # dd returns 1 on "No space"
 147
 148         # Wait for the MDS to see the latest OSD map so that it will reliably
 149         # be applying the policy of rejecting non-deletion metadata operations
 150         # while in the full state.
 151         osd_epoch = json.loads(self.fs.mon_manager.raw_cluster_cmd("osd", "dump", "--format=json-pretty"))['epoch']
 152         self.wait_until_true(
 153             lambda: self.fs.rank_asok(['status'])['osdmap_epoch'] >= osd_epoch,
 154             timeout=10)
 155
 156         if not self.data_only:
 157             with self.assertRaises(CommandFailedError):
 158                 self.mount_a.write_n_mb("small_file_1", 0)
 159
 160         # Clear out some space
 161         if easy_case:
 162             self.mount_a.run_shell(['rm', '-f', 'large_file_a'])
 163             self.mount_a.run_shell(['rm', '-f', 'large_file_b'])
 164         else:
 165             # In the hard case it is the file that filled the system.
 166             # Before the new #7317 (ENOSPC, epoch barrier) changes, this
 167             # would fail because the last objects written would be
 168             # stuck in the client cache as objecter operations.
 169             self.mount_a.run_shell(['rm', '-f', 'large_file_b'])
 170             self.mount_a.run_shell(['rm', '-f', 'large_file_a'])
 171
 172         # Here we are waiting for two things to happen:
 173         # * The MDS to purge the stray folder and execute object deletions
 174         #  * The OSDs to inform the mon that they are no longer full
 175         self.wait_until_true(lambda: not self.is_full(),
 176                              timeout=osd_mon_report_interval * 5)
 177
 178         # Wait for the MDS to see the latest OSD map so that it will reliably
 179         # be applying the free space policy
 180         osd_epoch = json.loads(self.fs.mon_manager.raw_cluster_cmd("osd", "dump", "--format=json-pretty"))['epoch']
 181         self.wait_until_true(
 182             lambda: self.fs.rank_asok(['status'])['osdmap_epoch'] >= osd_epoch,
 183             timeout=10)
 184
 185         # Now I should be able to write again
 186         self.mount_a.write_n_mb("large_file", 50, seek=0)
 187
 188         # Ensure that the MDS keeps its OSD epoch barrier across a restart
 189
 190     def test_full_different_file(self):
 191         self._test_full(True)
 192
 193     def test_full_same_file(self):
 194         self._test_full(False)
 195
 196     def _remote_write_test(self, template):
 197         """
 198         Run some remote python in a way that's useful for
 199         testing free space behaviour (see test_* methods using this)
 200         """
 201         file_path = os.path.join(self.mount_a.mountpoint, "full_test_file")
 202
 203         # Enough to trip the full flag
 204         osd_mon_report_interval = int(self.fs.get_config("osd_mon_report_interval", service_type='osd'))
 205         mon_tick_interval = int(self.fs.get_config("mon_tick_interval", service_type="mon"))
 206
 207         # Sufficient data to cause RADOS cluster to go 'full'
 208         log.info("pool capacity {0}, {1}MB should be enough to fill it".format(self.pool_capacity, self.fill_mb))
 209
 210         # Long enough for RADOS cluster to notice it is full and set flag on mons
 211         # (report_interval for mon to learn PG stats, tick interval for it to update OSD map,
 212         #  factor of 1.5 for I/O + network latency in committing OSD map and distributing it
 213         #  to the OSDs)
 214         full_wait = (osd_mon_report_interval + mon_tick_interval) * 1.5
 215
 216         # Configs for this test should bring this setting down in order to
 217         # run reasonably quickly
 218         if osd_mon_report_interval > 10:
 219             log.warn("This test may run rather slowly unless you decrease"
 220                      "osd_mon_report_interval (5 is a good setting)!")
 221
 222         self.mount_a.run_python(template.format(
 223             fill_mb=self.fill_mb,
 224             file_path=file_path,
 225             full_wait=full_wait,
 226             is_fuse=isinstance(self.mount_a, FuseMount)
 227         ))
 228
 229     def test_full_fclose(self):
 230         # A remote script which opens a file handle, fills up the filesystem, and then
 231         # checks that ENOSPC errors on buffered writes appear correctly as errors in fsync
 232         remote_script = dedent("""
 233             import time
 234             import datetime
 235             import subprocess
 236             import os
 237
 238             # Write some buffered data through before going full, all should be well
 239             print "writing some data through which we expect to succeed"
 240             bytes = 0
 241             f = os.open("{file_path}", os.O_WRONLY | os.O_CREAT)
 242             bytes += os.write(f, 'a' * 512 * 1024)
 243             os.fsync(f)
 244             print "fsync'ed data successfully, will now attempt to fill fs"
 245
 246             # Okay, now we're going to fill up the filesystem, and then keep
 247             # writing until we see an error from fsync.  As long as we're doing
 248             # buffered IO, the error should always only appear from fsync and not
 249             # from write
 250             full = False
 251
 252             for n in range(0, int({fill_mb} * 0.9)):
 253                 bytes += os.write(f, 'x' * 1024 * 1024)
 254                 print "wrote {{0}} bytes via buffered write, may repeat".format(bytes)
 255             print "done writing {{0}} bytes".format(bytes)
 256
 257             # OK, now we should sneak in under the full condition
 258             # due to the time it takes the OSDs to report to the
 259             # mons, and get a successful fsync on our full-making data
 260             os.fsync(f)
 261             print "successfully fsync'ed prior to getting full state reported"
 262
 263             # buffered write, add more dirty data to the buffer
 264             print "starting buffered write"
 265             try:
 266                 for n in range(0, int({fill_mb} * 0.2)):
 267                     bytes += os.write(f, 'x' * 1024 * 1024)
 268                     print "sleeping a bit as we've exceeded 90% of our expected full ratio"
 269                     time.sleep({full_wait})
 270             except OSError:
 271                 pass;
 272
 273             print "wrote, now waiting 30s and then doing a close we expect to fail"
 274
 275             # Wait long enough for a background flush that should fail
 276             time.sleep(30)
 277
 278             if {is_fuse}:
 279                 # ...and check that the failed background flush is reflected in fclose
 280                 try:
 281                     os.close(f)
 282                 except OSError:
 283                     print "close() returned an error as expected"
 284                 else:
 285                     raise RuntimeError("close() failed to raise error")
 286             else:
 287                 # The kernel cephfs client does not raise errors on fclose
 288                 os.close(f)
 289
 290             os.unlink("{file_path}")
 291             """)
 292         self._remote_write_test(remote_script)
 293
 294     def test_full_fsync(self):
 295         """
 296         That when the full flag is encountered during asynchronous
 297         flushes, such that an fwrite() succeeds but an fsync/fclose()
 298         should return the ENOSPC error.
 299         """
 300
 301         # A remote script which opens a file handle, fills up the filesystem, and then
 302         # checks that ENOSPC errors on buffered writes appear correctly as errors in fsync
 303         remote_script = dedent("""
 304             import time
 305             import datetime
 306             import subprocess
 307             import os
 308
 309             # Write some buffered data through before going full, all should be well
 310             print "writing some data through which we expect to succeed"
 311             bytes = 0
 312             f = os.open("{file_path}", os.O_WRONLY | os.O_CREAT)
 313             bytes += os.write(f, 'a' * 4096)
 314             os.fsync(f)
 315             print "fsync'ed data successfully, will now attempt to fill fs"
 316
 317             # Okay, now we're going to fill up the filesystem, and then keep
 318             # writing until we see an error from fsync.  As long as we're doing
 319             # buffered IO, the error should always only appear from fsync and not
 320             # from write
 321             full = False
 322
 323             for n in range(0, int({fill_mb} * 1.1)):
 324                 try:
 325                     bytes += os.write(f, 'x' * 1024 * 1024)
 326                     print "wrote bytes via buffered write, moving on to fsync"
 327                 except OSError as e:
 328                     print "Unexpected error %s from write() instead of fsync()" % e
 329                     raise
 330
 331                 try:
 332                     os.fsync(f)
 333                     print "fsync'ed successfully"
 334                 except OSError as e:
 335                     print "Reached fullness after %.2f MB" % (bytes / (1024.0 * 1024.0))
 336                     full = True
 337                     break
 338                 else:
 339                     print "Not full yet after %.2f MB" % (bytes / (1024.0 * 1024.0))
 340
 341                 if n > {fill_mb} * 0.9:
 342                     # Be cautious in the last region where we expect to hit
 343                     # the full condition, so that we don't overshoot too dramatically
 344                     print "sleeping a bit as we've exceeded 90% of our expected full ratio"
 345                     time.sleep({full_wait})
 346
 347             if not full:
 348                 raise RuntimeError("Failed to reach fullness after writing %d bytes" % bytes)
 349
 350             # close() should not raise an error because we already caught it in
 351             # fsync.  There shouldn't have been any more writeback errors
 352             # since then because all IOs got cancelled on the full flag.
 353             print "calling close"
 354             os.close(f)
 355             print "close() did not raise error"
 356
 357             os.unlink("{file_path}")
 358             """)
 359
 360         self._remote_write_test(remote_script)
 361
 362
 363 class TestQuotaFull(FullnessTestCase):
 364     """
 365     Test per-pool fullness, which indicates quota limits exceeded
 366     """
 367     pool_capacity = 1024 * 1024 * 32   # arbitrary low-ish limit
 368     fill_mb = pool_capacity / (1024 * 1024)
 369
 370     # We are only testing quota handling on the data pool, not the metadata
 371     # pool.
 372     data_only = True
 373
 374     def setUp(self):
 375         super(TestQuotaFull, self).setUp()
 376
 377         pool_name = self.fs.get_data_pool_name()
 378         self.fs.mon_manager.raw_cluster_cmd("osd", "pool", "set-quota", pool_name,
 379                                             "max_bytes", "{0}".format(self.pool_capacity))
 380
 381     def is_full(self):
 382         return self.fs.is_full()
 383
 384
 385 class TestClusterFull(FullnessTestCase):
 386     """
 387     Test data pool fullness, which indicates that an OSD has become too full
 388     """
 389     pool_capacity = None
 390     REQUIRE_MEMSTORE = True
 391
 392     def setUp(self):
 393         super(TestClusterFull, self).setUp()
 394
 395         if self.pool_capacity is None:
 396             max_avail = self.fs.get_pool_df(self._data_pool_name())['max_avail']
 397             full_ratio = float(self.fs.get_config("mon_osd_full_ratio", service_type="mon"))
 398             TestClusterFull.pool_capacity = int(max_avail * full_ratio)
 399             TestClusterFull.fill_mb = (self.pool_capacity / (1024 * 1024))
 400
 401     def is_full(self):
 402         return self.fs.is_full()
 403
 404 # Hide the parent class so that unittest.loader doesn't try to run it.
 405 del globals()['FullnessTestCase']