ceph/qa/tasks/cephfs/test_full.py

   1 import json
   2 import logging
   3 import os
   4 from textwrap import dedent
   5 try:
   6     from typing import Optional
   7 except:
   8     # make it work for python2
   9     pass
  10 from teuthology.exceptions import CommandFailedError
  11 from tasks.cephfs.fuse_mount import FuseMount
  12 from tasks.cephfs.cephfs_test_case import CephFSTestCase
  13
  14
  15 log = logging.getLogger(__name__)
  16
  17
  18 class FullnessTestCase(CephFSTestCase):
  19     CLIENTS_REQUIRED = 2
  20
  21     # Subclasses define whether they're filling whole cluster or just data pool
  22     data_only = False
  23
  24     # Subclasses define how many bytes should be written to achieve fullness
  25     pool_capacity = None  # type: Optional[int]
  26     fill_mb = None
  27
  28     def is_full(self):
  29         return self.fs.is_full()
  30
  31     def setUp(self):
  32         CephFSTestCase.setUp(self)
  33
  34         mds_status = self.fs.rank_asok(["status"])
  35
  36         # Capture the initial OSD map epoch for later use
  37         self.initial_osd_epoch = mds_status['osdmap_epoch_barrier']
  38
  39     def test_barrier(self):
  40         """
  41         That when an OSD epoch barrier is set on an MDS, subsequently
  42         issued capabilities cause clients to update their OSD map to that
  43         epoch.
  44         """
  45
  46         # script that sync up client with MDS OSD map barrier. The barrier should
  47         # be updated by cap flush ack message.
  48         pyscript = dedent("""
  49             import os
  50             fd = os.open("{path}", os.O_CREAT | os.O_RDWR, 0O600)
  51             os.fchmod(fd, 0O666)
  52             os.fsync(fd)
  53             os.close(fd)
  54             """)
  55
  56         # Sync up client with initial MDS OSD map barrier.
  57         path = os.path.join(self.mount_a.mountpoint, "foo")
  58         self.mount_a.run_python(pyscript.format(path=path))
  59
  60         # Grab mounts' initial OSD epochs: later we will check that
  61         # it hasn't advanced beyond this point.
  62         mount_a_initial_epoch, mount_a_initial_barrier = self.mount_a.get_osd_epoch()
  63
  64         # Freshly mounted at start of test, should be up to date with OSD map
  65         self.assertGreaterEqual(mount_a_initial_epoch, self.initial_osd_epoch)
  66
  67         # Set and unset a flag to cause OSD epoch to increment
  68         self.fs.mon_manager.raw_cluster_cmd("osd", "set", "pause")
  69         self.fs.mon_manager.raw_cluster_cmd("osd", "unset", "pause")
  70
  71         out = self.fs.mon_manager.raw_cluster_cmd("osd", "dump", "--format=json").strip()
  72         new_epoch = json.loads(out)['epoch']
  73         self.assertNotEqual(self.initial_osd_epoch, new_epoch)
  74
  75         # Do a metadata operation on clients, witness that they end up with
  76         # the old OSD map from startup time (nothing has prompted client
  77         # to update its map)
  78         path = os.path.join(self.mount_a.mountpoint, "foo")
  79         self.mount_a.run_python(pyscript.format(path=path))
  80         mount_a_epoch, mount_a_barrier = self.mount_a.get_osd_epoch()
  81         self.assertEqual(mount_a_epoch, mount_a_initial_epoch)
  82         self.assertEqual(mount_a_barrier, mount_a_initial_barrier)
  83
  84         # Set a barrier on the MDS
  85         self.fs.rank_asok(["osdmap", "barrier", new_epoch.__str__()])
  86
  87         # Sync up client with new MDS OSD map barrier
  88         path = os.path.join(self.mount_a.mountpoint, "baz")
  89         self.mount_a.run_python(pyscript.format(path=path))
  90         mount_a_epoch, mount_a_barrier = self.mount_a.get_osd_epoch()
  91         self.assertEqual(mount_a_barrier, new_epoch)
  92
  93         # Some time passes here because the metadata part of the operation
  94         # completes immediately, while the resulting OSD map update happens
  95         # asynchronously (it's an Objecter::_maybe_request_map) as a result
  96         # of seeing the new epoch barrier.
  97         self.wait_until_true(
  98             lambda: self.mount_a.get_osd_epoch()[0] >= new_epoch,
  99             timeout=30)
 100
 101     def _data_pool_name(self):
 102         data_pool_names = self.fs.get_data_pool_names()
 103         if len(data_pool_names) > 1:
 104             raise RuntimeError("This test can't handle multiple data pools")
 105         else:
 106             return data_pool_names[0]
 107
 108     def _test_full(self, easy_case):
 109         """
 110         - That a client trying to write data to a file is prevented
 111         from doing so with an -EFULL result
 112         - That they are also prevented from creating new files by the MDS.
 113         - That they may delete another file to get the system healthy again
 114
 115         :param easy_case: if true, delete a successfully written file to
 116                           free up space.  else, delete the file that experienced
 117                           the failed write.
 118         """
 119
 120         osd_mon_report_interval = int(self.fs.get_config("osd_mon_report_interval", service_type='osd'))
 121
 122         log.info("Writing {0}MB should fill this cluster".format(self.fill_mb))
 123
 124         # Fill up the cluster.  This dd may or may not fail, as it depends on
 125         # how soon the cluster recognises its own fullness
 126         self.mount_a.write_n_mb("large_file_a", self.fill_mb // 2)
 127         try:
 128             self.mount_a.write_n_mb("large_file_b", (self.fill_mb * 1.1) // 2)
 129         except CommandFailedError:
 130             log.info("Writing file B failed (full status happened already)")
 131             assert self.is_full()
 132         else:
 133             log.info("Writing file B succeeded (full status will happen soon)")
 134             self.wait_until_true(lambda: self.is_full(),
 135                                  timeout=osd_mon_report_interval * 120)
 136
 137         # Attempting to write more data should give me ENOSPC
 138         with self.assertRaises(CommandFailedError) as ar:
 139             self.mount_a.write_n_mb("large_file_b", 50, seek=self.fill_mb // 2)
 140         self.assertEqual(ar.exception.exitstatus, 1)  # dd returns 1 on "No space"
 141
 142         # Wait for the MDS to see the latest OSD map so that it will reliably
 143         # be applying the policy of rejecting non-deletion metadata operations
 144         # while in the full state.
 145         osd_epoch = json.loads(self.fs.mon_manager.raw_cluster_cmd("osd", "dump", "--format=json-pretty"))['epoch']
 146         self.wait_until_true(
 147             lambda: self.fs.rank_asok(['status'])['osdmap_epoch'] >= osd_epoch,
 148             timeout=10)
 149
 150         if not self.data_only:
 151             with self.assertRaises(CommandFailedError):
 152                 self.mount_a.write_n_mb("small_file_1", 0)
 153
 154         # Clear out some space
 155         if easy_case:
 156             self.mount_a.run_shell(['rm', '-f', 'large_file_a'])
 157             self.mount_a.run_shell(['rm', '-f', 'large_file_b'])
 158         else:
 159             # In the hard case it is the file that filled the system.
 160             # Before the new #7317 (ENOSPC, epoch barrier) changes, this
 161             # would fail because the last objects written would be
 162             # stuck in the client cache as objecter operations.
 163             self.mount_a.run_shell(['rm', '-f', 'large_file_b'])
 164             self.mount_a.run_shell(['rm', '-f', 'large_file_a'])
 165
 166         # Here we are waiting for two things to happen:
 167         # * The MDS to purge the stray folder and execute object deletions
 168         #  * The OSDs to inform the mon that they are no longer full
 169         self.wait_until_true(lambda: not self.is_full(),
 170                              timeout=osd_mon_report_interval * 120)
 171
 172         # Wait for the MDS to see the latest OSD map so that it will reliably
 173         # be applying the free space policy
 174         osd_epoch = json.loads(self.fs.mon_manager.raw_cluster_cmd("osd", "dump", "--format=json-pretty"))['epoch']
 175         self.wait_until_true(
 176             lambda: self.fs.rank_asok(['status'])['osdmap_epoch'] >= osd_epoch,
 177             timeout=10)
 178
 179         # Now I should be able to write again
 180         self.mount_a.write_n_mb("large_file", 50, seek=0)
 181
 182         # Ensure that the MDS keeps its OSD epoch barrier across a restart
 183
 184     def test_full_different_file(self):
 185         self._test_full(True)
 186
 187     def test_full_same_file(self):
 188         self._test_full(False)
 189
 190     def _remote_write_test(self, template):
 191         """
 192         Run some remote python in a way that's useful for
 193         testing free space behaviour (see test_* methods using this)
 194         """
 195         file_path = os.path.join(self.mount_a.mountpoint, "full_test_file")
 196
 197         # Enough to trip the full flag
 198         osd_mon_report_interval = int(self.fs.get_config("osd_mon_report_interval", service_type='osd'))
 199         mon_tick_interval = int(self.fs.get_config("mon_tick_interval", service_type="mon"))
 200
 201         # Sufficient data to cause RADOS cluster to go 'full'
 202         log.info("pool capacity {0}, {1}MB should be enough to fill it".format(self.pool_capacity, self.fill_mb))
 203
 204         # Long enough for RADOS cluster to notice it is full and set flag on mons
 205         # (report_interval for mon to learn PG stats, tick interval for it to update OSD map,
 206         #  factor of 1.5 for I/O + network latency in committing OSD map and distributing it
 207         #  to the OSDs)
 208         full_wait = (osd_mon_report_interval + mon_tick_interval) * 1.5
 209
 210         # Configs for this test should bring this setting down in order to
 211         # run reasonably quickly
 212         if osd_mon_report_interval > 10:
 213             log.warning("This test may run rather slowly unless you decrease"
 214                      "osd_mon_report_interval (5 is a good setting)!")
 215
 216         # set the object_size to 1MB to make the objects destributed more evenly
 217         # among the OSDs to fix Tracker#45434
 218         file_layout = "stripe_unit=1048576 stripe_count=1 object_size=1048576"
 219         self.mount_a.run_python(template.format(
 220             fill_mb=self.fill_mb,
 221             file_path=file_path,
 222             file_layout=file_layout,
 223             full_wait=full_wait,
 224             is_fuse=isinstance(self.mount_a, FuseMount)
 225         ))
 226
 227     def test_full_fclose(self):
 228         # A remote script which opens a file handle, fills up the filesystem, and then
 229         # checks that ENOSPC errors on buffered writes appear correctly as errors in fsync
 230         remote_script = dedent("""
 231             import time
 232             import datetime
 233             import subprocess
 234             import os
 235
 236             # Write some buffered data through before going full, all should be well
 237             print("writing some data through which we expect to succeed")
 238             bytes = 0
 239             f = os.open("{file_path}", os.O_WRONLY | os.O_CREAT)
 240             os.setxattr("{file_path}", 'ceph.file.layout', b'{file_layout}')
 241             bytes += os.write(f, b'a' * 512 * 1024)
 242             os.fsync(f)
 243             print("fsync'ed data successfully, will now attempt to fill fs")
 244
 245             # Okay, now we're going to fill up the filesystem, and then keep
 246             # writing until we see an error from fsync.  As long as we're doing
 247             # buffered IO, the error should always only appear from fsync and not
 248             # from write
 249             full = False
 250
 251             for n in range(0, int({fill_mb} * 0.9)):
 252                 bytes += os.write(f, b'x' * 1024 * 1024)
 253                 print("wrote {{0}} bytes via buffered write, may repeat".format(bytes))
 254             print("done writing {{0}} bytes".format(bytes))
 255
 256             # OK, now we should sneak in under the full condition
 257             # due to the time it takes the OSDs to report to the
 258             # mons, and get a successful fsync on our full-making data
 259             os.fsync(f)
 260             print("successfully fsync'ed prior to getting full state reported")
 261
 262             # buffered write, add more dirty data to the buffer
 263             print("starting buffered write")
 264             try:
 265                 for n in range(0, int({fill_mb} * 0.2)):
 266                     bytes += os.write(f, b'x' * 1024 * 1024)
 267                     print("sleeping a bit as we've exceeded 90% of our expected full ratio")
 268                     time.sleep({full_wait})
 269             except OSError:
 270                 pass;
 271
 272             print("wrote, now waiting 30s and then doing a close we expect to fail")
 273
 274             # Wait long enough for a background flush that should fail
 275             time.sleep(30)
 276
 277             if {is_fuse}:
 278                 # ...and check that the failed background flush is reflected in fclose
 279                 try:
 280                     os.close(f)
 281                 except OSError:
 282                     print("close() returned an error as expected")
 283                 else:
 284                     raise RuntimeError("close() failed to raise error")
 285             else:
 286                 # The kernel cephfs client does not raise errors on fclose
 287                 os.close(f)
 288
 289             os.unlink("{file_path}")
 290             """)
 291         self._remote_write_test(remote_script)
 292
 293     def test_full_fsync(self):
 294         """
 295         That when the full flag is encountered during asynchronous
 296         flushes, such that an fwrite() succeeds but an fsync/fclose()
 297         should return the ENOSPC error.
 298         """
 299
 300         # A remote script which opens a file handle, fills up the filesystem, and then
 301         # checks that ENOSPC errors on buffered writes appear correctly as errors in fsync
 302         remote_script = dedent("""
 303             import time
 304             import datetime
 305             import subprocess
 306             import os
 307
 308             # Write some buffered data through before going full, all should be well
 309             print("writing some data through which we expect to succeed")
 310             bytes = 0
 311             f = os.open("{file_path}", os.O_WRONLY | os.O_CREAT)
 312             os.setxattr("{file_path}", 'ceph.file.layout', b'{file_layout}')
 313             bytes += os.write(f, b'a' * 4096)
 314             os.fsync(f)
 315             print("fsync'ed data successfully, will now attempt to fill fs")
 316
 317             # Okay, now we're going to fill up the filesystem, and then keep
 318             # writing until we see an error from fsync.  As long as we're doing
 319             # buffered IO, the error should always only appear from fsync and not
 320             # from write
 321             full = False
 322
 323             for n in range(0, int({fill_mb} * 1.1)):
 324                 try:
 325                     bytes += os.write(f, b'x' * 1024 * 1024)
 326                     print("wrote bytes via buffered write, moving on to fsync")
 327                 except OSError as e:
 328                     if {is_fuse}:
 329                         print("Unexpected error %s from write() instead of fsync()" % e)
 330                         raise
 331                     else:
 332                         print("Reached fullness after %.2f MB" % (bytes / (1024.0 * 1024.0)))
 333                         full = True
 334                         break
 335
 336                 try:
 337                     os.fsync(f)
 338                     print("fsync'ed successfully")
 339                 except OSError as e:
 340                     print("Reached fullness after %.2f MB" % (bytes / (1024.0 * 1024.0)))
 341                     full = True
 342                     break
 343                 else:
 344                     print("Not full yet after %.2f MB" % (bytes / (1024.0 * 1024.0)))
 345
 346                 if n > {fill_mb} * 0.9:
 347                     # Be cautious in the last region where we expect to hit
 348                     # the full condition, so that we don't overshoot too dramatically
 349                     print("sleeping a bit as we've exceeded 90% of our expected full ratio")
 350                     time.sleep({full_wait})
 351
 352             if not full:
 353                 raise RuntimeError("Failed to reach fullness after writing %d bytes" % bytes)
 354
 355             # close() should not raise an error because we already caught it in
 356             # fsync.  There shouldn't have been any more writeback errors
 357             # since then because all IOs got cancelled on the full flag.
 358             print("calling close")
 359             os.close(f)
 360             print("close() did not raise error")
 361
 362             os.unlink("{file_path}")
 363             """)
 364
 365         self._remote_write_test(remote_script)
 366
 367
 368 class TestQuotaFull(FullnessTestCase):
 369     """
 370     Test per-pool fullness, which indicates quota limits exceeded
 371     """
 372     pool_capacity = 1024 * 1024 * 32  # arbitrary low-ish limit
 373     fill_mb = pool_capacity // (1024 * 1024)  # type: ignore
 374
 375     # We are only testing quota handling on the data pool, not the metadata
 376     # pool.
 377     data_only = True
 378
 379     def setUp(self):
 380         super(TestQuotaFull, self).setUp()
 381
 382         pool_name = self.fs.get_data_pool_name()
 383         self.fs.mon_manager.raw_cluster_cmd("osd", "pool", "set-quota", pool_name,
 384                                             "max_bytes", "{0}".format(self.pool_capacity))
 385
 386
 387 class TestClusterFull(FullnessTestCase):
 388     """
 389     Test data pool fullness, which indicates that an OSD has become too full
 390     """
 391     pool_capacity = None
 392     REQUIRE_MEMSTORE = True
 393
 394     def setUp(self):
 395         super(TestClusterFull, self).setUp()
 396
 397         if self.pool_capacity is None:
 398             TestClusterFull.pool_capacity = self.fs.get_pool_df(self._data_pool_name())['max_avail']
 399             TestClusterFull.fill_mb = (self.pool_capacity // (1024 * 1024))
 400
 401 # Hide the parent class so that unittest.loader doesn't try to run it.
 402 del globals()['FullnessTestCase']