ceph/qa/tasks/cephfs/test_full.py

   1 import json
   2 import logging
   3 import os
   4 from textwrap import dedent
   5 from typing import Optional
   6 from teuthology.exceptions import CommandFailedError
   7 from tasks.cephfs.fuse_mount import FuseMount
   8 from tasks.cephfs.cephfs_test_case import CephFSTestCase
   9
  10
  11 log = logging.getLogger(__name__)
  12
  13
  14 class FullnessTestCase(CephFSTestCase):
  15     CLIENTS_REQUIRED = 2
  16
  17     # Subclasses define whether they're filling whole cluster or just data pool
  18     data_only = False
  19
  20     # Subclasses define how many bytes should be written to achieve fullness
  21     pool_capacity: Optional[int] = None
  22     fill_mb = None
  23
  24     def is_full(self):
  25         return self.fs.is_full()
  26
  27     def setUp(self):
  28         CephFSTestCase.setUp(self)
  29
  30         mds_status = self.fs.rank_asok(["status"])
  31
  32         # Capture the initial OSD map epoch for later use
  33         self.initial_osd_epoch = mds_status['osdmap_epoch_barrier']
  34
  35     def test_barrier(self):
  36         """
  37         That when an OSD epoch barrier is set on an MDS, subsequently
  38         issued capabilities cause clients to update their OSD map to that
  39         epoch.
  40         """
  41
  42         # script that sync up client with MDS OSD map barrier. The barrier should
  43         # be updated by cap flush ack message.
  44         pyscript = dedent("""
  45             import os
  46             fd = os.open("{path}", os.O_CREAT | os.O_RDWR, 0O600)
  47             os.fchmod(fd, 0O666)
  48             os.fsync(fd)
  49             os.close(fd)
  50             """)
  51
  52         # Sync up client with initial MDS OSD map barrier.
  53         path = os.path.join(self.mount_a.mountpoint, "foo")
  54         self.mount_a.run_python(pyscript.format(path=path))
  55
  56         # Grab mounts' initial OSD epochs: later we will check that
  57         # it hasn't advanced beyond this point.
  58         mount_a_initial_epoch, mount_a_initial_barrier = self.mount_a.get_osd_epoch()
  59
  60         # Freshly mounted at start of test, should be up to date with OSD map
  61         self.assertGreaterEqual(mount_a_initial_epoch, self.initial_osd_epoch)
  62
  63         # Set and unset a flag to cause OSD epoch to increment
  64         self.fs.mon_manager.raw_cluster_cmd("osd", "set", "pause")
  65         self.fs.mon_manager.raw_cluster_cmd("osd", "unset", "pause")
  66
  67         out = self.fs.mon_manager.raw_cluster_cmd("osd", "dump", "--format=json").strip()
  68         new_epoch = json.loads(out)['epoch']
  69         self.assertNotEqual(self.initial_osd_epoch, new_epoch)
  70
  71         # Do a metadata operation on clients, witness that they end up with
  72         # the old OSD map from startup time (nothing has prompted client
  73         # to update its map)
  74         path = os.path.join(self.mount_a.mountpoint, "foo")
  75         self.mount_a.run_python(pyscript.format(path=path))
  76         mount_a_epoch, mount_a_barrier = self.mount_a.get_osd_epoch()
  77         self.assertEqual(mount_a_epoch, mount_a_initial_epoch)
  78         self.assertEqual(mount_a_barrier, mount_a_initial_barrier)
  79
  80         # Set a barrier on the MDS
  81         self.fs.rank_asok(["osdmap", "barrier", new_epoch.__str__()])
  82
  83         # Sync up client with new MDS OSD map barrier
  84         path = os.path.join(self.mount_a.mountpoint, "baz")
  85         self.mount_a.run_python(pyscript.format(path=path))
  86         mount_a_epoch, mount_a_barrier = self.mount_a.get_osd_epoch()
  87         self.assertEqual(mount_a_barrier, new_epoch)
  88
  89         # Some time passes here because the metadata part of the operation
  90         # completes immediately, while the resulting OSD map update happens
  91         # asynchronously (it's an Objecter::_maybe_request_map) as a result
  92         # of seeing the new epoch barrier.
  93         self.wait_until_true(
  94             lambda: self.mount_a.get_osd_epoch()[0] >= new_epoch,
  95             timeout=30)
  96
  97     def _data_pool_name(self):
  98         data_pool_names = self.fs.get_data_pool_names()
  99         if len(data_pool_names) > 1:
 100             raise RuntimeError("This test can't handle multiple data pools")
 101         else:
 102             return data_pool_names[0]
 103
 104     def _test_full(self, easy_case):
 105         """
 106         - That a client trying to write data to a file is prevented
 107         from doing so with an -EFULL result
 108         - That they are also prevented from creating new files by the MDS.
 109         - That they may delete another file to get the system healthy again
 110
 111         :param easy_case: if true, delete a successfully written file to
 112                           free up space.  else, delete the file that experienced
 113                           the failed write.
 114         """
 115
 116         osd_mon_report_interval = int(self.fs.get_config("osd_mon_report_interval", service_type='osd'))
 117
 118         log.info("Writing {0}MB should fill this cluster".format(self.fill_mb))
 119
 120         # Fill up the cluster.  This dd may or may not fail, as it depends on
 121         # how soon the cluster recognises its own fullness
 122         self.mount_a.write_n_mb("large_file_a", self.fill_mb // 2)
 123         try:
 124             self.mount_a.write_n_mb("large_file_b", (self.fill_mb * 1.1) // 2)
 125         except CommandFailedError:
 126             log.info("Writing file B failed (full status happened already)")
 127             assert self.is_full()
 128         else:
 129             log.info("Writing file B succeeded (full status will happen soon)")
 130             self.wait_until_true(lambda: self.is_full(),
 131                                  timeout=osd_mon_report_interval * 120)
 132
 133         # Attempting to write more data should give me ENOSPC
 134         with self.assertRaises(CommandFailedError) as ar:
 135             self.mount_a.write_n_mb("large_file_b", 50, seek=self.fill_mb // 2)
 136         self.assertEqual(ar.exception.exitstatus, 1)  # dd returns 1 on "No space"
 137
 138         # Wait for the MDS to see the latest OSD map so that it will reliably
 139         # be applying the policy of rejecting non-deletion metadata operations
 140         # while in the full state.
 141         osd_epoch = json.loads(self.fs.mon_manager.raw_cluster_cmd("osd", "dump", "--format=json-pretty"))['epoch']
 142         self.wait_until_true(
 143             lambda: self.fs.rank_asok(['status'])['osdmap_epoch'] >= osd_epoch,
 144             timeout=10)
 145
 146         if not self.data_only:
 147             with self.assertRaises(CommandFailedError):
 148                 self.mount_a.write_n_mb("small_file_1", 0)
 149
 150         # Clear out some space
 151         if easy_case:
 152             self.mount_a.run_shell(['rm', '-f', 'large_file_a'])
 153             self.mount_a.run_shell(['rm', '-f', 'large_file_b'])
 154         else:
 155             # In the hard case it is the file that filled the system.
 156             # Before the new #7317 (ENOSPC, epoch barrier) changes, this
 157             # would fail because the last objects written would be
 158             # stuck in the client cache as objecter operations.
 159             self.mount_a.run_shell(['rm', '-f', 'large_file_b'])
 160             self.mount_a.run_shell(['rm', '-f', 'large_file_a'])
 161
 162         # Here we are waiting for two things to happen:
 163         # * The MDS to purge the stray folder and execute object deletions
 164         #  * The OSDs to inform the mon that they are no longer full
 165         self.wait_until_true(lambda: not self.is_full(),
 166                              timeout=osd_mon_report_interval * 120)
 167
 168         # Wait for the MDS to see the latest OSD map so that it will reliably
 169         # be applying the free space policy
 170         osd_epoch = json.loads(self.fs.mon_manager.raw_cluster_cmd("osd", "dump", "--format=json-pretty"))['epoch']
 171         self.wait_until_true(
 172             lambda: self.fs.rank_asok(['status'])['osdmap_epoch'] >= osd_epoch,
 173             timeout=10)
 174
 175         # Now I should be able to write again
 176         self.mount_a.write_n_mb("large_file", 50, seek=0)
 177
 178         # Ensure that the MDS keeps its OSD epoch barrier across a restart
 179
 180     def test_full_different_file(self):
 181         self._test_full(True)
 182
 183     def test_full_same_file(self):
 184         self._test_full(False)
 185
 186     def _remote_write_test(self, template):
 187         """
 188         Run some remote python in a way that's useful for
 189         testing free space behaviour (see test_* methods using this)
 190         """
 191         file_path = os.path.join(self.mount_a.mountpoint, "full_test_file")
 192
 193         # Enough to trip the full flag
 194         osd_mon_report_interval = int(self.fs.get_config("osd_mon_report_interval", service_type='osd'))
 195         mon_tick_interval = int(self.fs.get_config("mon_tick_interval", service_type="mon"))
 196
 197         # Sufficient data to cause RADOS cluster to go 'full'
 198         log.info("pool capacity {0}, {1}MB should be enough to fill it".format(self.pool_capacity, self.fill_mb))
 199
 200         # Long enough for RADOS cluster to notice it is full and set flag on mons
 201         # (report_interval for mon to learn PG stats, tick interval for it to update OSD map,
 202         #  factor of 1.5 for I/O + network latency in committing OSD map and distributing it
 203         #  to the OSDs)
 204         full_wait = (osd_mon_report_interval + mon_tick_interval) * 1.5
 205
 206         # Configs for this test should bring this setting down in order to
 207         # run reasonably quickly
 208         if osd_mon_report_interval > 10:
 209             log.warning("This test may run rather slowly unless you decrease"
 210                      "osd_mon_report_interval (5 is a good setting)!")
 211
 212         # set the object_size to 1MB to make the objects destributed more evenly
 213         # among the OSDs to fix Tracker#45434
 214         file_layout = "stripe_unit=1048576 stripe_count=1 object_size=1048576"
 215         self.mount_a.run_python(template.format(
 216             fill_mb=self.fill_mb,
 217             file_path=file_path,
 218             file_layout=file_layout,
 219             full_wait=full_wait,
 220             is_fuse=isinstance(self.mount_a, FuseMount)
 221         ))
 222
 223     def test_full_fclose(self):
 224         # A remote script which opens a file handle, fills up the filesystem, and then
 225         # checks that ENOSPC errors on buffered writes appear correctly as errors in fsync
 226         remote_script = dedent("""
 227             import time
 228             import datetime
 229             import subprocess
 230             import os
 231
 232             # Write some buffered data through before going full, all should be well
 233             print("writing some data through which we expect to succeed")
 234             bytes = 0
 235             f = os.open("{file_path}", os.O_WRONLY | os.O_CREAT)
 236             os.setxattr("{file_path}", 'ceph.file.layout', b'{file_layout}')
 237             bytes += os.write(f, b'a' * 512 * 1024)
 238             os.fsync(f)
 239             print("fsync'ed data successfully, will now attempt to fill fs")
 240
 241             # Okay, now we're going to fill up the filesystem, and then keep
 242             # writing until we see an error from fsync.  As long as we're doing
 243             # buffered IO, the error should always only appear from fsync and not
 244             # from write
 245             full = False
 246
 247             for n in range(0, int({fill_mb} * 0.9)):
 248                 bytes += os.write(f, b'x' * 1024 * 1024)
 249                 print("wrote {{0}} bytes via buffered write, may repeat".format(bytes))
 250             print("done writing {{0}} bytes".format(bytes))
 251
 252             # OK, now we should sneak in under the full condition
 253             # due to the time it takes the OSDs to report to the
 254             # mons, and get a successful fsync on our full-making data
 255             os.fsync(f)
 256             print("successfully fsync'ed prior to getting full state reported")
 257
 258             # buffered write, add more dirty data to the buffer
 259             print("starting buffered write")
 260             try:
 261                 for n in range(0, int({fill_mb} * 0.2)):
 262                     bytes += os.write(f, b'x' * 1024 * 1024)
 263                     print("sleeping a bit as we've exceeded 90% of our expected full ratio")
 264                     time.sleep({full_wait})
 265             except OSError:
 266                 pass;
 267
 268             print("wrote, now waiting 30s and then doing a close we expect to fail")
 269
 270             # Wait long enough for a background flush that should fail
 271             time.sleep(30)
 272
 273             if {is_fuse}:
 274                 # ...and check that the failed background flush is reflected in fclose
 275                 try:
 276                     os.close(f)
 277                 except OSError:
 278                     print("close() returned an error as expected")
 279                 else:
 280                     raise RuntimeError("close() failed to raise error")
 281             else:
 282                 # The kernel cephfs client does not raise errors on fclose
 283                 os.close(f)
 284
 285             os.unlink("{file_path}")
 286             """)
 287         self._remote_write_test(remote_script)
 288
 289     def test_full_fsync(self):
 290         """
 291         That when the full flag is encountered during asynchronous
 292         flushes, such that an fwrite() succeeds but an fsync/fclose()
 293         should return the ENOSPC error.
 294         """
 295
 296         # A remote script which opens a file handle, fills up the filesystem, and then
 297         # checks that ENOSPC errors on buffered writes appear correctly as errors in fsync
 298         remote_script = dedent("""
 299             import time
 300             import datetime
 301             import subprocess
 302             import os
 303
 304             # Write some buffered data through before going full, all should be well
 305             print("writing some data through which we expect to succeed")
 306             bytes = 0
 307             f = os.open("{file_path}", os.O_WRONLY | os.O_CREAT)
 308             os.setxattr("{file_path}", 'ceph.file.layout', b'{file_layout}')
 309             bytes += os.write(f, b'a' * 4096)
 310             os.fsync(f)
 311             print("fsync'ed data successfully, will now attempt to fill fs")
 312
 313             # Okay, now we're going to fill up the filesystem, and then keep
 314             # writing until we see an error from fsync.  As long as we're doing
 315             # buffered IO, the error should always only appear from fsync and not
 316             # from write
 317             full = False
 318
 319             for n in range(0, int({fill_mb} * 1.1)):
 320                 try:
 321                     bytes += os.write(f, b'x' * 1024 * 1024)
 322                     print("wrote bytes via buffered write, moving on to fsync")
 323                 except OSError as e:
 324                     if {is_fuse}:
 325                         print("Unexpected error %s from write() instead of fsync()" % e)
 326                         raise
 327                     else:
 328                         print("Reached fullness after %.2f MB" % (bytes / (1024.0 * 1024.0)))
 329                         full = True
 330                         break
 331
 332                 try:
 333                     os.fsync(f)
 334                     print("fsync'ed successfully")
 335                 except OSError as e:
 336                     print("Reached fullness after %.2f MB" % (bytes / (1024.0 * 1024.0)))
 337                     full = True
 338                     break
 339                 else:
 340                     print("Not full yet after %.2f MB" % (bytes / (1024.0 * 1024.0)))
 341
 342                 if n > {fill_mb} * 0.9:
 343                     # Be cautious in the last region where we expect to hit
 344                     # the full condition, so that we don't overshoot too dramatically
 345                     print("sleeping a bit as we've exceeded 90% of our expected full ratio")
 346                     time.sleep({full_wait})
 347
 348             if not full:
 349                 raise RuntimeError("Failed to reach fullness after writing %d bytes" % bytes)
 350
 351             # close() should not raise an error because we already caught it in
 352             # fsync.  There shouldn't have been any more writeback errors
 353             # since then because all IOs got cancelled on the full flag.
 354             print("calling close")
 355             os.close(f)
 356             print("close() did not raise error")
 357
 358             os.unlink("{file_path}")
 359             """)
 360
 361         self._remote_write_test(remote_script)
 362
 363
 364 class TestQuotaFull(FullnessTestCase):
 365     """
 366     Test per-pool fullness, which indicates quota limits exceeded
 367     """
 368     pool_capacity = 1024 * 1024 * 32  # arbitrary low-ish limit
 369     fill_mb = pool_capacity // (1024 * 1024)  # type: ignore
 370
 371     # We are only testing quota handling on the data pool, not the metadata
 372     # pool.
 373     data_only = True
 374
 375     def setUp(self):
 376         super(TestQuotaFull, self).setUp()
 377
 378         pool_name = self.fs.get_data_pool_name()
 379         self.fs.mon_manager.raw_cluster_cmd("osd", "pool", "set-quota", pool_name,
 380                                             "max_bytes", "{0}".format(self.pool_capacity))
 381
 382
 383 class TestClusterFull(FullnessTestCase):
 384     """
 385     Test data pool fullness, which indicates that an OSD has become too full
 386     """
 387     pool_capacity = None
 388     REQUIRE_MEMSTORE = True
 389
 390     def setUp(self):
 391         super(TestClusterFull, self).setUp()
 392
 393         if self.pool_capacity is None:
 394             TestClusterFull.pool_capacity = self.fs.get_pool_df(self._data_pool_name())['max_avail']
 395             TestClusterFull.fill_mb = (self.pool_capacity // (1024 * 1024))
 396
 397 # Hide the parent class so that unittest.loader doesn't try to run it.
 398 del globals()['FullnessTestCase']