6 from textwrap
import dedent
8 from teuthology
.orchestra
.run
import CommandFailedError
9 from tasks
.cephfs
.fuse_mount
import FuseMount
10 from tasks
.cephfs
.cephfs_test_case
import CephFSTestCase
13 log
= logging
.getLogger(__name__
)
16 class FullnessTestCase(CephFSTestCase
):
19 # Subclasses define whether they're filling whole cluster or just data pool
22 # Subclasses define how many bytes should be written to achieve fullness
26 # Subclasses define what fullness means to them
28 raise NotImplementedError()
31 CephFSTestCase
.setUp(self
)
33 # These tests just use a single active MDS throughout, so remember its ID
34 # for use in mds_asok calls
35 self
.active_mds_id
= self
.fs
.get_active_names()[0]
37 # Capture the initial OSD map epoch for later use
38 self
.initial_osd_epoch
= json
.loads(
39 self
.fs
.mon_manager
.raw_cluster_cmd("osd", "dump", "--format=json").strip()
42 # Check the initial barrier epoch on the MDS: this should be
43 # set to the latest map at MDS startup. We do this check in
44 # setUp to get in there before subclasses might touch things
45 # in their own setUp functions.
46 self
.assertGreaterEqual(self
.fs
.mds_asok(["status"], mds_id
=self
.active_mds_id
)['osdmap_epoch_barrier'],
47 self
.initial_osd_epoch
)
49 def test_barrier(self
):
51 That when an OSD epoch barrier is set on an MDS, subsequently
52 issued capabilities cause clients to update their OSD map to that
56 # Sync up clients with initial MDS OSD map barrier
57 self
.mount_a
.open_no_data("foo")
58 self
.mount_b
.open_no_data("bar")
60 # Grab mounts' initial OSD epochs: later we will check that
61 # it hasn't advanced beyond this point.
62 mount_a_initial_epoch
= self
.mount_a
.get_osd_epoch()[0]
63 mount_b_initial_epoch
= self
.mount_b
.get_osd_epoch()[0]
65 # Freshly mounted at start of test, should be up to date with OSD map
66 self
.assertGreaterEqual(mount_a_initial_epoch
, self
.initial_osd_epoch
)
67 self
.assertGreaterEqual(mount_b_initial_epoch
, self
.initial_osd_epoch
)
69 # Set and unset a flag to cause OSD epoch to increment
70 self
.fs
.mon_manager
.raw_cluster_cmd("osd", "set", "pause")
71 self
.fs
.mon_manager
.raw_cluster_cmd("osd", "unset", "pause")
73 out
= self
.fs
.mon_manager
.raw_cluster_cmd("osd", "dump", "--format=json").strip()
74 new_epoch
= json
.loads(out
)['epoch']
75 self
.assertNotEqual(self
.initial_osd_epoch
, new_epoch
)
77 # Do a metadata operation on clients, witness that they end up with
78 # the old OSD map from startup time (nothing has prompted client
80 self
.mount_a
.open_no_data("alpha")
81 self
.mount_b
.open_no_data("bravo1")
83 # Sleep long enough that if the OSD map was propagating it would
84 # have done so (this is arbitrary because we are 'waiting' for something
88 mount_a_epoch
, mount_a_barrier
= self
.mount_a
.get_osd_epoch()
89 self
.assertEqual(mount_a_epoch
, mount_a_initial_epoch
)
90 mount_b_epoch
, mount_b_barrier
= self
.mount_b
.get_osd_epoch()
91 self
.assertEqual(mount_b_epoch
, mount_b_initial_epoch
)
93 # Set a barrier on the MDS
94 self
.fs
.mds_asok(["osdmap", "barrier", new_epoch
.__str
__()], mds_id
=self
.active_mds_id
)
96 # Do an operation on client B, witness that it ends up with
97 # the latest OSD map from the barrier. This shouldn't generate any
98 # cap revokes to A because B was already the last one to touch
100 self
.mount_b
.run_shell(["touch", "bravo2"])
101 self
.mount_b
.open_no_data("bravo2")
103 # Some time passes here because the metadata part of the operation
104 # completes immediately, while the resulting OSD map update happens
105 # asynchronously (it's an Objecter::_maybe_request_map) as a result
106 # of seeing the new epoch barrier.
107 self
.wait_until_equal(
108 lambda: self
.mount_b
.get_osd_epoch(),
109 (new_epoch
, new_epoch
),
111 lambda x
: x
[0] > new_epoch
or x
[1] > new_epoch
)
113 # ...and none of this should have affected the oblivious mount a,
114 # because it wasn't doing any data or metadata IO
115 mount_a_epoch
, mount_a_barrier
= self
.mount_a
.get_osd_epoch()
116 self
.assertEqual(mount_a_epoch
, mount_a_initial_epoch
)
118 def _data_pool_name(self
):
119 data_pool_names
= self
.fs
.get_data_pool_names()
120 if len(data_pool_names
) > 1:
121 raise RuntimeError("This test can't handle multiple data pools")
123 return data_pool_names
[0]
125 def _test_full(self
, easy_case
):
127 - That a client trying to write data to a file is prevented
128 from doing so with an -EFULL result
129 - That they are also prevented from creating new files by the MDS.
130 - That they may delete another file to get the system healthy again
132 :param easy_case: if true, delete a successfully written file to
133 free up space. else, delete the file that experienced
137 osd_mon_report_interval_max
= int(self
.fs
.get_config("osd_mon_report_interval_max", service_type
='osd'))
139 log
.info("Writing {0}MB should fill this cluster".format(self
.fill_mb
))
141 # Fill up the cluster. This dd may or may not fail, as it depends on
142 # how soon the cluster recognises its own fullness
143 self
.mount_a
.write_n_mb("large_file_a", self
.fill_mb
/ 2)
145 self
.mount_a
.write_n_mb("large_file_b", self
.fill_mb
/ 2)
146 except CommandFailedError
:
147 log
.info("Writing file B failed (full status happened already)")
148 assert self
.is_full()
150 log
.info("Writing file B succeeded (full status will happen soon)")
151 self
.wait_until_true(lambda: self
.is_full(),
152 timeout
=osd_mon_report_interval_max
* 5)
154 # Attempting to write more data should give me ENOSPC
155 with self
.assertRaises(CommandFailedError
) as ar
:
156 self
.mount_a
.write_n_mb("large_file_b", 50, seek
=self
.fill_mb
/ 2)
157 self
.assertEqual(ar
.exception
.exitstatus
, 1) # dd returns 1 on "No space"
159 # Wait for the MDS to see the latest OSD map so that it will reliably
160 # be applying the policy of rejecting non-deletion metadata operations
161 # while in the full state.
162 osd_epoch
= json
.loads(self
.fs
.mon_manager
.raw_cluster_cmd("osd", "dump", "--format=json-pretty"))['epoch']
163 self
.wait_until_true(
164 lambda: self
.fs
.mds_asok(['status'], mds_id
=self
.active_mds_id
)['osdmap_epoch'] >= osd_epoch
,
167 if not self
.data_only
:
168 with self
.assertRaises(CommandFailedError
):
169 self
.mount_a
.write_n_mb("small_file_1", 0)
171 # Clear out some space
173 self
.mount_a
.run_shell(['rm', '-f', 'large_file_a'])
174 self
.mount_a
.run_shell(['rm', '-f', 'large_file_b'])
176 # In the hard case it is the file that filled the system.
177 # Before the new #7317 (ENOSPC, epoch barrier) changes, this
178 # would fail because the last objects written would be
179 # stuck in the client cache as objecter operations.
180 self
.mount_a
.run_shell(['rm', '-f', 'large_file_b'])
181 self
.mount_a
.run_shell(['rm', '-f', 'large_file_a'])
183 # Here we are waiting for two things to happen:
184 # * The MDS to purge the stray folder and execute object deletions
185 # * The OSDs to inform the mon that they are no longer full
186 self
.wait_until_true(lambda: not self
.is_full(),
187 timeout
=osd_mon_report_interval_max
* 5)
189 # Wait for the MDS to see the latest OSD map so that it will reliably
190 # be applying the free space policy
191 osd_epoch
= json
.loads(self
.fs
.mon_manager
.raw_cluster_cmd("osd", "dump", "--format=json-pretty"))['epoch']
192 self
.wait_until_true(
193 lambda: self
.fs
.mds_asok(['status'], mds_id
=self
.active_mds_id
)['osdmap_epoch'] >= osd_epoch
,
196 # Now I should be able to write again
197 self
.mount_a
.write_n_mb("large_file", 50, seek
=0)
199 # Ensure that the MDS keeps its OSD epoch barrier across a restart
201 def test_full_different_file(self
):
202 self
._test
_full
(True)
204 def test_full_same_file(self
):
205 self
._test
_full
(False)
207 def _remote_write_test(self
, template
):
209 Run some remote python in a way that's useful for
210 testing free space behaviour (see test_* methods using this)
212 file_path
= os
.path
.join(self
.mount_a
.mountpoint
, "full_test_file")
214 # Enough to trip the full flag
215 osd_mon_report_interval_max
= int(self
.fs
.get_config("osd_mon_report_interval_max", service_type
='osd'))
216 mon_tick_interval
= int(self
.fs
.get_config("mon_tick_interval", service_type
="mon"))
218 # Sufficient data to cause RADOS cluster to go 'full'
219 log
.info("pool capacity {0}, {1}MB should be enough to fill it".format(self
.pool_capacity
, self
.fill_mb
))
221 # Long enough for RADOS cluster to notice it is full and set flag on mons
222 # (report_interval for mon to learn PG stats, tick interval for it to update OSD map,
223 # factor of 1.5 for I/O + network latency in committing OSD map and distributing it
225 full_wait
= (osd_mon_report_interval_max
+ mon_tick_interval
) * 1.5
227 # Configs for this test should bring this setting down in order to
228 # run reasonably quickly
229 if osd_mon_report_interval_max
> 10:
230 log
.warn("This test may run rather slowly unless you decrease"
231 "osd_mon_report_interval_max (5 is a good setting)!")
233 self
.mount_a
.run_python(template
.format(
234 fill_mb
=self
.fill_mb
,
237 is_fuse
=isinstance(self
.mount_a
, FuseMount
)
240 def test_full_fclose(self
):
241 # A remote script which opens a file handle, fills up the filesystem, and then
242 # checks that ENOSPC errors on buffered writes appear correctly as errors in fsync
243 remote_script
= dedent("""
249 # Write some buffered data through before going full, all should be well
250 print "writing some data through which we expect to succeed"
252 f = os.open("{file_path}", os.O_WRONLY | os.O_CREAT)
253 bytes += os.write(f, 'a' * 512 * 1024)
255 print "fsync'ed data successfully, will now attempt to fill fs"
257 # Okay, now we're going to fill up the filesystem, and then keep
258 # writing until we see an error from fsync. As long as we're doing
259 # buffered IO, the error should always only appear from fsync and not
263 for n in range(0, int({fill_mb} * 0.9)):
264 bytes += os.write(f, 'x' * 1024 * 1024)
265 print "wrote {{0}} bytes via buffered write, may repeat".format(bytes)
266 print "done writing {{0}} bytes".format(bytes)
268 # OK, now we should sneak in under the full condition
269 # due to the time it takes the OSDs to report to the
270 # mons, and get a successful fsync on our full-making data
272 print "successfully fsync'ed prior to getting full state reported"
274 # buffered write, add more dirty data to the buffer
275 print "starting buffered write"
277 for n in range(0, int({fill_mb} * 0.2)):
278 bytes += os.write(f, 'x' * 1024 * 1024)
279 print "sleeping a bit as we've exceeded 90% of our expected full ratio"
280 time.sleep({full_wait})
284 print "wrote, now waiting 30s and then doing a close we expect to fail"
286 # Wait long enough for a background flush that should fail
290 # ...and check that the failed background flush is reflected in fclose
294 print "close() returned an error as expected"
296 raise RuntimeError("close() failed to raise error")
298 # The kernel cephfs client does not raise errors on fclose
301 os.unlink("{file_path}")
303 self
._remote
_write
_test
(remote_script
)
305 def test_full_fsync(self
):
307 That when the full flag is encountered during asynchronous
308 flushes, such that an fwrite() succeeds but an fsync/fclose()
309 should return the ENOSPC error.
312 # A remote script which opens a file handle, fills up the filesystem, and then
313 # checks that ENOSPC errors on buffered writes appear correctly as errors in fsync
314 remote_script
= dedent("""
320 # Write some buffered data through before going full, all should be well
321 print "writing some data through which we expect to succeed"
323 f = os.open("{file_path}", os.O_WRONLY | os.O_CREAT)
324 bytes += os.write(f, 'a' * 4096)
326 print "fsync'ed data successfully, will now attempt to fill fs"
328 # Okay, now we're going to fill up the filesystem, and then keep
329 # writing until we see an error from fsync. As long as we're doing
330 # buffered IO, the error should always only appear from fsync and not
334 for n in range(0, int({fill_mb} * 1.1)):
336 bytes += os.write(f, 'x' * 1024 * 1024)
337 print "wrote bytes via buffered write, moving on to fsync"
339 print "Unexpected error %s from write() instead of fsync()" % e
344 print "fsync'ed successfully"
346 print "Reached fullness after %.2f MB" % (bytes / (1024.0 * 1024.0))
350 print "Not full yet after %.2f MB" % (bytes / (1024.0 * 1024.0))
352 if n > {fill_mb} * 0.9:
353 # Be cautious in the last region where we expect to hit
354 # the full condition, so that we don't overshoot too dramatically
355 print "sleeping a bit as we've exceeded 90% of our expected full ratio"
356 time.sleep({full_wait})
359 raise RuntimeError("Failed to reach fullness after writing %d bytes" % bytes)
361 # close() should not raise an error because we already caught it in
362 # fsync. There shouldn't have been any more writeback errors
363 # since then because all IOs got cancelled on the full flag.
364 print "calling close"
366 print "close() did not raise error"
368 os.unlink("{file_path}")
371 self
._remote
_write
_test
(remote_script
)
374 class TestQuotaFull(FullnessTestCase
):
376 Test per-pool fullness, which indicates quota limits exceeded
378 pool_capacity
= 1024 * 1024 * 32 # arbitrary low-ish limit
379 fill_mb
= pool_capacity
/ (1024 * 1024)
381 # We are only testing quota handling on the data pool, not the metadata
386 super(TestQuotaFull
, self
).setUp()
388 pool_name
= self
.fs
.get_data_pool_name()
389 self
.fs
.mon_manager
.raw_cluster_cmd("osd", "pool", "set-quota", pool_name
,
390 "max_bytes", "{0}".format(self
.pool_capacity
))
393 return self
.fs
.is_full()
396 class TestClusterFull(FullnessTestCase
):
398 Test data pool fullness, which indicates that an OSD has become too full
401 REQUIRE_MEMSTORE
= True
404 super(TestClusterFull
, self
).setUp()
406 if self
.pool_capacity
is None:
407 max_avail
= self
.fs
.get_pool_df(self
._data
_pool
_name
())['max_avail']
408 full_ratio
= float(self
.fs
.get_config("mon_osd_full_ratio", service_type
="mon"))
409 TestClusterFull
.pool_capacity
= int(max_avail
* full_ratio
)
410 TestClusterFull
.fill_mb
= (self
.pool_capacity
/ (1024 * 1024))
413 return self
.fs
.is_full()
415 # Hide the parent class so that unittest.loader doesn't try to run it.
416 del globals()['FullnessTestCase']