]> git.proxmox.com Git - ceph.git/blob - ceph/qa/tasks/cephfs/test_full.py
import new upstream nautilus stable release 14.2.8
[ceph.git] / ceph / qa / tasks / cephfs / test_full.py
1
2
3 import json
4 import logging
5 import os
6 from textwrap import dedent
7 import time
8 from teuthology.orchestra.run import CommandFailedError
9 from tasks.cephfs.fuse_mount import FuseMount
10 from tasks.cephfs.cephfs_test_case import CephFSTestCase
11
12
13 log = logging.getLogger(__name__)
14
15
16 class FullnessTestCase(CephFSTestCase):
17 CLIENTS_REQUIRED = 2
18
19 # Subclasses define whether they're filling whole cluster or just data pool
20 data_only = False
21
22 # Subclasses define how many bytes should be written to achieve fullness
23 pool_capacity = None
24 fill_mb = None
25
26 # Subclasses define what fullness means to them
27 def is_full(self):
28 raise NotImplementedError()
29
30 def setUp(self):
31 CephFSTestCase.setUp(self)
32
33 mds_status = self.fs.rank_asok(["status"])
34
35 # Capture the initial OSD map epoch for later use
36 self.initial_osd_epoch = mds_status['osdmap_epoch_barrier']
37
38 def test_barrier(self):
39 """
40 That when an OSD epoch barrier is set on an MDS, subsequently
41 issued capabilities cause clients to update their OSD map to that
42 epoch.
43 """
44
45 # Sync up clients with initial MDS OSD map barrier
46 self.mount_a.open_no_data("foo")
47 self.mount_b.open_no_data("bar")
48
49 # Grab mounts' initial OSD epochs: later we will check that
50 # it hasn't advanced beyond this point.
51 mount_a_initial_epoch = self.mount_a.get_osd_epoch()[0]
52 mount_b_initial_epoch = self.mount_b.get_osd_epoch()[0]
53
54 # Freshly mounted at start of test, should be up to date with OSD map
55 self.assertGreaterEqual(mount_a_initial_epoch, self.initial_osd_epoch)
56 self.assertGreaterEqual(mount_b_initial_epoch, self.initial_osd_epoch)
57
58 # Set and unset a flag to cause OSD epoch to increment
59 self.fs.mon_manager.raw_cluster_cmd("osd", "set", "pause")
60 self.fs.mon_manager.raw_cluster_cmd("osd", "unset", "pause")
61
62 out = self.fs.mon_manager.raw_cluster_cmd("osd", "dump", "--format=json").strip()
63 new_epoch = json.loads(out)['epoch']
64 self.assertNotEqual(self.initial_osd_epoch, new_epoch)
65
66 # Do a metadata operation on clients, witness that they end up with
67 # the old OSD map from startup time (nothing has prompted client
68 # to update its map)
69 self.mount_a.open_no_data("alpha")
70 self.mount_b.open_no_data("bravo1")
71
72 # Sleep long enough that if the OSD map was propagating it would
73 # have done so (this is arbitrary because we are 'waiting' for something
74 # to *not* happen).
75 time.sleep(30)
76
77 mount_a_epoch, mount_a_barrier = self.mount_a.get_osd_epoch()
78 self.assertEqual(mount_a_epoch, mount_a_initial_epoch)
79 mount_b_epoch, mount_b_barrier = self.mount_b.get_osd_epoch()
80 self.assertEqual(mount_b_epoch, mount_b_initial_epoch)
81
82 # Set a barrier on the MDS
83 self.fs.rank_asok(["osdmap", "barrier", new_epoch.__str__()])
84
85 # Do an operation on client B, witness that it ends up with
86 # the latest OSD map from the barrier. This shouldn't generate any
87 # cap revokes to A because B was already the last one to touch
88 # a file in root.
89 self.mount_b.run_shell(["touch", "bravo2"])
90 self.mount_b.open_no_data("bravo2")
91
92 # Some time passes here because the metadata part of the operation
93 # completes immediately, while the resulting OSD map update happens
94 # asynchronously (it's an Objecter::_maybe_request_map) as a result
95 # of seeing the new epoch barrier.
96 self.wait_until_equal(
97 lambda: self.mount_b.get_osd_epoch(),
98 (new_epoch, new_epoch),
99 30,
100 lambda x: x[0] > new_epoch or x[1] > new_epoch)
101
102 # ...and none of this should have affected the oblivious mount a,
103 # because it wasn't doing any data or metadata IO
104 mount_a_epoch, mount_a_barrier = self.mount_a.get_osd_epoch()
105 self.assertEqual(mount_a_epoch, mount_a_initial_epoch)
106
107 def _data_pool_name(self):
108 data_pool_names = self.fs.get_data_pool_names()
109 if len(data_pool_names) > 1:
110 raise RuntimeError("This test can't handle multiple data pools")
111 else:
112 return data_pool_names[0]
113
114 def _test_full(self, easy_case):
115 """
116 - That a client trying to write data to a file is prevented
117 from doing so with an -EFULL result
118 - That they are also prevented from creating new files by the MDS.
119 - That they may delete another file to get the system healthy again
120
121 :param easy_case: if true, delete a successfully written file to
122 free up space. else, delete the file that experienced
123 the failed write.
124 """
125
126 osd_mon_report_interval = int(self.fs.get_config("osd_mon_report_interval", service_type='osd'))
127
128 log.info("Writing {0}MB should fill this cluster".format(self.fill_mb))
129
130 # Fill up the cluster. This dd may or may not fail, as it depends on
131 # how soon the cluster recognises its own fullness
132 self.mount_a.write_n_mb("large_file_a", self.fill_mb / 2)
133 try:
134 self.mount_a.write_n_mb("large_file_b", self.fill_mb / 2)
135 except CommandFailedError:
136 log.info("Writing file B failed (full status happened already)")
137 assert self.is_full()
138 else:
139 log.info("Writing file B succeeded (full status will happen soon)")
140 self.wait_until_true(lambda: self.is_full(),
141 timeout=osd_mon_report_interval * 5)
142
143 # Attempting to write more data should give me ENOSPC
144 with self.assertRaises(CommandFailedError) as ar:
145 self.mount_a.write_n_mb("large_file_b", 50, seek=self.fill_mb / 2)
146 self.assertEqual(ar.exception.exitstatus, 1) # dd returns 1 on "No space"
147
148 # Wait for the MDS to see the latest OSD map so that it will reliably
149 # be applying the policy of rejecting non-deletion metadata operations
150 # while in the full state.
151 osd_epoch = json.loads(self.fs.mon_manager.raw_cluster_cmd("osd", "dump", "--format=json-pretty"))['epoch']
152 self.wait_until_true(
153 lambda: self.fs.rank_asok(['status'])['osdmap_epoch'] >= osd_epoch,
154 timeout=10)
155
156 if not self.data_only:
157 with self.assertRaises(CommandFailedError):
158 self.mount_a.write_n_mb("small_file_1", 0)
159
160 # Clear out some space
161 if easy_case:
162 self.mount_a.run_shell(['rm', '-f', 'large_file_a'])
163 self.mount_a.run_shell(['rm', '-f', 'large_file_b'])
164 else:
165 # In the hard case it is the file that filled the system.
166 # Before the new #7317 (ENOSPC, epoch barrier) changes, this
167 # would fail because the last objects written would be
168 # stuck in the client cache as objecter operations.
169 self.mount_a.run_shell(['rm', '-f', 'large_file_b'])
170 self.mount_a.run_shell(['rm', '-f', 'large_file_a'])
171
172 # Here we are waiting for two things to happen:
173 # * The MDS to purge the stray folder and execute object deletions
174 # * The OSDs to inform the mon that they are no longer full
175 self.wait_until_true(lambda: not self.is_full(),
176 timeout=osd_mon_report_interval * 5)
177
178 # Wait for the MDS to see the latest OSD map so that it will reliably
179 # be applying the free space policy
180 osd_epoch = json.loads(self.fs.mon_manager.raw_cluster_cmd("osd", "dump", "--format=json-pretty"))['epoch']
181 self.wait_until_true(
182 lambda: self.fs.rank_asok(['status'])['osdmap_epoch'] >= osd_epoch,
183 timeout=10)
184
185 # Now I should be able to write again
186 self.mount_a.write_n_mb("large_file", 50, seek=0)
187
188 # Ensure that the MDS keeps its OSD epoch barrier across a restart
189
190 def test_full_different_file(self):
191 self._test_full(True)
192
193 def test_full_same_file(self):
194 self._test_full(False)
195
196 def _remote_write_test(self, template):
197 """
198 Run some remote python in a way that's useful for
199 testing free space behaviour (see test_* methods using this)
200 """
201 file_path = os.path.join(self.mount_a.mountpoint, "full_test_file")
202
203 # Enough to trip the full flag
204 osd_mon_report_interval = int(self.fs.get_config("osd_mon_report_interval", service_type='osd'))
205 mon_tick_interval = int(self.fs.get_config("mon_tick_interval", service_type="mon"))
206
207 # Sufficient data to cause RADOS cluster to go 'full'
208 log.info("pool capacity {0}, {1}MB should be enough to fill it".format(self.pool_capacity, self.fill_mb))
209
210 # Long enough for RADOS cluster to notice it is full and set flag on mons
211 # (report_interval for mon to learn PG stats, tick interval for it to update OSD map,
212 # factor of 1.5 for I/O + network latency in committing OSD map and distributing it
213 # to the OSDs)
214 full_wait = (osd_mon_report_interval + mon_tick_interval) * 1.5
215
216 # Configs for this test should bring this setting down in order to
217 # run reasonably quickly
218 if osd_mon_report_interval > 10:
219 log.warn("This test may run rather slowly unless you decrease"
220 "osd_mon_report_interval (5 is a good setting)!")
221
222 self.mount_a.run_python(template.format(
223 fill_mb=self.fill_mb,
224 file_path=file_path,
225 full_wait=full_wait,
226 is_fuse=isinstance(self.mount_a, FuseMount)
227 ))
228
229 def test_full_fclose(self):
230 # A remote script which opens a file handle, fills up the filesystem, and then
231 # checks that ENOSPC errors on buffered writes appear correctly as errors in fsync
232 remote_script = dedent("""
233 import time
234 import datetime
235 import subprocess
236 import os
237
238 # Write some buffered data through before going full, all should be well
239 print "writing some data through which we expect to succeed"
240 bytes = 0
241 f = os.open("{file_path}", os.O_WRONLY | os.O_CREAT)
242 bytes += os.write(f, 'a' * 512 * 1024)
243 os.fsync(f)
244 print "fsync'ed data successfully, will now attempt to fill fs"
245
246 # Okay, now we're going to fill up the filesystem, and then keep
247 # writing until we see an error from fsync. As long as we're doing
248 # buffered IO, the error should always only appear from fsync and not
249 # from write
250 full = False
251
252 for n in range(0, int({fill_mb} * 0.9)):
253 bytes += os.write(f, 'x' * 1024 * 1024)
254 print "wrote {{0}} bytes via buffered write, may repeat".format(bytes)
255 print "done writing {{0}} bytes".format(bytes)
256
257 # OK, now we should sneak in under the full condition
258 # due to the time it takes the OSDs to report to the
259 # mons, and get a successful fsync on our full-making data
260 os.fsync(f)
261 print "successfully fsync'ed prior to getting full state reported"
262
263 # buffered write, add more dirty data to the buffer
264 print "starting buffered write"
265 try:
266 for n in range(0, int({fill_mb} * 0.2)):
267 bytes += os.write(f, 'x' * 1024 * 1024)
268 print "sleeping a bit as we've exceeded 90% of our expected full ratio"
269 time.sleep({full_wait})
270 except OSError:
271 pass;
272
273 print "wrote, now waiting 30s and then doing a close we expect to fail"
274
275 # Wait long enough for a background flush that should fail
276 time.sleep(30)
277
278 if {is_fuse}:
279 # ...and check that the failed background flush is reflected in fclose
280 try:
281 os.close(f)
282 except OSError:
283 print "close() returned an error as expected"
284 else:
285 raise RuntimeError("close() failed to raise error")
286 else:
287 # The kernel cephfs client does not raise errors on fclose
288 os.close(f)
289
290 os.unlink("{file_path}")
291 """)
292 self._remote_write_test(remote_script)
293
294 def test_full_fsync(self):
295 """
296 That when the full flag is encountered during asynchronous
297 flushes, such that an fwrite() succeeds but an fsync/fclose()
298 should return the ENOSPC error.
299 """
300
301 # A remote script which opens a file handle, fills up the filesystem, and then
302 # checks that ENOSPC errors on buffered writes appear correctly as errors in fsync
303 remote_script = dedent("""
304 import time
305 import datetime
306 import subprocess
307 import os
308
309 # Write some buffered data through before going full, all should be well
310 print "writing some data through which we expect to succeed"
311 bytes = 0
312 f = os.open("{file_path}", os.O_WRONLY | os.O_CREAT)
313 bytes += os.write(f, 'a' * 4096)
314 os.fsync(f)
315 print "fsync'ed data successfully, will now attempt to fill fs"
316
317 # Okay, now we're going to fill up the filesystem, and then keep
318 # writing until we see an error from fsync. As long as we're doing
319 # buffered IO, the error should always only appear from fsync and not
320 # from write
321 full = False
322
323 for n in range(0, int({fill_mb} * 1.1)):
324 try:
325 bytes += os.write(f, 'x' * 1024 * 1024)
326 print "wrote bytes via buffered write, moving on to fsync"
327 except OSError as e:
328 print "Unexpected error %s from write() instead of fsync()" % e
329 raise
330
331 try:
332 os.fsync(f)
333 print "fsync'ed successfully"
334 except OSError as e:
335 print "Reached fullness after %.2f MB" % (bytes / (1024.0 * 1024.0))
336 full = True
337 break
338 else:
339 print "Not full yet after %.2f MB" % (bytes / (1024.0 * 1024.0))
340
341 if n > {fill_mb} * 0.9:
342 # Be cautious in the last region where we expect to hit
343 # the full condition, so that we don't overshoot too dramatically
344 print "sleeping a bit as we've exceeded 90% of our expected full ratio"
345 time.sleep({full_wait})
346
347 if not full:
348 raise RuntimeError("Failed to reach fullness after writing %d bytes" % bytes)
349
350 # close() should not raise an error because we already caught it in
351 # fsync. There shouldn't have been any more writeback errors
352 # since then because all IOs got cancelled on the full flag.
353 print "calling close"
354 os.close(f)
355 print "close() did not raise error"
356
357 os.unlink("{file_path}")
358 """)
359
360 self._remote_write_test(remote_script)
361
362
363 class TestQuotaFull(FullnessTestCase):
364 """
365 Test per-pool fullness, which indicates quota limits exceeded
366 """
367 pool_capacity = 1024 * 1024 * 32 # arbitrary low-ish limit
368 fill_mb = pool_capacity / (1024 * 1024)
369
370 # We are only testing quota handling on the data pool, not the metadata
371 # pool.
372 data_only = True
373
374 def setUp(self):
375 super(TestQuotaFull, self).setUp()
376
377 pool_name = self.fs.get_data_pool_name()
378 self.fs.mon_manager.raw_cluster_cmd("osd", "pool", "set-quota", pool_name,
379 "max_bytes", "{0}".format(self.pool_capacity))
380
381 def is_full(self):
382 return self.fs.is_full()
383
384
385 class TestClusterFull(FullnessTestCase):
386 """
387 Test data pool fullness, which indicates that an OSD has become too full
388 """
389 pool_capacity = None
390 REQUIRE_MEMSTORE = True
391
392 def setUp(self):
393 super(TestClusterFull, self).setUp()
394
395 if self.pool_capacity is None:
396 max_avail = self.fs.get_pool_df(self._data_pool_name())['max_avail']
397 full_ratio = float(self.fs.get_config("mon_osd_full_ratio", service_type="mon"))
398 TestClusterFull.pool_capacity = int(max_avail * full_ratio)
399 TestClusterFull.fill_mb = (self.pool_capacity / (1024 * 1024))
400
401 def is_full(self):
402 return self.fs.is_full()
403
404 # Hide the parent class so that unittest.loader doesn't try to run it.
405 del globals()['FullnessTestCase']