]> git.proxmox.com Git - ceph.git/blame - ceph/qa/tasks/cephfs/test_full.py
update ceph source to reef 18.1.2
[ceph.git] / ceph / qa / tasks / cephfs / test_full.py
CommitLineData
7c673cae
FG
1import json
2import logging
3import os
4from textwrap import dedent
39ae355f 5from typing import Optional
20effc67 6from teuthology.exceptions import CommandFailedError
7c673cae
FG
7from tasks.cephfs.fuse_mount import FuseMount
8from tasks.cephfs.cephfs_test_case import CephFSTestCase
9
10
11log = logging.getLogger(__name__)
12
13
14class FullnessTestCase(CephFSTestCase):
15 CLIENTS_REQUIRED = 2
16
17 # Subclasses define whether they're filling whole cluster or just data pool
18 data_only = False
19
20 # Subclasses define how many bytes should be written to achieve fullness
39ae355f 21 pool_capacity: Optional[int] = None
7c673cae
FG
22 fill_mb = None
23
7c673cae 24 def is_full(self):
522d829b 25 return self.fs.is_full()
7c673cae
FG
26
27 def setUp(self):
28 CephFSTestCase.setUp(self)
29
92f5a8d4 30 mds_status = self.fs.rank_asok(["status"])
7c673cae
FG
31
32 # Capture the initial OSD map epoch for later use
92f5a8d4 33 self.initial_osd_epoch = mds_status['osdmap_epoch_barrier']
7c673cae
FG
34
35 def test_barrier(self):
36 """
37 That when an OSD epoch barrier is set on an MDS, subsequently
38 issued capabilities cause clients to update their OSD map to that
39 epoch.
40 """
41
1911f103
TL
42 # script that sync up client with MDS OSD map barrier. The barrier should
43 # be updated by cap flush ack message.
44 pyscript = dedent("""
45 import os
46 fd = os.open("{path}", os.O_CREAT | os.O_RDWR, 0O600)
47 os.fchmod(fd, 0O666)
48 os.fsync(fd)
49 os.close(fd)
50 """)
51
52 # Sync up client with initial MDS OSD map barrier.
53 path = os.path.join(self.mount_a.mountpoint, "foo")
54 self.mount_a.run_python(pyscript.format(path=path))
7c673cae
FG
55
56 # Grab mounts' initial OSD epochs: later we will check that
57 # it hasn't advanced beyond this point.
1911f103 58 mount_a_initial_epoch, mount_a_initial_barrier = self.mount_a.get_osd_epoch()
7c673cae
FG
59
60 # Freshly mounted at start of test, should be up to date with OSD map
61 self.assertGreaterEqual(mount_a_initial_epoch, self.initial_osd_epoch)
7c673cae
FG
62
63 # Set and unset a flag to cause OSD epoch to increment
64 self.fs.mon_manager.raw_cluster_cmd("osd", "set", "pause")
65 self.fs.mon_manager.raw_cluster_cmd("osd", "unset", "pause")
66
67 out = self.fs.mon_manager.raw_cluster_cmd("osd", "dump", "--format=json").strip()
68 new_epoch = json.loads(out)['epoch']
69 self.assertNotEqual(self.initial_osd_epoch, new_epoch)
70
71 # Do a metadata operation on clients, witness that they end up with
72 # the old OSD map from startup time (nothing has prompted client
73 # to update its map)
1911f103
TL
74 path = os.path.join(self.mount_a.mountpoint, "foo")
75 self.mount_a.run_python(pyscript.format(path=path))
7c673cae
FG
76 mount_a_epoch, mount_a_barrier = self.mount_a.get_osd_epoch()
77 self.assertEqual(mount_a_epoch, mount_a_initial_epoch)
1911f103 78 self.assertEqual(mount_a_barrier, mount_a_initial_barrier)
7c673cae
FG
79
80 # Set a barrier on the MDS
92f5a8d4 81 self.fs.rank_asok(["osdmap", "barrier", new_epoch.__str__()])
7c673cae 82
1911f103
TL
83 # Sync up client with new MDS OSD map barrier
84 path = os.path.join(self.mount_a.mountpoint, "baz")
85 self.mount_a.run_python(pyscript.format(path=path))
86 mount_a_epoch, mount_a_barrier = self.mount_a.get_osd_epoch()
87 self.assertEqual(mount_a_barrier, new_epoch)
7c673cae
FG
88
89 # Some time passes here because the metadata part of the operation
90 # completes immediately, while the resulting OSD map update happens
91 # asynchronously (it's an Objecter::_maybe_request_map) as a result
92 # of seeing the new epoch barrier.
1911f103
TL
93 self.wait_until_true(
94 lambda: self.mount_a.get_osd_epoch()[0] >= new_epoch,
95 timeout=30)
7c673cae
FG
96
97 def _data_pool_name(self):
98 data_pool_names = self.fs.get_data_pool_names()
99 if len(data_pool_names) > 1:
100 raise RuntimeError("This test can't handle multiple data pools")
101 else:
102 return data_pool_names[0]
103
104 def _test_full(self, easy_case):
105 """
106 - That a client trying to write data to a file is prevented
107 from doing so with an -EFULL result
108 - That they are also prevented from creating new files by the MDS.
109 - That they may delete another file to get the system healthy again
110
111 :param easy_case: if true, delete a successfully written file to
112 free up space. else, delete the file that experienced
113 the failed write.
114 """
115
11fdf7f2 116 osd_mon_report_interval = int(self.fs.get_config("osd_mon_report_interval", service_type='osd'))
7c673cae
FG
117
118 log.info("Writing {0}MB should fill this cluster".format(self.fill_mb))
119
120 # Fill up the cluster. This dd may or may not fail, as it depends on
121 # how soon the cluster recognises its own fullness
e306af50 122 self.mount_a.write_n_mb("large_file_a", self.fill_mb // 2)
7c673cae 123 try:
522d829b 124 self.mount_a.write_n_mb("large_file_b", (self.fill_mb * 1.1) // 2)
7c673cae
FG
125 except CommandFailedError:
126 log.info("Writing file B failed (full status happened already)")
127 assert self.is_full()
128 else:
129 log.info("Writing file B succeeded (full status will happen soon)")
130 self.wait_until_true(lambda: self.is_full(),
522d829b 131 timeout=osd_mon_report_interval * 120)
7c673cae
FG
132
133 # Attempting to write more data should give me ENOSPC
134 with self.assertRaises(CommandFailedError) as ar:
e306af50 135 self.mount_a.write_n_mb("large_file_b", 50, seek=self.fill_mb // 2)
7c673cae
FG
136 self.assertEqual(ar.exception.exitstatus, 1) # dd returns 1 on "No space"
137
138 # Wait for the MDS to see the latest OSD map so that it will reliably
139 # be applying the policy of rejecting non-deletion metadata operations
140 # while in the full state.
141 osd_epoch = json.loads(self.fs.mon_manager.raw_cluster_cmd("osd", "dump", "--format=json-pretty"))['epoch']
142 self.wait_until_true(
92f5a8d4 143 lambda: self.fs.rank_asok(['status'])['osdmap_epoch'] >= osd_epoch,
7c673cae
FG
144 timeout=10)
145
146 if not self.data_only:
147 with self.assertRaises(CommandFailedError):
148 self.mount_a.write_n_mb("small_file_1", 0)
149
150 # Clear out some space
151 if easy_case:
152 self.mount_a.run_shell(['rm', '-f', 'large_file_a'])
153 self.mount_a.run_shell(['rm', '-f', 'large_file_b'])
154 else:
155 # In the hard case it is the file that filled the system.
156 # Before the new #7317 (ENOSPC, epoch barrier) changes, this
157 # would fail because the last objects written would be
158 # stuck in the client cache as objecter operations.
159 self.mount_a.run_shell(['rm', '-f', 'large_file_b'])
160 self.mount_a.run_shell(['rm', '-f', 'large_file_a'])
161
162 # Here we are waiting for two things to happen:
163 # * The MDS to purge the stray folder and execute object deletions
164 # * The OSDs to inform the mon that they are no longer full
165 self.wait_until_true(lambda: not self.is_full(),
522d829b 166 timeout=osd_mon_report_interval * 120)
7c673cae
FG
167
168 # Wait for the MDS to see the latest OSD map so that it will reliably
169 # be applying the free space policy
170 osd_epoch = json.loads(self.fs.mon_manager.raw_cluster_cmd("osd", "dump", "--format=json-pretty"))['epoch']
171 self.wait_until_true(
92f5a8d4 172 lambda: self.fs.rank_asok(['status'])['osdmap_epoch'] >= osd_epoch,
7c673cae
FG
173 timeout=10)
174
175 # Now I should be able to write again
176 self.mount_a.write_n_mb("large_file", 50, seek=0)
177
178 # Ensure that the MDS keeps its OSD epoch barrier across a restart
179
180 def test_full_different_file(self):
181 self._test_full(True)
182
183 def test_full_same_file(self):
184 self._test_full(False)
185
186 def _remote_write_test(self, template):
187 """
188 Run some remote python in a way that's useful for
189 testing free space behaviour (see test_* methods using this)
190 """
191 file_path = os.path.join(self.mount_a.mountpoint, "full_test_file")
192
193 # Enough to trip the full flag
11fdf7f2 194 osd_mon_report_interval = int(self.fs.get_config("osd_mon_report_interval", service_type='osd'))
7c673cae
FG
195 mon_tick_interval = int(self.fs.get_config("mon_tick_interval", service_type="mon"))
196
197 # Sufficient data to cause RADOS cluster to go 'full'
198 log.info("pool capacity {0}, {1}MB should be enough to fill it".format(self.pool_capacity, self.fill_mb))
199
200 # Long enough for RADOS cluster to notice it is full and set flag on mons
201 # (report_interval for mon to learn PG stats, tick interval for it to update OSD map,
202 # factor of 1.5 for I/O + network latency in committing OSD map and distributing it
203 # to the OSDs)
11fdf7f2 204 full_wait = (osd_mon_report_interval + mon_tick_interval) * 1.5
7c673cae
FG
205
206 # Configs for this test should bring this setting down in order to
207 # run reasonably quickly
11fdf7f2 208 if osd_mon_report_interval > 10:
e306af50 209 log.warning("This test may run rather slowly unless you decrease"
11fdf7f2 210 "osd_mon_report_interval (5 is a good setting)!")
7c673cae 211
522d829b
TL
212 # set the object_size to 1MB to make the objects destributed more evenly
213 # among the OSDs to fix Tracker#45434
214 file_layout = "stripe_unit=1048576 stripe_count=1 object_size=1048576"
7c673cae
FG
215 self.mount_a.run_python(template.format(
216 fill_mb=self.fill_mb,
217 file_path=file_path,
522d829b 218 file_layout=file_layout,
7c673cae
FG
219 full_wait=full_wait,
220 is_fuse=isinstance(self.mount_a, FuseMount)
221 ))
222
223 def test_full_fclose(self):
224 # A remote script which opens a file handle, fills up the filesystem, and then
225 # checks that ENOSPC errors on buffered writes appear correctly as errors in fsync
226 remote_script = dedent("""
227 import time
228 import datetime
229 import subprocess
230 import os
231
232 # Write some buffered data through before going full, all should be well
9f95a23c 233 print("writing some data through which we expect to succeed")
7c673cae
FG
234 bytes = 0
235 f = os.open("{file_path}", os.O_WRONLY | os.O_CREAT)
522d829b 236 os.setxattr("{file_path}", 'ceph.file.layout', b'{file_layout}')
9f95a23c 237 bytes += os.write(f, b'a' * 512 * 1024)
7c673cae 238 os.fsync(f)
9f95a23c 239 print("fsync'ed data successfully, will now attempt to fill fs")
7c673cae
FG
240
241 # Okay, now we're going to fill up the filesystem, and then keep
242 # writing until we see an error from fsync. As long as we're doing
243 # buffered IO, the error should always only appear from fsync and not
244 # from write
245 full = False
246
94b18763 247 for n in range(0, int({fill_mb} * 0.9)):
9f95a23c
TL
248 bytes += os.write(f, b'x' * 1024 * 1024)
249 print("wrote {{0}} bytes via buffered write, may repeat".format(bytes))
250 print("done writing {{0}} bytes".format(bytes))
7c673cae
FG
251
252 # OK, now we should sneak in under the full condition
253 # due to the time it takes the OSDs to report to the
254 # mons, and get a successful fsync on our full-making data
255 os.fsync(f)
9f95a23c 256 print("successfully fsync'ed prior to getting full state reported")
7c673cae 257
94b18763 258 # buffered write, add more dirty data to the buffer
9f95a23c 259 print("starting buffered write")
94b18763
FG
260 try:
261 for n in range(0, int({fill_mb} * 0.2)):
9f95a23c
TL
262 bytes += os.write(f, b'x' * 1024 * 1024)
263 print("sleeping a bit as we've exceeded 90% of our expected full ratio")
94b18763
FG
264 time.sleep({full_wait})
265 except OSError:
266 pass;
7c673cae 267
9f95a23c 268 print("wrote, now waiting 30s and then doing a close we expect to fail")
7c673cae
FG
269
270 # Wait long enough for a background flush that should fail
271 time.sleep(30)
272
273 if {is_fuse}:
274 # ...and check that the failed background flush is reflected in fclose
275 try:
276 os.close(f)
277 except OSError:
9f95a23c 278 print("close() returned an error as expected")
7c673cae
FG
279 else:
280 raise RuntimeError("close() failed to raise error")
281 else:
282 # The kernel cephfs client does not raise errors on fclose
283 os.close(f)
284
285 os.unlink("{file_path}")
286 """)
287 self._remote_write_test(remote_script)
288
289 def test_full_fsync(self):
290 """
291 That when the full flag is encountered during asynchronous
292 flushes, such that an fwrite() succeeds but an fsync/fclose()
293 should return the ENOSPC error.
294 """
295
296 # A remote script which opens a file handle, fills up the filesystem, and then
297 # checks that ENOSPC errors on buffered writes appear correctly as errors in fsync
298 remote_script = dedent("""
299 import time
300 import datetime
301 import subprocess
302 import os
303
304 # Write some buffered data through before going full, all should be well
9f95a23c 305 print("writing some data through which we expect to succeed")
7c673cae
FG
306 bytes = 0
307 f = os.open("{file_path}", os.O_WRONLY | os.O_CREAT)
522d829b 308 os.setxattr("{file_path}", 'ceph.file.layout', b'{file_layout}')
9f95a23c 309 bytes += os.write(f, b'a' * 4096)
7c673cae 310 os.fsync(f)
9f95a23c 311 print("fsync'ed data successfully, will now attempt to fill fs")
7c673cae
FG
312
313 # Okay, now we're going to fill up the filesystem, and then keep
314 # writing until we see an error from fsync. As long as we're doing
315 # buffered IO, the error should always only appear from fsync and not
316 # from write
317 full = False
318
94b18763 319 for n in range(0, int({fill_mb} * 1.1)):
7c673cae 320 try:
9f95a23c
TL
321 bytes += os.write(f, b'x' * 1024 * 1024)
322 print("wrote bytes via buffered write, moving on to fsync")
7c673cae 323 except OSError as e:
522d829b
TL
324 if {is_fuse}:
325 print("Unexpected error %s from write() instead of fsync()" % e)
326 raise
327 else:
328 print("Reached fullness after %.2f MB" % (bytes / (1024.0 * 1024.0)))
329 full = True
330 break
7c673cae
FG
331
332 try:
333 os.fsync(f)
9f95a23c 334 print("fsync'ed successfully")
7c673cae 335 except OSError as e:
9f95a23c 336 print("Reached fullness after %.2f MB" % (bytes / (1024.0 * 1024.0)))
7c673cae
FG
337 full = True
338 break
339 else:
9f95a23c 340 print("Not full yet after %.2f MB" % (bytes / (1024.0 * 1024.0)))
7c673cae 341
94b18763 342 if n > {fill_mb} * 0.9:
7c673cae
FG
343 # Be cautious in the last region where we expect to hit
344 # the full condition, so that we don't overshoot too dramatically
9f95a23c 345 print("sleeping a bit as we've exceeded 90% of our expected full ratio")
7c673cae
FG
346 time.sleep({full_wait})
347
348 if not full:
349 raise RuntimeError("Failed to reach fullness after writing %d bytes" % bytes)
350
351 # close() should not raise an error because we already caught it in
352 # fsync. There shouldn't have been any more writeback errors
353 # since then because all IOs got cancelled on the full flag.
9f95a23c 354 print("calling close")
7c673cae 355 os.close(f)
9f95a23c 356 print("close() did not raise error")
7c673cae
FG
357
358 os.unlink("{file_path}")
359 """)
360
361 self._remote_write_test(remote_script)
362
363
364class TestQuotaFull(FullnessTestCase):
365 """
366 Test per-pool fullness, which indicates quota limits exceeded
367 """
9f95a23c 368 pool_capacity = 1024 * 1024 * 32 # arbitrary low-ish limit
e306af50 369 fill_mb = pool_capacity // (1024 * 1024) # type: ignore
7c673cae
FG
370
371 # We are only testing quota handling on the data pool, not the metadata
372 # pool.
373 data_only = True
374
375 def setUp(self):
376 super(TestQuotaFull, self).setUp()
377
378 pool_name = self.fs.get_data_pool_name()
379 self.fs.mon_manager.raw_cluster_cmd("osd", "pool", "set-quota", pool_name,
380 "max_bytes", "{0}".format(self.pool_capacity))
381
7c673cae
FG
382
383class TestClusterFull(FullnessTestCase):
384 """
b32b8144 385 Test data pool fullness, which indicates that an OSD has become too full
7c673cae
FG
386 """
387 pool_capacity = None
388 REQUIRE_MEMSTORE = True
389
390 def setUp(self):
391 super(TestClusterFull, self).setUp()
392
393 if self.pool_capacity is None:
522d829b 394 TestClusterFull.pool_capacity = self.fs.get_pool_df(self._data_pool_name())['max_avail']
e306af50 395 TestClusterFull.fill_mb = (self.pool_capacity // (1024 * 1024))
7c673cae 396
7c673cae
FG
397# Hide the parent class so that unittest.loader doesn't try to run it.
398del globals()['FullnessTestCase']