]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | import json |
2 | import logging | |
3 | import os | |
4 | from textwrap import dedent | |
39ae355f | 5 | from typing import Optional |
20effc67 | 6 | from teuthology.exceptions import CommandFailedError |
7c673cae FG |
7 | from tasks.cephfs.fuse_mount import FuseMount |
8 | from tasks.cephfs.cephfs_test_case import CephFSTestCase | |
9 | ||
10 | ||
11 | log = logging.getLogger(__name__) | |
12 | ||
13 | ||
14 | class FullnessTestCase(CephFSTestCase): | |
15 | CLIENTS_REQUIRED = 2 | |
16 | ||
17 | # Subclasses define whether they're filling whole cluster or just data pool | |
18 | data_only = False | |
19 | ||
20 | # Subclasses define how many bytes should be written to achieve fullness | |
39ae355f | 21 | pool_capacity: Optional[int] = None |
7c673cae FG |
22 | fill_mb = None |
23 | ||
7c673cae | 24 | def is_full(self): |
522d829b | 25 | return self.fs.is_full() |
7c673cae FG |
26 | |
27 | def setUp(self): | |
28 | CephFSTestCase.setUp(self) | |
29 | ||
92f5a8d4 | 30 | mds_status = self.fs.rank_asok(["status"]) |
7c673cae FG |
31 | |
32 | # Capture the initial OSD map epoch for later use | |
92f5a8d4 | 33 | self.initial_osd_epoch = mds_status['osdmap_epoch_barrier'] |
7c673cae FG |
34 | |
35 | def test_barrier(self): | |
36 | """ | |
37 | That when an OSD epoch barrier is set on an MDS, subsequently | |
38 | issued capabilities cause clients to update their OSD map to that | |
39 | epoch. | |
40 | """ | |
41 | ||
1911f103 TL |
42 | # script that sync up client with MDS OSD map barrier. The barrier should |
43 | # be updated by cap flush ack message. | |
44 | pyscript = dedent(""" | |
45 | import os | |
46 | fd = os.open("{path}", os.O_CREAT | os.O_RDWR, 0O600) | |
47 | os.fchmod(fd, 0O666) | |
48 | os.fsync(fd) | |
49 | os.close(fd) | |
50 | """) | |
51 | ||
52 | # Sync up client with initial MDS OSD map barrier. | |
53 | path = os.path.join(self.mount_a.mountpoint, "foo") | |
54 | self.mount_a.run_python(pyscript.format(path=path)) | |
7c673cae FG |
55 | |
56 | # Grab mounts' initial OSD epochs: later we will check that | |
57 | # it hasn't advanced beyond this point. | |
1911f103 | 58 | mount_a_initial_epoch, mount_a_initial_barrier = self.mount_a.get_osd_epoch() |
7c673cae FG |
59 | |
60 | # Freshly mounted at start of test, should be up to date with OSD map | |
61 | self.assertGreaterEqual(mount_a_initial_epoch, self.initial_osd_epoch) | |
7c673cae FG |
62 | |
63 | # Set and unset a flag to cause OSD epoch to increment | |
64 | self.fs.mon_manager.raw_cluster_cmd("osd", "set", "pause") | |
65 | self.fs.mon_manager.raw_cluster_cmd("osd", "unset", "pause") | |
66 | ||
67 | out = self.fs.mon_manager.raw_cluster_cmd("osd", "dump", "--format=json").strip() | |
68 | new_epoch = json.loads(out)['epoch'] | |
69 | self.assertNotEqual(self.initial_osd_epoch, new_epoch) | |
70 | ||
71 | # Do a metadata operation on clients, witness that they end up with | |
72 | # the old OSD map from startup time (nothing has prompted client | |
73 | # to update its map) | |
1911f103 TL |
74 | path = os.path.join(self.mount_a.mountpoint, "foo") |
75 | self.mount_a.run_python(pyscript.format(path=path)) | |
7c673cae FG |
76 | mount_a_epoch, mount_a_barrier = self.mount_a.get_osd_epoch() |
77 | self.assertEqual(mount_a_epoch, mount_a_initial_epoch) | |
1911f103 | 78 | self.assertEqual(mount_a_barrier, mount_a_initial_barrier) |
7c673cae FG |
79 | |
80 | # Set a barrier on the MDS | |
92f5a8d4 | 81 | self.fs.rank_asok(["osdmap", "barrier", new_epoch.__str__()]) |
7c673cae | 82 | |
1911f103 TL |
83 | # Sync up client with new MDS OSD map barrier |
84 | path = os.path.join(self.mount_a.mountpoint, "baz") | |
85 | self.mount_a.run_python(pyscript.format(path=path)) | |
86 | mount_a_epoch, mount_a_barrier = self.mount_a.get_osd_epoch() | |
87 | self.assertEqual(mount_a_barrier, new_epoch) | |
7c673cae FG |
88 | |
89 | # Some time passes here because the metadata part of the operation | |
90 | # completes immediately, while the resulting OSD map update happens | |
91 | # asynchronously (it's an Objecter::_maybe_request_map) as a result | |
92 | # of seeing the new epoch barrier. | |
1911f103 TL |
93 | self.wait_until_true( |
94 | lambda: self.mount_a.get_osd_epoch()[0] >= new_epoch, | |
95 | timeout=30) | |
7c673cae FG |
96 | |
97 | def _data_pool_name(self): | |
98 | data_pool_names = self.fs.get_data_pool_names() | |
99 | if len(data_pool_names) > 1: | |
100 | raise RuntimeError("This test can't handle multiple data pools") | |
101 | else: | |
102 | return data_pool_names[0] | |
103 | ||
104 | def _test_full(self, easy_case): | |
105 | """ | |
106 | - That a client trying to write data to a file is prevented | |
107 | from doing so with an -EFULL result | |
108 | - That they are also prevented from creating new files by the MDS. | |
109 | - That they may delete another file to get the system healthy again | |
110 | ||
111 | :param easy_case: if true, delete a successfully written file to | |
112 | free up space. else, delete the file that experienced | |
113 | the failed write. | |
114 | """ | |
115 | ||
11fdf7f2 | 116 | osd_mon_report_interval = int(self.fs.get_config("osd_mon_report_interval", service_type='osd')) |
7c673cae FG |
117 | |
118 | log.info("Writing {0}MB should fill this cluster".format(self.fill_mb)) | |
119 | ||
120 | # Fill up the cluster. This dd may or may not fail, as it depends on | |
121 | # how soon the cluster recognises its own fullness | |
e306af50 | 122 | self.mount_a.write_n_mb("large_file_a", self.fill_mb // 2) |
7c673cae | 123 | try: |
522d829b | 124 | self.mount_a.write_n_mb("large_file_b", (self.fill_mb * 1.1) // 2) |
7c673cae FG |
125 | except CommandFailedError: |
126 | log.info("Writing file B failed (full status happened already)") | |
127 | assert self.is_full() | |
128 | else: | |
129 | log.info("Writing file B succeeded (full status will happen soon)") | |
130 | self.wait_until_true(lambda: self.is_full(), | |
522d829b | 131 | timeout=osd_mon_report_interval * 120) |
7c673cae FG |
132 | |
133 | # Attempting to write more data should give me ENOSPC | |
134 | with self.assertRaises(CommandFailedError) as ar: | |
e306af50 | 135 | self.mount_a.write_n_mb("large_file_b", 50, seek=self.fill_mb // 2) |
7c673cae FG |
136 | self.assertEqual(ar.exception.exitstatus, 1) # dd returns 1 on "No space" |
137 | ||
138 | # Wait for the MDS to see the latest OSD map so that it will reliably | |
139 | # be applying the policy of rejecting non-deletion metadata operations | |
140 | # while in the full state. | |
141 | osd_epoch = json.loads(self.fs.mon_manager.raw_cluster_cmd("osd", "dump", "--format=json-pretty"))['epoch'] | |
142 | self.wait_until_true( | |
92f5a8d4 | 143 | lambda: self.fs.rank_asok(['status'])['osdmap_epoch'] >= osd_epoch, |
7c673cae FG |
144 | timeout=10) |
145 | ||
146 | if not self.data_only: | |
147 | with self.assertRaises(CommandFailedError): | |
148 | self.mount_a.write_n_mb("small_file_1", 0) | |
149 | ||
150 | # Clear out some space | |
151 | if easy_case: | |
152 | self.mount_a.run_shell(['rm', '-f', 'large_file_a']) | |
153 | self.mount_a.run_shell(['rm', '-f', 'large_file_b']) | |
154 | else: | |
155 | # In the hard case it is the file that filled the system. | |
156 | # Before the new #7317 (ENOSPC, epoch barrier) changes, this | |
157 | # would fail because the last objects written would be | |
158 | # stuck in the client cache as objecter operations. | |
159 | self.mount_a.run_shell(['rm', '-f', 'large_file_b']) | |
160 | self.mount_a.run_shell(['rm', '-f', 'large_file_a']) | |
161 | ||
162 | # Here we are waiting for two things to happen: | |
163 | # * The MDS to purge the stray folder and execute object deletions | |
164 | # * The OSDs to inform the mon that they are no longer full | |
165 | self.wait_until_true(lambda: not self.is_full(), | |
522d829b | 166 | timeout=osd_mon_report_interval * 120) |
7c673cae FG |
167 | |
168 | # Wait for the MDS to see the latest OSD map so that it will reliably | |
169 | # be applying the free space policy | |
170 | osd_epoch = json.loads(self.fs.mon_manager.raw_cluster_cmd("osd", "dump", "--format=json-pretty"))['epoch'] | |
171 | self.wait_until_true( | |
92f5a8d4 | 172 | lambda: self.fs.rank_asok(['status'])['osdmap_epoch'] >= osd_epoch, |
7c673cae FG |
173 | timeout=10) |
174 | ||
175 | # Now I should be able to write again | |
176 | self.mount_a.write_n_mb("large_file", 50, seek=0) | |
177 | ||
178 | # Ensure that the MDS keeps its OSD epoch barrier across a restart | |
179 | ||
180 | def test_full_different_file(self): | |
181 | self._test_full(True) | |
182 | ||
183 | def test_full_same_file(self): | |
184 | self._test_full(False) | |
185 | ||
186 | def _remote_write_test(self, template): | |
187 | """ | |
188 | Run some remote python in a way that's useful for | |
189 | testing free space behaviour (see test_* methods using this) | |
190 | """ | |
191 | file_path = os.path.join(self.mount_a.mountpoint, "full_test_file") | |
192 | ||
193 | # Enough to trip the full flag | |
11fdf7f2 | 194 | osd_mon_report_interval = int(self.fs.get_config("osd_mon_report_interval", service_type='osd')) |
7c673cae FG |
195 | mon_tick_interval = int(self.fs.get_config("mon_tick_interval", service_type="mon")) |
196 | ||
197 | # Sufficient data to cause RADOS cluster to go 'full' | |
198 | log.info("pool capacity {0}, {1}MB should be enough to fill it".format(self.pool_capacity, self.fill_mb)) | |
199 | ||
200 | # Long enough for RADOS cluster to notice it is full and set flag on mons | |
201 | # (report_interval for mon to learn PG stats, tick interval for it to update OSD map, | |
202 | # factor of 1.5 for I/O + network latency in committing OSD map and distributing it | |
203 | # to the OSDs) | |
11fdf7f2 | 204 | full_wait = (osd_mon_report_interval + mon_tick_interval) * 1.5 |
7c673cae FG |
205 | |
206 | # Configs for this test should bring this setting down in order to | |
207 | # run reasonably quickly | |
11fdf7f2 | 208 | if osd_mon_report_interval > 10: |
e306af50 | 209 | log.warning("This test may run rather slowly unless you decrease" |
11fdf7f2 | 210 | "osd_mon_report_interval (5 is a good setting)!") |
7c673cae | 211 | |
522d829b TL |
212 | # set the object_size to 1MB to make the objects destributed more evenly |
213 | # among the OSDs to fix Tracker#45434 | |
214 | file_layout = "stripe_unit=1048576 stripe_count=1 object_size=1048576" | |
7c673cae FG |
215 | self.mount_a.run_python(template.format( |
216 | fill_mb=self.fill_mb, | |
217 | file_path=file_path, | |
522d829b | 218 | file_layout=file_layout, |
7c673cae FG |
219 | full_wait=full_wait, |
220 | is_fuse=isinstance(self.mount_a, FuseMount) | |
221 | )) | |
222 | ||
223 | def test_full_fclose(self): | |
224 | # A remote script which opens a file handle, fills up the filesystem, and then | |
225 | # checks that ENOSPC errors on buffered writes appear correctly as errors in fsync | |
226 | remote_script = dedent(""" | |
227 | import time | |
228 | import datetime | |
229 | import subprocess | |
230 | import os | |
231 | ||
232 | # Write some buffered data through before going full, all should be well | |
9f95a23c | 233 | print("writing some data through which we expect to succeed") |
7c673cae FG |
234 | bytes = 0 |
235 | f = os.open("{file_path}", os.O_WRONLY | os.O_CREAT) | |
522d829b | 236 | os.setxattr("{file_path}", 'ceph.file.layout', b'{file_layout}') |
9f95a23c | 237 | bytes += os.write(f, b'a' * 512 * 1024) |
7c673cae | 238 | os.fsync(f) |
9f95a23c | 239 | print("fsync'ed data successfully, will now attempt to fill fs") |
7c673cae FG |
240 | |
241 | # Okay, now we're going to fill up the filesystem, and then keep | |
242 | # writing until we see an error from fsync. As long as we're doing | |
243 | # buffered IO, the error should always only appear from fsync and not | |
244 | # from write | |
245 | full = False | |
246 | ||
94b18763 | 247 | for n in range(0, int({fill_mb} * 0.9)): |
9f95a23c TL |
248 | bytes += os.write(f, b'x' * 1024 * 1024) |
249 | print("wrote {{0}} bytes via buffered write, may repeat".format(bytes)) | |
250 | print("done writing {{0}} bytes".format(bytes)) | |
7c673cae FG |
251 | |
252 | # OK, now we should sneak in under the full condition | |
253 | # due to the time it takes the OSDs to report to the | |
254 | # mons, and get a successful fsync on our full-making data | |
255 | os.fsync(f) | |
9f95a23c | 256 | print("successfully fsync'ed prior to getting full state reported") |
7c673cae | 257 | |
94b18763 | 258 | # buffered write, add more dirty data to the buffer |
9f95a23c | 259 | print("starting buffered write") |
94b18763 FG |
260 | try: |
261 | for n in range(0, int({fill_mb} * 0.2)): | |
9f95a23c TL |
262 | bytes += os.write(f, b'x' * 1024 * 1024) |
263 | print("sleeping a bit as we've exceeded 90% of our expected full ratio") | |
94b18763 FG |
264 | time.sleep({full_wait}) |
265 | except OSError: | |
266 | pass; | |
7c673cae | 267 | |
9f95a23c | 268 | print("wrote, now waiting 30s and then doing a close we expect to fail") |
7c673cae FG |
269 | |
270 | # Wait long enough for a background flush that should fail | |
271 | time.sleep(30) | |
272 | ||
273 | if {is_fuse}: | |
274 | # ...and check that the failed background flush is reflected in fclose | |
275 | try: | |
276 | os.close(f) | |
277 | except OSError: | |
9f95a23c | 278 | print("close() returned an error as expected") |
7c673cae FG |
279 | else: |
280 | raise RuntimeError("close() failed to raise error") | |
281 | else: | |
282 | # The kernel cephfs client does not raise errors on fclose | |
283 | os.close(f) | |
284 | ||
285 | os.unlink("{file_path}") | |
286 | """) | |
287 | self._remote_write_test(remote_script) | |
288 | ||
289 | def test_full_fsync(self): | |
290 | """ | |
291 | That when the full flag is encountered during asynchronous | |
292 | flushes, such that an fwrite() succeeds but an fsync/fclose() | |
293 | should return the ENOSPC error. | |
294 | """ | |
295 | ||
296 | # A remote script which opens a file handle, fills up the filesystem, and then | |
297 | # checks that ENOSPC errors on buffered writes appear correctly as errors in fsync | |
298 | remote_script = dedent(""" | |
299 | import time | |
300 | import datetime | |
301 | import subprocess | |
302 | import os | |
303 | ||
304 | # Write some buffered data through before going full, all should be well | |
9f95a23c | 305 | print("writing some data through which we expect to succeed") |
7c673cae FG |
306 | bytes = 0 |
307 | f = os.open("{file_path}", os.O_WRONLY | os.O_CREAT) | |
522d829b | 308 | os.setxattr("{file_path}", 'ceph.file.layout', b'{file_layout}') |
9f95a23c | 309 | bytes += os.write(f, b'a' * 4096) |
7c673cae | 310 | os.fsync(f) |
9f95a23c | 311 | print("fsync'ed data successfully, will now attempt to fill fs") |
7c673cae FG |
312 | |
313 | # Okay, now we're going to fill up the filesystem, and then keep | |
314 | # writing until we see an error from fsync. As long as we're doing | |
315 | # buffered IO, the error should always only appear from fsync and not | |
316 | # from write | |
317 | full = False | |
318 | ||
94b18763 | 319 | for n in range(0, int({fill_mb} * 1.1)): |
7c673cae | 320 | try: |
9f95a23c TL |
321 | bytes += os.write(f, b'x' * 1024 * 1024) |
322 | print("wrote bytes via buffered write, moving on to fsync") | |
7c673cae | 323 | except OSError as e: |
522d829b TL |
324 | if {is_fuse}: |
325 | print("Unexpected error %s from write() instead of fsync()" % e) | |
326 | raise | |
327 | else: | |
328 | print("Reached fullness after %.2f MB" % (bytes / (1024.0 * 1024.0))) | |
329 | full = True | |
330 | break | |
7c673cae FG |
331 | |
332 | try: | |
333 | os.fsync(f) | |
9f95a23c | 334 | print("fsync'ed successfully") |
7c673cae | 335 | except OSError as e: |
9f95a23c | 336 | print("Reached fullness after %.2f MB" % (bytes / (1024.0 * 1024.0))) |
7c673cae FG |
337 | full = True |
338 | break | |
339 | else: | |
9f95a23c | 340 | print("Not full yet after %.2f MB" % (bytes / (1024.0 * 1024.0))) |
7c673cae | 341 | |
94b18763 | 342 | if n > {fill_mb} * 0.9: |
7c673cae FG |
343 | # Be cautious in the last region where we expect to hit |
344 | # the full condition, so that we don't overshoot too dramatically | |
9f95a23c | 345 | print("sleeping a bit as we've exceeded 90% of our expected full ratio") |
7c673cae FG |
346 | time.sleep({full_wait}) |
347 | ||
348 | if not full: | |
349 | raise RuntimeError("Failed to reach fullness after writing %d bytes" % bytes) | |
350 | ||
351 | # close() should not raise an error because we already caught it in | |
352 | # fsync. There shouldn't have been any more writeback errors | |
353 | # since then because all IOs got cancelled on the full flag. | |
9f95a23c | 354 | print("calling close") |
7c673cae | 355 | os.close(f) |
9f95a23c | 356 | print("close() did not raise error") |
7c673cae FG |
357 | |
358 | os.unlink("{file_path}") | |
359 | """) | |
360 | ||
361 | self._remote_write_test(remote_script) | |
362 | ||
363 | ||
364 | class TestQuotaFull(FullnessTestCase): | |
365 | """ | |
366 | Test per-pool fullness, which indicates quota limits exceeded | |
367 | """ | |
9f95a23c | 368 | pool_capacity = 1024 * 1024 * 32 # arbitrary low-ish limit |
e306af50 | 369 | fill_mb = pool_capacity // (1024 * 1024) # type: ignore |
7c673cae FG |
370 | |
371 | # We are only testing quota handling on the data pool, not the metadata | |
372 | # pool. | |
373 | data_only = True | |
374 | ||
375 | def setUp(self): | |
376 | super(TestQuotaFull, self).setUp() | |
377 | ||
378 | pool_name = self.fs.get_data_pool_name() | |
379 | self.fs.mon_manager.raw_cluster_cmd("osd", "pool", "set-quota", pool_name, | |
380 | "max_bytes", "{0}".format(self.pool_capacity)) | |
381 | ||
7c673cae FG |
382 | |
383 | class TestClusterFull(FullnessTestCase): | |
384 | """ | |
b32b8144 | 385 | Test data pool fullness, which indicates that an OSD has become too full |
7c673cae FG |
386 | """ |
387 | pool_capacity = None | |
388 | REQUIRE_MEMSTORE = True | |
389 | ||
390 | def setUp(self): | |
391 | super(TestClusterFull, self).setUp() | |
392 | ||
393 | if self.pool_capacity is None: | |
522d829b | 394 | TestClusterFull.pool_capacity = self.fs.get_pool_df(self._data_pool_name())['max_avail'] |
e306af50 | 395 | TestClusterFull.fill_mb = (self.pool_capacity // (1024 * 1024)) |
7c673cae | 396 | |
7c673cae FG |
397 | # Hide the parent class so that unittest.loader doesn't try to run it. |
398 | del globals()['FullnessTestCase'] |