]> git.proxmox.com Git - ceph.git/blob - ceph/qa/tasks/osd_failsafe_enospc.py
4b2cdb983fc0badd033b9b98c03d400997133044
[ceph.git] / ceph / qa / tasks / osd_failsafe_enospc.py
1 """
2 Handle osdfailsafe configuration settings (nearfull ratio and full ratio)
3 """
4 from io import BytesIO
5 import logging
6 import six
7 import time
8
9 from teuthology.orchestra import run
10 from tasks.util.rados import rados
11 from teuthology import misc as teuthology
12
13 log = logging.getLogger(__name__)
14
15 def task(ctx, config):
16 """
17 Test handling of osd_failsafe_nearfull_ratio and osd_failsafe_full_ratio
18 configuration settings
19
20 In order for test to pass must use log-whitelist as follows
21
22 tasks:
23 - chef:
24 - install:
25 - ceph:
26 log-whitelist: ['OSD near full', 'OSD full dropping all updates']
27 - osd_failsafe_enospc:
28
29 """
30 if config is None:
31 config = {}
32 assert isinstance(config, dict), \
33 'osd_failsafe_enospc task only accepts a dict for configuration'
34
35 # Give 2 seconds for injectargs + osd_op_complaint_time (30) + 2 * osd_heartbeat_interval (6) + 6 padding
36 sleep_time = 50
37
38 # something that is always there
39 dummyfile = '/etc/fstab'
40 dummyfile2 = '/etc/resolv.conf'
41
42 manager = ctx.managers['ceph']
43
44 # create 1 pg pool with 1 rep which can only be on osd.0
45 osds = manager.get_osd_dump()
46 for osd in osds:
47 if osd['osd'] != 0:
48 manager.mark_out_osd(osd['osd'])
49
50 log.info('creating pool foo')
51 manager.create_pool("foo")
52 manager.raw_cluster_cmd('osd', 'pool', 'set', 'foo', 'size', '1')
53
54 # State NONE -> NEAR
55 log.info('1. Verify warning messages when exceeding nearfull_ratio')
56
57 first_mon = teuthology.get_first_mon(ctx, config)
58 (mon,) = ctx.cluster.only(first_mon).remotes.keys()
59
60 proc = mon.run(
61 args=[
62 'sudo',
63 'daemon-helper',
64 'kill',
65 'ceph', '-w'
66 ],
67 stdin=run.PIPE,
68 stdout=BytesIO(),
69 wait=False,
70 )
71
72 manager.raw_cluster_cmd('tell', 'osd.0', 'injectargs', '--osd_failsafe_nearfull_ratio .00001')
73
74 time.sleep(sleep_time)
75 proc.stdin.close() # causes daemon-helper send SIGKILL to ceph -w
76 proc.wait()
77
78 lines = six.ensure_str(proc.stdout.getvalue()).split('\n')
79
80 count = len(filter(lambda line: '[WRN] OSD near full' in line, lines))
81 assert count == 2, 'Incorrect number of warning messages expected 2 got %d' % count
82 count = len(filter(lambda line: '[ERR] OSD full dropping all updates' in line, lines))
83 assert count == 0, 'Incorrect number of error messages expected 0 got %d' % count
84
85 # State NEAR -> FULL
86 log.info('2. Verify error messages when exceeding full_ratio')
87
88 proc = mon.run(
89 args=[
90 'sudo',
91 'daemon-helper',
92 'kill',
93 'ceph', '-w'
94 ],
95 stdin=run.PIPE,
96 stdout=BytesIO(),
97 wait=False,
98 )
99
100 manager.raw_cluster_cmd('tell', 'osd.0', 'injectargs', '--osd_failsafe_full_ratio .00001')
101
102 time.sleep(sleep_time)
103 proc.stdin.close() # causes daemon-helper send SIGKILL to ceph -w
104 proc.wait()
105
106 lines = six.ensure_str(proc.stdout.getvalue()).split('\n')
107
108 count = len(filter(lambda line: '[ERR] OSD full dropping all updates' in line, lines))
109 assert count == 2, 'Incorrect number of error messages expected 2 got %d' % count
110
111 log.info('3. Verify write failure when exceeding full_ratio')
112
113 # Write data should fail
114 ret = rados(ctx, mon, ['-p', 'foo', 'put', 'newfile1', dummyfile])
115 assert ret != 0, 'Expected write failure but it succeeded with exit status 0'
116
117 # Put back default
118 manager.raw_cluster_cmd('tell', 'osd.0', 'injectargs', '--osd_failsafe_full_ratio .97')
119 time.sleep(10)
120
121 # State FULL -> NEAR
122 log.info('4. Verify write success when NOT exceeding full_ratio')
123
124 # Write should succeed
125 ret = rados(ctx, mon, ['-p', 'foo', 'put', 'newfile2', dummyfile2])
126 assert ret == 0, 'Expected write to succeed, but got exit status %d' % ret
127
128 log.info('5. Verify warning messages again when exceeding nearfull_ratio')
129
130 proc = mon.run(
131 args=[
132 'sudo',
133 'daemon-helper',
134 'kill',
135 'ceph', '-w'
136 ],
137 stdin=run.PIPE,
138 stdout=BytesIO(),
139 wait=False,
140 )
141
142 time.sleep(sleep_time)
143 proc.stdin.close() # causes daemon-helper send SIGKILL to ceph -w
144 proc.wait()
145
146 lines = six.ensure_str(proc.stdout.getvalue()).split('\n')
147
148 count = len(filter(lambda line: '[WRN] OSD near full' in line, lines))
149 assert count == 1 or count == 2, 'Incorrect number of warning messages expected 1 or 2 got %d' % count
150 count = len(filter(lambda line: '[ERR] OSD full dropping all updates' in line, lines))
151 assert count == 0, 'Incorrect number of error messages expected 0 got %d' % count
152
153 manager.raw_cluster_cmd('tell', 'osd.0', 'injectargs', '--osd_failsafe_nearfull_ratio .90')
154 time.sleep(10)
155
156 # State NONE -> FULL
157 log.info('6. Verify error messages again when exceeding full_ratio')
158
159 proc = mon.run(
160 args=[
161 'sudo',
162 'daemon-helper',
163 'kill',
164 'ceph', '-w'
165 ],
166 stdin=run.PIPE,
167 stdout=BytesIO(),
168 wait=False,
169 )
170
171 manager.raw_cluster_cmd('tell', 'osd.0', 'injectargs', '--osd_failsafe_full_ratio .00001')
172
173 time.sleep(sleep_time)
174 proc.stdin.close() # causes daemon-helper send SIGKILL to ceph -w
175 proc.wait()
176
177 lines = six.ensure_str(proc.stdout.getvalue()).split('\n')
178
179 count = len(filter(lambda line: '[WRN] OSD near full' in line, lines))
180 assert count == 0, 'Incorrect number of warning messages expected 0 got %d' % count
181 count = len(filter(lambda line: '[ERR] OSD full dropping all updates' in line, lines))
182 assert count == 2, 'Incorrect number of error messages expected 2 got %d' % count
183
184 # State FULL -> NONE
185 log.info('7. Verify no messages settings back to default')
186
187 manager.raw_cluster_cmd('tell', 'osd.0', 'injectargs', '--osd_failsafe_full_ratio .97')
188 time.sleep(10)
189
190 proc = mon.run(
191 args=[
192 'sudo',
193 'daemon-helper',
194 'kill',
195 'ceph', '-w'
196 ],
197 stdin=run.PIPE,
198 stdout=BytesIO(),
199 wait=False,
200 )
201
202 time.sleep(sleep_time)
203 proc.stdin.close() # causes daemon-helper send SIGKILL to ceph -w
204 proc.wait()
205
206 lines = six.ensure_str(proc.stdout.getvalue()).split('\n')
207
208 count = len(filter(lambda line: '[WRN] OSD near full' in line, lines))
209 assert count == 0, 'Incorrect number of warning messages expected 0 got %d' % count
210 count = len(filter(lambda line: '[ERR] OSD full dropping all updates' in line, lines))
211 assert count == 0, 'Incorrect number of error messages expected 0 got %d' % count
212
213 log.info('Test Passed')
214
215 # Bring all OSDs back in
216 manager.remove_pool("foo")
217 for osd in osds:
218 if osd['osd'] != 0:
219 manager.mark_in_osd(osd['osd'])