]> git.proxmox.com Git - ceph.git/blame - ceph/qa/tasks/osd_failsafe_enospc.py
import ceph 15.2.10
[ceph.git] / ceph / qa / tasks / osd_failsafe_enospc.py
CommitLineData
7c673cae
FG
1"""
2Handle osdfailsafe configuration settings (nearfull ratio and full ratio)
3"""
9f95a23c 4from io import BytesIO
7c673cae 5import logging
9f95a23c 6import six
7c673cae
FG
7import time
8
9from teuthology.orchestra import run
e306af50 10from tasks.util.rados import rados
7c673cae
FG
11from teuthology import misc as teuthology
12
13log = logging.getLogger(__name__)
14
15def task(ctx, config):
16 """
17 Test handling of osd_failsafe_nearfull_ratio and osd_failsafe_full_ratio
18 configuration settings
19
cd265ab1 20 In order for test to pass must use log-ignorelist as follows
7c673cae
FG
21
22 tasks:
23 - chef:
24 - install:
25 - ceph:
cd265ab1 26 log-ignorelist: ['OSD near full', 'OSD full dropping all updates']
7c673cae
FG
27 - osd_failsafe_enospc:
28
29 """
30 if config is None:
31 config = {}
32 assert isinstance(config, dict), \
33 'osd_failsafe_enospc task only accepts a dict for configuration'
34
35 # Give 2 seconds for injectargs + osd_op_complaint_time (30) + 2 * osd_heartbeat_interval (6) + 6 padding
36 sleep_time = 50
37
38 # something that is always there
39 dummyfile = '/etc/fstab'
40 dummyfile2 = '/etc/resolv.conf'
41
42 manager = ctx.managers['ceph']
43
44 # create 1 pg pool with 1 rep which can only be on osd.0
45 osds = manager.get_osd_dump()
46 for osd in osds:
47 if osd['osd'] != 0:
48 manager.mark_out_osd(osd['osd'])
49
50 log.info('creating pool foo')
51 manager.create_pool("foo")
52 manager.raw_cluster_cmd('osd', 'pool', 'set', 'foo', 'size', '1')
53
54 # State NONE -> NEAR
55 log.info('1. Verify warning messages when exceeding nearfull_ratio')
56
57 first_mon = teuthology.get_first_mon(ctx, config)
9f95a23c 58 (mon,) = ctx.cluster.only(first_mon).remotes.keys()
7c673cae
FG
59
60 proc = mon.run(
61 args=[
62 'sudo',
63 'daemon-helper',
64 'kill',
65 'ceph', '-w'
66 ],
67 stdin=run.PIPE,
9f95a23c 68 stdout=BytesIO(),
7c673cae
FG
69 wait=False,
70 )
71
72 manager.raw_cluster_cmd('tell', 'osd.0', 'injectargs', '--osd_failsafe_nearfull_ratio .00001')
73
74 time.sleep(sleep_time)
75 proc.stdin.close() # causes daemon-helper send SIGKILL to ceph -w
76 proc.wait()
77
9f95a23c 78 lines = six.ensure_str(proc.stdout.getvalue()).split('\n')
7c673cae
FG
79
80 count = len(filter(lambda line: '[WRN] OSD near full' in line, lines))
81 assert count == 2, 'Incorrect number of warning messages expected 2 got %d' % count
82 count = len(filter(lambda line: '[ERR] OSD full dropping all updates' in line, lines))
83 assert count == 0, 'Incorrect number of error messages expected 0 got %d' % count
84
85 # State NEAR -> FULL
86 log.info('2. Verify error messages when exceeding full_ratio')
87
88 proc = mon.run(
89 args=[
90 'sudo',
91 'daemon-helper',
92 'kill',
93 'ceph', '-w'
94 ],
95 stdin=run.PIPE,
9f95a23c 96 stdout=BytesIO(),
7c673cae
FG
97 wait=False,
98 )
99
100 manager.raw_cluster_cmd('tell', 'osd.0', 'injectargs', '--osd_failsafe_full_ratio .00001')
101
102 time.sleep(sleep_time)
103 proc.stdin.close() # causes daemon-helper send SIGKILL to ceph -w
104 proc.wait()
105
9f95a23c 106 lines = six.ensure_str(proc.stdout.getvalue()).split('\n')
7c673cae
FG
107
108 count = len(filter(lambda line: '[ERR] OSD full dropping all updates' in line, lines))
109 assert count == 2, 'Incorrect number of error messages expected 2 got %d' % count
110
111 log.info('3. Verify write failure when exceeding full_ratio')
112
113 # Write data should fail
114 ret = rados(ctx, mon, ['-p', 'foo', 'put', 'newfile1', dummyfile])
115 assert ret != 0, 'Expected write failure but it succeeded with exit status 0'
116
117 # Put back default
118 manager.raw_cluster_cmd('tell', 'osd.0', 'injectargs', '--osd_failsafe_full_ratio .97')
119 time.sleep(10)
120
121 # State FULL -> NEAR
122 log.info('4. Verify write success when NOT exceeding full_ratio')
123
124 # Write should succeed
125 ret = rados(ctx, mon, ['-p', 'foo', 'put', 'newfile2', dummyfile2])
126 assert ret == 0, 'Expected write to succeed, but got exit status %d' % ret
127
128 log.info('5. Verify warning messages again when exceeding nearfull_ratio')
129
130 proc = mon.run(
131 args=[
132 'sudo',
133 'daemon-helper',
134 'kill',
135 'ceph', '-w'
136 ],
137 stdin=run.PIPE,
9f95a23c 138 stdout=BytesIO(),
7c673cae
FG
139 wait=False,
140 )
141
142 time.sleep(sleep_time)
143 proc.stdin.close() # causes daemon-helper send SIGKILL to ceph -w
144 proc.wait()
145
9f95a23c 146 lines = six.ensure_str(proc.stdout.getvalue()).split('\n')
7c673cae
FG
147
148 count = len(filter(lambda line: '[WRN] OSD near full' in line, lines))
149 assert count == 1 or count == 2, 'Incorrect number of warning messages expected 1 or 2 got %d' % count
150 count = len(filter(lambda line: '[ERR] OSD full dropping all updates' in line, lines))
151 assert count == 0, 'Incorrect number of error messages expected 0 got %d' % count
152
153 manager.raw_cluster_cmd('tell', 'osd.0', 'injectargs', '--osd_failsafe_nearfull_ratio .90')
154 time.sleep(10)
155
156 # State NONE -> FULL
157 log.info('6. Verify error messages again when exceeding full_ratio')
158
159 proc = mon.run(
160 args=[
161 'sudo',
162 'daemon-helper',
163 'kill',
164 'ceph', '-w'
165 ],
166 stdin=run.PIPE,
9f95a23c 167 stdout=BytesIO(),
7c673cae
FG
168 wait=False,
169 )
170
171 manager.raw_cluster_cmd('tell', 'osd.0', 'injectargs', '--osd_failsafe_full_ratio .00001')
172
173 time.sleep(sleep_time)
174 proc.stdin.close() # causes daemon-helper send SIGKILL to ceph -w
175 proc.wait()
176
9f95a23c 177 lines = six.ensure_str(proc.stdout.getvalue()).split('\n')
7c673cae
FG
178
179 count = len(filter(lambda line: '[WRN] OSD near full' in line, lines))
180 assert count == 0, 'Incorrect number of warning messages expected 0 got %d' % count
181 count = len(filter(lambda line: '[ERR] OSD full dropping all updates' in line, lines))
182 assert count == 2, 'Incorrect number of error messages expected 2 got %d' % count
183
184 # State FULL -> NONE
185 log.info('7. Verify no messages settings back to default')
186
187 manager.raw_cluster_cmd('tell', 'osd.0', 'injectargs', '--osd_failsafe_full_ratio .97')
188 time.sleep(10)
189
190 proc = mon.run(
191 args=[
192 'sudo',
193 'daemon-helper',
194 'kill',
195 'ceph', '-w'
196 ],
197 stdin=run.PIPE,
9f95a23c 198 stdout=BytesIO(),
7c673cae
FG
199 wait=False,
200 )
201
202 time.sleep(sleep_time)
203 proc.stdin.close() # causes daemon-helper send SIGKILL to ceph -w
204 proc.wait()
205
9f95a23c 206 lines = six.ensure_str(proc.stdout.getvalue()).split('\n')
7c673cae
FG
207
208 count = len(filter(lambda line: '[WRN] OSD near full' in line, lines))
209 assert count == 0, 'Incorrect number of warning messages expected 0 got %d' % count
210 count = len(filter(lambda line: '[ERR] OSD full dropping all updates' in line, lines))
211 assert count == 0, 'Incorrect number of error messages expected 0 got %d' % count
212
213 log.info('Test Passed')
214
215 # Bring all OSDs back in
216 manager.remove_pool("foo")
217 for osd in osds:
218 if osd['osd'] != 0:
219 manager.mark_in_osd(osd['osd'])