]> git.proxmox.com Git - ceph.git/blob - ceph/qa/tasks/reg11184.py
import quincy beta 17.1.0
[ceph.git] / ceph / qa / tasks / reg11184.py
1 """
2 Special regression test for tracker #11184
3
4 Synopsis: osd/SnapMapper.cc: 282: FAILED assert(check(oid))
5
6 This is accomplished by moving a pg that wasn't part of split and still include
7 divergent priors.
8 """
9 import logging
10 import time
11
12 from teuthology.exceptions import CommandFailedError
13 from teuthology.orchestra import run
14 from teuthology import misc as teuthology
15 from tasks.util.rados import rados
16 import os
17
18
19 log = logging.getLogger(__name__)
20
21
22 def task(ctx, config):
23 """
24 Test handling of divergent entries during export / import
25 to regression test tracker #11184
26
27 overrides:
28 ceph:
29 conf:
30 osd:
31 debug osd: 5
32
33 Requires 3 osds on a single test node.
34 """
35 if config is None:
36 config = {}
37 assert isinstance(config, dict), \
38 'divergent_priors task only accepts a dict for configuration'
39
40 manager = ctx.managers['ceph']
41
42 while len(manager.get_osd_status()['up']) < 3:
43 time.sleep(10)
44 osds = [0, 1, 2]
45 manager.flush_pg_stats(osds)
46 manager.raw_cluster_cmd('osd', 'set', 'noout')
47 manager.raw_cluster_cmd('osd', 'set', 'noin')
48 manager.raw_cluster_cmd('osd', 'set', 'nodown')
49 manager.wait_for_clean()
50
51 # something that is always there
52 dummyfile = '/etc/fstab'
53 dummyfile2 = '/etc/resolv.conf'
54 testdir = teuthology.get_testdir(ctx)
55
56 # create 1 pg pool
57 log.info('creating foo')
58 manager.raw_cluster_cmd('osd', 'pool', 'create', 'foo', '1')
59 manager.raw_cluster_cmd(
60 'osd', 'pool', 'application', 'enable',
61 'foo', 'rados', run.Raw('||'), 'true')
62
63 # Remove extra pool to simlify log output
64 manager.raw_cluster_cmd('osd', 'pool', 'delete', 'rbd', 'rbd', '--yes-i-really-really-mean-it')
65
66 for i in osds:
67 manager.set_config(i, osd_min_pg_log_entries=10)
68 manager.set_config(i, osd_max_pg_log_entries=10)
69 manager.set_config(i, osd_pg_log_trim_min=5)
70
71 # determine primary
72 divergent = manager.get_pg_primary('foo', 0)
73 log.info("primary and soon to be divergent is %d", divergent)
74 non_divergent = list(osds)
75 non_divergent.remove(divergent)
76
77 log.info('writing initial objects')
78 first_mon = teuthology.get_first_mon(ctx, config)
79 (mon,) = ctx.cluster.only(first_mon).remotes.keys()
80 # write 100 objects
81 for i in range(100):
82 rados(ctx, mon, ['-p', 'foo', 'put', 'existing_%d' % i, dummyfile])
83
84 manager.wait_for_clean()
85
86 # blackhole non_divergent
87 log.info("blackholing osds %s", str(non_divergent))
88 for i in non_divergent:
89 manager.set_config(i, objectstore_blackhole=1)
90
91 DIVERGENT_WRITE = 5
92 DIVERGENT_REMOVE = 5
93 # Write some soon to be divergent
94 log.info('writing divergent objects')
95 for i in range(DIVERGENT_WRITE):
96 rados(ctx, mon, ['-p', 'foo', 'put', 'existing_%d' % i,
97 dummyfile2], wait=False)
98 # Remove some soon to be divergent
99 log.info('remove divergent objects')
100 for i in range(DIVERGENT_REMOVE):
101 rados(ctx, mon, ['-p', 'foo', 'rm',
102 'existing_%d' % (i + DIVERGENT_WRITE)], wait=False)
103 time.sleep(10)
104 mon.run(
105 args=['killall', '-9', 'rados'],
106 wait=True,
107 check_status=False)
108
109 # kill all the osds but leave divergent in
110 log.info('killing all the osds')
111 for i in osds:
112 manager.kill_osd(i)
113 for i in osds:
114 manager.mark_down_osd(i)
115 for i in non_divergent:
116 manager.mark_out_osd(i)
117
118 # bring up non-divergent
119 log.info("bringing up non_divergent %s", str(non_divergent))
120 for i in non_divergent:
121 manager.revive_osd(i)
122 for i in non_divergent:
123 manager.mark_in_osd(i)
124
125 # write 1 non-divergent object (ensure that old divergent one is divergent)
126 objname = "existing_%d" % (DIVERGENT_WRITE + DIVERGENT_REMOVE)
127 log.info('writing non-divergent object ' + objname)
128 rados(ctx, mon, ['-p', 'foo', 'put', objname, dummyfile2])
129
130 manager.wait_for_recovery()
131
132 # ensure no recovery of up osds first
133 log.info('delay recovery')
134 for i in non_divergent:
135 manager.wait_run_admin_socket(
136 'osd', i, ['set_recovery_delay', '100000'])
137
138 # bring in our divergent friend
139 log.info("revive divergent %d", divergent)
140 manager.raw_cluster_cmd('osd', 'set', 'noup')
141 manager.revive_osd(divergent)
142
143 log.info('delay recovery divergent')
144 manager.wait_run_admin_socket(
145 'osd', divergent, ['set_recovery_delay', '100000'])
146
147 manager.raw_cluster_cmd('osd', 'unset', 'noup')
148 while len(manager.get_osd_status()['up']) < 3:
149 time.sleep(10)
150
151 log.info('wait for peering')
152 rados(ctx, mon, ['-p', 'foo', 'put', 'foo', dummyfile])
153
154 # At this point the divergent_priors should have been detected
155
156 log.info("killing divergent %d", divergent)
157 manager.kill_osd(divergent)
158
159 # Split pgs for pool foo
160 manager.raw_cluster_cmd('osd', 'pool', 'set', 'foo', 'pg_num', '2')
161 time.sleep(5)
162
163 manager.raw_cluster_cmd('pg','dump')
164
165 # Export a pg
166 (exp_remote,) = ctx.\
167 cluster.only('osd.{o}'.format(o=divergent)).remotes.keys()
168 FSPATH = manager.get_filepath()
169 JPATH = os.path.join(FSPATH, "journal")
170 prefix = ("sudo adjust-ulimits ceph-objectstore-tool "
171 "--data-path {fpath} --journal-path {jpath} "
172 "--log-file="
173 "/var/log/ceph/objectstore_tool.$$.log ".
174 format(fpath=FSPATH, jpath=JPATH))
175 pid = os.getpid()
176 expfile = os.path.join(testdir, "exp.{pid}.out".format(pid=pid))
177 cmd = ((prefix + "--op export-remove --pgid 2.0 --file {file}").
178 format(id=divergent, file=expfile))
179 try:
180 exp_remote.sh(cmd, wait=True)
181 except CommandFailedError as e:
182 assert e.exitstatus == 0
183
184 # Kill one of non-divergent OSDs
185 log.info('killing osd.%d' % non_divergent[0])
186 manager.kill_osd(non_divergent[0])
187 manager.mark_down_osd(non_divergent[0])
188 # manager.mark_out_osd(non_divergent[0])
189
190 # An empty collection for pg 2.0 might need to be cleaned up
191 cmd = ((prefix + "--force --op remove --pgid 2.0").
192 format(id=non_divergent[0]))
193 exp_remote.sh(cmd, wait=True, check_status=False)
194
195 cmd = ((prefix + "--op import --file {file}").
196 format(id=non_divergent[0], file=expfile))
197 try:
198 exp_remote.sh(cmd, wait=True)
199 except CommandFailedError as e:
200 assert e.exitstatus == 0
201
202 # bring in our divergent friend and other node
203 log.info("revive divergent %d", divergent)
204 manager.revive_osd(divergent)
205 manager.mark_in_osd(divergent)
206 log.info("revive %d", non_divergent[0])
207 manager.revive_osd(non_divergent[0])
208
209 while len(manager.get_osd_status()['up']) < 3:
210 time.sleep(10)
211
212 log.info('delay recovery divergent')
213 manager.set_config(divergent, osd_recovery_delay_start=100000)
214 log.info('mark divergent in')
215 manager.mark_in_osd(divergent)
216
217 log.info('wait for peering')
218 rados(ctx, mon, ['-p', 'foo', 'put', 'foo', dummyfile])
219
220 log.info("killing divergent %d", divergent)
221 manager.kill_osd(divergent)
222 log.info("reviving divergent %d", divergent)
223 manager.revive_osd(divergent)
224 time.sleep(3)
225
226 log.info('allowing recovery')
227 # Set osd_recovery_delay_start back to 0 and kick the queue
228 for i in osds:
229 manager.raw_cluster_cmd('tell', 'osd.%d' % i, 'debug',
230 'kick_recovery_wq', ' 0')
231
232 log.info('reading divergent objects')
233 for i in range(DIVERGENT_WRITE + DIVERGENT_REMOVE):
234 exit_status = rados(ctx, mon, ['-p', 'foo', 'get', 'existing_%d' % i,
235 '/tmp/existing'])
236 assert exit_status == 0
237
238 (remote,) = ctx.\
239 cluster.only('osd.{o}'.format(o=divergent)).remotes.keys()
240 cmd = 'rm {file}'.format(file=expfile)
241 remote.run(args=cmd, wait=True)
242 log.info("success")