]> git.proxmox.com Git - ceph.git/blob - ceph/qa/tasks/reg11184.py
update sources to v12.1.2
[ceph.git] / ceph / qa / tasks / reg11184.py
1 """
2 Special regression test for tracker #11184
3
4 Synopsis: osd/SnapMapper.cc: 282: FAILED assert(check(oid))
5
6 This is accomplished by moving a pg that wasn't part of split and still include
7 divergent priors.
8 """
9 import logging
10 import time
11 from cStringIO import StringIO
12
13 from teuthology.orchestra import run
14 from teuthology import misc as teuthology
15 from util.rados import rados
16 import os
17
18
19 log = logging.getLogger(__name__)
20
21
22 def task(ctx, config):
23 """
24 Test handling of divergent entries during export / import
25 to regression test tracker #11184
26
27 overrides:
28 ceph:
29 conf:
30 osd:
31 debug osd: 5
32
33 Requires 3 osds on a single test node.
34 """
35 if config is None:
36 config = {}
37 assert isinstance(config, dict), \
38 'divergent_priors task only accepts a dict for configuration'
39
40 manager = ctx.managers['ceph']
41
42 while len(manager.get_osd_status()['up']) < 3:
43 time.sleep(10)
44 osds = [0, 1, 2]
45 manager.flush_pg_stats(osds)
46 manager.raw_cluster_cmd('osd', 'set', 'noout')
47 manager.raw_cluster_cmd('osd', 'set', 'noin')
48 manager.raw_cluster_cmd('osd', 'set', 'nodown')
49 manager.wait_for_clean()
50
51 # something that is always there
52 dummyfile = '/etc/fstab'
53 dummyfile2 = '/etc/resolv.conf'
54 testdir = teuthology.get_testdir(ctx)
55
56 # create 1 pg pool
57 log.info('creating foo')
58 manager.raw_cluster_cmd('osd', 'pool', 'create', 'foo', '1')
59 manager.raw_cluster_cmd(
60 'osd', 'pool', 'application', 'enable',
61 'foo', 'rados', run.Raw('||'), 'true')
62
63 # Remove extra pool to simlify log output
64 manager.raw_cluster_cmd('osd', 'pool', 'delete', 'rbd', 'rbd', '--yes-i-really-really-mean-it')
65
66 for i in osds:
67 manager.set_config(i, osd_min_pg_log_entries=10)
68 manager.set_config(i, osd_max_pg_log_entries=10)
69 manager.set_config(i, osd_pg_log_trim_min=5)
70
71 # determine primary
72 divergent = manager.get_pg_primary('foo', 0)
73 log.info("primary and soon to be divergent is %d", divergent)
74 non_divergent = list(osds)
75 non_divergent.remove(divergent)
76
77 log.info('writing initial objects')
78 first_mon = teuthology.get_first_mon(ctx, config)
79 (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()
80 # write 100 objects
81 for i in range(100):
82 rados(ctx, mon, ['-p', 'foo', 'put', 'existing_%d' % i, dummyfile])
83
84 manager.wait_for_clean()
85
86 # blackhole non_divergent
87 log.info("blackholing osds %s", str(non_divergent))
88 for i in non_divergent:
89 manager.set_config(i, objectstore_blackhole=1)
90
91 DIVERGENT_WRITE = 5
92 DIVERGENT_REMOVE = 5
93 # Write some soon to be divergent
94 log.info('writing divergent objects')
95 for i in range(DIVERGENT_WRITE):
96 rados(ctx, mon, ['-p', 'foo', 'put', 'existing_%d' % i,
97 dummyfile2], wait=False)
98 # Remove some soon to be divergent
99 log.info('remove divergent objects')
100 for i in range(DIVERGENT_REMOVE):
101 rados(ctx, mon, ['-p', 'foo', 'rm',
102 'existing_%d' % (i + DIVERGENT_WRITE)], wait=False)
103 time.sleep(10)
104 mon.run(
105 args=['killall', '-9', 'rados'],
106 wait=True,
107 check_status=False)
108
109 # kill all the osds but leave divergent in
110 log.info('killing all the osds')
111 for i in osds:
112 manager.kill_osd(i)
113 for i in osds:
114 manager.mark_down_osd(i)
115 for i in non_divergent:
116 manager.mark_out_osd(i)
117
118 # bring up non-divergent
119 log.info("bringing up non_divergent %s", str(non_divergent))
120 for i in non_divergent:
121 manager.revive_osd(i)
122 for i in non_divergent:
123 manager.mark_in_osd(i)
124
125 # write 1 non-divergent object (ensure that old divergent one is divergent)
126 objname = "existing_%d" % (DIVERGENT_WRITE + DIVERGENT_REMOVE)
127 log.info('writing non-divergent object ' + objname)
128 rados(ctx, mon, ['-p', 'foo', 'put', objname, dummyfile2])
129
130 manager.wait_for_recovery()
131
132 # ensure no recovery of up osds first
133 log.info('delay recovery')
134 for i in non_divergent:
135 manager.wait_run_admin_socket(
136 'osd', i, ['set_recovery_delay', '100000'])
137
138 # bring in our divergent friend
139 log.info("revive divergent %d", divergent)
140 manager.raw_cluster_cmd('osd', 'set', 'noup')
141 manager.revive_osd(divergent)
142
143 log.info('delay recovery divergent')
144 manager.wait_run_admin_socket(
145 'osd', divergent, ['set_recovery_delay', '100000'])
146
147 manager.raw_cluster_cmd('osd', 'unset', 'noup')
148 while len(manager.get_osd_status()['up']) < 3:
149 time.sleep(10)
150
151 log.info('wait for peering')
152 rados(ctx, mon, ['-p', 'foo', 'put', 'foo', dummyfile])
153
154 # At this point the divergent_priors should have been detected
155
156 log.info("killing divergent %d", divergent)
157 manager.kill_osd(divergent)
158
159 # Split pgs for pool foo
160 manager.raw_cluster_cmd('osd', 'pool', 'set', 'foo', 'pg_num', '2')
161 time.sleep(5)
162
163 manager.raw_cluster_cmd('pg','dump')
164
165 # Export a pg
166 (exp_remote,) = ctx.\
167 cluster.only('osd.{o}'.format(o=divergent)).remotes.iterkeys()
168 FSPATH = manager.get_filepath()
169 JPATH = os.path.join(FSPATH, "journal")
170 prefix = ("sudo adjust-ulimits ceph-objectstore-tool "
171 "--data-path {fpath} --journal-path {jpath} "
172 "--log-file="
173 "/var/log/ceph/objectstore_tool.$$.log ".
174 format(fpath=FSPATH, jpath=JPATH))
175 pid = os.getpid()
176 expfile = os.path.join(testdir, "exp.{pid}.out".format(pid=pid))
177 cmd = ((prefix + "--op export --pgid 2.0 --file {file}").
178 format(id=divergent, file=expfile))
179 proc = exp_remote.run(args=cmd, wait=True,
180 check_status=False, stdout=StringIO())
181 assert proc.exitstatus == 0
182
183 # Remove the same pg that was exported
184 cmd = ((prefix + "--op remove --pgid 2.0").
185 format(id=divergent))
186 proc = exp_remote.run(args=cmd, wait=True,
187 check_status=False, stdout=StringIO())
188 assert proc.exitstatus == 0
189
190 # Kill one of non-divergent OSDs
191 log.info('killing osd.%d' % non_divergent[0])
192 manager.kill_osd(non_divergent[0])
193 manager.mark_down_osd(non_divergent[0])
194 # manager.mark_out_osd(non_divergent[0])
195
196 # An empty collection for pg 2.0 might need to be cleaned up
197 cmd = ((prefix + "--op remove --pgid 2.0").
198 format(id=non_divergent[0]))
199 proc = exp_remote.run(args=cmd, wait=True,
200 check_status=False, stdout=StringIO())
201
202 cmd = ((prefix + "--op import --file {file}").
203 format(id=non_divergent[0], file=expfile))
204 proc = exp_remote.run(args=cmd, wait=True,
205 check_status=False, stdout=StringIO())
206 assert proc.exitstatus == 0
207
208 # bring in our divergent friend and other node
209 log.info("revive divergent %d", divergent)
210 manager.revive_osd(divergent)
211 manager.mark_in_osd(divergent)
212 log.info("revive %d", non_divergent[0])
213 manager.revive_osd(non_divergent[0])
214
215 while len(manager.get_osd_status()['up']) < 3:
216 time.sleep(10)
217
218 log.info('delay recovery divergent')
219 manager.set_config(divergent, osd_recovery_delay_start=100000)
220 log.info('mark divergent in')
221 manager.mark_in_osd(divergent)
222
223 log.info('wait for peering')
224 rados(ctx, mon, ['-p', 'foo', 'put', 'foo', dummyfile])
225
226 log.info("killing divergent %d", divergent)
227 manager.kill_osd(divergent)
228 log.info("reviving divergent %d", divergent)
229 manager.revive_osd(divergent)
230 time.sleep(3)
231
232 log.info('allowing recovery')
233 # Set osd_recovery_delay_start back to 0 and kick the queue
234 for i in osds:
235 manager.raw_cluster_cmd('tell', 'osd.%d' % i, 'debug',
236 'kick_recovery_wq', ' 0')
237
238 log.info('reading divergent objects')
239 for i in range(DIVERGENT_WRITE + DIVERGENT_REMOVE):
240 exit_status = rados(ctx, mon, ['-p', 'foo', 'get', 'existing_%d' % i,
241 '/tmp/existing'])
242 assert exit_status is 0
243
244 (remote,) = ctx.\
245 cluster.only('osd.{o}'.format(o=divergent)).remotes.iterkeys()
246 cmd = 'rm {file}'.format(file=expfile)
247 remote.run(args=cmd, wait=True)
248 log.info("success")