]> git.proxmox.com Git - ceph.git/blame - ceph/qa/tasks/reg11184.py
buildsys: fix parallel builds
[ceph.git] / ceph / qa / tasks / reg11184.py
CommitLineData
7c673cae
FG
1"""
2Special regression test for tracker #11184
3
4Synopsis: osd/SnapMapper.cc: 282: FAILED assert(check(oid))
5
6This is accomplished by moving a pg that wasn't part of split and still include
7divergent priors.
8"""
9import logging
10import time
11from cStringIO import StringIO
12
13from teuthology import misc as teuthology
14from util.rados import rados
15import os
16
17
18log = logging.getLogger(__name__)
19
20
21def task(ctx, config):
22 """
23 Test handling of divergent entries during export / import
24 to regression test tracker #11184
25
26 overrides:
27 ceph:
28 conf:
29 osd:
30 debug osd: 5
31
32 Requires 3 osds on a single test node.
33 """
34 if config is None:
35 config = {}
36 assert isinstance(config, dict), \
37 'divergent_priors task only accepts a dict for configuration'
38
39 manager = ctx.managers['ceph']
40
41 while len(manager.get_osd_status()['up']) < 3:
42 time.sleep(10)
31f18b77 43 manager.flush_pg_stats([0, 1, 2])
7c673cae
FG
44 manager.raw_cluster_cmd('osd', 'set', 'noout')
45 manager.raw_cluster_cmd('osd', 'set', 'noin')
46 manager.raw_cluster_cmd('osd', 'set', 'nodown')
47 manager.wait_for_clean()
48
49 # something that is always there
50 dummyfile = '/etc/fstab'
51 dummyfile2 = '/etc/resolv.conf'
52 testdir = teuthology.get_testdir(ctx)
53
54 # create 1 pg pool
55 log.info('creating foo')
56 manager.raw_cluster_cmd('osd', 'pool', 'create', 'foo', '1')
57
58 osds = [0, 1, 2]
59 for i in osds:
60 manager.set_config(i, osd_min_pg_log_entries=10)
61 manager.set_config(i, osd_max_pg_log_entries=10)
62 manager.set_config(i, osd_pg_log_trim_min=5)
63
64 # determine primary
65 divergent = manager.get_pg_primary('foo', 0)
66 log.info("primary and soon to be divergent is %d", divergent)
67 non_divergent = list(osds)
68 non_divergent.remove(divergent)
69
70 log.info('writing initial objects')
71 first_mon = teuthology.get_first_mon(ctx, config)
72 (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()
73 # write 100 objects
74 for i in range(100):
75 rados(ctx, mon, ['-p', 'foo', 'put', 'existing_%d' % i, dummyfile])
76
77 manager.wait_for_clean()
78
79 # blackhole non_divergent
80 log.info("blackholing osds %s", str(non_divergent))
81 for i in non_divergent:
82 manager.set_config(i, objectstore_blackhole=1)
83
84 DIVERGENT_WRITE = 5
85 DIVERGENT_REMOVE = 5
86 # Write some soon to be divergent
87 log.info('writing divergent objects')
88 for i in range(DIVERGENT_WRITE):
89 rados(ctx, mon, ['-p', 'foo', 'put', 'existing_%d' % i,
90 dummyfile2], wait=False)
91 # Remove some soon to be divergent
92 log.info('remove divergent objects')
93 for i in range(DIVERGENT_REMOVE):
94 rados(ctx, mon, ['-p', 'foo', 'rm',
95 'existing_%d' % (i + DIVERGENT_WRITE)], wait=False)
96 time.sleep(10)
97 mon.run(
98 args=['killall', '-9', 'rados'],
99 wait=True,
100 check_status=False)
101
102 # kill all the osds but leave divergent in
103 log.info('killing all the osds')
104 for i in osds:
105 manager.kill_osd(i)
106 for i in osds:
107 manager.mark_down_osd(i)
108 for i in non_divergent:
109 manager.mark_out_osd(i)
110
111 # bring up non-divergent
112 log.info("bringing up non_divergent %s", str(non_divergent))
113 for i in non_divergent:
114 manager.revive_osd(i)
115 for i in non_divergent:
116 manager.mark_in_osd(i)
117
118 # write 1 non-divergent object (ensure that old divergent one is divergent)
119 objname = "existing_%d" % (DIVERGENT_WRITE + DIVERGENT_REMOVE)
120 log.info('writing non-divergent object ' + objname)
121 rados(ctx, mon, ['-p', 'foo', 'put', objname, dummyfile2])
122
123 manager.wait_for_recovery()
124
125 # ensure no recovery of up osds first
126 log.info('delay recovery')
127 for i in non_divergent:
128 manager.wait_run_admin_socket(
129 'osd', i, ['set_recovery_delay', '100000'])
130
131 # bring in our divergent friend
132 log.info("revive divergent %d", divergent)
133 manager.raw_cluster_cmd('osd', 'set', 'noup')
134 manager.revive_osd(divergent)
135
136 log.info('delay recovery divergent')
137 manager.wait_run_admin_socket(
138 'osd', divergent, ['set_recovery_delay', '100000'])
139
140 manager.raw_cluster_cmd('osd', 'unset', 'noup')
141 while len(manager.get_osd_status()['up']) < 3:
142 time.sleep(10)
143
144 log.info('wait for peering')
145 rados(ctx, mon, ['-p', 'foo', 'put', 'foo', dummyfile])
146
147 # At this point the divergent_priors should have been detected
148
149 log.info("killing divergent %d", divergent)
150 manager.kill_osd(divergent)
151
152 # Split pgs for pool foo
153 manager.raw_cluster_cmd('osd', 'pool', 'set', 'foo', 'pg_num', '2')
154 time.sleep(5)
155
156 # Export a pg
157 (exp_remote,) = ctx.\
158 cluster.only('osd.{o}'.format(o=divergent)).remotes.iterkeys()
159 FSPATH = manager.get_filepath()
160 JPATH = os.path.join(FSPATH, "journal")
161 prefix = ("sudo adjust-ulimits ceph-objectstore-tool "
162 "--data-path {fpath} --journal-path {jpath} "
163 "--log-file="
164 "/var/log/ceph/objectstore_tool.$$.log ".
165 format(fpath=FSPATH, jpath=JPATH))
166 pid = os.getpid()
167 expfile = os.path.join(testdir, "exp.{pid}.out".format(pid=pid))
168 cmd = ((prefix + "--op export --pgid 1.0 --file {file}").
169 format(id=divergent, file=expfile))
170 proc = exp_remote.run(args=cmd, wait=True,
171 check_status=False, stdout=StringIO())
172 assert proc.exitstatus == 0
173
174 # Remove the same pg that was exported
175 cmd = ((prefix + "--op remove --pgid 1.0").
176 format(id=divergent, file=expfile))
177 proc = exp_remote.run(args=cmd, wait=True,
178 check_status=False, stdout=StringIO())
179 assert proc.exitstatus == 0
180
181 # Kill one of non-divergent OSDs
182 log.info('killing osd.%d' % non_divergent[1])
183 manager.kill_osd(non_divergent[1])
184 manager.mark_down_osd(non_divergent[1])
185 # manager.mark_out_osd(non_divergent[1])
186
187 cmd = ((prefix + "--op import --file {file}").
188 format(id=non_divergent[1], file=expfile))
189 proc = exp_remote.run(args=cmd, wait=True,
190 check_status=False, stdout=StringIO())
191 assert proc.exitstatus == 0
192
193 # bring in our divergent friend and other node
194 log.info("revive divergent %d", divergent)
195 manager.revive_osd(divergent)
196 manager.mark_in_osd(divergent)
197 log.info("revive %d", non_divergent[1])
198 manager.revive_osd(non_divergent[1])
199
200 while len(manager.get_osd_status()['up']) < 3:
201 time.sleep(10)
202
203 log.info('delay recovery divergent')
204 manager.set_config(divergent, osd_recovery_delay_start=100000)
205 log.info('mark divergent in')
206 manager.mark_in_osd(divergent)
207
208 log.info('wait for peering')
209 rados(ctx, mon, ['-p', 'foo', 'put', 'foo', dummyfile])
210
211 log.info("killing divergent %d", divergent)
212 manager.kill_osd(divergent)
213 log.info("reviving divergent %d", divergent)
214 manager.revive_osd(divergent)
215 time.sleep(3)
216
217 log.info('allowing recovery')
218 # Set osd_recovery_delay_start back to 0 and kick the queue
219 for i in osds:
220 manager.raw_cluster_cmd('tell', 'osd.%d' % i, 'debug',
221 'kick_recovery_wq', ' 0')
222
223 log.info('reading divergent objects')
224 for i in range(DIVERGENT_WRITE + DIVERGENT_REMOVE):
225 exit_status = rados(ctx, mon, ['-p', 'foo', 'get', 'existing_%d' % i,
226 '/tmp/existing'])
227 assert exit_status is 0
228
229 (remote,) = ctx.\
230 cluster.only('osd.{o}'.format(o=divergent)).remotes.iterkeys()
231 cmd = 'rm {file}'.format(file=expfile)
232 remote.run(args=cmd, wait=True)
233 log.info("success")