]>
git.proxmox.com Git - ceph.git/blob - ceph/qa/tasks/reg11184.py
2 Special regression test for tracker #11184
4 Synopsis: osd/SnapMapper.cc: 282: FAILED assert(check(oid))
6 This is accomplished by moving a pg that wasn't part of split and still include
12 from teuthology
.exceptions
import CommandFailedError
13 from teuthology
.orchestra
import run
14 from teuthology
import misc
as teuthology
15 from tasks
.util
.rados
import rados
19 log
= logging
.getLogger(__name__
)
22 def task(ctx
, config
):
24 Test handling of divergent entries during export / import
25 to regression test tracker #11184
33 Requires 3 osds on a single test node.
37 assert isinstance(config
, dict), \
38 'divergent_priors task only accepts a dict for configuration'
40 manager
= ctx
.managers
['ceph']
42 while len(manager
.get_osd_status()['up']) < 3:
45 manager
.flush_pg_stats(osds
)
46 manager
.raw_cluster_cmd('osd', 'set', 'noout')
47 manager
.raw_cluster_cmd('osd', 'set', 'noin')
48 manager
.raw_cluster_cmd('osd', 'set', 'nodown')
49 manager
.wait_for_clean()
51 # something that is always there
52 dummyfile
= '/etc/fstab'
53 dummyfile2
= '/etc/resolv.conf'
54 testdir
= teuthology
.get_testdir(ctx
)
57 log
.info('creating foo')
58 manager
.raw_cluster_cmd('osd', 'pool', 'create', 'foo', '1')
59 manager
.raw_cluster_cmd(
60 'osd', 'pool', 'application', 'enable',
61 'foo', 'rados', run
.Raw('||'), 'true')
63 # Remove extra pool to simlify log output
64 manager
.raw_cluster_cmd('osd', 'pool', 'delete', 'rbd', 'rbd', '--yes-i-really-really-mean-it')
67 manager
.set_config(i
, osd_min_pg_log_entries
=10)
68 manager
.set_config(i
, osd_max_pg_log_entries
=10)
69 manager
.set_config(i
, osd_pg_log_trim_min
=5)
72 divergent
= manager
.get_pg_primary('foo', 0)
73 log
.info("primary and soon to be divergent is %d", divergent
)
74 non_divergent
= list(osds
)
75 non_divergent
.remove(divergent
)
77 log
.info('writing initial objects')
78 first_mon
= teuthology
.get_first_mon(ctx
, config
)
79 (mon
,) = ctx
.cluster
.only(first_mon
).remotes
.keys()
82 rados(ctx
, mon
, ['-p', 'foo', 'put', 'existing_%d' % i
, dummyfile
])
84 manager
.wait_for_clean()
86 # blackhole non_divergent
87 log
.info("blackholing osds %s", str(non_divergent
))
88 for i
in non_divergent
:
89 manager
.set_config(i
, objectstore_blackhole
=1)
93 # Write some soon to be divergent
94 log
.info('writing divergent objects')
95 for i
in range(DIVERGENT_WRITE
):
96 rados(ctx
, mon
, ['-p', 'foo', 'put', 'existing_%d' % i
,
97 dummyfile2
], wait
=False)
98 # Remove some soon to be divergent
99 log
.info('remove divergent objects')
100 for i
in range(DIVERGENT_REMOVE
):
101 rados(ctx
, mon
, ['-p', 'foo', 'rm',
102 'existing_%d' % (i
+ DIVERGENT_WRITE
)], wait
=False)
105 args
=['killall', '-9', 'rados'],
109 # kill all the osds but leave divergent in
110 log
.info('killing all the osds')
114 manager
.mark_down_osd(i
)
115 for i
in non_divergent
:
116 manager
.mark_out_osd(i
)
118 # bring up non-divergent
119 log
.info("bringing up non_divergent %s", str(non_divergent
))
120 for i
in non_divergent
:
121 manager
.revive_osd(i
)
122 for i
in non_divergent
:
123 manager
.mark_in_osd(i
)
125 # write 1 non-divergent object (ensure that old divergent one is divergent)
126 objname
= "existing_%d" % (DIVERGENT_WRITE
+ DIVERGENT_REMOVE
)
127 log
.info('writing non-divergent object ' + objname
)
128 rados(ctx
, mon
, ['-p', 'foo', 'put', objname
, dummyfile2
])
130 manager
.wait_for_recovery()
132 # ensure no recovery of up osds first
133 log
.info('delay recovery')
134 for i
in non_divergent
:
135 manager
.wait_run_admin_socket(
136 'osd', i
, ['set_recovery_delay', '100000'])
138 # bring in our divergent friend
139 log
.info("revive divergent %d", divergent
)
140 manager
.raw_cluster_cmd('osd', 'set', 'noup')
141 manager
.revive_osd(divergent
)
143 log
.info('delay recovery divergent')
144 manager
.wait_run_admin_socket(
145 'osd', divergent
, ['set_recovery_delay', '100000'])
147 manager
.raw_cluster_cmd('osd', 'unset', 'noup')
148 while len(manager
.get_osd_status()['up']) < 3:
151 log
.info('wait for peering')
152 rados(ctx
, mon
, ['-p', 'foo', 'put', 'foo', dummyfile
])
154 # At this point the divergent_priors should have been detected
156 log
.info("killing divergent %d", divergent
)
157 manager
.kill_osd(divergent
)
159 # Split pgs for pool foo
160 manager
.raw_cluster_cmd('osd', 'pool', 'set', 'foo', 'pg_num', '2')
163 manager
.raw_cluster_cmd('pg','dump')
166 (exp_remote
,) = ctx
.\
167 cluster
.only('osd.{o}'.format(o
=divergent
)).remotes
.keys()
168 FSPATH
= manager
.get_filepath()
169 JPATH
= os
.path
.join(FSPATH
, "journal")
170 prefix
= ("sudo adjust-ulimits ceph-objectstore-tool "
171 "--data-path {fpath} --journal-path {jpath} "
173 "/var/log/ceph/objectstore_tool.$$.log ".
174 format(fpath
=FSPATH
, jpath
=JPATH
))
176 expfile
= os
.path
.join(testdir
, "exp.{pid}.out".format(pid
=pid
))
177 cmd
= ((prefix
+ "--op export-remove --pgid 2.0 --file {file}").
178 format(id=divergent
, file=expfile
))
180 exp_remote
.sh(cmd
, wait
=True)
181 except CommandFailedError
as e
:
182 assert e
.exitstatus
== 0
184 # Kill one of non-divergent OSDs
185 log
.info('killing osd.%d' % non_divergent
[0])
186 manager
.kill_osd(non_divergent
[0])
187 manager
.mark_down_osd(non_divergent
[0])
188 # manager.mark_out_osd(non_divergent[0])
190 # An empty collection for pg 2.0 might need to be cleaned up
191 cmd
= ((prefix
+ "--force --op remove --pgid 2.0").
192 format(id=non_divergent
[0]))
193 exp_remote
.sh(cmd
, wait
=True, check_status
=False)
195 cmd
= ((prefix
+ "--op import --file {file}").
196 format(id=non_divergent
[0], file=expfile
))
198 exp_remote
.sh(cmd
, wait
=True)
199 except CommandFailedError
as e
:
200 assert e
.exitstatus
== 0
202 # bring in our divergent friend and other node
203 log
.info("revive divergent %d", divergent
)
204 manager
.revive_osd(divergent
)
205 manager
.mark_in_osd(divergent
)
206 log
.info("revive %d", non_divergent
[0])
207 manager
.revive_osd(non_divergent
[0])
209 while len(manager
.get_osd_status()['up']) < 3:
212 log
.info('delay recovery divergent')
213 manager
.set_config(divergent
, osd_recovery_delay_start
=100000)
214 log
.info('mark divergent in')
215 manager
.mark_in_osd(divergent
)
217 log
.info('wait for peering')
218 rados(ctx
, mon
, ['-p', 'foo', 'put', 'foo', dummyfile
])
220 log
.info("killing divergent %d", divergent
)
221 manager
.kill_osd(divergent
)
222 log
.info("reviving divergent %d", divergent
)
223 manager
.revive_osd(divergent
)
226 log
.info('allowing recovery')
227 # Set osd_recovery_delay_start back to 0 and kick the queue
229 manager
.raw_cluster_cmd('tell', 'osd.%d' % i
, 'debug',
230 'kick_recovery_wq', ' 0')
232 log
.info('reading divergent objects')
233 for i
in range(DIVERGENT_WRITE
+ DIVERGENT_REMOVE
):
234 exit_status
= rados(ctx
, mon
, ['-p', 'foo', 'get', 'existing_%d' % i
,
236 assert exit_status
== 0
239 cluster
.only('osd.{o}'.format(o
=divergent
)).remotes
.keys()
240 cmd
= 'rm {file}'.format(file=expfile
)
241 remote
.run(args
=cmd
, wait
=True)