]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | """ |
2 | Lost_unfound | |
3 | """ | |
4 | import logging | |
5 | import time | |
6 | import ceph_manager | |
7 | from teuthology import misc as teuthology | |
8 | from teuthology.orchestra import run | |
9 | from util.rados import rados | |
10 | ||
11 | log = logging.getLogger(__name__) | |
12 | ||
13 | def task(ctx, config): | |
14 | """ | |
15 | Test handling of lost objects. | |
16 | ||
17 | A pretty rigid cluseter is brought up andtested by this task | |
18 | """ | |
19 | POOL = 'unfound_pool' | |
20 | if config is None: | |
21 | config = {} | |
22 | assert isinstance(config, dict), \ | |
23 | 'lost_unfound task only accepts a dict for configuration' | |
24 | first_mon = teuthology.get_first_mon(ctx, config) | |
25 | (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys() | |
26 | ||
27 | manager = ceph_manager.CephManager( | |
28 | mon, | |
29 | ctx=ctx, | |
30 | logger=log.getChild('ceph_manager'), | |
31 | ) | |
32 | ||
33 | while len(manager.get_osd_status()['up']) < 3: | |
34 | time.sleep(10) | |
35 | ||
36 | manager.wait_for_clean() | |
37 | ||
38 | manager.create_pool(POOL) | |
39 | ||
40 | # something that is always there | |
41 | dummyfile = '/etc/fstab' | |
42 | ||
43 | # take an osd out until the very end | |
44 | manager.kill_osd(2) | |
45 | manager.mark_down_osd(2) | |
46 | manager.mark_out_osd(2) | |
47 | ||
48 | # kludge to make sure they get a map | |
49 | rados(ctx, mon, ['-p', POOL, 'put', 'dummy', dummyfile]) | |
50 | ||
51 | manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats') | |
52 | manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats') | |
53 | manager.wait_for_recovery() | |
54 | ||
55 | # create old objects | |
56 | for f in range(1, 10): | |
57 | rados(ctx, mon, ['-p', POOL, 'put', 'existing_%d' % f, dummyfile]) | |
58 | rados(ctx, mon, ['-p', POOL, 'put', 'existed_%d' % f, dummyfile]) | |
59 | rados(ctx, mon, ['-p', POOL, 'rm', 'existed_%d' % f]) | |
60 | ||
61 | # delay recovery, and make the pg log very long (to prevent backfill) | |
62 | manager.raw_cluster_cmd( | |
63 | 'tell', 'osd.1', | |
64 | 'injectargs', | |
65 | '--osd-recovery-delay-start 1000 --osd-min-pg-log-entries 100000000' | |
66 | ) | |
67 | ||
68 | manager.kill_osd(0) | |
69 | manager.mark_down_osd(0) | |
70 | ||
71 | for f in range(1, 10): | |
72 | rados(ctx, mon, ['-p', POOL, 'put', 'new_%d' % f, dummyfile]) | |
73 | rados(ctx, mon, ['-p', POOL, 'put', 'existed_%d' % f, dummyfile]) | |
74 | rados(ctx, mon, ['-p', POOL, 'put', 'existing_%d' % f, dummyfile]) | |
75 | ||
76 | # bring osd.0 back up, let it peer, but don't replicate the new | |
77 | # objects... | |
78 | log.info('osd.0 command_args is %s' % 'foo') | |
79 | log.info(ctx.daemons.get_daemon('osd', 0).command_args) | |
80 | ctx.daemons.get_daemon('osd', 0).command_kwargs['args'].extend([ | |
81 | '--osd-recovery-delay-start', '1000' | |
82 | ]) | |
83 | manager.revive_osd(0) | |
84 | manager.mark_in_osd(0) | |
85 | manager.wait_till_osd_is_up(0) | |
86 | ||
87 | manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats') | |
88 | manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats') | |
89 | manager.wait_till_active() | |
90 | ||
91 | # take out osd.1 and the only copy of those objects. | |
92 | manager.kill_osd(1) | |
93 | manager.mark_down_osd(1) | |
94 | manager.mark_out_osd(1) | |
95 | manager.raw_cluster_cmd('osd', 'lost', '1', '--yes-i-really-mean-it') | |
96 | ||
97 | # bring up osd.2 so that things would otherwise, in theory, recovery fully | |
98 | manager.revive_osd(2) | |
99 | manager.mark_in_osd(2) | |
100 | manager.wait_till_osd_is_up(2) | |
101 | ||
102 | manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats') | |
103 | manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats') | |
104 | manager.wait_till_active() | |
105 | manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats') | |
106 | manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats') | |
107 | ||
108 | # verify that there are unfound objects | |
109 | unfound = manager.get_num_unfound_objects() | |
110 | log.info("there are %d unfound objects" % unfound) | |
111 | assert unfound | |
112 | ||
113 | testdir = teuthology.get_testdir(ctx) | |
114 | procs = [] | |
115 | if config.get('parallel_bench', True): | |
116 | procs.append(mon.run( | |
117 | args=[ | |
118 | "/bin/sh", "-c", | |
119 | " ".join(['adjust-ulimits', | |
120 | 'ceph-coverage', | |
121 | '{tdir}/archive/coverage', | |
122 | 'rados', | |
123 | '--no-log-to-stderr', | |
124 | '--name', 'client.admin', | |
125 | '-b', str(4<<10), | |
126 | '-p' , POOL, | |
127 | '-t', '20', | |
128 | 'bench', '240', 'write', | |
129 | ]).format(tdir=testdir), | |
130 | ], | |
131 | logger=log.getChild('radosbench.{id}'.format(id='client.admin')), | |
132 | stdin=run.PIPE, | |
133 | wait=False | |
134 | )) | |
135 | time.sleep(10) | |
136 | ||
137 | # mark stuff lost | |
138 | pgs = manager.get_pg_stats() | |
139 | for pg in pgs: | |
140 | if pg['stat_sum']['num_objects_unfound'] > 0: | |
141 | primary = 'osd.%d' % pg['acting'][0] | |
142 | ||
143 | # verify that i can list them direct from the osd | |
144 | log.info('listing missing/lost in %s state %s', pg['pgid'], | |
145 | pg['state']); | |
146 | m = manager.list_pg_missing(pg['pgid']) | |
147 | #log.info('%s' % m) | |
148 | assert m['num_unfound'] == pg['stat_sum']['num_objects_unfound'] | |
149 | num_unfound=0 | |
150 | for o in m['objects']: | |
151 | if len(o['locations']) == 0: | |
152 | num_unfound += 1 | |
153 | assert m['num_unfound'] == num_unfound | |
154 | ||
155 | log.info("reverting unfound in %s on %s", pg['pgid'], primary) | |
156 | manager.raw_cluster_cmd('pg', pg['pgid'], | |
157 | 'mark_unfound_lost', 'revert') | |
158 | else: | |
159 | log.info("no unfound in %s", pg['pgid']) | |
160 | ||
161 | manager.raw_cluster_cmd('tell', 'osd.0', 'debug', 'kick_recovery_wq', '5') | |
162 | manager.raw_cluster_cmd('tell', 'osd.2', 'debug', 'kick_recovery_wq', '5') | |
163 | manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats') | |
164 | manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats') | |
165 | manager.wait_for_recovery() | |
166 | ||
167 | # verify result | |
168 | for f in range(1, 10): | |
169 | err = rados(ctx, mon, ['-p', POOL, 'get', 'new_%d' % f, '-']) | |
170 | assert err | |
171 | err = rados(ctx, mon, ['-p', POOL, 'get', 'existed_%d' % f, '-']) | |
172 | assert err | |
173 | err = rados(ctx, mon, ['-p', POOL, 'get', 'existing_%d' % f, '-']) | |
174 | assert not err | |
175 | ||
176 | # see if osd.1 can cope | |
177 | manager.revive_osd(1) | |
178 | manager.mark_in_osd(1) | |
179 | manager.wait_till_osd_is_up(1) | |
180 | manager.wait_for_clean() | |
181 | run.wait(procs) |