]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | """ |
2 | Lost_unfound | |
3 | """ | |
4 | import logging | |
5 | import time | |
6 | import ceph_manager | |
7 | from teuthology import misc as teuthology | |
8 | from teuthology.orchestra import run | |
9 | from util.rados import rados | |
10 | ||
11 | log = logging.getLogger(__name__) | |
12 | ||
13 | def task(ctx, config): | |
14 | """ | |
15 | Test handling of lost objects. | |
16 | ||
17 | A pretty rigid cluseter is brought up andtested by this task | |
18 | """ | |
19 | POOL = 'unfound_pool' | |
20 | if config is None: | |
21 | config = {} | |
22 | assert isinstance(config, dict), \ | |
23 | 'lost_unfound task only accepts a dict for configuration' | |
24 | first_mon = teuthology.get_first_mon(ctx, config) | |
25 | (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys() | |
26 | ||
27 | manager = ceph_manager.CephManager( | |
28 | mon, | |
29 | ctx=ctx, | |
30 | logger=log.getChild('ceph_manager'), | |
31 | ) | |
32 | ||
33 | while len(manager.get_osd_status()['up']) < 3: | |
34 | time.sleep(10) | |
35 | ||
36 | manager.wait_for_clean() | |
37 | ||
38 | manager.create_pool(POOL) | |
39 | ||
40 | # something that is always there | |
41 | dummyfile = '/etc/fstab' | |
42 | ||
43 | # take an osd out until the very end | |
44 | manager.kill_osd(2) | |
45 | manager.mark_down_osd(2) | |
46 | manager.mark_out_osd(2) | |
47 | ||
48 | # kludge to make sure they get a map | |
49 | rados(ctx, mon, ['-p', POOL, 'put', 'dummy', dummyfile]) | |
50 | ||
31f18b77 | 51 | manager.flush_pg_stats([0, 1]) |
7c673cae FG |
52 | manager.wait_for_recovery() |
53 | ||
54 | # create old objects | |
55 | for f in range(1, 10): | |
56 | rados(ctx, mon, ['-p', POOL, 'put', 'existing_%d' % f, dummyfile]) | |
57 | rados(ctx, mon, ['-p', POOL, 'put', 'existed_%d' % f, dummyfile]) | |
58 | rados(ctx, mon, ['-p', POOL, 'rm', 'existed_%d' % f]) | |
59 | ||
60 | # delay recovery, and make the pg log very long (to prevent backfill) | |
61 | manager.raw_cluster_cmd( | |
62 | 'tell', 'osd.1', | |
63 | 'injectargs', | |
64 | '--osd-recovery-delay-start 1000 --osd-min-pg-log-entries 100000000' | |
65 | ) | |
66 | ||
67 | manager.kill_osd(0) | |
68 | manager.mark_down_osd(0) | |
69 | ||
70 | for f in range(1, 10): | |
71 | rados(ctx, mon, ['-p', POOL, 'put', 'new_%d' % f, dummyfile]) | |
72 | rados(ctx, mon, ['-p', POOL, 'put', 'existed_%d' % f, dummyfile]) | |
73 | rados(ctx, mon, ['-p', POOL, 'put', 'existing_%d' % f, dummyfile]) | |
74 | ||
75 | # bring osd.0 back up, let it peer, but don't replicate the new | |
76 | # objects... | |
77 | log.info('osd.0 command_args is %s' % 'foo') | |
78 | log.info(ctx.daemons.get_daemon('osd', 0).command_args) | |
79 | ctx.daemons.get_daemon('osd', 0).command_kwargs['args'].extend([ | |
80 | '--osd-recovery-delay-start', '1000' | |
81 | ]) | |
82 | manager.revive_osd(0) | |
83 | manager.mark_in_osd(0) | |
84 | manager.wait_till_osd_is_up(0) | |
85 | ||
31f18b77 | 86 | manager.flush_pg_stats([1, 0]) |
7c673cae FG |
87 | manager.wait_till_active() |
88 | ||
89 | # take out osd.1 and the only copy of those objects. | |
90 | manager.kill_osd(1) | |
91 | manager.mark_down_osd(1) | |
92 | manager.mark_out_osd(1) | |
93 | manager.raw_cluster_cmd('osd', 'lost', '1', '--yes-i-really-mean-it') | |
94 | ||
95 | # bring up osd.2 so that things would otherwise, in theory, recovery fully | |
96 | manager.revive_osd(2) | |
97 | manager.mark_in_osd(2) | |
98 | manager.wait_till_osd_is_up(2) | |
99 | ||
31f18b77 | 100 | manager.flush_pg_stats([0, 2]) |
7c673cae | 101 | manager.wait_till_active() |
31f18b77 | 102 | manager.flush_pg_stats([0, 2]) |
7c673cae FG |
103 | |
104 | # verify that there are unfound objects | |
105 | unfound = manager.get_num_unfound_objects() | |
106 | log.info("there are %d unfound objects" % unfound) | |
107 | assert unfound | |
108 | ||
109 | testdir = teuthology.get_testdir(ctx) | |
110 | procs = [] | |
111 | if config.get('parallel_bench', True): | |
112 | procs.append(mon.run( | |
113 | args=[ | |
114 | "/bin/sh", "-c", | |
115 | " ".join(['adjust-ulimits', | |
116 | 'ceph-coverage', | |
117 | '{tdir}/archive/coverage', | |
118 | 'rados', | |
119 | '--no-log-to-stderr', | |
120 | '--name', 'client.admin', | |
121 | '-b', str(4<<10), | |
122 | '-p' , POOL, | |
123 | '-t', '20', | |
124 | 'bench', '240', 'write', | |
125 | ]).format(tdir=testdir), | |
126 | ], | |
127 | logger=log.getChild('radosbench.{id}'.format(id='client.admin')), | |
128 | stdin=run.PIPE, | |
129 | wait=False | |
130 | )) | |
131 | time.sleep(10) | |
132 | ||
133 | # mark stuff lost | |
134 | pgs = manager.get_pg_stats() | |
135 | for pg in pgs: | |
136 | if pg['stat_sum']['num_objects_unfound'] > 0: | |
137 | primary = 'osd.%d' % pg['acting'][0] | |
138 | ||
139 | # verify that i can list them direct from the osd | |
140 | log.info('listing missing/lost in %s state %s', pg['pgid'], | |
141 | pg['state']); | |
142 | m = manager.list_pg_missing(pg['pgid']) | |
143 | #log.info('%s' % m) | |
144 | assert m['num_unfound'] == pg['stat_sum']['num_objects_unfound'] | |
145 | num_unfound=0 | |
146 | for o in m['objects']: | |
147 | if len(o['locations']) == 0: | |
148 | num_unfound += 1 | |
149 | assert m['num_unfound'] == num_unfound | |
150 | ||
151 | log.info("reverting unfound in %s on %s", pg['pgid'], primary) | |
152 | manager.raw_cluster_cmd('pg', pg['pgid'], | |
153 | 'mark_unfound_lost', 'revert') | |
154 | else: | |
155 | log.info("no unfound in %s", pg['pgid']) | |
156 | ||
157 | manager.raw_cluster_cmd('tell', 'osd.0', 'debug', 'kick_recovery_wq', '5') | |
158 | manager.raw_cluster_cmd('tell', 'osd.2', 'debug', 'kick_recovery_wq', '5') | |
31f18b77 | 159 | manager.flush_pg_stats([0, 2]) |
7c673cae FG |
160 | manager.wait_for_recovery() |
161 | ||
162 | # verify result | |
163 | for f in range(1, 10): | |
164 | err = rados(ctx, mon, ['-p', POOL, 'get', 'new_%d' % f, '-']) | |
165 | assert err | |
166 | err = rados(ctx, mon, ['-p', POOL, 'get', 'existed_%d' % f, '-']) | |
167 | assert err | |
168 | err = rados(ctx, mon, ['-p', POOL, 'get', 'existing_%d' % f, '-']) | |
169 | assert not err | |
170 | ||
171 | # see if osd.1 can cope | |
172 | manager.revive_osd(1) | |
173 | manager.mark_in_osd(1) | |
174 | manager.wait_till_osd_is_up(1) | |
175 | manager.wait_for_clean() | |
176 | run.wait(procs) |