]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | """ |
2 | Lost_unfound | |
3 | """ | |
4 | import logging | |
7c673cae | 5 | import time |
e306af50 TL |
6 | |
7 | from tasks import ceph_manager | |
8 | from tasks.util.rados import rados | |
7c673cae | 9 | from teuthology import misc as teuthology |
e306af50 | 10 | from teuthology.orchestra import run |
7c673cae FG |
11 | |
12 | log = logging.getLogger(__name__) | |
13 | ||
14 | def task(ctx, config): | |
15 | """ | |
16 | Test handling of lost objects. | |
17 | ||
f67539c2 | 18 | A pretty rigid cluster is brought up and tested by this task |
7c673cae FG |
19 | """ |
20 | POOL = 'unfounddel_pool' | |
21 | if config is None: | |
22 | config = {} | |
23 | assert isinstance(config, dict), \ | |
24 | 'lost_unfound task only accepts a dict for configuration' | |
25 | first_mon = teuthology.get_first_mon(ctx, config) | |
9f95a23c | 26 | (mon,) = ctx.cluster.only(first_mon).remotes.keys() |
7c673cae FG |
27 | |
28 | manager = ceph_manager.CephManager( | |
29 | mon, | |
30 | ctx=ctx, | |
31 | logger=log.getChild('ceph_manager'), | |
32 | ) | |
33 | ||
34 | while len(manager.get_osd_status()['up']) < 3: | |
35 | time.sleep(10) | |
31f18b77 | 36 | manager.flush_pg_stats([0, 1, 2]) |
7c673cae FG |
37 | manager.wait_for_clean() |
38 | ||
39 | manager.create_pool(POOL) | |
40 | ||
41 | # something that is always there | |
42 | dummyfile = '/etc/fstab' | |
43 | ||
44 | # take an osd out until the very end | |
45 | manager.kill_osd(2) | |
46 | manager.mark_down_osd(2) | |
47 | manager.mark_out_osd(2) | |
48 | ||
49 | # kludge to make sure they get a map | |
50 | rados(ctx, mon, ['-p', POOL, 'put', 'dummy', dummyfile]) | |
51 | ||
31f18b77 | 52 | manager.flush_pg_stats([0, 1]) |
7c673cae FG |
53 | manager.wait_for_recovery() |
54 | ||
55 | # create old objects | |
56 | for f in range(1, 10): | |
57 | rados(ctx, mon, ['-p', POOL, 'put', 'existing_%d' % f, dummyfile]) | |
58 | rados(ctx, mon, ['-p', POOL, 'put', 'existed_%d' % f, dummyfile]) | |
59 | rados(ctx, mon, ['-p', POOL, 'rm', 'existed_%d' % f]) | |
60 | ||
61 | # delay recovery, and make the pg log very long (to prevent backfill) | |
62 | manager.raw_cluster_cmd( | |
63 | 'tell', 'osd.1', | |
64 | 'injectargs', | |
65 | '--osd-recovery-delay-start 1000 --osd-min-pg-log-entries 100000000' | |
66 | ) | |
67 | ||
68 | manager.kill_osd(0) | |
69 | manager.mark_down_osd(0) | |
70 | ||
71 | for f in range(1, 10): | |
72 | rados(ctx, mon, ['-p', POOL, 'put', 'new_%d' % f, dummyfile]) | |
73 | rados(ctx, mon, ['-p', POOL, 'put', 'existed_%d' % f, dummyfile]) | |
74 | rados(ctx, mon, ['-p', POOL, 'put', 'existing_%d' % f, dummyfile]) | |
75 | ||
76 | # bring osd.0 back up, let it peer, but don't replicate the new | |
77 | # objects... | |
78 | log.info('osd.0 command_args is %s' % 'foo') | |
79 | log.info(ctx.daemons.get_daemon('osd', 0).command_args) | |
80 | ctx.daemons.get_daemon('osd', 0).command_kwargs['args'].extend([ | |
81 | '--osd-recovery-delay-start', '1000' | |
82 | ]) | |
83 | manager.revive_osd(0) | |
84 | manager.mark_in_osd(0) | |
85 | manager.wait_till_osd_is_up(0) | |
86 | ||
31f18b77 | 87 | manager.flush_pg_stats([0, 1]) |
7c673cae FG |
88 | manager.wait_till_active() |
89 | ||
90 | # take out osd.1 and the only copy of those objects. | |
91 | manager.kill_osd(1) | |
92 | manager.mark_down_osd(1) | |
93 | manager.mark_out_osd(1) | |
94 | manager.raw_cluster_cmd('osd', 'lost', '1', '--yes-i-really-mean-it') | |
95 | ||
96 | # bring up osd.2 so that things would otherwise, in theory, recovery fully | |
97 | manager.revive_osd(2) | |
98 | manager.mark_in_osd(2) | |
99 | manager.wait_till_osd_is_up(2) | |
100 | ||
31f18b77 | 101 | manager.flush_pg_stats([0, 2]) |
7c673cae | 102 | manager.wait_till_active() |
31f18b77 | 103 | manager.flush_pg_stats([0, 2]) |
7c673cae FG |
104 | |
105 | # verify that there are unfound objects | |
106 | unfound = manager.get_num_unfound_objects() | |
107 | log.info("there are %d unfound objects" % unfound) | |
108 | assert unfound | |
109 | ||
110 | testdir = teuthology.get_testdir(ctx) | |
111 | procs = [] | |
112 | if config.get('parallel_bench', True): | |
113 | procs.append(mon.run( | |
114 | args=[ | |
115 | "/bin/sh", "-c", | |
116 | " ".join(['adjust-ulimits', | |
117 | 'ceph-coverage', | |
118 | '{tdir}/archive/coverage', | |
119 | 'rados', | |
120 | '--no-log-to-stderr', | |
121 | '--name', 'client.admin', | |
122 | '-b', str(4<<10), | |
123 | '-p' , POOL, | |
124 | '-t', '20', | |
125 | 'bench', '240', 'write', | |
126 | ]).format(tdir=testdir), | |
127 | ], | |
128 | logger=log.getChild('radosbench.{id}'.format(id='client.admin')), | |
129 | stdin=run.PIPE, | |
130 | wait=False | |
131 | )) | |
132 | time.sleep(10) | |
133 | ||
134 | # mark stuff lost | |
135 | pgs = manager.get_pg_stats() | |
136 | for pg in pgs: | |
137 | if pg['stat_sum']['num_objects_unfound'] > 0: | |
138 | primary = 'osd.%d' % pg['acting'][0] | |
139 | ||
140 | # verify that i can list them direct from the osd | |
141 | log.info('listing missing/lost in %s state %s', pg['pgid'], | |
142 | pg['state']); | |
11fdf7f2 | 143 | m = manager.list_pg_unfound(pg['pgid']) |
7c673cae FG |
144 | #log.info('%s' % m) |
145 | assert m['num_unfound'] == pg['stat_sum']['num_objects_unfound'] | |
146 | num_unfound=0 | |
147 | for o in m['objects']: | |
148 | if len(o['locations']) == 0: | |
149 | num_unfound += 1 | |
150 | assert m['num_unfound'] == num_unfound | |
151 | ||
152 | log.info("reverting unfound in %s on %s", pg['pgid'], primary) | |
153 | manager.raw_cluster_cmd('pg', pg['pgid'], | |
154 | 'mark_unfound_lost', 'delete') | |
155 | else: | |
156 | log.info("no unfound in %s", pg['pgid']) | |
157 | ||
158 | manager.raw_cluster_cmd('tell', 'osd.0', 'debug', 'kick_recovery_wq', '5') | |
159 | manager.raw_cluster_cmd('tell', 'osd.2', 'debug', 'kick_recovery_wq', '5') | |
31f18b77 | 160 | manager.flush_pg_stats([0, 2]) |
7c673cae FG |
161 | manager.wait_for_recovery() |
162 | ||
163 | # verify result | |
164 | for f in range(1, 10): | |
165 | err = rados(ctx, mon, ['-p', POOL, 'get', 'new_%d' % f, '-']) | |
166 | assert err | |
167 | err = rados(ctx, mon, ['-p', POOL, 'get', 'existed_%d' % f, '-']) | |
168 | assert err | |
169 | err = rados(ctx, mon, ['-p', POOL, 'get', 'existing_%d' % f, '-']) | |
170 | assert err | |
171 | ||
172 | # see if osd.1 can cope | |
7c673cae | 173 | manager.mark_in_osd(1) |
11fdf7f2 | 174 | manager.revive_osd(1) |
7c673cae FG |
175 | manager.wait_till_osd_is_up(1) |
176 | manager.wait_for_clean() | |
177 | run.wait(procs) | |
f67539c2 | 178 | manager.wait_for_clean() |
7c673cae | 179 |