]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | """ |
2 | Lost_unfound | |
3 | """ | |
4 | from teuthology.orchestra import run | |
5 | import logging | |
6 | import ceph_manager | |
7 | from teuthology import misc as teuthology | |
8 | from util.rados import rados | |
9 | import time | |
10 | ||
11 | log = logging.getLogger(__name__) | |
12 | ||
13 | def task(ctx, config): | |
14 | """ | |
15 | Test handling of lost objects on an ec pool. | |
16 | ||
17 | A pretty rigid cluster is brought up andtested by this task | |
18 | """ | |
19 | if config is None: | |
20 | config = {} | |
21 | assert isinstance(config, dict), \ | |
22 | 'lost_unfound task only accepts a dict for configuration' | |
23 | first_mon = teuthology.get_first_mon(ctx, config) | |
24 | (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys() | |
25 | ||
26 | manager = ceph_manager.CephManager( | |
27 | mon, | |
28 | ctx=ctx, | |
29 | logger=log.getChild('ceph_manager'), | |
30 | ) | |
31 | ||
32 | manager.wait_for_clean() | |
33 | ||
34 | profile = config.get('erasure_code_profile', { | |
35 | 'k': '2', | |
36 | 'm': '2', | |
37 | 'ruleset-failure-domain': 'osd' | |
38 | }) | |
39 | profile_name = profile.get('name', 'lost_unfound') | |
40 | manager.create_erasure_code_profile(profile_name, profile) | |
41 | pool = manager.create_pool_with_unique_name( | |
42 | erasure_code_profile_name=profile_name, | |
43 | min_size=2) | |
44 | ||
45 | # something that is always there, readable and never empty | |
46 | dummyfile = '/etc/group' | |
47 | ||
48 | # kludge to make sure they get a map | |
49 | rados(ctx, mon, ['-p', pool, 'put', 'dummy', dummyfile]) | |
50 | ||
51 | manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats') | |
52 | manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats') | |
53 | manager.wait_for_recovery() | |
54 | ||
55 | # create old objects | |
56 | for f in range(1, 10): | |
57 | rados(ctx, mon, ['-p', pool, 'put', 'existing_%d' % f, dummyfile]) | |
58 | rados(ctx, mon, ['-p', pool, 'put', 'existed_%d' % f, dummyfile]) | |
59 | rados(ctx, mon, ['-p', pool, 'rm', 'existed_%d' % f]) | |
60 | ||
61 | # delay recovery, and make the pg log very long (to prevent backfill) | |
62 | manager.raw_cluster_cmd( | |
63 | 'tell', 'osd.1', | |
64 | 'injectargs', | |
65 | '--osd-recovery-delay-start 1000 --osd-min-pg-log-entries 100000000' | |
66 | ) | |
67 | ||
68 | manager.kill_osd(0) | |
69 | manager.mark_down_osd(0) | |
70 | manager.kill_osd(3) | |
71 | manager.mark_down_osd(3) | |
72 | ||
73 | for f in range(1, 10): | |
74 | rados(ctx, mon, ['-p', pool, 'put', 'new_%d' % f, dummyfile]) | |
75 | rados(ctx, mon, ['-p', pool, 'put', 'existed_%d' % f, dummyfile]) | |
76 | rados(ctx, mon, ['-p', pool, 'put', 'existing_%d' % f, dummyfile]) | |
77 | ||
78 | # take out osd.1 and a necessary shard of those objects. | |
79 | manager.kill_osd(1) | |
80 | manager.mark_down_osd(1) | |
81 | manager.raw_cluster_cmd('osd', 'lost', '1', '--yes-i-really-mean-it') | |
82 | manager.revive_osd(0) | |
83 | manager.wait_till_osd_is_up(0) | |
84 | manager.revive_osd(3) | |
85 | manager.wait_till_osd_is_up(3) | |
86 | ||
87 | manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats') | |
88 | manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats') | |
89 | manager.raw_cluster_cmd('tell', 'osd.3', 'flush_pg_stats') | |
90 | manager.wait_till_active() | |
91 | manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats') | |
92 | manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats') | |
93 | manager.raw_cluster_cmd('tell', 'osd.3', 'flush_pg_stats') | |
94 | ||
95 | # verify that there are unfound objects | |
96 | unfound = manager.get_num_unfound_objects() | |
97 | log.info("there are %d unfound objects" % unfound) | |
98 | assert unfound | |
99 | ||
100 | testdir = teuthology.get_testdir(ctx) | |
101 | procs = [] | |
102 | if config.get('parallel_bench', True): | |
103 | procs.append(mon.run( | |
104 | args=[ | |
105 | "/bin/sh", "-c", | |
106 | " ".join(['adjust-ulimits', | |
107 | 'ceph-coverage', | |
108 | '{tdir}/archive/coverage', | |
109 | 'rados', | |
110 | '--no-log-to-stderr', | |
111 | '--name', 'client.admin', | |
112 | '-b', str(4<<10), | |
113 | '-p' , pool, | |
114 | '-t', '20', | |
115 | 'bench', '240', 'write', | |
116 | ]).format(tdir=testdir), | |
117 | ], | |
118 | logger=log.getChild('radosbench.{id}'.format(id='client.admin')), | |
119 | stdin=run.PIPE, | |
120 | wait=False | |
121 | )) | |
122 | time.sleep(10) | |
123 | ||
124 | # mark stuff lost | |
125 | pgs = manager.get_pg_stats() | |
126 | for pg in pgs: | |
127 | if pg['stat_sum']['num_objects_unfound'] > 0: | |
128 | # verify that i can list them direct from the osd | |
129 | log.info('listing missing/lost in %s state %s', pg['pgid'], | |
130 | pg['state']); | |
131 | m = manager.list_pg_missing(pg['pgid']) | |
132 | log.info('%s' % m) | |
133 | assert m['num_unfound'] == pg['stat_sum']['num_objects_unfound'] | |
134 | ||
135 | log.info("reverting unfound in %s", pg['pgid']) | |
136 | manager.raw_cluster_cmd('pg', pg['pgid'], | |
137 | 'mark_unfound_lost', 'delete') | |
138 | else: | |
139 | log.info("no unfound in %s", pg['pgid']) | |
140 | ||
141 | manager.raw_cluster_cmd('tell', 'osd.0', 'debug', 'kick_recovery_wq', '5') | |
142 | manager.raw_cluster_cmd('tell', 'osd.2', 'debug', 'kick_recovery_wq', '5') | |
143 | manager.raw_cluster_cmd('tell', 'osd.3', 'debug', 'kick_recovery_wq', '5') | |
144 | manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats') | |
145 | manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats') | |
146 | manager.raw_cluster_cmd('tell', 'osd.3', 'flush_pg_stats') | |
147 | manager.wait_for_recovery() | |
148 | ||
149 | if not config.get('parallel_bench', True): | |
150 | time.sleep(20) | |
151 | ||
152 | # verify result | |
153 | for f in range(1, 10): | |
154 | err = rados(ctx, mon, ['-p', pool, 'get', 'new_%d' % f, '-']) | |
155 | assert err | |
156 | err = rados(ctx, mon, ['-p', pool, 'get', 'existed_%d' % f, '-']) | |
157 | assert err | |
158 | err = rados(ctx, mon, ['-p', pool, 'get', 'existing_%d' % f, '-']) | |
159 | assert err | |
160 | ||
161 | # see if osd.1 can cope | |
162 | manager.revive_osd(1) | |
163 | manager.wait_till_osd_is_up(1) | |
164 | manager.wait_for_clean() | |
165 | run.wait(procs) |