]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | """ |
2 | Lost_unfound | |
3 | """ | |
7c673cae | 4 | import logging |
7c673cae | 5 | import time |
e306af50 TL |
6 | from tasks import ceph_manager |
7 | from tasks.util.rados import rados | |
8 | from teuthology import misc as teuthology | |
9 | from teuthology.orchestra import run | |
7c673cae FG |
10 | |
11 | log = logging.getLogger(__name__) | |
12 | ||
13 | def task(ctx, config): | |
14 | """ | |
15 | Test handling of lost objects on an ec pool. | |
16 | ||
f67539c2 | 17 | A pretty rigid cluster is brought up and tested by this task |
7c673cae FG |
18 | """ |
19 | if config is None: | |
20 | config = {} | |
21 | assert isinstance(config, dict), \ | |
22 | 'lost_unfound task only accepts a dict for configuration' | |
23 | first_mon = teuthology.get_first_mon(ctx, config) | |
9f95a23c | 24 | (mon,) = ctx.cluster.only(first_mon).remotes.keys() |
7c673cae FG |
25 | |
26 | manager = ceph_manager.CephManager( | |
27 | mon, | |
28 | ctx=ctx, | |
29 | logger=log.getChild('ceph_manager'), | |
30 | ) | |
31 | ||
32 | manager.wait_for_clean() | |
33 | ||
34 | profile = config.get('erasure_code_profile', { | |
35 | 'k': '2', | |
36 | 'm': '2', | |
224ce89b | 37 | 'crush-failure-domain': 'osd' |
7c673cae FG |
38 | }) |
39 | profile_name = profile.get('name', 'lost_unfound') | |
40 | manager.create_erasure_code_profile(profile_name, profile) | |
41 | pool = manager.create_pool_with_unique_name( | |
42 | erasure_code_profile_name=profile_name, | |
43 | min_size=2) | |
44 | ||
45 | # something that is always there, readable and never empty | |
46 | dummyfile = '/etc/group' | |
47 | ||
48 | # kludge to make sure they get a map | |
49 | rados(ctx, mon, ['-p', pool, 'put', 'dummy', dummyfile]) | |
50 | ||
31f18b77 | 51 | manager.flush_pg_stats([0, 1]) |
7c673cae FG |
52 | manager.wait_for_recovery() |
53 | ||
54 | # create old objects | |
55 | for f in range(1, 10): | |
56 | rados(ctx, mon, ['-p', pool, 'put', 'existing_%d' % f, dummyfile]) | |
57 | rados(ctx, mon, ['-p', pool, 'put', 'existed_%d' % f, dummyfile]) | |
58 | rados(ctx, mon, ['-p', pool, 'rm', 'existed_%d' % f]) | |
59 | ||
60 | # delay recovery, and make the pg log very long (to prevent backfill) | |
61 | manager.raw_cluster_cmd( | |
62 | 'tell', 'osd.1', | |
63 | 'injectargs', | |
64 | '--osd-recovery-delay-start 1000 --osd-min-pg-log-entries 100000000' | |
65 | ) | |
66 | ||
67 | manager.kill_osd(0) | |
68 | manager.mark_down_osd(0) | |
69 | manager.kill_osd(3) | |
70 | manager.mark_down_osd(3) | |
71 | ||
72 | for f in range(1, 10): | |
73 | rados(ctx, mon, ['-p', pool, 'put', 'new_%d' % f, dummyfile]) | |
74 | rados(ctx, mon, ['-p', pool, 'put', 'existed_%d' % f, dummyfile]) | |
75 | rados(ctx, mon, ['-p', pool, 'put', 'existing_%d' % f, dummyfile]) | |
76 | ||
77 | # take out osd.1 and a necessary shard of those objects. | |
78 | manager.kill_osd(1) | |
79 | manager.mark_down_osd(1) | |
80 | manager.raw_cluster_cmd('osd', 'lost', '1', '--yes-i-really-mean-it') | |
81 | manager.revive_osd(0) | |
82 | manager.wait_till_osd_is_up(0) | |
83 | manager.revive_osd(3) | |
84 | manager.wait_till_osd_is_up(3) | |
85 | ||
31f18b77 | 86 | manager.flush_pg_stats([0, 2, 3]) |
7c673cae | 87 | manager.wait_till_active() |
31f18b77 | 88 | manager.flush_pg_stats([0, 2, 3]) |
7c673cae FG |
89 | |
90 | # verify that there are unfound objects | |
91 | unfound = manager.get_num_unfound_objects() | |
92 | log.info("there are %d unfound objects" % unfound) | |
93 | assert unfound | |
94 | ||
95 | testdir = teuthology.get_testdir(ctx) | |
96 | procs = [] | |
97 | if config.get('parallel_bench', True): | |
98 | procs.append(mon.run( | |
99 | args=[ | |
100 | "/bin/sh", "-c", | |
101 | " ".join(['adjust-ulimits', | |
102 | 'ceph-coverage', | |
103 | '{tdir}/archive/coverage', | |
104 | 'rados', | |
105 | '--no-log-to-stderr', | |
106 | '--name', 'client.admin', | |
107 | '-b', str(4<<10), | |
108 | '-p' , pool, | |
109 | '-t', '20', | |
110 | 'bench', '240', 'write', | |
111 | ]).format(tdir=testdir), | |
112 | ], | |
113 | logger=log.getChild('radosbench.{id}'.format(id='client.admin')), | |
114 | stdin=run.PIPE, | |
115 | wait=False | |
116 | )) | |
117 | time.sleep(10) | |
118 | ||
119 | # mark stuff lost | |
120 | pgs = manager.get_pg_stats() | |
121 | for pg in pgs: | |
122 | if pg['stat_sum']['num_objects_unfound'] > 0: | |
123 | # verify that i can list them direct from the osd | |
124 | log.info('listing missing/lost in %s state %s', pg['pgid'], | |
125 | pg['state']); | |
11fdf7f2 | 126 | m = manager.list_pg_unfound(pg['pgid']) |
7c673cae FG |
127 | log.info('%s' % m) |
128 | assert m['num_unfound'] == pg['stat_sum']['num_objects_unfound'] | |
129 | ||
130 | log.info("reverting unfound in %s", pg['pgid']) | |
131 | manager.raw_cluster_cmd('pg', pg['pgid'], | |
132 | 'mark_unfound_lost', 'delete') | |
133 | else: | |
134 | log.info("no unfound in %s", pg['pgid']) | |
135 | ||
136 | manager.raw_cluster_cmd('tell', 'osd.0', 'debug', 'kick_recovery_wq', '5') | |
137 | manager.raw_cluster_cmd('tell', 'osd.2', 'debug', 'kick_recovery_wq', '5') | |
138 | manager.raw_cluster_cmd('tell', 'osd.3', 'debug', 'kick_recovery_wq', '5') | |
31f18b77 | 139 | manager.flush_pg_stats([0, 2, 3]) |
7c673cae FG |
140 | manager.wait_for_recovery() |
141 | ||
142 | if not config.get('parallel_bench', True): | |
143 | time.sleep(20) | |
144 | ||
145 | # verify result | |
146 | for f in range(1, 10): | |
147 | err = rados(ctx, mon, ['-p', pool, 'get', 'new_%d' % f, '-']) | |
148 | assert err | |
149 | err = rados(ctx, mon, ['-p', pool, 'get', 'existed_%d' % f, '-']) | |
150 | assert err | |
151 | err = rados(ctx, mon, ['-p', pool, 'get', 'existing_%d' % f, '-']) | |
152 | assert err | |
153 | ||
154 | # see if osd.1 can cope | |
155 | manager.revive_osd(1) | |
156 | manager.wait_till_osd_is_up(1) | |
157 | manager.wait_for_clean() | |
158 | run.wait(procs) | |
f67539c2 | 159 | manager.wait_for_clean() |