]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | """ |
2 | osd recovery | |
3 | """ | |
4 | import logging | |
7c673cae | 5 | import time |
e306af50 | 6 | from tasks import ceph_manager |
7c673cae FG |
7 | from teuthology import misc as teuthology |
8 | ||
9 | ||
10 | log = logging.getLogger(__name__) | |
11 | ||
12 | ||
13 | def rados_start(testdir, remote, cmd): | |
14 | """ | |
15 | Run a remote rados command (currently used to only write data) | |
16 | """ | |
17 | log.info("rados %s" % ' '.join(cmd)) | |
18 | pre = [ | |
19 | 'adjust-ulimits', | |
20 | 'ceph-coverage', | |
21 | '{tdir}/archive/coverage'.format(tdir=testdir), | |
22 | 'rados', | |
23 | ]; | |
24 | pre.extend(cmd) | |
25 | proc = remote.run( | |
26 | args=pre, | |
27 | wait=False, | |
28 | ) | |
29 | return proc | |
30 | ||
31 | def task(ctx, config): | |
32 | """ | |
33 | Test (non-backfill) recovery | |
34 | """ | |
35 | if config is None: | |
36 | config = {} | |
37 | assert isinstance(config, dict), \ | |
38 | 'task only accepts a dict for configuration' | |
39 | testdir = teuthology.get_testdir(ctx) | |
40 | first_mon = teuthology.get_first_mon(ctx, config) | |
9f95a23c | 41 | (mon,) = ctx.cluster.only(first_mon).remotes.keys() |
7c673cae FG |
42 | |
43 | num_osds = teuthology.num_instances_of_type(ctx.cluster, 'osd') | |
44 | log.info('num_osds is %s' % num_osds) | |
45 | assert num_osds == 3 | |
46 | ||
47 | manager = ceph_manager.CephManager( | |
48 | mon, | |
49 | ctx=ctx, | |
50 | logger=log.getChild('ceph_manager'), | |
51 | ) | |
52 | ||
53 | while len(manager.get_osd_status()['up']) < 3: | |
54 | time.sleep(10) | |
31f18b77 | 55 | manager.flush_pg_stats([0, 1, 2]) |
7c673cae FG |
56 | manager.wait_for_clean() |
57 | ||
58 | # test some osdmap flags | |
59 | manager.raw_cluster_cmd('osd', 'set', 'noin') | |
60 | manager.raw_cluster_cmd('osd', 'set', 'noout') | |
61 | manager.raw_cluster_cmd('osd', 'set', 'noup') | |
62 | manager.raw_cluster_cmd('osd', 'set', 'nodown') | |
63 | manager.raw_cluster_cmd('osd', 'unset', 'noin') | |
64 | manager.raw_cluster_cmd('osd', 'unset', 'noout') | |
65 | manager.raw_cluster_cmd('osd', 'unset', 'noup') | |
66 | manager.raw_cluster_cmd('osd', 'unset', 'nodown') | |
67 | ||
68 | # write some new data | |
69 | p = rados_start(testdir, mon, ['-p', 'rbd', 'bench', '20', 'write', '-b', '4096', | |
70 | '--no-cleanup']) | |
71 | ||
72 | time.sleep(15) | |
73 | ||
74 | # trigger a divergent target: | |
75 | # blackhole + restart osd.1 (shorter log) | |
76 | manager.blackhole_kill_osd(1) | |
77 | # kill osd.2 (longer log... we'll make it divergent below) | |
78 | manager.kill_osd(2) | |
79 | time.sleep(2) | |
80 | manager.revive_osd(1) | |
81 | ||
82 | # wait for our writes to complete + succeed | |
83 | err = p.wait() | |
84 | log.info('err is %d' % err) | |
85 | ||
86 | # cluster must repeer | |
31f18b77 | 87 | manager.flush_pg_stats([0, 1]) |
7c673cae FG |
88 | manager.wait_for_active_or_down() |
89 | ||
90 | # write some more (make sure osd.2 really is divergent) | |
91 | p = rados_start(testdir, mon, ['-p', 'rbd', 'bench', '15', 'write', '-b', '4096']) | |
92 | p.wait() | |
93 | ||
94 | # revive divergent osd | |
95 | manager.revive_osd(2) | |
96 | ||
97 | while len(manager.get_osd_status()['up']) < 3: | |
98 | log.info('waiting a bit...') | |
99 | time.sleep(2) | |
100 | log.info('3 are up!') | |
101 | ||
102 | # cluster must recover | |
31f18b77 | 103 | manager.flush_pg_stats([0, 1, 2]) |
7c673cae FG |
104 | manager.wait_for_clean() |
105 | ||
106 | ||
107 | def test_incomplete_pgs(ctx, config): | |
108 | """ | |
109 | Test handling of incomplete pgs. Requires 4 osds. | |
110 | """ | |
111 | testdir = teuthology.get_testdir(ctx) | |
112 | if config is None: | |
113 | config = {} | |
114 | assert isinstance(config, dict), \ | |
115 | 'task only accepts a dict for configuration' | |
116 | first_mon = teuthology.get_first_mon(ctx, config) | |
9f95a23c | 117 | (mon,) = ctx.cluster.only(first_mon).remotes.keys() |
7c673cae FG |
118 | |
119 | num_osds = teuthology.num_instances_of_type(ctx.cluster, 'osd') | |
120 | log.info('num_osds is %s' % num_osds) | |
121 | assert num_osds == 4 | |
122 | ||
123 | manager = ceph_manager.CephManager( | |
124 | mon, | |
125 | ctx=ctx, | |
126 | logger=log.getChild('ceph_manager'), | |
127 | ) | |
128 | ||
129 | while len(manager.get_osd_status()['up']) < 4: | |
130 | time.sleep(10) | |
131 | ||
31f18b77 | 132 | manager.flush_pg_stats([0, 1, 2, 3]) |
7c673cae FG |
133 | manager.wait_for_clean() |
134 | ||
135 | log.info('Testing incomplete pgs...') | |
136 | ||
137 | for i in range(4): | |
138 | manager.set_config( | |
139 | i, | |
140 | osd_recovery_delay_start=1000) | |
141 | ||
142 | # move data off of osd.0, osd.1 | |
143 | manager.raw_cluster_cmd('osd', 'out', '0', '1') | |
31f18b77 | 144 | manager.flush_pg_stats([0, 1, 2, 3], [0, 1]) |
7c673cae FG |
145 | manager.wait_for_clean() |
146 | ||
147 | # lots of objects in rbd (no pg log, will backfill) | |
148 | p = rados_start(testdir, mon, | |
149 | ['-p', 'rbd', 'bench', '20', 'write', '-b', '1', | |
150 | '--no-cleanup']) | |
151 | p.wait() | |
152 | ||
153 | # few objects in rbd pool (with pg log, normal recovery) | |
154 | for f in range(1, 20): | |
155 | p = rados_start(testdir, mon, ['-p', 'rbd', 'put', | |
156 | 'foo.%d' % f, '/etc/passwd']) | |
157 | p.wait() | |
158 | ||
159 | # move it back | |
160 | manager.raw_cluster_cmd('osd', 'in', '0', '1') | |
161 | manager.raw_cluster_cmd('osd', 'out', '2', '3') | |
162 | time.sleep(10) | |
31f18b77 | 163 | manager.flush_pg_stats([0, 1, 2, 3], [2, 3]) |
7c673cae FG |
164 | time.sleep(10) |
165 | manager.wait_for_active() | |
166 | ||
167 | assert not manager.is_clean() | |
168 | assert not manager.is_recovered() | |
169 | ||
170 | # kill 2 + 3 | |
171 | log.info('stopping 2,3') | |
172 | manager.kill_osd(2) | |
173 | manager.kill_osd(3) | |
174 | log.info('...') | |
175 | manager.raw_cluster_cmd('osd', 'down', '2', '3') | |
31f18b77 | 176 | manager.flush_pg_stats([0, 1]) |
7c673cae FG |
177 | manager.wait_for_active_or_down() |
178 | ||
179 | assert manager.get_num_down() > 0 | |
180 | ||
181 | # revive 2 + 3 | |
182 | manager.revive_osd(2) | |
183 | manager.revive_osd(3) | |
184 | while len(manager.get_osd_status()['up']) < 4: | |
185 | log.info('waiting a bit...') | |
186 | time.sleep(2) | |
187 | log.info('all are up!') | |
188 | ||
189 | for i in range(4): | |
190 | manager.kick_recovery_wq(i) | |
191 | ||
192 | # cluster must recover | |
193 | manager.wait_for_clean() |