]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | """ |
2 | Resolve stuck peering | |
3 | """ | |
4 | import logging | |
5 | import time | |
6 | ||
7 | from teuthology import misc as teuthology | |
e306af50 | 8 | from tasks.util.rados import rados |
7c673cae FG |
9 | |
10 | log = logging.getLogger(__name__) | |
11 | ||
12 | def task(ctx, config): | |
13 | """ | |
14 | Test handling resolve stuck peering | |
15 | ||
16 | requires 3 osds on a single test node | |
17 | """ | |
18 | if config is None: | |
19 | config = {} | |
20 | assert isinstance(config, dict), \ | |
21 | 'Resolve stuck peering only accepts a dict for config' | |
22 | ||
23 | manager = ctx.managers['ceph'] | |
24 | ||
25 | while len(manager.get_osd_status()['up']) < 3: | |
26 | time.sleep(10) | |
27 | ||
28 | ||
29 | manager.wait_for_clean() | |
30 | ||
31 | dummyfile = '/etc/fstab' | |
32 | dummyfile1 = '/etc/resolv.conf' | |
33 | ||
34 | #create 1 PG pool | |
35 | pool='foo' | |
36 | log.info('creating pool foo') | |
37 | manager.raw_cluster_cmd('osd', 'pool', 'create', '%s' % pool, '1') | |
38 | ||
39 | #set min_size of the pool to 1 | |
40 | #so that we can continue with I/O | |
41 | #when 2 osds are down | |
42 | manager.set_pool_property(pool, "min_size", 1) | |
43 | ||
44 | osds = [0, 1, 2] | |
45 | ||
46 | primary = manager.get_pg_primary('foo', 0) | |
47 | log.info("primary osd is %d", primary) | |
48 | ||
49 | others = list(osds) | |
50 | others.remove(primary) | |
51 | ||
52 | log.info('writing initial objects') | |
53 | first_mon = teuthology.get_first_mon(ctx, config) | |
9f95a23c | 54 | (mon,) = ctx.cluster.only(first_mon).remotes.keys() |
7c673cae FG |
55 | #create few objects |
56 | for i in range(100): | |
57 | rados(ctx, mon, ['-p', 'foo', 'put', 'existing_%d' % i, dummyfile]) | |
58 | ||
59 | manager.wait_for_clean() | |
60 | ||
61 | #kill other osds except primary | |
62 | log.info('killing other osds except primary') | |
63 | for i in others: | |
64 | manager.kill_osd(i) | |
65 | for i in others: | |
66 | manager.mark_down_osd(i) | |
67 | ||
68 | ||
69 | for i in range(100): | |
70 | rados(ctx, mon, ['-p', 'foo', 'put', 'new_%d' % i, dummyfile1]) | |
71 | ||
72 | #kill primary osd | |
73 | manager.kill_osd(primary) | |
74 | manager.mark_down_osd(primary) | |
75 | ||
76 | #revive other 2 osds | |
77 | for i in others: | |
78 | manager.revive_osd(i) | |
79 | ||
80 | #make sure that pg is down | |
81 | #Assuming pg number for single pg pool will start from 0 | |
82 | pgnum=0 | |
83 | pgstr = manager.get_pgid(pool, pgnum) | |
84 | stats = manager.get_single_pg_stats(pgstr) | |
9f95a23c | 85 | print(stats['state']) |
7c673cae FG |
86 | |
87 | timeout=60 | |
88 | start=time.time() | |
89 | ||
90 | while 'down' not in stats['state']: | |
91 | assert time.time() - start < timeout, \ | |
92 | 'failed to reach down state before timeout expired' | |
93 | stats = manager.get_single_pg_stats(pgstr) | |
94 | ||
95 | #mark primary as lost | |
96 | manager.raw_cluster_cmd('osd', 'lost', '%d' % primary,\ | |
97 | '--yes-i-really-mean-it') | |
98 | ||
99 | ||
100 | #expect the pg status to be active+undersized+degraded | |
101 | #pg should recover and become active+clean within timeout | |
102 | stats = manager.get_single_pg_stats(pgstr) | |
9f95a23c | 103 | print(stats['state']) |
7c673cae FG |
104 | |
105 | timeout=10 | |
106 | start=time.time() | |
107 | ||
108 | while manager.get_num_down(): | |
109 | assert time.time() - start < timeout, \ | |
110 | 'failed to recover before timeout expired' | |
31f18b77 FG |
111 | |
112 | manager.revive_osd(primary) |