]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | """ |
2 | Dump_stuck command | |
3 | """ | |
4 | import logging | |
7c673cae FG |
5 | import time |
6 | ||
e306af50 | 7 | from tasks import ceph_manager |
7c673cae FG |
8 | from teuthology import misc as teuthology |
9 | ||
10 | ||
11 | log = logging.getLogger(__name__) | |
12 | ||
13 | def check_stuck(manager, num_inactive, num_unclean, num_stale, timeout=10): | |
14 | """ | |
11fdf7f2 | 15 | Do checks. Make sure get_stuck_pgs return the right amount of information, then |
7c673cae FG |
16 | extract health information from the raw_cluster_cmd and compare the results with |
17 | values passed in. This passes if all asserts pass. | |
18 | ||
19 | :param num_manager: Ceph manager | |
20 | :param num_inactive: number of inaactive pages that are stuck | |
21 | :param num_unclean: number of unclean pages that are stuck | |
9f95a23c | 22 | :param num_stale: number of stale pages that are stuck |
7c673cae FG |
23 | :param timeout: timeout value for get_stuck_pgs calls |
24 | """ | |
25 | inactive = manager.get_stuck_pgs('inactive', timeout) | |
26 | unclean = manager.get_stuck_pgs('unclean', timeout) | |
27 | stale = manager.get_stuck_pgs('stale', timeout) | |
28 | log.info('inactive %s / %d, unclean %s / %d, stale %s / %d', | |
29 | len(inactive), num_inactive, | |
30 | len(unclean), num_unclean, | |
31 | len(stale), num_stale) | |
32 | assert len(inactive) == num_inactive | |
33 | assert len(unclean) == num_unclean | |
34 | assert len(stale) == num_stale | |
35 | ||
36 | def task(ctx, config): | |
37 | """ | |
38 | Test the dump_stuck command. | |
39 | ||
40 | :param ctx: Context | |
41 | :param config: Configuration | |
42 | """ | |
43 | assert config is None, \ | |
44 | 'dump_stuck requires no configuration' | |
45 | assert teuthology.num_instances_of_type(ctx.cluster, 'osd') == 2, \ | |
46 | 'dump_stuck requires exactly 2 osds' | |
47 | ||
48 | timeout = 60 | |
49 | first_mon = teuthology.get_first_mon(ctx, config) | |
9f95a23c | 50 | (mon,) = ctx.cluster.only(first_mon).remotes.keys() |
7c673cae FG |
51 | |
52 | manager = ceph_manager.CephManager( | |
53 | mon, | |
54 | ctx=ctx, | |
55 | logger=log.getChild('ceph_manager'), | |
56 | ) | |
57 | ||
31f18b77 | 58 | manager.flush_pg_stats([0, 1]) |
7c673cae FG |
59 | manager.wait_for_clean(timeout) |
60 | ||
9f95a23c | 61 | manager.raw_cluster_cmd('tell', 'mon.a', 'injectargs', '--', |
7c673cae FG |
62 | # '--mon-osd-report-timeout 90', |
63 | '--mon-pg-stuck-threshold 10') | |
64 | ||
224ce89b | 65 | # all active+clean |
7c673cae FG |
66 | check_stuck( |
67 | manager, | |
68 | num_inactive=0, | |
69 | num_unclean=0, | |
70 | num_stale=0, | |
71 | ) | |
72 | num_pgs = manager.get_num_pgs() | |
73 | ||
74 | manager.mark_out_osd(0) | |
75 | time.sleep(timeout) | |
31f18b77 | 76 | manager.flush_pg_stats([1]) |
7c673cae FG |
77 | manager.wait_for_recovery(timeout) |
78 | ||
224ce89b | 79 | # all active+clean+remapped |
7c673cae FG |
80 | check_stuck( |
81 | manager, | |
82 | num_inactive=0, | |
224ce89b | 83 | num_unclean=0, |
7c673cae FG |
84 | num_stale=0, |
85 | ) | |
86 | ||
87 | manager.mark_in_osd(0) | |
31f18b77 | 88 | manager.flush_pg_stats([0, 1]) |
7c673cae FG |
89 | manager.wait_for_clean(timeout) |
90 | ||
224ce89b | 91 | # all active+clean |
7c673cae FG |
92 | check_stuck( |
93 | manager, | |
94 | num_inactive=0, | |
95 | num_unclean=0, | |
96 | num_stale=0, | |
97 | ) | |
98 | ||
99 | log.info('stopping first osd') | |
100 | manager.kill_osd(0) | |
101 | manager.mark_down_osd(0) | |
c07f9fc5 | 102 | manager.wait_for_active(timeout) |
7c673cae FG |
103 | |
104 | log.info('waiting for all to be unclean') | |
105 | starttime = time.time() | |
106 | done = False | |
107 | while not done: | |
108 | try: | |
109 | check_stuck( | |
110 | manager, | |
111 | num_inactive=0, | |
112 | num_unclean=num_pgs, | |
113 | num_stale=0, | |
114 | ) | |
115 | done = True | |
116 | except AssertionError: | |
117 | # wait up to 15 minutes to become stale | |
118 | if time.time() - starttime > 900: | |
119 | raise | |
120 | ||
121 | ||
122 | log.info('stopping second osd') | |
123 | manager.kill_osd(1) | |
124 | manager.mark_down_osd(1) | |
125 | ||
126 | log.info('waiting for all to be stale') | |
127 | starttime = time.time() | |
128 | done = False | |
129 | while not done: | |
130 | try: | |
131 | check_stuck( | |
132 | manager, | |
133 | num_inactive=0, | |
134 | num_unclean=num_pgs, | |
135 | num_stale=num_pgs, | |
136 | ) | |
137 | done = True | |
138 | except AssertionError: | |
139 | # wait up to 15 minutes to become stale | |
140 | if time.time() - starttime > 900: | |
141 | raise | |
142 | ||
143 | log.info('reviving') | |
144 | for id_ in teuthology.all_roles_of_type(ctx.cluster, 'osd'): | |
145 | manager.revive_osd(id_) | |
146 | manager.mark_in_osd(id_) | |
147 | while True: | |
148 | try: | |
31f18b77 | 149 | manager.flush_pg_stats([0, 1]) |
7c673cae FG |
150 | break |
151 | except Exception: | |
152 | log.exception('osds must not be started yet, waiting...') | |
153 | time.sleep(1) | |
154 | manager.wait_for_clean(timeout) | |
155 | ||
156 | check_stuck( | |
157 | manager, | |
158 | num_inactive=0, | |
159 | num_unclean=0, | |
160 | num_stale=0, | |
161 | ) |