]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | """ |
2 | Dump_stuck command | |
3 | """ | |
4 | import logging | |
5 | import re | |
6 | import time | |
7 | ||
8 | import ceph_manager | |
9 | from teuthology import misc as teuthology | |
10 | ||
11 | ||
12 | log = logging.getLogger(__name__) | |
13 | ||
14 | def check_stuck(manager, num_inactive, num_unclean, num_stale, timeout=10): | |
15 | """ | |
16 | Do checks. Make sure get_stuck_pgs return the right amout of information, then | |
17 | extract health information from the raw_cluster_cmd and compare the results with | |
18 | values passed in. This passes if all asserts pass. | |
19 | ||
20 | :param num_manager: Ceph manager | |
21 | :param num_inactive: number of inaactive pages that are stuck | |
22 | :param num_unclean: number of unclean pages that are stuck | |
23 | :paran num_stale: number of stale pages that are stuck | |
24 | :param timeout: timeout value for get_stuck_pgs calls | |
25 | """ | |
26 | inactive = manager.get_stuck_pgs('inactive', timeout) | |
27 | unclean = manager.get_stuck_pgs('unclean', timeout) | |
28 | stale = manager.get_stuck_pgs('stale', timeout) | |
29 | log.info('inactive %s / %d, unclean %s / %d, stale %s / %d', | |
30 | len(inactive), num_inactive, | |
31 | len(unclean), num_unclean, | |
32 | len(stale), num_stale) | |
33 | assert len(inactive) == num_inactive | |
34 | assert len(unclean) == num_unclean | |
35 | assert len(stale) == num_stale | |
36 | ||
37 | def task(ctx, config): | |
38 | """ | |
39 | Test the dump_stuck command. | |
40 | ||
41 | :param ctx: Context | |
42 | :param config: Configuration | |
43 | """ | |
44 | assert config is None, \ | |
45 | 'dump_stuck requires no configuration' | |
46 | assert teuthology.num_instances_of_type(ctx.cluster, 'osd') == 2, \ | |
47 | 'dump_stuck requires exactly 2 osds' | |
48 | ||
49 | timeout = 60 | |
50 | first_mon = teuthology.get_first_mon(ctx, config) | |
51 | (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys() | |
52 | ||
53 | manager = ceph_manager.CephManager( | |
54 | mon, | |
55 | ctx=ctx, | |
56 | logger=log.getChild('ceph_manager'), | |
57 | ) | |
58 | ||
31f18b77 | 59 | manager.flush_pg_stats([0, 1]) |
7c673cae FG |
60 | manager.wait_for_clean(timeout) |
61 | ||
62 | manager.raw_cluster_cmd('tell', 'mon.0', 'injectargs', '--', | |
63 | # '--mon-osd-report-timeout 90', | |
64 | '--mon-pg-stuck-threshold 10') | |
65 | ||
224ce89b | 66 | # all active+clean |
7c673cae FG |
67 | check_stuck( |
68 | manager, | |
69 | num_inactive=0, | |
70 | num_unclean=0, | |
71 | num_stale=0, | |
72 | ) | |
73 | num_pgs = manager.get_num_pgs() | |
74 | ||
75 | manager.mark_out_osd(0) | |
76 | time.sleep(timeout) | |
31f18b77 | 77 | manager.flush_pg_stats([1]) |
7c673cae FG |
78 | manager.wait_for_recovery(timeout) |
79 | ||
224ce89b | 80 | # all active+clean+remapped |
7c673cae FG |
81 | check_stuck( |
82 | manager, | |
83 | num_inactive=0, | |
224ce89b | 84 | num_unclean=0, |
7c673cae FG |
85 | num_stale=0, |
86 | ) | |
87 | ||
88 | manager.mark_in_osd(0) | |
31f18b77 | 89 | manager.flush_pg_stats([0, 1]) |
7c673cae FG |
90 | manager.wait_for_clean(timeout) |
91 | ||
224ce89b | 92 | # all active+clean |
7c673cae FG |
93 | check_stuck( |
94 | manager, | |
95 | num_inactive=0, | |
96 | num_unclean=0, | |
97 | num_stale=0, | |
98 | ) | |
99 | ||
100 | log.info('stopping first osd') | |
101 | manager.kill_osd(0) | |
102 | manager.mark_down_osd(0) | |
103 | ||
104 | log.info('waiting for all to be unclean') | |
105 | starttime = time.time() | |
106 | done = False | |
107 | while not done: | |
108 | try: | |
109 | check_stuck( | |
110 | manager, | |
111 | num_inactive=0, | |
112 | num_unclean=num_pgs, | |
113 | num_stale=0, | |
114 | ) | |
115 | done = True | |
116 | except AssertionError: | |
117 | # wait up to 15 minutes to become stale | |
118 | if time.time() - starttime > 900: | |
119 | raise | |
120 | ||
121 | ||
122 | log.info('stopping second osd') | |
123 | manager.kill_osd(1) | |
124 | manager.mark_down_osd(1) | |
125 | ||
126 | log.info('waiting for all to be stale') | |
127 | starttime = time.time() | |
128 | done = False | |
129 | while not done: | |
130 | try: | |
131 | check_stuck( | |
132 | manager, | |
133 | num_inactive=0, | |
134 | num_unclean=num_pgs, | |
135 | num_stale=num_pgs, | |
136 | ) | |
137 | done = True | |
138 | except AssertionError: | |
139 | # wait up to 15 minutes to become stale | |
140 | if time.time() - starttime > 900: | |
141 | raise | |
142 | ||
143 | log.info('reviving') | |
144 | for id_ in teuthology.all_roles_of_type(ctx.cluster, 'osd'): | |
145 | manager.revive_osd(id_) | |
146 | manager.mark_in_osd(id_) | |
147 | while True: | |
148 | try: | |
31f18b77 | 149 | manager.flush_pg_stats([0, 1]) |
7c673cae FG |
150 | break |
151 | except Exception: | |
152 | log.exception('osds must not be started yet, waiting...') | |
153 | time.sleep(1) | |
154 | manager.wait_for_clean(timeout) | |
155 | ||
156 | check_stuck( | |
157 | manager, | |
158 | num_inactive=0, | |
159 | num_unclean=0, | |
160 | num_stale=0, | |
161 | ) |