]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | """ |
2 | Dump_stuck command | |
3 | """ | |
4 | import logging | |
5 | import re | |
6 | import time | |
7 | ||
8 | import ceph_manager | |
9 | from teuthology import misc as teuthology | |
10 | ||
11 | ||
12 | log = logging.getLogger(__name__) | |
13 | ||
14 | def check_stuck(manager, num_inactive, num_unclean, num_stale, timeout=10): | |
15 | """ | |
16 | Do checks. Make sure get_stuck_pgs return the right amout of information, then | |
17 | extract health information from the raw_cluster_cmd and compare the results with | |
18 | values passed in. This passes if all asserts pass. | |
19 | ||
20 | :param num_manager: Ceph manager | |
21 | :param num_inactive: number of inaactive pages that are stuck | |
22 | :param num_unclean: number of unclean pages that are stuck | |
23 | :paran num_stale: number of stale pages that are stuck | |
24 | :param timeout: timeout value for get_stuck_pgs calls | |
25 | """ | |
26 | inactive = manager.get_stuck_pgs('inactive', timeout) | |
27 | unclean = manager.get_stuck_pgs('unclean', timeout) | |
28 | stale = manager.get_stuck_pgs('stale', timeout) | |
29 | log.info('inactive %s / %d, unclean %s / %d, stale %s / %d', | |
30 | len(inactive), num_inactive, | |
31 | len(unclean), num_unclean, | |
32 | len(stale), num_stale) | |
33 | assert len(inactive) == num_inactive | |
34 | assert len(unclean) == num_unclean | |
35 | assert len(stale) == num_stale | |
36 | ||
37 | def task(ctx, config): | |
38 | """ | |
39 | Test the dump_stuck command. | |
40 | ||
41 | :param ctx: Context | |
42 | :param config: Configuration | |
43 | """ | |
44 | assert config is None, \ | |
45 | 'dump_stuck requires no configuration' | |
46 | assert teuthology.num_instances_of_type(ctx.cluster, 'osd') == 2, \ | |
47 | 'dump_stuck requires exactly 2 osds' | |
48 | ||
49 | timeout = 60 | |
50 | first_mon = teuthology.get_first_mon(ctx, config) | |
51 | (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys() | |
52 | ||
53 | manager = ceph_manager.CephManager( | |
54 | mon, | |
55 | ctx=ctx, | |
56 | logger=log.getChild('ceph_manager'), | |
57 | ) | |
58 | ||
31f18b77 | 59 | manager.flush_pg_stats([0, 1]) |
7c673cae FG |
60 | manager.wait_for_clean(timeout) |
61 | ||
62 | manager.raw_cluster_cmd('tell', 'mon.0', 'injectargs', '--', | |
63 | # '--mon-osd-report-timeout 90', | |
64 | '--mon-pg-stuck-threshold 10') | |
65 | ||
66 | check_stuck( | |
67 | manager, | |
68 | num_inactive=0, | |
69 | num_unclean=0, | |
70 | num_stale=0, | |
71 | ) | |
72 | num_pgs = manager.get_num_pgs() | |
73 | ||
74 | manager.mark_out_osd(0) | |
75 | time.sleep(timeout) | |
31f18b77 | 76 | manager.flush_pg_stats([1]) |
7c673cae FG |
77 | manager.wait_for_recovery(timeout) |
78 | ||
79 | check_stuck( | |
80 | manager, | |
81 | num_inactive=0, | |
82 | num_unclean=num_pgs, | |
83 | num_stale=0, | |
84 | ) | |
85 | ||
86 | manager.mark_in_osd(0) | |
31f18b77 | 87 | manager.flush_pg_stats([0, 1]) |
7c673cae FG |
88 | manager.wait_for_clean(timeout) |
89 | ||
90 | check_stuck( | |
91 | manager, | |
92 | num_inactive=0, | |
93 | num_unclean=0, | |
94 | num_stale=0, | |
95 | ) | |
96 | ||
97 | log.info('stopping first osd') | |
98 | manager.kill_osd(0) | |
99 | manager.mark_down_osd(0) | |
100 | ||
101 | log.info('waiting for all to be unclean') | |
102 | starttime = time.time() | |
103 | done = False | |
104 | while not done: | |
105 | try: | |
106 | check_stuck( | |
107 | manager, | |
108 | num_inactive=0, | |
109 | num_unclean=num_pgs, | |
110 | num_stale=0, | |
111 | ) | |
112 | done = True | |
113 | except AssertionError: | |
114 | # wait up to 15 minutes to become stale | |
115 | if time.time() - starttime > 900: | |
116 | raise | |
117 | ||
118 | ||
119 | log.info('stopping second osd') | |
120 | manager.kill_osd(1) | |
121 | manager.mark_down_osd(1) | |
122 | ||
123 | log.info('waiting for all to be stale') | |
124 | starttime = time.time() | |
125 | done = False | |
126 | while not done: | |
127 | try: | |
128 | check_stuck( | |
129 | manager, | |
130 | num_inactive=0, | |
131 | num_unclean=num_pgs, | |
132 | num_stale=num_pgs, | |
133 | ) | |
134 | done = True | |
135 | except AssertionError: | |
136 | # wait up to 15 minutes to become stale | |
137 | if time.time() - starttime > 900: | |
138 | raise | |
139 | ||
140 | log.info('reviving') | |
141 | for id_ in teuthology.all_roles_of_type(ctx.cluster, 'osd'): | |
142 | manager.revive_osd(id_) | |
143 | manager.mark_in_osd(id_) | |
144 | while True: | |
145 | try: | |
31f18b77 | 146 | manager.flush_pg_stats([0, 1]) |
7c673cae FG |
147 | break |
148 | except Exception: | |
149 | log.exception('osds must not be started yet, waiting...') | |
150 | time.sleep(1) | |
151 | manager.wait_for_clean(timeout) | |
152 | ||
153 | check_stuck( | |
154 | manager, | |
155 | num_inactive=0, | |
156 | num_unclean=0, | |
157 | num_stale=0, | |
158 | ) |