]> git.proxmox.com Git - ceph.git/blob - ceph/qa/tasks/dump_stuck.py
237d9127fc1487589dd1d114f4cd3cc2af8ce5ee
[ceph.git] / ceph / qa / tasks / dump_stuck.py
1 """
2 Dump_stuck command
3 """
4 import logging
5 import time
6
7 import ceph_manager
8 from teuthology import misc as teuthology
9
10
11 log = logging.getLogger(__name__)
12
13 def check_stuck(manager, num_inactive, num_unclean, num_stale, timeout=10):
14 """
15 Do checks. Make sure get_stuck_pgs return the right amount of information, then
16 extract health information from the raw_cluster_cmd and compare the results with
17 values passed in. This passes if all asserts pass.
18
19 :param num_manager: Ceph manager
20 :param num_inactive: number of inaactive pages that are stuck
21 :param num_unclean: number of unclean pages that are stuck
22 :param num_stale: number of stale pages that are stuck
23 :param timeout: timeout value for get_stuck_pgs calls
24 """
25 inactive = manager.get_stuck_pgs('inactive', timeout)
26 unclean = manager.get_stuck_pgs('unclean', timeout)
27 stale = manager.get_stuck_pgs('stale', timeout)
28 log.info('inactive %s / %d, unclean %s / %d, stale %s / %d',
29 len(inactive), num_inactive,
30 len(unclean), num_unclean,
31 len(stale), num_stale)
32 assert len(inactive) == num_inactive
33 assert len(unclean) == num_unclean
34 assert len(stale) == num_stale
35
36 def task(ctx, config):
37 """
38 Test the dump_stuck command.
39
40 :param ctx: Context
41 :param config: Configuration
42 """
43 assert config is None, \
44 'dump_stuck requires no configuration'
45 assert teuthology.num_instances_of_type(ctx.cluster, 'osd') == 2, \
46 'dump_stuck requires exactly 2 osds'
47
48 timeout = 60
49 first_mon = teuthology.get_first_mon(ctx, config)
50 (mon,) = ctx.cluster.only(first_mon).remotes.keys()
51
52 manager = ceph_manager.CephManager(
53 mon,
54 ctx=ctx,
55 logger=log.getChild('ceph_manager'),
56 )
57
58 manager.flush_pg_stats([0, 1])
59 manager.wait_for_clean(timeout)
60
61 manager.raw_cluster_cmd('tell', 'mon.a', 'injectargs', '--',
62 # '--mon-osd-report-timeout 90',
63 '--mon-pg-stuck-threshold 10')
64
65 # all active+clean
66 check_stuck(
67 manager,
68 num_inactive=0,
69 num_unclean=0,
70 num_stale=0,
71 )
72 num_pgs = manager.get_num_pgs()
73
74 manager.mark_out_osd(0)
75 time.sleep(timeout)
76 manager.flush_pg_stats([1])
77 manager.wait_for_recovery(timeout)
78
79 # all active+clean+remapped
80 check_stuck(
81 manager,
82 num_inactive=0,
83 num_unclean=0,
84 num_stale=0,
85 )
86
87 manager.mark_in_osd(0)
88 manager.flush_pg_stats([0, 1])
89 manager.wait_for_clean(timeout)
90
91 # all active+clean
92 check_stuck(
93 manager,
94 num_inactive=0,
95 num_unclean=0,
96 num_stale=0,
97 )
98
99 log.info('stopping first osd')
100 manager.kill_osd(0)
101 manager.mark_down_osd(0)
102 manager.wait_for_active(timeout)
103
104 log.info('waiting for all to be unclean')
105 starttime = time.time()
106 done = False
107 while not done:
108 try:
109 check_stuck(
110 manager,
111 num_inactive=0,
112 num_unclean=num_pgs,
113 num_stale=0,
114 )
115 done = True
116 except AssertionError:
117 # wait up to 15 minutes to become stale
118 if time.time() - starttime > 900:
119 raise
120
121
122 log.info('stopping second osd')
123 manager.kill_osd(1)
124 manager.mark_down_osd(1)
125
126 log.info('waiting for all to be stale')
127 starttime = time.time()
128 done = False
129 while not done:
130 try:
131 check_stuck(
132 manager,
133 num_inactive=0,
134 num_unclean=num_pgs,
135 num_stale=num_pgs,
136 )
137 done = True
138 except AssertionError:
139 # wait up to 15 minutes to become stale
140 if time.time() - starttime > 900:
141 raise
142
143 log.info('reviving')
144 for id_ in teuthology.all_roles_of_type(ctx.cluster, 'osd'):
145 manager.revive_osd(id_)
146 manager.mark_in_osd(id_)
147 while True:
148 try:
149 manager.flush_pg_stats([0, 1])
150 break
151 except Exception:
152 log.exception('osds must not be started yet, waiting...')
153 time.sleep(1)
154 manager.wait_for_clean(timeout)
155
156 check_stuck(
157 manager,
158 num_inactive=0,
159 num_unclean=0,
160 num_stale=0,
161 )