]> git.proxmox.com Git - ceph.git/blob - ceph/qa/tasks/dump_stuck.py
add subtree-ish sources for 12.0.3
[ceph.git] / ceph / qa / tasks / dump_stuck.py
1 """
2 Dump_stuck command
3 """
4 import logging
5 import re
6 import time
7
8 import ceph_manager
9 from teuthology import misc as teuthology
10
11
12 log = logging.getLogger(__name__)
13
14 def check_stuck(manager, num_inactive, num_unclean, num_stale, timeout=10):
15 """
16 Do checks. Make sure get_stuck_pgs return the right amout of information, then
17 extract health information from the raw_cluster_cmd and compare the results with
18 values passed in. This passes if all asserts pass.
19
20 :param num_manager: Ceph manager
21 :param num_inactive: number of inaactive pages that are stuck
22 :param num_unclean: number of unclean pages that are stuck
23 :paran num_stale: number of stale pages that are stuck
24 :param timeout: timeout value for get_stuck_pgs calls
25 """
26 inactive = manager.get_stuck_pgs('inactive', timeout)
27 unclean = manager.get_stuck_pgs('unclean', timeout)
28 stale = manager.get_stuck_pgs('stale', timeout)
29 log.info('inactive %s / %d, unclean %s / %d, stale %s / %d',
30 len(inactive), num_inactive,
31 len(unclean), num_unclean,
32 len(stale), num_stale)
33 assert len(inactive) == num_inactive
34 assert len(unclean) == num_unclean
35 assert len(stale) == num_stale
36
37 def task(ctx, config):
38 """
39 Test the dump_stuck command.
40
41 :param ctx: Context
42 :param config: Configuration
43 """
44 assert config is None, \
45 'dump_stuck requires no configuration'
46 assert teuthology.num_instances_of_type(ctx.cluster, 'osd') == 2, \
47 'dump_stuck requires exactly 2 osds'
48
49 timeout = 60
50 first_mon = teuthology.get_first_mon(ctx, config)
51 (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()
52
53 manager = ceph_manager.CephManager(
54 mon,
55 ctx=ctx,
56 logger=log.getChild('ceph_manager'),
57 )
58
59 manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
60 manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats')
61 manager.wait_for_clean(timeout)
62
63 manager.raw_cluster_cmd('tell', 'mon.0', 'injectargs', '--',
64 # '--mon-osd-report-timeout 90',
65 '--mon-pg-stuck-threshold 10')
66
67 check_stuck(
68 manager,
69 num_inactive=0,
70 num_unclean=0,
71 num_stale=0,
72 )
73 num_pgs = manager.get_num_pgs()
74
75 manager.mark_out_osd(0)
76 time.sleep(timeout)
77 manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats')
78 manager.wait_for_recovery(timeout)
79
80 check_stuck(
81 manager,
82 num_inactive=0,
83 num_unclean=num_pgs,
84 num_stale=0,
85 )
86
87 manager.mark_in_osd(0)
88 manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
89 manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats')
90 manager.wait_for_clean(timeout)
91
92 check_stuck(
93 manager,
94 num_inactive=0,
95 num_unclean=0,
96 num_stale=0,
97 )
98
99 log.info('stopping first osd')
100 manager.kill_osd(0)
101 manager.mark_down_osd(0)
102
103 log.info('waiting for all to be unclean')
104 starttime = time.time()
105 done = False
106 while not done:
107 try:
108 check_stuck(
109 manager,
110 num_inactive=0,
111 num_unclean=num_pgs,
112 num_stale=0,
113 )
114 done = True
115 except AssertionError:
116 # wait up to 15 minutes to become stale
117 if time.time() - starttime > 900:
118 raise
119
120
121 log.info('stopping second osd')
122 manager.kill_osd(1)
123 manager.mark_down_osd(1)
124
125 log.info('waiting for all to be stale')
126 starttime = time.time()
127 done = False
128 while not done:
129 try:
130 check_stuck(
131 manager,
132 num_inactive=0,
133 num_unclean=num_pgs,
134 num_stale=num_pgs,
135 )
136 done = True
137 except AssertionError:
138 # wait up to 15 minutes to become stale
139 if time.time() - starttime > 900:
140 raise
141
142 log.info('reviving')
143 for id_ in teuthology.all_roles_of_type(ctx.cluster, 'osd'):
144 manager.revive_osd(id_)
145 manager.mark_in_osd(id_)
146 while True:
147 try:
148 manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
149 manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats')
150 break
151 except Exception:
152 log.exception('osds must not be started yet, waiting...')
153 time.sleep(1)
154 manager.wait_for_clean(timeout)
155
156 check_stuck(
157 manager,
158 num_inactive=0,
159 num_unclean=0,
160 num_stale=0,
161 )