]> git.proxmox.com Git - ceph.git/blame - ceph/qa/tasks/resolve_stuck_peering.py
import 15.2.4
[ceph.git] / ceph / qa / tasks / resolve_stuck_peering.py
CommitLineData
7c673cae
FG
1"""
2Resolve stuck peering
3"""
4import logging
5import time
6
7from teuthology import misc as teuthology
e306af50 8from tasks.util.rados import rados
7c673cae
FG
9
10log = logging.getLogger(__name__)
11
12def task(ctx, config):
13 """
14 Test handling resolve stuck peering
15
16 requires 3 osds on a single test node
17 """
18 if config is None:
19 config = {}
20 assert isinstance(config, dict), \
21 'Resolve stuck peering only accepts a dict for config'
22
23 manager = ctx.managers['ceph']
24
25 while len(manager.get_osd_status()['up']) < 3:
26 time.sleep(10)
27
28
29 manager.wait_for_clean()
30
31 dummyfile = '/etc/fstab'
32 dummyfile1 = '/etc/resolv.conf'
33
34 #create 1 PG pool
35 pool='foo'
36 log.info('creating pool foo')
37 manager.raw_cluster_cmd('osd', 'pool', 'create', '%s' % pool, '1')
38
39 #set min_size of the pool to 1
40 #so that we can continue with I/O
41 #when 2 osds are down
42 manager.set_pool_property(pool, "min_size", 1)
43
44 osds = [0, 1, 2]
45
46 primary = manager.get_pg_primary('foo', 0)
47 log.info("primary osd is %d", primary)
48
49 others = list(osds)
50 others.remove(primary)
51
52 log.info('writing initial objects')
53 first_mon = teuthology.get_first_mon(ctx, config)
9f95a23c 54 (mon,) = ctx.cluster.only(first_mon).remotes.keys()
7c673cae
FG
55 #create few objects
56 for i in range(100):
57 rados(ctx, mon, ['-p', 'foo', 'put', 'existing_%d' % i, dummyfile])
58
59 manager.wait_for_clean()
60
61 #kill other osds except primary
62 log.info('killing other osds except primary')
63 for i in others:
64 manager.kill_osd(i)
65 for i in others:
66 manager.mark_down_osd(i)
67
68
69 for i in range(100):
70 rados(ctx, mon, ['-p', 'foo', 'put', 'new_%d' % i, dummyfile1])
71
72 #kill primary osd
73 manager.kill_osd(primary)
74 manager.mark_down_osd(primary)
75
76 #revive other 2 osds
77 for i in others:
78 manager.revive_osd(i)
79
80 #make sure that pg is down
81 #Assuming pg number for single pg pool will start from 0
82 pgnum=0
83 pgstr = manager.get_pgid(pool, pgnum)
84 stats = manager.get_single_pg_stats(pgstr)
9f95a23c 85 print(stats['state'])
7c673cae
FG
86
87 timeout=60
88 start=time.time()
89
90 while 'down' not in stats['state']:
91 assert time.time() - start < timeout, \
92 'failed to reach down state before timeout expired'
93 stats = manager.get_single_pg_stats(pgstr)
94
95 #mark primary as lost
96 manager.raw_cluster_cmd('osd', 'lost', '%d' % primary,\
97 '--yes-i-really-mean-it')
98
99
100 #expect the pg status to be active+undersized+degraded
101 #pg should recover and become active+clean within timeout
102 stats = manager.get_single_pg_stats(pgstr)
9f95a23c 103 print(stats['state'])
7c673cae
FG
104
105 timeout=10
106 start=time.time()
107
108 while manager.get_num_down():
109 assert time.time() - start < timeout, \
110 'failed to recover before timeout expired'
31f18b77
FG
111
112 manager.revive_osd(primary)