-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathmon_recovery.py
80 lines (66 loc) · 2.22 KB
/
mon_recovery.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
"""
Monitor recovery
"""
import logging
import ceph_manager
from teuthology import misc as teuthology
log = logging.getLogger(__name__)
def task(ctx, config):
"""
Test monitor recovery.
"""
if config is None:
config = {}
assert isinstance(config, dict), \
'task only accepts a dict for configuration'
first_mon = teuthology.get_first_mon(ctx, config)
(mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()
manager = ceph_manager.CephManager(
mon,
ctx=ctx,
logger=log.getChild('ceph_manager'),
)
mons = [f.split('.')[1] for f in teuthology.get_mon_names(ctx)]
log.info("mon ids = %s" % mons)
manager.wait_for_mon_quorum_size(len(mons))
log.info('verifying all monitors are in the quorum')
for m in mons:
s = manager.get_mon_status(m)
assert s['state'] == 'leader' or s['state'] == 'peon'
assert len(s['quorum']) == len(mons)
log.info('restarting each monitor in turn')
for m in mons:
# stop a monitor
manager.kill_mon(m)
manager.wait_for_mon_quorum_size(len(mons) - 1)
# restart
manager.revive_mon(m)
manager.wait_for_mon_quorum_size(len(mons))
# in forward and reverse order,
rmons = mons
rmons.reverse()
for mons in mons, rmons:
log.info('stopping all monitors')
for m in mons:
manager.kill_mon(m)
log.info('forming a minimal quorum for %s, then adding monitors' % mons)
qnum = (len(mons) / 2) + 1
num = 0
for m in mons:
manager.revive_mon(m)
num += 1
if num >= qnum:
manager.wait_for_mon_quorum_size(num)
# on both leader and non-leader ranks...
for rank in [0, 1]:
# take one out
log.info('removing mon %s' % mons[rank])
manager.kill_mon(mons[rank])
manager.wait_for_mon_quorum_size(len(mons) - 1)
log.info('causing some monitor log activity')
m = 30
for n in range(1, m):
manager.raw_cluster_cmd('log', '%d of %d' % (n, m))
log.info('adding mon %s back in' % mons[rank])
manager.revive_mon(mons[rank])
manager.wait_for_mon_quorum_size(len(mons))