-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdump_stuck.py
161 lines (138 loc) · 4.31 KB
/
dump_stuck.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
"""
Dump_stuck command
"""
import logging
import time
from tasks import ceph_manager
from teuthology import misc as teuthology
log = logging.getLogger(__name__)
def check_stuck(manager, num_inactive, num_unclean, num_stale, timeout=10):
"""
Do checks. Make sure get_stuck_pgs return the right amount of information, then
extract health information from the raw_cluster_cmd and compare the results with
values passed in. This passes if all asserts pass.
:param num_manager: Ceph manager
:param num_inactive: number of inaactive pages that are stuck
:param num_unclean: number of unclean pages that are stuck
:param num_stale: number of stale pages that are stuck
:param timeout: timeout value for get_stuck_pgs calls
"""
inactive = manager.get_stuck_pgs('inactive', timeout)
unclean = manager.get_stuck_pgs('unclean', timeout)
stale = manager.get_stuck_pgs('stale', timeout)
log.info('inactive %s / %d, unclean %s / %d, stale %s / %d',
len(inactive), num_inactive,
len(unclean), num_unclean,
len(stale), num_stale)
assert len(inactive) == num_inactive
assert len(unclean) == num_unclean
assert len(stale) == num_stale
def task(ctx, config):
"""
Test the dump_stuck command.
:param ctx: Context
:param config: Configuration
"""
assert config is None, \
'dump_stuck requires no configuration'
assert teuthology.num_instances_of_type(ctx.cluster, 'osd') == 2, \
'dump_stuck requires exactly 2 osds'
timeout = 60
first_mon = teuthology.get_first_mon(ctx, config)
(mon,) = ctx.cluster.only(first_mon).remotes.keys()
manager = ceph_manager.CephManager(
mon,
ctx=ctx,
logger=log.getChild('ceph_manager'),
)
manager.flush_pg_stats([0, 1])
manager.wait_for_clean(timeout)
manager.raw_cluster_cmd('tell', 'mon.a', 'injectargs', '--',
# '--mon-osd-report-timeout 90',
'--mon-pg-stuck-threshold 10')
# all active+clean
check_stuck(
manager,
num_inactive=0,
num_unclean=0,
num_stale=0,
)
num_pgs = manager.get_num_pgs()
manager.mark_out_osd(0)
time.sleep(timeout)
manager.flush_pg_stats([1])
manager.wait_for_recovery(timeout)
# all active+clean+remapped
check_stuck(
manager,
num_inactive=0,
num_unclean=0,
num_stale=0,
)
manager.mark_in_osd(0)
manager.flush_pg_stats([0, 1])
manager.wait_for_clean(timeout)
# all active+clean
check_stuck(
manager,
num_inactive=0,
num_unclean=0,
num_stale=0,
)
log.info('stopping first osd')
manager.kill_osd(0)
manager.mark_down_osd(0)
manager.wait_for_active(timeout)
log.info('waiting for all to be unclean')
starttime = time.time()
done = False
while not done:
try:
check_stuck(
manager,
num_inactive=0,
num_unclean=num_pgs,
num_stale=0,
)
done = True
except AssertionError:
# wait up to 15 minutes to become stale
if time.time() - starttime > 900:
raise
log.info('stopping second osd')
manager.kill_osd(1)
manager.mark_down_osd(1)
log.info('waiting for all to be stale')
starttime = time.time()
done = False
while not done:
try:
check_stuck(
manager,
num_inactive=0,
num_unclean=num_pgs,
num_stale=num_pgs,
)
done = True
except AssertionError:
# wait up to 15 minutes to become stale
if time.time() - starttime > 900:
raise
log.info('reviving')
for id_ in teuthology.all_roles_of_type(ctx.cluster, 'osd'):
manager.revive_osd(id_)
manager.mark_in_osd(id_)
while True:
try:
manager.flush_pg_stats([0, 1])
break
except Exception:
log.exception('osds must not be started yet, waiting...')
time.sleep(1)
manager.wait_for_clean(timeout)
check_stuck(
manager,
num_inactive=0,
num_unclean=0,
num_stale=0,
)