-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathdie_on_err.py
70 lines (56 loc) · 1.9 KB
/
die_on_err.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
"""
Raise exceptions on osd coredumps or test err directories
"""
import contextlib
import logging
import time
from teuthology.orchestra import run
import ceph_manager
from teuthology import misc as teuthology
log = logging.getLogger(__name__)
@contextlib.contextmanager
def task(ctx, config):
"""
Die if {testdir}/err exists or if an OSD dumps core
"""
if config is None:
config = {}
first_mon = teuthology.get_first_mon(ctx, config)
(mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()
num_osds = teuthology.num_instances_of_type(ctx.cluster, 'osd')
log.info('num_osds is %s' % num_osds)
manager = ceph_manager.CephManager(
mon,
ctx=ctx,
logger=log.getChild('ceph_manager'),
)
while len(manager.get_osd_status()['up']) < num_osds:
time.sleep(10)
testdir = teuthology.get_testdir(ctx)
while True:
for i in range(num_osds):
(osd_remote,) = ctx.cluster.only('osd.%d' % i).remotes.iterkeys()
p = osd_remote.run(
args = [ 'test', '-e', '{tdir}/err'.format(tdir=testdir) ],
wait=True,
check_status=False,
)
exit_status = p.exitstatus
if exit_status == 0:
log.info("osd %d has an error" % i)
raise Exception("osd %d error" % i)
log_path = '/var/log/ceph/osd.%d.log' % (i)
p = osd_remote.run(
args = [
'tail', '-1', log_path,
run.Raw('|'),
'grep', '-q', 'end dump'
],
wait=True,
check_status=False,
)
exit_status = p.exitstatus
if exit_status == 0:
log.info("osd %d dumped core" % i)
raise Exception("osd %d dumped core" % i)
time.sleep(5)