Skip to content

Commit

Permalink
orchestra/daemon/cephadmunit: use journalctl to monitor stdout/err
Browse files Browse the repository at this point in the history
Using 'podman logs' is racy because the command may start before the
container starts, or even after it start and exits.

Signed-off-by: Sage Weil <[email protected]>
  • Loading branch information
liewegas committed Jan 17, 2020
1 parent 72ef72f commit 4fa8304
Showing 1 changed file with 31 additions and 11 deletions.
42 changes: 31 additions & 11 deletions teuthology/orchestra/daemon/cephadmunit.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,18 +42,39 @@ def kill_cmd(self, sig):

def _start_logger(self):
name = '%s.%s' % (self.type_, self.id_)
#self.log.info('_start_logger %s' % name)
self.remote_logger = self.remote.run(
args=['sudo', self.use_cephadm, 'logs',
args=['sudo', 'journalctl',
'-f',
'--fsid', self.fsid,
'--name', name],
'-n', '0',
'-u',
'ceph-%s@%s.service' % (self.fsid, name)
],
logger=logging.getLogger(self.cluster + '.' + name),
label=name,
wait=False)
wait=False,
check_status=False,
)

def _join_logger(self):
def _stop_logger(self):
name = '%s.%s' % (self.type_, self.id_)
# this is a horrible kludge, since i don't know how else to kill
# the journalctl process at the other end :(
#self.log.info('_stop_logger %s running pkill' % name)
self.remote.run(
args=['sudo', 'pkill', '-f',
' '.join(['journalctl',
'-f',
'-n', '0',
'-u',
'ceph-%s@%s.service' % (self.fsid, name)]),
],
check_status=False,
)
#self.log.info('_stop_logger %s waiting')
self.remote_logger.wait()
self.remote_logger = None
#self.log.info('_stop_logger done')

def reset(self):
"""
Expand All @@ -70,14 +91,12 @@ def restart(self, *args, **kwargs):
"""
if not self.running():
self.log.info('Restarting %s (starting--it wasn\'t running)...' % self.name())
self.remote.sh(self.start_cmd)
self._start_logger()
self.remote.sh(self.start_cmd)
self.is_started = True
else:
self.log.info('Restarting %s...' % self.name())
self.remote.sh(self.restart_cmd)
self._join_logger()
self._start_logger()

def restart_with_args(self, extra_args):
"""
Expand Down Expand Up @@ -111,8 +130,8 @@ def start(self, timeout=300):
self.log.warn('Restarting a running daemon')
self.restart()
return
self.remote.run(self.start_cmd)
self._start_logger()
self.remote.run(self.start_cmd)

def stop(self, timeout=300):
"""
Expand All @@ -129,7 +148,7 @@ def stop(self, timeout=300):
self.log.info('Stopping %s...' % self.name())
self.remote.sh(self.stop_cmd)
self.is_started = False
self._join_logger()
self._stop_logger()
self.log.info('Stopped %s' % self.name())

# FIXME why are there two wait methods?
Expand All @@ -141,8 +160,9 @@ def wait(self, timeout=300):
any exception. Mark the daemon as not running.
"""
self.log.info('Waiting for %s to exit...' % self.name())
self._join_logger()
self.remote.sh(self.stop_cmd)
self.is_started = False
self._stop_logger()
self.log.info('Finished waiting for %s to stop' % self.name())

def wait_for_exit(self):
Expand Down

0 comments on commit 4fa8304

Please sign in to comment.