Skip to content

Commit

Permalink
qa/tasks/rebuild_mondb: update to address ceph-mgr changes
Browse files Browse the repository at this point in the history
- revive ceph-mgr after updating the keyring cap
- grant "mgr:allow *" to client.admin
- minor refactors

Signed-off-by: Kefu Chai <[email protected]>
  • Loading branch information
tchaikov committed May 28, 2017
1 parent 5f003d6 commit 8abc6e1
Show file tree
Hide file tree
Showing 2 changed files with 80 additions and 43 deletions.
14 changes: 14 additions & 0 deletions qa/tasks/ceph_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -2268,6 +2268,20 @@ def revive_mon(self, mon):
self.make_admin_daemon_dir(remote)
self.ctx.daemons.get_daemon('mon', mon, self.cluster).restart()

def revive_mgr(self, mgr):
"""
Restart by either power cycling (if the config says so),
or by doing a normal restart.
"""
if self.config.get('powercycle'):
remote = self.find_remote('mgr', mgr)
self.log('revive_mgr on mgr.{m} doing powercycle of {s}'.
format(m=mgr, s=remote.name))
self._assert_ipmi(remote)
remote.console.power_on()
self.make_admin_daemon_dir(remote)
self.ctx.daemons.get_daemon('mgr', mgr, self.cluster).restart()

def get_mon_status(self, mon):
"""
Extract all the monitor status information from the cluster
Expand Down
109 changes: 66 additions & 43 deletions qa/tasks/rebuild_mondb.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
log = logging.getLogger(__name__)


def push_directory(path, remote, remote_dir):
def _push_directory(path, remote, remote_dir):
"""
local_temp_path=`mktemp`
tar czf $local_temp_path $path
Expand Down Expand Up @@ -46,45 +46,20 @@ def push_directory(path, remote, remote_dir):
remote.run(args=['sudo', 'rm', '-fr', remote_temp_path])


def task(ctx, config):
"""
Test monitor recovery from OSD
"""
if config is None:
config = {}
assert isinstance(config, dict), \
'task only accepts a dict for configuration'

first_mon = teuthology.get_first_mon(ctx, config)
(mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()

manager = ceph_manager.CephManager(
mon,
ctx=ctx,
logger=log.getChild('ceph_manager'))

mons = ctx.cluster.only(teuthology.is_type('mon'))
def _nuke_mons(manager, mons, mon_id):
assert mons
# note down the first cluster_name and mon_id
# we will recover it later on
cluster_name = None
mon_id = None
is_mon = teuthology.is_type('mon')
for remote, roles in mons.remotes.iteritems():
is_mon = teuthology.is_type('mon')
for role in roles:
if not is_mon(role):
continue
cluster, _, m = teuthology.split_role(role)
if cluster_name is None:
cluster_name = cluster
mon_id = m
assert cluster_name == cluster
log.info('killing {cluster}:mon.{mon}'.format(
cluster=cluster,
mon=m))
manager.kill_mon(m)
mon_data = os.path.join('/var/lib/ceph/mon/',
'{0}-{1}'.format(cluster_name, m))
'{0}-{1}'.format(cluster, m))
if m == mon_id:
# so we will only need to recreate the store.db for the
# first mon, would be easier than mkfs on it then replace
Expand All @@ -94,13 +69,15 @@ def task(ctx, config):
else:
remote.run(args=['sudo', 'rm', '-r', mon_data])


def _rebuild_db(ctx, manager, cluster_name, mon, mon_id, keyring_path):
local_mstore = tempfile.mkdtemp()

# collect the maps from all OSDs
osds = ctx.cluster.only(teuthology.is_type('osd'))
is_osd = teuthology.is_type('osd')
osds = ctx.cluster.only(is_osd)
assert osds
for osd, roles in osds.remotes.iteritems():
is_osd = teuthology.is_type('osd')
for role in roles:
if not is_osd(role):
continue
Expand All @@ -113,7 +90,7 @@ def task(ctx, config):
osd_mstore = os.path.join(teuthology.get_testdir(ctx), 'mon-store')
osd.run(args=['sudo', 'mkdir', '-m', 'o+x', '-p', osd_mstore])

push_directory(local_mstore, osd, osd_mstore)
_push_directory(local_mstore, osd, osd_mstore)
log.info('rm -rf {0}'.format(local_mstore))
shutil.rmtree(local_mstore)
# update leveldb with OSD data
Expand All @@ -135,12 +112,10 @@ def task(ctx, config):
# pull from recovered leveldb from client
mon_store_dir = os.path.join('/var/lib/ceph/mon',
'{0}-{1}'.format(cluster_name, mon_id))
push_directory(local_mstore, mon, mon_store_dir)
_push_directory(local_mstore, mon, mon_store_dir)
mon.run(args=['sudo', 'chown', '-R', 'ceph:ceph', mon_store_dir])
shutil.rmtree(local_mstore)
default_keyring = '/etc/ceph/{cluster}.keyring'.format(
cluster=cluster_name)
keyring_path = config.get('keyring_path', default_keyring)

# fill up the caps in the keyring file
mon.run(args=['sudo',
'ceph-authtool', keyring_path,
Expand All @@ -151,23 +126,25 @@ def task(ctx, config):
'-n', 'client.admin',
'--cap', 'mon', 'allow *',
'--cap', 'osd', 'allow *',
'--cap', 'mds', 'allow *'])
'--cap', 'mds', 'allow *',
'--cap', 'mgr', 'allow *'])
mon.run(args=['sudo', '-u', 'ceph',
'ceph-monstore-tool', mon_store_dir,
'rebuild', '--', '--keyring',
keyring_path])


def _revive_mons(manager, mons, recovered, keyring_path):
# revive monitors
# the initial monmap is in the ceph.conf, so we are good.
n_mons = 0
is_mon = teuthology.is_type('mon')
for remote, roles in mons.remotes.iteritems():
is_mon = teuthology.is_type('mon')
for role in roles:
if not is_mon(role):
continue
cluster, _, m = teuthology.split_role(role)
assert cluster_name == cluster
if mon_id != m:
if recovered != m:
log.info('running mkfs on {cluster}:mon.{mon}'.format(
cluster=cluster,
mon=m))
Expand All @@ -179,15 +156,61 @@ def task(ctx, config):
'--mkfs',
'-i', m,
'--keyring', keyring_path])
log.info('reviving mon.{0}'.format(m))
manager.revive_mon(m)
n_mons += 1

manager.wait_for_mon_quorum_size(n_mons, timeout=30)
for osd, roles in osds.remotes.iteritems():
is_osd = teuthology.is_type('osd')


def _revive_mgrs(ctx, manager):
is_mgr = teuthology.is_type('mgr')
mgrs = ctx.cluster.only(is_mgr)
for _, roles in mgrs.remotes.iteritems():
for role in roles:
if not is_mgr(role):
continue
_, _, mgr_id = teuthology.split_role(role)
log.info('reviving mgr.{0}'.format(mgr_id))
manager.revive_mgr(mgr_id)


def _revive_osds(ctx, manager):
is_osd = teuthology.is_type('osd')
osds = ctx.cluster.only(is_osd)
for _, roles in osds.remotes.iteritems():
for role in roles:
if not is_osd(role):
continue
_, _, osd_id = teuthology.split_role(role)
log.info('reviving osd.{0}'.format(osd_id))
manager.revive_osd(osd_id)


def task(ctx, config):
"""
Test monitor recovery from OSD
"""
if config is None:
config = {}
assert isinstance(config, dict), \
'task only accepts a dict for configuration'

first_mon = teuthology.get_first_mon(ctx, config)
(mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()
manager = ceph_manager.CephManager(
mon,
ctx=ctx,
logger=log.getChild('ceph_manager'))

mons = ctx.cluster.only(teuthology.is_type('mon'))
# note down the first cluster_name and mon_id
# we will recover it later on
cluster_name, _, mon_id = teuthology.split_role(first_mon)
_nuke_mons(manager, mons, mon_id)
default_keyring = '/etc/ceph/{cluster}.keyring'.format(
cluster=cluster_name)
keyring_path = config.get('keyring_path', default_keyring)
_rebuild_db(ctx, manager, cluster_name, mon, mon_id, keyring_path)
_revive_mons(manager, mons, mon_id, keyring_path)
_revive_mgrs(ctx, manager)
_revive_osds(ctx, manager)

0 comments on commit 8abc6e1

Please sign in to comment.