Skip to content

Commit

Permalink
Merge pull request ceph#59483 from kamoltat/wip-ksirivad-exit-stretch…
Browse files Browse the repository at this point in the history
…-mode

mon [stretch mode]: support disable_stretch_mode
Reviewed-by: Nitzan Mordechai <[email protected]>
  • Loading branch information
kamoltat authored Nov 5, 2024
2 parents a91bcae + a7f3b7b commit 28e38e3
Show file tree
Hide file tree
Showing 12 changed files with 945 additions and 2 deletions.
28 changes: 28 additions & 0 deletions doc/rados/operations/stretch-mode.rst
Original file line number Diff line number Diff line change
Expand Up @@ -247,6 +247,34 @@ possible, if needed).

.. _Changing Monitor elections: ../change-mon-elections

Exiting Stretch Mode
=====================
To exit stretch mode, run the following command:

.. prompt:: bash $

ceph mon disable_stretch_mode [{crush_rule}] --yes-i-really-mean-it


.. describe:: {crush_rule}

The CRUSH rule that the user wants all pools to move back to. If this
is not specified, the pools will move back to the default CRUSH rule.

:Type: String
:Required: No.

The command will move the cluster back to normal mode,
and the cluster will no longer be in stretch mode.
All pools will move its ``size`` and ``min_size``
back to the default values it started with.
At this point the user is responsible for scaling down the cluster
to the desired number of OSDs if they choose to operate with less number of OSDs.

Please note that the command will not execute when the cluster is in
``recovery stretch mode``. The command will only execute when the cluster
is in ``degraded stretch mode`` or ``healthy stretch mode``.

Limitations of Stretch Mode
===========================
When using stretch mode, OSDs must be located at exactly two sites.
Expand Down
58 changes: 58 additions & 0 deletions qa/suites/rados/singleton/all/stretch-mode-5-mons-8-osds.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
roles:
- - mon.a
- mon.b
- mgr.a
- mgr.b
- osd.0
- osd.1
- osd.2
- osd.3
- - mon.c
- mon.d
- mgr.c
- mgr.d
- osd.4
- osd.5
- osd.6
- osd.7
- - mon.e
- - client.0

openstack:
- volumes: # attached to each instance
count: 3
size: 10 # GB
overrides:
ceph:
conf:
global:
mon election default strategy: 3
osd pool default size: 3
osd pool default min size: 2
mon:
debug mon: 30
tasks:
- install:
- ceph:
pre-mgr-commands:
- sudo ceph config set mgr mgr_pool false --force
log-ignorelist:
- \(POOL_
- \(CACHE_POOL_
- overall HEALTH_
- \(PG_AVAILABILITY\)
- Reduced data availability
- \(PG_DEGRADED\)
- \(MON_DOWN\)
- \(OSD_DATACENTER_DOWN\)
- \(OSD_DOWN\)
- \(OSD_HOST_DOWN\)


- workunit:
clients:
client.0:
- mon/mon-stretch-mode-5-mons-8-osds.sh
- cephfs_test_runner:
modules:
- tasks.stretch_mode_disable_enable
90 changes: 90 additions & 0 deletions qa/tasks/ceph_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -2796,6 +2796,59 @@ def _get_num_peered(self, pgs):
num += 1
return num

def _print_not_active_clean_pg(self, pgs):
"""
Print the PGs that are not active+clean.
"""
for pg in pgs:
if not (pg['state'].count('active') and
pg['state'].count('clean') and
not pg['state'].count('stale')):
log.debug(
"PG %s is not active+clean, but %s",
pg['pgid'], pg['state']
)

def pg_all_active_clean(self):
"""
Check if all pgs are active+clean
return: True if all pgs are active+clean else False
"""
pgs = self.get_pg_stats()
result = self._get_num_active_clean(pgs) == len(pgs)
if result:
log.debug("All PGs are active+clean")
else:
log.debug("Not all PGs are active+clean")
self._print_not_active_clean_pg(pgs)
return result

def _print_not_active_pg(self, pgs):
"""
Print the PGs that are not active.
"""
for pg in pgs:
if not (pg['state'].count('active')
and not pg['state'].count('stale')):
log.debug(
"PG %s is not active, but %s",
pg['pgid'], pg['state']
)

def pg_all_active(self):
"""
Check if all pgs are active
return: True if all pgs are active else False
"""
pgs = self.get_pg_stats()
result = self._get_num_active(pgs) == len(pgs)
if result:
log.debug("All PGs are active")
else:
log.debug("Not all PGs are active")
self._print_not_active_pg(pgs)
return result

def is_clean(self):
"""
True if all pgs are clean
Expand Down Expand Up @@ -3237,6 +3290,26 @@ def revive_mgr(self, mgr):
self.make_admin_daemon_dir(remote)
self.ctx.daemons.get_daemon('mgr', mgr, self.cluster).restart()

def get_crush_rule_id(self, crush_rule_name):
"""
Get crush rule id by name
:returns: int -- crush rule id
"""
out = self.raw_cluster_cmd('osd', 'crush', 'rule', 'dump', '--format=json')
j = json.loads('\n'.join(out.split('\n')[1:]))
for rule in j:
if rule['rule_name'] == crush_rule_name:
return rule['rule_id']
assert False, 'rule %s not found' % crush_rule_name

def get_mon_dump_json(self):
"""
mon dump --format=json converted to a python object
:returns: the python object
"""
out = self.raw_cluster_cmd('mon', 'dump', '--format=json')
return json.loads('\n'.join(out.split('\n')[1:]))

def get_mon_status(self, mon):
"""
Extract all the monitor status information from the cluster
Expand Down Expand Up @@ -3340,6 +3413,23 @@ def get_service_task_status(self, service, status_key):
self.log(task_status)
return task_status

# Stretch mode related functions
def is_degraded_stretch_mode(self):
"""
Return whether the cluster is in degraded stretch mode
"""
try:
osdmap = self.get_osd_dump_json()
stretch_mode = osdmap.get('stretch_mode', {})
degraded_stretch_mode = stretch_mode.get('degraded_stretch_mode', 0)
self.log("is_degraded_stretch_mode: {0}".format(degraded_stretch_mode))
return degraded_stretch_mode == 1
except (TypeError, AttributeError) as e:
# Log the error or handle it as needed
self.log("Error accessing degraded_stretch_mode: {0}".format(e))
return False


def utility_task(name):
"""
Generate ceph_manager subtask corresponding to ceph_manager
Expand Down
Loading

0 comments on commit 28e38e3

Please sign in to comment.