Skip to content

Commit

Permalink
Merge pull request ceph#27503 from dzafman/wip-39099
Browse files Browse the repository at this point in the history
osd: Give recovery for inactive PGs a higher priority

Reviewed-by: Sage Weil <[email protected]>
Reviewed-by: Neha Ojha <[email protected]>
  • Loading branch information
dzafman authored Apr 25, 2019
2 parents 5f34369 + 71d2546 commit 39cc14b
Show file tree
Hide file tree
Showing 17 changed files with 330 additions and 66 deletions.
6 changes: 4 additions & 2 deletions doc/rados/operations/pools.rst
Original file line number Diff line number Diff line change
Expand Up @@ -601,8 +601,10 @@ You may set values for the following keys:

``recovery_priority``

:Description: When a value is set it will boost the computed reservation priority
by this amount. This value should be less than 30.
:Description: When a value is set it will increase or decrease the computed
reservation priority. This value must be in the range -10 to
10. Use a negative priority for less important pools so they
have lower priority than any new pools.

:Type: Integer
:Default: ``0``
Expand Down
89 changes: 89 additions & 0 deletions qa/standalone/mon/osd-pool-create.sh
Original file line number Diff line number Diff line change
Expand Up @@ -232,6 +232,95 @@ function TEST_pool_create_rep_expected_num_objects() {
fi
}

function check_pool_priority() {
local dir=$1
shift
local pools=$1
shift
local spread="$1"
shift
local results="$1"

setup $dir || return 1

EXTRA_OPTS="--debug_allow_any_pool_priority=true"
export EXTRA_OPTS
run_mon $dir a || return 1
run_mgr $dir x || return 1
run_osd $dir 0 || return 1
run_osd $dir 1 || return 1
run_osd $dir 2 || return 1

# Add pool 0 too
for i in $(seq 0 $pools)
do
num=$(expr $i + 1)
ceph osd pool create test${num} 1 1
done

wait_for_clean || return 1
for i in $(seq 0 $pools)
do
num=$(expr $i + 1)
ceph osd pool set test${num} recovery_priority $(expr $i \* $spread)
done

#grep "recovery_priority.*pool set" out/mon.a.log

bin/ceph osd dump

# Restart everything so mon converts the priorities
kill_daemons
run_mon $dir a || return 1
run_mgr $dir x || return 1
activate_osd $dir 0 || return 1
activate_osd $dir 1 || return 1
activate_osd $dir 2 || return 1
sleep 5

grep convert $dir/mon.a.log
ceph osd dump

pos=1
for i in $(ceph osd dump | grep ^pool | sed 's/.*recovery_priority //' | awk '{ print $1 }')
do
result=$(echo $results | awk "{ print \$${pos} }")
# A value of 0 is an unset value so sed/awk gets "pool"
if test $result = "0"
then
result="pool"
fi
test "$result" = "$i" || return 1
pos=$(expr $pos + 1)
done
}

function TEST_pool_pos_only_prio() {
local dir=$1
check_pool_priority $dir 20 5 "0 0 1 1 2 2 3 3 4 4 5 5 6 6 7 7 8 8 9 9 10" || return 1
}

function TEST_pool_neg_only_prio() {
local dir=$1
check_pool_priority $dir 20 -5 "0 0 -1 -1 -2 -2 -3 -3 -4 -4 -5 -5 -6 -6 -7 -7 -8 -8 -9 -9 -10" || return 1
}

function TEST_pool_both_prio() {
local dir=$1
check_pool_priority $dir 20 "5 - 50" "-10 -9 -8 -7 -6 -5 -4 -3 -2 -1 0 1 2 3 4 5 6 7 8 9 10" || return 1
}

function TEST_pool_both_prio_no_neg() {
local dir=$1
check_pool_priority $dir 20 "2 - 4" "-4 -2 0 0 1 1 2 2 3 3 4 5 5 6 6 7 7 8 8 9 10" || return 1
}

function TEST_pool_both_prio_no_pos() {
local dir=$1
check_pool_priority $dir 20 "2 - 36" "-10 -9 -8 -8 -7 -7 -6 -6 -5 -5 -4 -3 -3 -2 -2 -1 -1 0 0 2 4" || return 1
}


main osd-pool-create "$@"

# Local Variables:
Expand Down
4 changes: 2 additions & 2 deletions qa/standalone/osd/osd-backfill-prio.sh
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,8 @@ function run() {
export objects=50
export poolprefix=test
export FORCE_PRIO="254" # See OSD_BACKFILL_PRIORITY_FORCED
export DEGRADED_PRIO="140" # See OSD_BACKFILL_DEGRADED_PRIORITY_BASE
export NORMAL_PRIO="100" # See OSD_BACKFILL_PRIORITY_BASE
export DEGRADED_PRIO="150" # See OSD_BACKFILL_DEGRADED_PRIORITY_BASE + 10
export NORMAL_PRIO="110" # See OSD_BACKFILL_PRIORITY_BASE + 10

local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')}
for func in $funcs ; do
Expand Down
2 changes: 1 addition & 1 deletion qa/standalone/osd/osd-recovery-prio.sh
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ function run() {
export objects=200
export poolprefix=test
export FORCE_PRIO="255" # See OSD_RECOVERY_PRIORITY_FORCED
export NORMAL_PRIO="180" # See OSD_RECOVERY_PRIORITY_BASE
export NORMAL_PRIO="190" # See OSD_RECOVERY_PRIORITY_BASE + 10

local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')}
for func in $funcs ; do
Expand Down
3 changes: 2 additions & 1 deletion qa/standalone/scrub/osd-recovery-scrub.sh
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,8 @@ function TEST_recovery_scrub() {
ERRORS=$(expr $ERRORS + 1)
fi

kill_daemons $dir || return 1
# Work around for http://tracker.ceph.com/issues/38195
kill_daemons $dir #|| return 1

declare -a err_strings
err_strings[0]="not scheduling scrubs due to active recovery"
Expand Down
6 changes: 4 additions & 2 deletions qa/workunits/cephtool/test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2065,10 +2065,12 @@ function test_mon_osd_pool_set()
ceph osd pool get $TEST_POOL_GETSET recovery_priority | expect_false grep '.'
ceph osd pool set $TEST_POOL_GETSET recovery_priority 5
ceph osd pool get $TEST_POOL_GETSET recovery_priority | grep 'recovery_priority: 5'
ceph osd pool set $TEST_POOL_GETSET recovery_priority -5
ceph osd pool get $TEST_POOL_GETSET recovery_priority | grep 'recovery_priority: -5'
ceph osd pool set $TEST_POOL_GETSET recovery_priority 0
ceph osd pool get $TEST_POOL_GETSET recovery_priority | expect_false grep '.'
expect_false ceph osd pool set $TEST_POOL_GETSET recovery_priority -1
expect_false ceph osd pool set $TEST_POOL_GETSET recovery_priority 30
expect_false ceph osd pool set $TEST_POOL_GETSET recovery_priority -11
expect_false ceph osd pool set $TEST_POOL_GETSET recovery_priority 11

ceph osd pool get $TEST_POOL_GETSET recovery_op_priority | expect_false grep '.'
ceph osd pool set $TEST_POOL_GETSET recovery_op_priority 5
Expand Down
1 change: 1 addition & 0 deletions src/common/legacy_config_opts.h
Original file line number Diff line number Diff line change
Expand Up @@ -1536,3 +1536,4 @@ OPTION(fake_statfs_for_testing, OPT_INT) // Set a value for kb and compute kb_us
OPTION(rgw_sts_token_introspection_url, OPT_STR) // url for introspecting web tokens
OPTION(rgw_sts_client_id, OPT_STR) // Client Id
OPTION(rgw_sts_client_secret, OPT_STR) // Client Secret
OPTION(debug_allow_any_pool_priority, OPT_BOOL)
4 changes: 4 additions & 0 deletions src/common/options.cc
Original file line number Diff line number Diff line change
Expand Up @@ -8170,6 +8170,10 @@ std::vector<Option> get_mds_client_options() {
Option("fake_statfs_for_testing", Option::TYPE_INT, Option::LEVEL_DEV)
.set_default(0)
.set_description("Set a value for kb and compute kb_used from total of num_bytes"),

Option("debug_allow_any_pool_priority", Option::TYPE_BOOL, Option::LEVEL_DEV)
.set_default(false)
.set_description("Allow any pool priority to be set to test conversion to new range"),
});
}

Expand Down
64 changes: 35 additions & 29 deletions src/mgr/DaemonServer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1351,16 +1351,17 @@ bool DaemonServer::_handle_command(
}
auto q = pg_map.num_pg_by_osd.find(osd);
if (q != pg_map.num_pg_by_osd.end()) {
if (q->second.acting > 0 || q->second.up > 0) {
if (q->second.acting > 0 || q->second.up_not_acting > 0) {
active_osds.insert(osd);
affected_pgs += q->second.acting + q->second.up;
// XXX: For overlapping PGs, this counts them again
affected_pgs += q->second.acting + q->second.up_not_acting;
continue;
}
}
if (num_active_clean < pg_map.num_pg) {
// all pgs aren't active+clean; we need to be careful.
auto p = pg_map.osd_stat.find(osd);
if (p == pg_map.osd_stat.end()) {
if (p == pg_map.osd_stat.end() || !osdmap.is_up(osd)) {
missing_stats.insert(osd);
continue;
} else if (p->second.num_pgs > 0) {
Expand Down Expand Up @@ -1405,7 +1406,6 @@ bool DaemonServer::_handle_command(
if (!r) {
ss << "OSD(s) " << osds << " are safe to destroy without reducing data"
<< " durability.";
safe_to_destroy.swap(osds);
}
if (f) {
f->open_object_section("osd_status");
Expand Down Expand Up @@ -1477,7 +1477,7 @@ bool DaemonServer::_handle_command(
cmdctx->reply(r, ss);
return true;
}
map<pg_t,int> pg_delta; // pgid -> net acting set size change
int touched_pgs = 0;
int dangerous_pgs = 0;
cluster_state.with_osdmap_and_pgmap([&](const OSDMap& osdmap, const PGMap& pg_map) {
if (pg_map.num_pg_unknown > 0) {
Expand All @@ -1486,35 +1486,40 @@ bool DaemonServer::_handle_command(
r = -EAGAIN;
return;
}
for (auto osd : osds) {
auto p = pg_map.pg_by_osd.find(osd);
if (p != pg_map.pg_by_osd.end()) {
for (auto& pgid : p->second) {
--pg_delta[pgid];
for (const auto& q : pg_map.pg_stat) {
set<int32_t> pg_acting; // net acting sets (with no missing if degraded)
bool found = false;
if (q.second.state & PG_STATE_DEGRADED) {
for (auto& anm : q.second.avail_no_missing) {
if (osds.count(anm.osd)) {
found = true;
continue;
}
pg_acting.insert(anm.osd);
}
} else {
for (auto& a : q.second.acting) {
if (osds.count(a)) {
found = true;
continue;
}
pg_acting.insert(a);
}
}
}
for (auto& p : pg_delta) {
auto q = pg_map.pg_stat.find(p.first);
if (q == pg_map.pg_stat.end()) {
ss << "missing information about " << p.first << "; cannot draw"
<< " any conclusions";
r = -EAGAIN;
return;
if (!found) {
continue;
}
if (!(q->second.state & PG_STATE_ACTIVE) ||
(q->second.state & PG_STATE_DEGRADED)) {
// we don't currently have a good way to tell *how* degraded
// a degraded PG is, so we have to assume we cannot remove
// any more replicas/shards.
touched_pgs++;
if (!(q.second.state & PG_STATE_ACTIVE) ||
(q.second.state & PG_STATE_DEGRADED)) {
++dangerous_pgs;
continue;
}
const pg_pool_t *pi = osdmap.get_pg_pool(p.first.pool());
const pg_pool_t *pi = osdmap.get_pg_pool(q.first.pool());
if (!pi) {
++dangerous_pgs; // pool is creating or deleting
} else {
if (q->second.acting.size() + p.second < pi->min_size) {
if (pg_acting.size() < pi->min_size) {
++dangerous_pgs;
}
}
Expand All @@ -1525,14 +1530,15 @@ bool DaemonServer::_handle_command(
return true;
}
if (dangerous_pgs) {
ss << dangerous_pgs << " PGs are already degraded or might become "
<< "unavailable";
ss << dangerous_pgs << " PGs are already too degraded, would become"
<< " too degraded or might become unavailable";
cmdctx->reply(-EBUSY, ss);
return true;
}
ss << "OSD(s) " << osds << " are ok to stop without reducing"
<< " availability, provided there are no other concurrent failures"
<< " or interventions. " << pg_delta.size() << " PGs are likely to be"
<< " availability or risking data, provided there are no other concurrent failures"
<< " or interventions." << std::endl;
ss << touched_pgs << " PGs are likely to be"
<< " degraded (but remain available) as a result.";
cmdctx->reply(0, ss);
return true;
Expand Down
70 changes: 63 additions & 7 deletions src/mon/OSDMonitor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -652,6 +652,11 @@ void OSDMonitor::on_active()

if (mon->is_leader()) {
mon->clog->debug() << "osdmap " << osdmap;
if (!priority_convert) {
// Only do this once at start-up
convert_pool_priorities();
priority_convert = true;
}
} else {
list<MonOpRequestRef> ls;
take_all_failures(ls);
Expand Down Expand Up @@ -7749,13 +7754,12 @@ int OSDMonitor::prepare_command_pool_set(const cmdmap_t& cmdmap,
ss << "error parsing int value '" << val << "': " << interr;
return -EINVAL;
}
if (n < 0) {
ss << "pool recovery_priority can not be negative";
return -EINVAL;
} else if (n >= 30) {
ss << "pool recovery_priority should be less than 30 due to "
<< "Ceph internal implementation restrictions";
return -EINVAL;
if (!g_conf()->debug_allow_any_pool_priority) {
if (n > OSD_POOL_PRIORITY_MAX || n < OSD_POOL_PRIORITY_MIN) {
ss << "pool recovery_priority must be between " << OSD_POOL_PRIORITY_MIN
<< " and " << OSD_POOL_PRIORITY_MAX;
return -EINVAL;
}
}
} else if (var == "pg_autoscale_bias") {
if (f < 0.0 || f > 1000.0) {
Expand Down Expand Up @@ -13348,3 +13352,55 @@ void OSDMonitor::_pool_op_reply(MonOpRequestRef op,
ret, epoch, get_last_committed(), blp);
mon->send_reply(op, reply);
}

void OSDMonitor::convert_pool_priorities(void)
{
pool_opts_t::key_t key = pool_opts_t::get_opt_desc("recovery_priority").key;
int64_t max_prio = 0;
int64_t min_prio = 0;
for (const auto &i : osdmap.get_pools()) {
const auto &pool = i.second;

if (pool.opts.is_set(key)) {
int64_t prio;
pool.opts.get(key, &prio);
if (prio > max_prio)
max_prio = prio;
if (prio < min_prio)
min_prio = prio;
}
}
if (max_prio <= OSD_POOL_PRIORITY_MAX && min_prio >= OSD_POOL_PRIORITY_MIN) {
dout(20) << __func__ << " nothing to fix" << dendl;
return;
}
// Current pool priorities exceeds new maximum
for (const auto &i : osdmap.get_pools()) {
const auto pool_id = i.first;
pg_pool_t pool = i.second;

int64_t prio = 0;
pool.opts.get(key, &prio);
int64_t n;

if (prio > 0 && max_prio > OSD_POOL_PRIORITY_MAX) { // Likely scenario
// Scaled priority range 0 to OSD_POOL_PRIORITY_MAX
n = (float)prio / max_prio * OSD_POOL_PRIORITY_MAX;
} else if (prio < 0 && min_prio < OSD_POOL_PRIORITY_MIN) {
// Scaled priority range OSD_POOL_PRIORITY_MIN to 0
n = (float)prio / min_prio * OSD_POOL_PRIORITY_MIN;
} else {
continue;
}
if (n == 0) {
pool.opts.unset(key);
} else {
pool.opts.set(key, static_cast<int64_t>(n));
}
dout(10) << __func__ << " pool " << pool_id
<< " recovery_priority adjusted "
<< prio << " to " << n << dendl;
pool.last_change = pending_inc.epoch;
pending_inc.new_pools[pool_id] = pool;
}
}
2 changes: 2 additions & 0 deletions src/mon/OSDMonitor.h
Original file line number Diff line number Diff line change
Expand Up @@ -220,6 +220,7 @@ class OSDMonitor : public PaxosService {
set<int> pending_metadata_rm;
map<int, failure_info_t> failure_info;
map<int,utime_t> down_pending_out; // osd down -> out
bool priority_convert = false;

map<int,double> osd_weight;

Expand Down Expand Up @@ -701,6 +702,7 @@ class OSDMonitor : public PaxosService {
pending_inc.new_flags &= ~flag;
}
}
void convert_pool_priorities(void);
};

#endif
Loading

0 comments on commit 39cc14b

Please sign in to comment.