Skip to content

Commit

Permalink
mon: subtree-based crush type down health warnings
Browse files Browse the repository at this point in the history
Signed-off-by: Neha Ojha <[email protected]>
  • Loading branch information
neha-ojha committed May 12, 2017
1 parent d69aacc commit 72682e5
Show file tree
Hide file tree
Showing 5 changed files with 136 additions and 71 deletions.
14 changes: 14 additions & 0 deletions src/crush/CrushWrapper.cc
Original file line number Diff line number Diff line change
Expand Up @@ -587,6 +587,20 @@ int CrushWrapper::get_full_location_ordered(int id, vector<pair<string, string>
return 0;
}

string CrushWrapper::get_full_location_ordered_string(int id)
{
vector<pair<string, string> > full_location_ordered;
string full_location;
get_full_location_ordered(id, full_location_ordered);
reverse(begin(full_location_ordered), end(full_location_ordered));
for(auto i = full_location_ordered.begin(); i != full_location_ordered.end(); i++) {
full_location = full_location + i->first + "=" + i->second;
if (i != full_location_ordered.end() - 1) {
full_location = full_location + ",";
}
}
return full_location;
}

map<int, string> CrushWrapper::get_parent_hierarchy(int id)
{
Expand Down
9 changes: 9 additions & 0 deletions src/crush/CrushWrapper.h
Original file line number Diff line number Diff line change
Expand Up @@ -590,6 +590,15 @@ class CrushWrapper {
*/
int get_full_location_ordered(int id, vector<pair<string, string> >& path);

/*
* identical to get_full_location_ordered(int id, vector<pair<string, string> >& path),
* although it returns a concatenated string with the type/name pairs in descending
* hierarchical order with format key1=val1,key2=val2.
*
* returns the location in descending hierarchy as a string.
*/
string get_full_location_ordered_string(int id);

/**
* returns (type_id, type) of all parent buckets between id and
* default, can be used to check for anomolous CRUSH maps
Expand Down
135 changes: 87 additions & 48 deletions src/mon/OSDMonitor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -3460,15 +3460,14 @@ void OSDMonitor::get_health(list<pair<health_status_t,string> >& summary,
} else {
int num_in_osds = 0;
int num_down_in_osds = 0;
int num_in_subtrees = 0;
int num_down_in_subtrees = 0;
set<int> osds;
set<int> down_cache; // quick cache of down subtrees
set<int> in_subtrees;
set<int> up_in_subtrees;
set<int> down_in_subtrees;
set<string> down_in_subtree_names;
int type = osdmap.crush->get_type_id(g_conf->mon_osd_down_out_subtree_limit);
set<int> down_in_osds;
set<int> up_in_osds;
set<int> subtree_up;
unordered_map<int, set<int> > subtree_type_down;
unordered_map<int, int> num_osds_subtree;
int max_type = osdmap.crush->get_num_type_names() - 1;

for (int i = 0; i < osdmap.get_max_osd(); i++) {
if (!osdmap.exists(i)) {
if (osdmap.crush->item_exists(i)) {
Expand All @@ -3479,57 +3478,97 @@ void OSDMonitor::get_health(list<pair<health_status_t,string> >& summary,
if (osdmap.is_out(i))
continue;
++num_in_osds;
// get the id of the parent subtree
int subtree_id = osdmap.get_parent_subtree_id(g_ceph_context, i, type, &down_cache);
if (subtree_id != -ENOENT) {
in_subtrees.insert(subtree_id);
}

if (down_in_osds.count(i) || up_in_osds.count(i))
continue;
if (!osdmap.is_up(i)) {
++num_down_in_osds;
if (detail) {
const osd_info_t& info = osdmap.get_info(i);
ostringstream ss;
map<string, string> loc;
loc = osdmap.crush->get_full_location(i);
ss << "osd." << i << loc << " is down since epoch " << info.down_at << ", last address "
<< osdmap.get_addr(i);
detail->push_back(make_pair(HEALTH_WARN, ss.str()));
down_in_osds.insert(i);
int parent_id = 0;
int current = i;
for (int type = 0; type <= max_type; type++) {
int r = osdmap.crush->get_immediate_parent_id(current, &parent_id);
if (r == -ENOENT)
break;
// break early if this parent is already marked as up
if (subtree_up.count(parent_id))
break;
type = osdmap.crush->get_bucket_type(parent_id);
if (!osdmap.subtree_type_is_down(g_ceph_context, parent_id, type, &down_in_osds, &up_in_osds, &subtree_up, &subtree_type_down))
break;
current = parent_id;
}
} else {
// if an osd in a subtree is up, implies subtree is not down
up_in_subtrees.insert(subtree_id);
}
}

set_difference(in_subtrees.begin(), in_subtrees.end(),
up_in_subtrees.begin(), up_in_subtrees.end(),
inserter(down_in_subtrees, down_in_subtrees.end()));
num_in_subtrees = in_subtrees.size();
num_down_in_subtrees = down_in_subtrees.size();
for (set<int>::iterator it = down_in_subtrees.begin();
it != down_in_subtrees.end(); ++it) {
down_in_subtree_names.insert(osdmap.crush->get_item_name(*it));
// calculate the number of down osds in each down subtree and store it in num_osds_subtree
for (int type = 1; type <= max_type; type++) {
for (auto j = subtree_type_down[type].begin(); j != subtree_type_down[type].end(); ++j) {
if (type == 1) {
list<int> children;
int num = osdmap.crush->get_children(*j, &children);
num_osds_subtree[*j] = num;
} else {
list<int> children;
int num = 0;
int num_children = osdmap.crush->get_children(*j, &children);
if (num_children == 0)
continue;
for (auto l = children.begin(); l != children.end(); ++l) {
if (num_osds_subtree[*l] > 0) {
num = num + num_osds_subtree[*l];
}
}
num_osds_subtree[*j] = num;
}
}
}
num_down_in_osds = down_in_osds.size();
assert(num_down_in_osds <= num_in_osds);
assert(num_down_in_subtrees <= num_in_subtrees);
if (num_down_in_osds > 0) {
ostringstream ss;
ss << num_down_in_osds << "/" << num_in_osds << " in osds are down";
ss << "\n";
// summary of down subtree types and osds
for (int type = max_type; type > 0; type--) {
if (subtree_type_down[type].size() > 0) {
ss << subtree_type_down[type].size() << " " << osdmap.crush->get_type_name(type);
if (subtree_type_down[type].size() > 1) {
ss << "s";
}
int sum_down_osds = 0;
for (auto j = subtree_type_down[type].begin(); j != subtree_type_down[type].end(); ++j) {
sum_down_osds = sum_down_osds + num_osds_subtree[*j];
}
ss << " (" << sum_down_osds << " osds) down\n";
}
}
ss << down_in_osds.size() << " osds are down\n";
summary.push_back(make_pair(HEALTH_WARN, ss.str()));
if (num_down_in_subtrees > 0) {
ostringstream sst;
if (num_in_subtrees == 1) {
sst << num_down_in_subtrees << "/" << num_in_subtrees << " " << g_conf->mon_osd_down_out_subtree_limit <<
" is down";
sst << "(" << down_in_subtree_names << ")";
summary.push_back(make_pair(HEALTH_WARN, sst.str()));
} else {
sst << num_down_in_subtrees << "/" << num_in_subtrees << " " << g_conf->mon_osd_down_out_subtree_limit <<
"s are down";
sst << "(" << down_in_subtree_names << ")";
summary.push_back(make_pair(HEALTH_WARN, sst.str()));

if (detail) {
ostringstream ss;
// details of down subtree types
for (int type = max_type; type > 0; type--) {
for (auto j = subtree_type_down[type].rbegin(); j != subtree_type_down[type].rend(); ++j) {
ss << osdmap.crush->get_type_name(type);
ss << " ";
ss << osdmap.crush->get_item_name(*j);
// at the top level, do not print location
if (type != max_type) {
ss << " (";
ss << osdmap.crush->get_full_location_ordered_string(*j);
ss << ")";
}
int num = num_osds_subtree[*j];
ss << " (" << num << " osds)";
ss << " is down\n";
}
}
// details of down osds
for (auto it = down_in_osds.begin(); it != down_in_osds.end(); ++it) {
ss << "osd." << *it << " (";
ss << osdmap.crush->get_full_location_ordered_string(*it);
ss << ") is down\n";
}
detail->push_back(make_pair(HEALTH_WARN, ss.str()));
}
}

Expand Down
42 changes: 23 additions & 19 deletions src/osd/OSDMap.cc
Original file line number Diff line number Diff line change
Expand Up @@ -293,32 +293,36 @@ bool OSDMap::containing_subtree_is_down(CephContext *cct, int id, int subtree_ty
}
}

int OSDMap::get_parent_subtree_id(CephContext *cct, int id, int subtree_type, set<int> *down_cache) const
bool OSDMap::subtree_type_is_down(CephContext *cct, int id, int subtree_type, set<int> *down_in_osds, set<int> *up_in_osds,
set<int> *subtree_up, unordered_map<int, set<int> > *subtree_type_down) const
{
set<int> local_down_cache;
if (!down_cache) {
down_cache = &local_down_cache;
}

int current = id;
while (true) {
int type;
if (current >= 0) {
type = 0;
if (id >= 0) {
bool is_down_ret = is_down(id);
if (is_down_ret) {
down_in_osds->insert(id);
} else {
type = crush->get_bucket_type(current);
up_in_osds->insert(id);
}
assert(type >= 0);
return is_down_ret;
}

if (type >= subtree_type) {
return current;
}
if (subtree_type_down &&
(*subtree_type_down)[subtree_type].count(id)) {
return true;
}

int r = crush->get_immediate_parent_id(current, &current);
if (r < 0) {
return -ENOENT;
list<int> children;
crush->get_children(id, &children);
for (const auto &child : children) {
if (!subtree_type_is_down(cct, child, crush->get_bucket_type(child), down_in_osds, up_in_osds, subtree_up, subtree_type_down)) {
subtree_up->insert(id);
return false;
}
}
if (subtree_type_down) {
(*subtree_type_down)[subtree_type].insert(id);
}
return true;
}

void OSDMap::Incremental::encode_client_old(bufferlist& bl) const
Expand Down
7 changes: 3 additions & 4 deletions src/osd/OSDMap.h
Original file line number Diff line number Diff line change
Expand Up @@ -495,10 +495,9 @@ class OSDMap {
bool subtree_is_down(int id, set<int> *down_cache) const;
bool containing_subtree_is_down(CephContext *cct, int osd, int subtree_type, set<int> *down_cache) const;

/**
* get the id of the parent subtree
*/
int get_parent_subtree_id(CephContext *cct, int osd, int subtree_type, set<int> *down_cache) const;
bool subtree_type_is_down(CephContext *cct, int id, int subtree_type, set<int> *down_in_osds, set<int> *up_in_osds,
set<int> *subtree_up, unordered_map<int, set<int> > *subtree_type_down) const;

int identify_osd(const entity_addr_t& addr) const;
int identify_osd(const uuid_d& u) const;
int identify_osd_on_all_channels(const entity_addr_t& addr) const;
Expand Down

0 comments on commit 72682e5

Please sign in to comment.