Skip to content

Commit

Permalink
Merge pull request ceph#22010 from b-ranto/wip-expose-avgcount
Browse files Browse the repository at this point in the history
Expose avgcount to the python modules

Reviewed-by: Jan Fajerski <[email protected]>
Reveiwed-by: John Spray <[email protected]>
Reviewed-by: Kefu Chai <[email protected]>
  • Loading branch information
b-ranto authored May 30, 2018
2 parents a67681b + 7f468f4 commit 9d317e5
Show file tree
Hide file tree
Showing 6 changed files with 111 additions and 19 deletions.
6 changes: 6 additions & 0 deletions doc/mgr/prometheus.rst
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,12 @@ The *cluster* statistics (i.e. those global to the Ceph cluster)
have labels appropriate to what they report on. For example,
metrics relating to pools have a ``pool_id`` label.


The long running averages that represent the histograms from core Ceph
are represented by a pair of ``<name>_sum`` and ``<name>_count`` metrics.
This is similar to how histograms are represented in `Prometheus <https://prometheus.io/docs/concepts/metric_types/#histogram>`_
and they can also be treated `similarly <https://prometheus.io/docs/practices/histograms/>`_.

Pool and OSD metadata series
----------------------------

Expand Down
25 changes: 18 additions & 7 deletions src/mgr/ActivePyModules.cc
Original file line number Diff line number Diff line change
Expand Up @@ -589,13 +589,24 @@ PyObject* ActivePyModules::get_counter_python(
Mutex::Locker l2(metadata->lock);
if (metadata->perf_counters.instances.count(path)) {
auto counter_instance = metadata->perf_counters.instances.at(path);
const auto &data = counter_instance.get_data();
for (const auto &datapoint : data) {
f.open_array_section("datapoint");
f.dump_unsigned("t", datapoint.t.sec());
f.dump_unsigned("v", datapoint.v);
f.close_section();

auto counter_type = metadata->perf_counters.types.at(path);
if (counter_type.type & PERFCOUNTER_LONGRUNAVG) {
const auto &data = counter_instance.get_data();
for (const auto &datapoint : data) {
f.open_array_section("datapoint");
f.dump_unsigned("t", datapoint.t.sec());
f.dump_unsigned("v", datapoint.v);
f.close_section();
}
} else {
const auto &avg_data = counter_instance.get_data_avg();
for (const auto &datapoint : avg_data) {
f.open_array_section("datapoint");
f.dump_unsigned("t", datapoint.t.sec());
f.dump_unsigned("s", datapoint.s);
f.dump_unsigned("c", datapoint.c);
f.close_section();
}
}
} else {
dout(4) << "Missing counter: '" << path << "' ("
Expand Down
12 changes: 10 additions & 2 deletions src/mgr/DaemonState.cc
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,8 @@ void DaemonPerfCounters::update(MMgrReport *report)
for (const auto &t : report->declare_types) {
types.insert(std::make_pair(t.path, t));
session->declared_types.insert(t.path);
instances.insert(std::pair<std::string, PerfCounterInstance>(
t.path, PerfCounterInstance(t.type)));
}
// Remove any old types
for (const auto &t : report->undeclare_types) {
Expand All @@ -162,9 +164,10 @@ void DaemonPerfCounters::update(MMgrReport *report)
if (t.type & PERFCOUNTER_LONGRUNAVG) {
decode(avgcount, p);
decode(avgcount2, p);
instances.at(t_path).push_avg(now, val, avgcount);
} else {
instances.at(t_path).push(now, val);
}
// TODO: interface for insertion of avgs
instances[t_path].push(now, val);
}
DECODE_FINISH(p);
}
Expand All @@ -179,3 +182,8 @@ void PerfCounterInstance::push(utime_t t, uint64_t const &v)
buffer.push_back({t, v});
}

void PerfCounterInstance::push_avg(utime_t t, uint64_t const &s,
uint64_t const &c)
{
avg_buffer.push_back({t, s, c});
}
28 changes: 26 additions & 2 deletions src/mgr/DaemonState.h
Original file line number Diff line number Diff line change
Expand Up @@ -45,17 +45,41 @@ class PerfCounterInstance
{}
};

class AvgDataPoint
{
public:
utime_t t;
uint64_t s;
uint64_t c;
AvgDataPoint(utime_t t_, uint64_t s_, uint64_t c_)
: t(t_), s(s_), c(c_)
{}
};

boost::circular_buffer<DataPoint> buffer;
boost::circular_buffer<AvgDataPoint> avg_buffer;

uint64_t get_current() const;

public:
const boost::circular_buffer<DataPoint> & get_data() const
{
return buffer;
}
const boost::circular_buffer<AvgDataPoint> & get_data_avg() const
{
return avg_buffer;
}
void push(utime_t t, uint64_t const &v);
PerfCounterInstance()
: buffer(20) {}
void push_avg(utime_t t, uint64_t const &s, uint64_t const &c);

PerfCounterInstance(enum perfcounter_type_d type)
{
if (type & PERFCOUNTER_LONGRUNAVG)
avg_buffer = boost::circular_buffer<AvgDataPoint>(20);
else
buffer = boost::circular_buffer<DataPoint>(20);
};
};


Expand Down
28 changes: 25 additions & 3 deletions src/pybind/mgr/mgr_module.py
Original file line number Diff line number Diff line change
Expand Up @@ -692,6 +692,13 @@ def get_latest(self, daemon_type, daemon_name, counter):
else:
return 0

def get_latest_avg(self, daemon_type, daemon_name, counter):
data = self.get_counter(daemon_type, daemon_name, counter)[counter]
if data:
return (data[-1][1], data[-1][2])
else:
return (0, 0)

def get_all_perf_counters(self, prio_limit=PRIO_USEFUL):
"""
Return the perf counters currently known to this ceph-mgr
Expand Down Expand Up @@ -733,9 +740,24 @@ def get_all_perf_counters(self, prio_limit=PRIO_USEFUL):
if counter_schema['priority'] < prio_limit:
continue

counter_info = counter_schema
counter_info['value'] = self.get_latest(service['type'], service['id'],
counter_path)
counter_info = dict(counter_schema)

# Also populate count for the long running avgs
if counter_schema['type'] & self.PERFCOUNTER_LONGRUNAVG:
v, c = self.get_latest_avg(
service['type'],
service['id'],
counter_path
)
counter_info['value'], counter_info['count'] = v, c
result[svc_full_name][counter_path] = counter_info
else:
counter_info['value'] = self.get_latest(
service['type'],
service['id'],
counter_path
)

result[svc_full_name][counter_path] = counter_info

self.log.debug("returning {0} counter".format(len(result)))
Expand Down
31 changes: 26 additions & 5 deletions src/pybind/mgr/prometheus/module.py
Original file line number Diff line number Diff line change
Expand Up @@ -571,22 +571,43 @@ def collect(self):

for daemon, counters in self.get_all_perf_counters().items():
for path, counter_info in counters.items():
# Skip histograms, they are represented by long running avgs
stattype = self._stattype_to_str(counter_info['type'])
# XXX simplify first effort: no histograms
# averages are already collapsed to one value for us
if not stattype or stattype == 'histogram':
self.log.debug('ignoring %s, type %s' % (path, stattype))
continue

self.metrics.add_metric(path, Metric(
# Get the value of the counter
value = self._perfvalue_to_value(counter_info['type'], counter_info['value'])

# Represent the long running avgs as sum/count pairs
if counter_info['type'] & self.PERFCOUNTER_LONGRUNAVG:
_path = path + '_sum'
self.metrics.add_metric(_path, Metric(
stattype,
_path,
counter_info['description'] + ' Total',
("ceph_daemon",),
))
self.metrics.append(_path, value, (daemon,))

_path = path + '_count'
self.metrics.add_metric(_path, Metric(
'counter',
_path,
counter_info['description'] + ' Count',
("ceph_daemon",),
))
self.metrics.append(_path, counter_info['count'], (daemon,))
else:
self.metrics.add_metric(path, Metric(
stattype,
path,
counter_info['description'],
("ceph_daemon",),
))
self.metrics.append(path, value, (daemon,))

value = self._perfvalue_to_value(counter_info['type'], counter_info['value'])
self.metrics.append(path, value, (daemon,))
# It is sufficient to reset the pending metrics once per scrape
self.metrics.reset()

Expand Down

0 comments on commit 9d317e5

Please sign in to comment.