Merge pull request ceph#22010 from b-ranto/wip-expose-avgcount

Expose avgcount to the python modules Reviewed-by: Jan Fajerski <[email protected]> Reveiwed-by: John Spray <[email protected]> Reviewed-by: Kefu Chai <[email protected]>
mleklund · May 30, 2018 · 9d317e5 · 9d317e5
2 parents a67681b + 7f468f4
commit 9d317e5
Show file tree

Hide file tree

Showing 6 changed files with 111 additions and 19 deletions.
diff --git a/doc/mgr/prometheus.rst b/doc/mgr/prometheus.rst
@@ -49,6 +49,12 @@ The *cluster* statistics (i.e. those global to the Ceph cluster)
 have labels appropriate to what they report on.  For example, 
 metrics relating to pools have a ``pool_id`` label.
 
+
+The long running averages that represent the histograms from core Ceph
+are represented by a pair of ``<name>_sum`` and ``<name>_count`` metrics.
+This is similar to how histograms are represented in `Prometheus <https://prometheus.io/docs/concepts/metric_types/#histogram>`_
+and they can also be treated `similarly <https://prometheus.io/docs/practices/histograms/>`_.
+
 Pool and OSD metadata series
 ----------------------------
 

diff --git a/src/mgr/ActivePyModules.cc b/src/mgr/ActivePyModules.cc
@@ -589,13 +589,24 @@ PyObject* ActivePyModules::get_counter_python(
     Mutex::Locker l2(metadata->lock);
     if (metadata->perf_counters.instances.count(path)) {
       auto counter_instance = metadata->perf_counters.instances.at(path);
-      const auto &data = counter_instance.get_data();
-      for (const auto &datapoint : data) {
-        f.open_array_section("datapoint");
-        f.dump_unsigned("t", datapoint.t.sec());
-        f.dump_unsigned("v", datapoint.v);
-        f.close_section();
-
+      auto counter_type = metadata->perf_counters.types.at(path);
+      if (counter_type.type & PERFCOUNTER_LONGRUNAVG) {
+        const auto &data = counter_instance.get_data();
+        for (const auto &datapoint : data) {
+          f.open_array_section("datapoint");
+          f.dump_unsigned("t", datapoint.t.sec());
+          f.dump_unsigned("v", datapoint.v);
+          f.close_section();
+        }
+      } else {
+        const auto &avg_data = counter_instance.get_data_avg();
+        for (const auto &datapoint : avg_data) {
+          f.open_array_section("datapoint");
+          f.dump_unsigned("t", datapoint.t.sec());
+          f.dump_unsigned("s", datapoint.s);
+          f.dump_unsigned("c", datapoint.c);
+          f.close_section();
+        }
       }
     } else {
       dout(4) << "Missing counter: '" << path << "' ("

diff --git a/src/mgr/DaemonState.cc b/src/mgr/DaemonState.cc
@@ -141,6 +141,8 @@ void DaemonPerfCounters::update(MMgrReport *report)
   for (const auto &t : report->declare_types) {
     types.insert(std::make_pair(t.path, t));
     session->declared_types.insert(t.path);
+    instances.insert(std::pair<std::string, PerfCounterInstance>(
+                     t.path, PerfCounterInstance(t.type)));
   }
   // Remove any old types
   for (const auto &t : report->undeclare_types) {
@@ -162,9 +164,10 @@ void DaemonPerfCounters::update(MMgrReport *report)
     if (t.type & PERFCOUNTER_LONGRUNAVG) {
       decode(avgcount, p);
       decode(avgcount2, p);
+      instances.at(t_path).push_avg(now, val, avgcount);
+    } else {
+      instances.at(t_path).push(now, val);
     }
-    // TODO: interface for insertion of avgs
-    instances[t_path].push(now, val);
   }
   DECODE_FINISH(p);
 }
@@ -179,3 +182,8 @@ void PerfCounterInstance::push(utime_t t, uint64_t const &v)
   buffer.push_back({t, v});
 }
 
+void PerfCounterInstance::push_avg(utime_t t, uint64_t const &s,
+                                   uint64_t const &c)
+{
+  avg_buffer.push_back({t, s, c});
+}
diff --git a/src/mgr/DaemonState.h b/src/mgr/DaemonState.h
@@ -45,17 +45,41 @@ class PerfCounterInstance
     {}
   };
 
+  class AvgDataPoint
+  {
+    public:
+    utime_t t;
+    uint64_t s;
+    uint64_t c;
+    AvgDataPoint(utime_t t_, uint64_t s_, uint64_t c_)
+      : t(t_), s(s_), c(c_)
+    {}
+  };
+
   boost::circular_buffer<DataPoint> buffer;
+  boost::circular_buffer<AvgDataPoint> avg_buffer;
+
   uint64_t get_current() const;
 
   public:
   const boost::circular_buffer<DataPoint> & get_data() const
   {
     return buffer;
   }
+  const boost::circular_buffer<AvgDataPoint> & get_data_avg() const
+  {
+    return avg_buffer;
+  }
   void push(utime_t t, uint64_t const &v);
-  PerfCounterInstance()
-    : buffer(20) {}
+  void push_avg(utime_t t, uint64_t const &s, uint64_t const &c);
+
+  PerfCounterInstance(enum perfcounter_type_d type)
+  {
+    if (type & PERFCOUNTER_LONGRUNAVG)
+      avg_buffer = boost::circular_buffer<AvgDataPoint>(20);
+    else
+      buffer = boost::circular_buffer<DataPoint>(20);
+  };
 };
 
 

diff --git a/src/pybind/mgr/mgr_module.py b/src/pybind/mgr/mgr_module.py
@@ -692,6 +692,13 @@ def get_latest(self, daemon_type, daemon_name, counter):
         else:
             return 0
 
+    def get_latest_avg(self, daemon_type, daemon_name, counter):
+        data = self.get_counter(daemon_type, daemon_name, counter)[counter]
+        if data:
+            return (data[-1][1], data[-1][2])
+        else:
+            return (0, 0)
+
     def get_all_perf_counters(self, prio_limit=PRIO_USEFUL):
         """
         Return the perf counters currently known to this ceph-mgr
@@ -733,9 +740,24 @@ def get_all_perf_counters(self, prio_limit=PRIO_USEFUL):
                     if counter_schema['priority'] < prio_limit:
                         continue
 
-                    counter_info = counter_schema
-                    counter_info['value'] = self.get_latest(service['type'], service['id'],
-                                                            counter_path)
+                    counter_info = dict(counter_schema)
+
+                    # Also populate count for the long running avgs
+                    if counter_schema['type'] & self.PERFCOUNTER_LONGRUNAVG:
+                        v, c = self.get_latest_avg(
+                            service['type'],
+                            service['id'],
+                            counter_path
+                        )
+                        counter_info['value'], counter_info['count'] = v, c
+                        result[svc_full_name][counter_path] = counter_info
+                    else:
+                        counter_info['value'] = self.get_latest(
+                            service['type'],
+                            service['id'],
+                            counter_path
+                        )
+
                     result[svc_full_name][counter_path] = counter_info
 
         self.log.debug("returning {0} counter".format(len(result)))

diff --git a/src/pybind/mgr/prometheus/module.py b/src/pybind/mgr/prometheus/module.py
@@ -571,22 +571,43 @@ def collect(self):
 
         for daemon, counters in self.get_all_perf_counters().items():
             for path, counter_info in counters.items():
+                # Skip histograms, they are represented by long running avgs
                 stattype = self._stattype_to_str(counter_info['type'])
-                # XXX simplify first effort: no histograms
-                # averages are already collapsed to one value for us
                 if not stattype or stattype == 'histogram':
                     self.log.debug('ignoring %s, type %s' % (path, stattype))
                     continue
 
-                self.metrics.add_metric(path, Metric(
+                # Get the value of the counter
+                value = self._perfvalue_to_value(counter_info['type'], counter_info['value'])
+
+                # Represent the long running avgs as sum/count pairs
+                if counter_info['type'] & self.PERFCOUNTER_LONGRUNAVG:
+                    _path = path + '_sum'
+                    self.metrics.add_metric(_path, Metric(
+                        stattype,
+                        _path,
+                        counter_info['description'] + ' Total',
+                        ("ceph_daemon",),
+                    ))
+                    self.metrics.append(_path, value, (daemon,))
+
+                    _path = path + '_count'
+                    self.metrics.add_metric(_path, Metric(
+                        'counter',
+                        _path,
+                        counter_info['description'] + ' Count',
+                        ("ceph_daemon",),
+                    ))
+                    self.metrics.append(_path, counter_info['count'], (daemon,))
+                else:
+                    self.metrics.add_metric(path, Metric(
                         stattype,
                         path,
                         counter_info['description'],
                         ("ceph_daemon",),
                     ))
+                    self.metrics.append(path, value, (daemon,))
 
-                value = self._perfvalue_to_value(counter_info['type'], counter_info['value'])
-                self.metrics.append(path, value, (daemon,))
         # It is sufficient to reset the pending metrics once per scrape
         self.metrics.reset()