Merge pull request ceph#37902 from pdvian/wip-clog-health-detail

mon: Log "ceph health detail" periodically in cluster log Reviewed-by: Josh Durgin <[email protected]> Reviewed-by: Neha Ojha <[email protected]>
zmc · Nov 12, 2020 · 4a6f53a · 4a6f53a
2 parents 6f75704 + f45712c
commit 4a6f53a
Show file tree

Hide file tree

Showing 5 changed files with 21 additions and 2 deletions.
diff --git a/PendingReleaseNotes b/PendingReleaseNotes
@@ -34,6 +34,10 @@
 >=15.0.0
 --------
 
+* MON: The cluster log now logs health detail every ``mon_health_to_clog_interval``,
+  which has been changed from 1hr to 10min. Logging of health detail will be
+  skipped if there is no change in health summary since last known.
+
 * The ``ceph df`` command now lists the number of pgs in each pool.
 
 * Monitors now have config option ``mon_allow_pool_size_one``, which is disabled

diff --git a/qa/tasks/ceph.conf.template b/qa/tasks/ceph.conf.template
@@ -40,6 +40,7 @@
 
 	mon cluster log file level = debug
 	debug asserts on shutdown = true
+	mon health detail to clog = false
 
 [osd]
         osd journal size = 100

diff --git a/src/common/legacy_config_opts.h b/src/common/legacy_config_opts.h
@@ -263,6 +263,7 @@ OPTION(mon_reweight_max_change, OPT_DOUBLE)
 OPTION(mon_health_to_clog, OPT_BOOL)
 OPTION(mon_health_to_clog_interval, OPT_INT)
 OPTION(mon_health_to_clog_tick_interval, OPT_DOUBLE)
+OPTION(mon_health_detail_to_clog, OPT_BOOL)
 OPTION(mon_data_avail_crit, OPT_INT)
 OPTION(mon_data_avail_warn, OPT_INT)
 OPTION(mon_data_size_warn, OPT_U64) // issue a warning when the monitor's data store goes over 15GB (in bytes)

diff --git a/src/common/options.cc b/src/common/options.cc
@@ -1941,7 +1941,7 @@ std::vector<Option> get_global_options() {
     .set_description("log monitor health to cluster log"),
 
     Option("mon_health_to_clog_interval", Option::TYPE_INT, Option::LEVEL_ADVANCED)
-    .set_default(1_hr)
+    .set_default(10_min)
     .add_service("mon")
     .set_description("frequency to log monitor health to cluster log")
     .add_see_also("mon_health_to_clog"),
@@ -1951,6 +1951,10 @@ std::vector<Option> get_global_options() {
     .add_service("mon")
     .set_description(""),
 
+    Option("mon_health_detail_to_clog", Option::TYPE_BOOL, Option::LEVEL_DEV)
+    .set_default(true)
+    .set_description("log health detail to cluster log"),
+
     Option("mon_health_max_detail", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
     .set_default(50)
     .add_service("mon")

diff --git a/src/mon/Monitor.cc b/src/mon/Monitor.cc
@@ -2835,7 +2835,16 @@ void Monitor::do_health_to_clog(bool force)
       summary == health_status_cache.summary &&
       level == health_status_cache.overall)
     return;
-  clog->health(level) << "overall " << summary;
+
+  if (g_conf()->mon_health_detail_to_clog &&
+      summary != health_status_cache.summary &&
+      level != HEALTH_OK) {
+    string details;
+    level = healthmon()->get_health_status(true, nullptr, &details);
+    clog->health(level) << "Health detail: " << details;
+  } else {
+    clog->health(level) << "overall " << summary;
+  }
   health_status_cache.summary = summary;
   health_status_cache.overall = level;
 }