Skip to content

Commit

Permalink
Track active migrations in Prometheus and tctl top (gravitational#1…
Browse files Browse the repository at this point in the history
…9520)

This commit adds a new Prometheus gauge `teleport_migrations` that
tracks for each migration if it is active (1) or not (0).

This gauge is then leveraged in `tctl top` to show a set of active
migrations.
  • Loading branch information
Vitor Enes authored Dec 22, 2022
1 parent 1b387ab commit 87f706d
Show file tree
Hide file tree
Showing 5 changed files with 63 additions and 3 deletions.
3 changes: 2 additions & 1 deletion docs/pages/includes/metrics.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
| `auth_generate_requests_throttled_total` | counter | Teleport Auth | Number of throttled requests to generate new server keys. |
| `auth_generate_requests_total` | counter | Teleport Auth | Number of requests to generate new server keys. |
| `auth_generate_requests` | gauge | Teleport Auth | Number of current generate requests. |
| `auth_generate_seconds` | `histogram` | Teleport Auth | Latency for generate requests. |
| `auth_generate_seconds` | histogram | Teleport Auth | Latency for generate requests. |
| `backend_batch_read_requests_total` | counter | cache | Number of read requests to the backend. |
| `backend_batch_read_seconds` | histogram | cache | Latency for batch read operations. |
| `backend_batch_write_requests_total` | counter | cache | Number of batch write requests to the backend. |
Expand Down Expand Up @@ -53,6 +53,7 @@
| `teleport_connected_resources` | gauge | Teleport Auth | Number and type of resources connected via keepalives. |
| `teleport_registered_servers` | gauge | Teleport Auth | The number of Teleport services that are connected to an Auth Service instance grouped by version. |
| `user_login_total` | counter | Teleport Auth | Number of user logins. |
| `teleport_migrations` | gauge | Teleport Auth | Tracks for each migration if it is active (1) or not (0). |
| `watcher_event_sizes` | histogram | cache | Overall size of events emitted. |
| `watcher_events` | histogram | cache | Per resource size of events emitted. |

Expand Down
11 changes: 10 additions & 1 deletion lib/auth/auth.go
Original file line number Diff line number Diff line change
Expand Up @@ -389,10 +389,19 @@ var (
[]string{teleport.TagVersion},
)

migrations = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: teleport.MetricNamespace,
Name: teleport.MetricMigrations,
Help: "Migrations tracks for each migration if it is active (1) or not (0).",
},
[]string{teleport.TagMigration},
)

prometheusCollectors = []prometheus.Collector{
generateRequestsCount, generateThrottledRequestsCount,
generateRequestsCurrent, generateRequestsLatencies, UserLoginCount, heartbeatsMissedByAuth,
registeredAgents,
registeredAgents, migrations,
}
)

Expand Down
20 changes: 20 additions & 0 deletions lib/auth/init.go
Original file line number Diff line number Diff line change
Expand Up @@ -554,6 +554,20 @@ func shouldInitReplaceResourceWithOrigin(stored, candidate types.ResourceWithOri
return false, nil
}

// migrationStart marks the migration as active.
// It should be called when a migration starts.
func migrationStart(ctx context.Context, migrationName string) {
log.Debugf("Migrations: %q migration started.", migrationName)
migrations.WithLabelValues(migrationName).Set(1)
}

// migrationEnd marks the migration as inactive.
// It should be called when a migration ends.
func migrationEnd(ctx context.Context, migrationName string) {
log.Debugf("Migrations: %q migration ended.", migrationName)
migrations.WithLabelValues(migrationName).Set(0)
}

func migrateLegacyResources(ctx context.Context, asrv *Server) error {
if err := migrateRemoteClusters(ctx, asrv); err != nil {
return trace.Wrap(err)
Expand Down Expand Up @@ -1027,6 +1041,9 @@ func ReadLocalIdentity(dataDir string, id IdentityID) (*Identity, error) {
// where the presence of remote cluster was identified only by presence
// of host certificate authority with cluster name not equal local cluster name
func migrateRemoteClusters(ctx context.Context, asrv *Server) error {
migrationStart(ctx, "remote_clusters")
defer migrationEnd(ctx, "remote_clusters")

clusterName, err := asrv.Services.GetClusterName()
if err != nil {
return trace.Wrap(err)
Expand Down Expand Up @@ -1084,6 +1101,9 @@ func migrateRemoteClusters(ctx context.Context, asrv *Server) error {
//
// DELETE IN 11.0
func migrateDBAuthority(ctx context.Context, asrv *Server) error {
migrationStart(ctx, "db_authority")
defer migrationEnd(ctx, "db_authority")

localClusterName, err := asrv.Services.GetClusterName()
if err != nil {
return trace.Wrap(err)
Expand Down
6 changes: 6 additions & 0 deletions metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,12 @@ const (
// MetricMissingSSHTunnels returns the number of missing SSH tunnels for this proxy.
MetricMissingSSHTunnels = "proxy_missing_ssh_tunnels"

// MetricMigrations tracks for each migration if it is active or not.
MetricMigrations = "migrations"

// TagMigration is a metric tag for a migration
TagMigration = "migration"

// TagCluster is a metric tag for a cluster
TagCluster = "cluster"
)
Expand Down
26 changes: 25 additions & 1 deletion tool/tctl/common/top_command.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ import (
"github.com/gravitational/kingpin"
"github.com/gravitational/roundtrip"
"github.com/gravitational/trace"
"github.com/prometheus/client_golang/prometheus"
dto "github.com/prometheus/client_model/go"
"github.com/prometheus/common/expfmt"

Expand Down Expand Up @@ -207,6 +208,7 @@ func (c *TopCommand) render(ctx context.Context, re Report, eventID string) erro
{"Cert Gen Requests/sec", humanize.FormatFloat("", re.Cluster.GenerateRequestsCount.GetFreq())},
{"Cert Gen Throttled Requests/sec", humanize.FormatFloat("", re.Cluster.GenerateRequestsThrottledCount.GetFreq())},
{"Auth Watcher Queue Size", humanize.FormatFloat("", re.Cache.QueueSize)},
{"Active Migrations", strings.Join(re.Cluster.ActiveMigrations, ", ")},
}
for _, rc := range re.Cluster.RemoteClusters {
t1.Rows = append(t1.Rows, []string{
Expand Down Expand Up @@ -289,7 +291,7 @@ func (c *TopCommand) render(ctx context.Context, re Report, eventID string) erro
ui.NewRow(0.3, t3),
),
ui.NewCol(0.5,
ui.NewRow(0.3, percentileTable("Generate Server Certificates Histogram", re.Cluster.GenerateRequestsHistogram)),
ui.NewRow(0.3, percentileTable("Generate Server Certificates Percentiles", re.Cluster.GenerateRequestsHistogram)),
),
),
ui.NewRow(0.025,
Expand Down Expand Up @@ -534,6 +536,8 @@ type ClusterStats struct {
GenerateRequestsThrottledCount Counter
// GenerateRequestsHistogram is a histogram of generate requests latencies
GenerateRequestsHistogram Histogram
// ActiveMigrations is a set of active migrations
ActiveMigrations []string
}

// RemoteCluster is a remote cluster (or local cluster)
Expand Down Expand Up @@ -734,6 +738,7 @@ func generateReport(metrics map[string]*dto.MetricFamily, prev *Report, period t
GenerateRequestsCount: Counter{Count: getCounterValue(metrics[teleport.MetricGenerateRequests])},
GenerateRequestsThrottledCount: Counter{Count: getCounterValue(metrics[teleport.MetricGenerateRequestsThrottled])},
GenerateRequestsHistogram: getHistogram(metrics[teleport.MetricGenerateRequestsHistogram], atIndex(0)),
ActiveMigrations: getActiveMigrations(metrics[prometheus.BuildFQName(teleport.MetricNamespace, "", teleport.MetricMigrations)]),
}

if prev != nil {
Expand Down Expand Up @@ -879,6 +884,25 @@ func getRemoteClusters(metric *dto.MetricFamily) []RemoteCluster {
return out
}

func getActiveMigrations(metric *dto.MetricFamily) []string {
if metric == nil || metric.GetType() != dto.MetricType_GAUGE || len(metric.Metric) == 0 {
return nil
}
var out []string
for _, counter := range metric.Metric {
if counter.Gauge.GetValue() == 0 {
continue
}
for _, label := range counter.Label {
if label.GetName() == teleport.TagMigration {
out = append(out, label.GetValue())
break
}
}
}
return out
}

func getComponentGaugeValue(component string, metric *dto.MetricFamily) float64 {
if metric == nil || metric.GetType() != dto.MetricType_GAUGE || len(metric.Metric) == 0 || metric.Metric[0].Gauge == nil || metric.Metric[0].Gauge.Value == nil {
return 0
Expand Down

0 comments on commit 87f706d

Please sign in to comment.