Skip to content

Commit

Permalink
Periodically capture resource metrics for every sandbox (#216)
Browse files Browse the repository at this point in the history
  • Loading branch information
0div authored Jan 24, 2025
2 parents 9fea81c + 8c1fbbd commit f823030
Show file tree
Hide file tree
Showing 13 changed files with 405 additions and 94 deletions.
30 changes: 30 additions & 0 deletions packages/api/internal/api/api.gen.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

106 changes: 54 additions & 52 deletions packages/api/internal/api/spec.gen.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

18 changes: 18 additions & 0 deletions packages/api/internal/api/types.gen.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

103 changes: 103 additions & 0 deletions packages/api/internal/handlers/sandbox_metrics.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
package handlers

import (
"encoding/json"
"fmt"
"net/http"
"slices"
"strings"
"time"

"github.com/gin-gonic/gin"
"github.com/grafana/loki/pkg/loghttp"
"github.com/grafana/loki/pkg/logproto"
"go.opentelemetry.io/otel/attribute"

"github.com/e2b-dev/infra/packages/api/internal/api"
"github.com/e2b-dev/infra/packages/api/internal/auth"
authcache "github.com/e2b-dev/infra/packages/api/internal/cache/auth"
"github.com/e2b-dev/infra/packages/api/internal/utils"
"github.com/e2b-dev/infra/packages/shared/pkg/telemetry"
)

func (a *APIStore) GetSandboxesSandboxIDMetrics(
c *gin.Context,
sandboxID string,
) {
ctx := c.Request.Context()
sandboxID = utils.ShortID(sandboxID)

teamID := c.Value(auth.TeamContextKey).(authcache.AuthTeamInfo).Team.ID

telemetry.SetAttributes(ctx,
attribute.String("instance.id", sandboxID),
attribute.String("team.id", teamID.String()),
)

end := time.Now()
start := end.Add(-oldestLogsLimit)

// Sanitize ID
// https://grafana.com/blog/2021/01/05/how-to-escape-special-characters-with-lokis-logql/
id := strings.ReplaceAll(sandboxID, "`", "")

// equivalent CLI query:
// logcli query '{source="logs-collector", service="envd", teamID="65d165ab-69f6-4b5c-9165-6b93cd341503", sandboxID="izuhqjlfabd8ataeixrtl", category="metrics"}' --from="2025-01-19T10:00:00Z"
query := fmt.Sprintf(
"{source=\"logs-collector\", service=\"envd\", teamID=`%s`, sandboxID=`%s`, category=\"metrics\"}", teamID.String(), id)

res, err := a.lokiClient.QueryRange(query, 100, start, end, logproto.FORWARD, time.Duration(0), time.Duration(0), true)
if err != nil {
errMsg := fmt.Errorf("error when returning metrics for sandbox: %w", err)
telemetry.ReportCriticalError(ctx, errMsg)
a.sendAPIStoreError(c, http.StatusNotFound, fmt.Sprintf("Error returning metrics for sandbox '%s'", sandboxID))

return
}

switch res.Data.Result.Type() {
case loghttp.ResultTypeStream:
value := res.Data.Result.(loghttp.Streams)

metrics := make([]api.SandboxMetric, 0)

for _, stream := range value {
for _, entry := range stream.Entries {

var metric struct {
CPUUsedPct float32 `json:"cpuUsedPct"`
CPUCount int32 `json:"cpuCount"`
MemTotalMiB int64 `json:"memTotalMiB"`
MemUsedMiB int64 `json:"memUsedMiB"`
}

err := json.Unmarshal([]byte(entry.Line), &metric)
if err != nil {
telemetry.ReportCriticalError(ctx, fmt.Errorf("failed to unmarshal metric: %w", err))
continue
}
metrics = append(metrics, api.SandboxMetric{
Timestamp: entry.Timestamp,
CpuUsedPct: metric.CPUUsedPct,
CpuCount: metric.CPUCount,
MemTotalMiB: metric.MemTotalMiB,
MemUsedMiB: metric.MemUsedMiB,
})
}
}

// Sort metrics by timestamp (they are returned by the time they arrived in Loki)
slices.SortFunc(metrics, func(a, b api.SandboxMetric) int {
return a.Timestamp.Compare(b.Timestamp)
})

c.JSON(http.StatusOK, metrics)

default:
errMsg := fmt.Errorf("unexpected value type %T", res.Data.Result.Type())
telemetry.ReportCriticalError(ctx, errMsg)
a.sendAPIStoreError(c, http.StatusInternalServerError, fmt.Sprintf("Error returning metrics for sandbox '%s", sandboxID))

return
}
}
34 changes: 23 additions & 11 deletions packages/envd/internal/host/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,11 @@ import (
)

type Metrics struct {
CPU float64 `json:"cpu_pct"` // Percent rounded to 2 decimal places
Mem uint64 `json:"mem_bytes"` // Total virtual memory in bytes
Timestamp int64 `json:"ts"` // Unix Timestamp in UTC
Timestamp int64 `json:"ts"` // Unix Timestamp in UTC
CPUCount uint32 `json:"cpu_count"` // Total CPU cores
CPUUsedPercent float32 `json:"cpu_used_pct"` // Percent rounded to 2 decimal places
MemTotalMiB uint64 `json:"mem_total_mib"` // Total virtual memory in MiB
MemUsedMiB uint64 `json:"mem_used_mib"` // Used virtual memory in MiB
}

func GetMetrics() (*Metrics, error) {
Expand All @@ -20,20 +22,30 @@ func GetMetrics() (*Metrics, error) {
return nil, err
}

cpuPcts, err := cpu.Percent(0, false)
memUsedMiB := v.Used / 1024 / 1024
memTotalMiB := v.Total / 1024 / 1024

cpuTotal, err := cpu.Counts(true)
if err != nil {
return nil, err
}

cpuUsedPcts, err := cpu.Percent(0, false)
if err != nil {
return nil, err
}

cpuPct := cpuPcts[0]
cpuPctRounded := cpuPct
if cpuPct > 0 {
cpuPctRounded = math.Round(cpuPct*100) / 100
cpuUsedPct := cpuUsedPcts[0]
cpuUsedPctRounded := float32(cpuUsedPct)
if cpuUsedPct > 0 {
cpuUsedPctRounded = float32(math.Round(cpuUsedPct*100) / 100)
}

return &Metrics{
CPU: cpuPctRounded,
Mem: v.Total,
Timestamp: time.Now().UTC().Unix(),
Timestamp: time.Now().UTC().Unix(),
CPUCount: uint32(cpuTotal),
CPUUsedPercent: cpuUsedPctRounded,
MemUsedMiB: memUsedMiB,
MemTotalMiB: memTotalMiB,
}, nil
}
2 changes: 1 addition & 1 deletion packages/envd/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ const (

var (
// These vars are automatically set by goreleaser.
Version = "0.1.4"
Version = "0.1.5"

debug bool
port int64
Expand Down
8 changes: 4 additions & 4 deletions packages/envd/spec/envd.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,9 @@ paths:
"200":
description: The resource usage metrics of the service
content:
application/json:
schema:
$ref: "#/components/schemas/Metrics"
application/json:
schema:
$ref: "#/components/schemas/Metrics"

/init:
post:
Expand Down Expand Up @@ -207,7 +207,7 @@ components:
type: object
description: Resource usage metrics
properties:
cpu_pct:
cpu_used_pct:
type: number
format: float
description: CPU usage percentage
Expand Down
3 changes: 2 additions & 1 deletion packages/nomad/logs-collector.hcl
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ job "logs-collector" {
resources {
memory_max = 4096
memory = 512
cpu = 256
cpu = 512
}

template {
Expand Down Expand Up @@ -152,6 +152,7 @@ service = "{{ service }}"
teamID = "{{ teamID }}"
envID = "{{ envID }}"
sandboxID = "{{ sandboxID }}"
category = "{{ category }}"
%{ if var.grafana_logs_endpoint != " " }
[sinks.grafana]
Expand Down
Loading

0 comments on commit f823030

Please sign in to comment.