Skip to content

Commit

Permalink
kube-updater: Implement UnhealthyWorkloadTrigger (gravitational#22737)
Browse files Browse the repository at this point in the history
This trigger allows a maintenance to start if the teleport-kube-agent is
unhealthy. A workload is unhealthy if at least one if its managed pods
is unhealthy. A pod is unhealthy if it has not been ready for 10 minutes
or more.
  • Loading branch information
hugoShaka authored Mar 16, 2023
1 parent 3f0c74b commit b2d5ea5
Show file tree
Hide file tree
Showing 7 changed files with 509 additions and 7 deletions.
1 change: 1 addition & 0 deletions integrations/kube-agent-updater/go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ require (
github.com/cespare/xxhash/v2 v2.1.2 // indirect
github.com/davecgh/go-spew v1.1.1 // indirect
github.com/emicklei/go-restful/v3 v3.9.0 // indirect
github.com/evanphx/json-patch v4.12.0+incompatible // indirect
github.com/evanphx/json-patch/v5 v5.6.0 // indirect
github.com/fsnotify/fsnotify v1.6.0 // indirect
github.com/go-logr/logr v1.2.3 // indirect
Expand Down
1 change: 1 addition & 0 deletions integrations/kube-agent-updater/go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ github.com/envoyproxy/go-control-plane v0.9.10-0.20210907150352-cf90f659a021/go.
github.com/envoyproxy/protoc-gen-validate v0.1.0/go.mod h1:iSmxcyjqTsJpI2R4NaDN7+kN2VEUnK/pcBlmesArF7c=
github.com/evanphx/json-patch v0.5.2/go.mod h1:ZWS5hhDbVDyob71nXKNL0+PWn6ToqBHMikGIFbs31qQ=
github.com/evanphx/json-patch v4.12.0+incompatible h1:4onqiflcdA9EOZ4RxV643DvftH5pOlLGNtQ5lPWQu84=
github.com/evanphx/json-patch v4.12.0+incompatible/go.mod h1:50XU6AFN0ol/bzJsmQLiYLvXMP4fmwYFNcr97nuDLSk=
github.com/evanphx/json-patch/v5 v5.6.0 h1:b91NhWfaz02IuVxO9faSllyAtNXHMPkC5J8sJCLunww=
github.com/evanphx/json-patch/v5 v5.6.0/go.mod h1:G79N1coSVB93tBe7j6PhzjmR3/2VvlbKOFpnXhI9Bw4=
github.com/fsnotify/fsnotify v1.6.0 h1:n+5WquG0fcWoWp6xPWfHdbskMCQaFnG6PfBrh1Ky4HY=
Expand Down
4 changes: 2 additions & 2 deletions integrations/kube-agent-updater/pkg/controller/updater.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ import (

"github.com/docker/distribution/reference"
"github.com/gravitational/trace"
v1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"sigs.k8s.io/controller-runtime/pkg/client"
ctrllog "sigs.k8s.io/controller-runtime/pkg/log"

"github.com/gravitational/teleport/integrations/kube-agent-updater/pkg/img"
Expand All @@ -42,7 +42,7 @@ type VersionUpdater struct {
// validating the new image signature.
// If all steps are successfully executed and there's a new version, it returns
// a digested reference to the new image that should be deployed.
func (r *VersionUpdater) GetVersion(ctx context.Context, obj v1.Object, currentVersion string) (img.NamedTaggedDigested, error) {
func (r *VersionUpdater) GetVersion(ctx context.Context, obj client.Object, currentVersion string) (img.NamedTaggedDigested, error) {
// Those are debug logs only
log := ctrllog.FromContext(ctx).V(1)

Expand Down
4 changes: 2 additions & 2 deletions integrations/kube-agent-updater/pkg/maintenance/mock.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ package maintenance
import (
"context"

v1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"sigs.k8s.io/controller-runtime/pkg/client"
)

// TriggerMock is a fake Trigger that return a static answer. This is used
Expand All @@ -35,7 +35,7 @@ func (m TriggerMock) Name() string {
}

// CanStart returns the statically defined maintenance approval result.
func (m TriggerMock) CanStart(_ context.Context, _ v1.Object) (bool, error) {
func (m TriggerMock) CanStart(_ context.Context, _ client.Object) (bool, error) {
return m.canStart, nil
}

Expand Down
8 changes: 5 additions & 3 deletions integrations/kube-agent-updater/pkg/maintenance/trigger.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ package maintenance
import (
"context"

v1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"sigs.k8s.io/controller-runtime/pkg/client"
ctrllog "sigs.k8s.io/controller-runtime/pkg/log"
)

Expand All @@ -33,15 +33,17 @@ import (
// of error.
type Trigger interface {
Name() string
CanStart(ctx context.Context, object v1.Object) (bool, error)
CanStart(ctx context.Context, object client.Object) (bool, error)
Default() bool
}

// Triggers is a list of Trigger. Triggers are OR-ed: any trigger firing in the
// list will cause the maintenance to be triggered.
type Triggers []Trigger

func (t Triggers) CanStart(ctx context.Context, object v1.Object) bool {
// CanStart checks if the maintenance can be started. It will return true if at
// least a Trigger approves the maintenance.
func (t Triggers) CanStart(ctx context.Context, object client.Object) bool {
log := ctrllog.FromContext(ctx).V(1)
for _, trigger := range t {
start, err := trigger.CanStart(ctx, object)
Expand Down
158 changes: 158 additions & 0 deletions integrations/kube-agent-updater/pkg/maintenance/unhealthy.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,158 @@
/*
Copyright 2023 Gravitational, Inc.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package maintenance

import (
"context"
"time"

"github.com/gravitational/trace"
appsv1 "k8s.io/api/apps/v1"
v1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/labels"
kclient "sigs.k8s.io/controller-runtime/pkg/client"
)

const (
podReadinessGracePeriod = 10 * time.Minute
deploymentKind = "Deployment"
statefulSetKind = "StatefulSet"
)

// unhealthyWorkloadTrigger allows a maintenance to start if the workload is
// unhealthy. This is designed to recover faster if a new version breaks the
// agent. This way the user will not be left with a broken cluster until the
// next maintenance window.
type unhealthyWorkloadTrigger struct {
name string
kclient.Client
}

// Name returns the trigger name.
func (u unhealthyWorkloadTrigger) Name() string {
return u.name
}

// CanStart implements maintenance.Trigger
func (u unhealthyWorkloadTrigger) CanStart(ctx context.Context, object kclient.Object) (bool, error) {
switch workload := object.(type) {
case *appsv1.Deployment:
selector, err := metav1.LabelSelectorAsSelector(workload.Spec.Selector)
if err != nil {
return false, trace.Wrap(err)
}
return u.isWorkloadUnhealthy(ctx, workload.GetNamespace(), selector)
case *appsv1.StatefulSet:
selector, err := metav1.LabelSelectorAsSelector(workload.Spec.Selector)
if err != nil {
return false, trace.Wrap(err)
}
return u.isWorkloadUnhealthy(ctx, workload.GetNamespace(), selector)
default:
return false, trace.BadParameter(
"workload type '%s' not supported",
object.GetObjectKind().GroupVersionKind().String(),
)
}
}

// Default returns what to do if the trigger can't be evaluated.
func (u unhealthyWorkloadTrigger) Default() bool {
return false
}

// isWorkloadUnhealthy checks the pods selected by a workload and returns true
// if at least one pod is unhealthy.
func (u unhealthyWorkloadTrigger) isWorkloadUnhealthy(ctx context.Context, namespace string, selector labels.Selector) (bool, error) {
managedPods := &v1.PodList{}
matchingSelector := kclient.MatchingLabelsSelector{Selector: selector}
inNamespace := kclient.InNamespace(namespace)
err := u.List(ctx, managedPods, inNamespace, matchingSelector)
if err != nil {
return false, trace.Wrap(err)
}

// If the deployment manages no pods, it is considered unhealthy
// and can be updated at any time
if len(managedPods.Items) == 0 {
return true, nil
}

// If at least a pod is unhealthy, we consider the whole workload unhealthy
return len(UnhealthyPods(managedPods)) > 0, nil
}

// NewUnhealthyWorkloadTrigger triggers a maintenance if the watched workload
// is unhealthy.
func NewUnhealthyWorkloadTrigger(name string, client kclient.Client) Trigger {
return unhealthyWorkloadTrigger{
name: name,
Client: client,
}
}

// UnhealthyPods takes a v1.PodList of pods and returns a list of all unhealthy
// pods.
func UnhealthyPods(list *v1.PodList) []*v1.Pod {
var unhealthyPods []*v1.Pod
for _, pod := range list.Items {
if isPodUnhealthy(&pod) {
unhealthyPods = append(unhealthyPods, &pod)
}
}
return unhealthyPods
}

// A Pod is unhealthy if it is not Ready since at least X minutes
// This heuristic also detects infrastructure issues like not enough room to
// schedule pod. As false positives are less problematic than
// false negatives in our case, this is not a problem. If false positives were
// to be a frequent issue we could build a more specific heuristic by looking
// at the container statuses
func isPodUnhealthy(pod *v1.Pod) bool {
// If the pod is terminating we ignore it and consider it healthy as it
// should be gone soon.
if pod.DeletionTimestamp != nil {
return false
}

condition := getPodReadyCondition(&pod.Status)
// if the pod has no ready condition, something is not ok
// we consider it not healthy
if condition == nil {
return true
}

// if the pod is marked as ready it is healthy
if condition.Status == v1.ConditionTrue {
return false
}

// if the pod is marked unready but is still in the grace period
// we don't consider it unhealthy yet
return condition.LastTransitionTime.Add(podReadinessGracePeriod).Before(time.Now())
}

func getPodReadyCondition(status *v1.PodStatus) *v1.PodCondition {
for _, condition := range status.Conditions {
if condition.Type == v1.PodReady {
return &condition
}
}
return nil
}
Loading

0 comments on commit b2d5ea5

Please sign in to comment.