Skip to content
This repository has been archived by the owner on Aug 19, 2023. It is now read-only.

Commit

Permalink
feat(networkd): Make healthcheck perform a check
Browse files Browse the repository at this point in the history
This implements an actual health check for networkd. We use the arp table ( ip neighbors )
to determine if the machine is actively sending traffic. We should see at least one entry
with a REACHABLE/STALE/DELAY state during normal operating conditions.

Signed-off-by: Brad Beam <[email protected]>
  • Loading branch information
bradbeam authored and andrewrynhard committed Feb 3, 2020
1 parent effd0ee commit e911353
Show file tree
Hide file tree
Showing 2 changed files with 45 additions and 4 deletions.
43 changes: 41 additions & 2 deletions internal/app/networkd/pkg/reg/reg.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at http://mozilla.org/MPL/2.0/.

// Package reg provides the gRPC network service implementation.
package reg

import (
Expand Down Expand Up @@ -155,16 +156,54 @@ func toCIDR(family uint8, prefix net.IP, prefixLen int) string {
}

// Check implements the Health api and provides visibilty into the state of networkd.
// Ready signifies the daemon (api) is healthy and ready to serve requests.
// We determine networkd health based on neighbor (arp table) entries.
// Under normal operating circumstances, there should be a steady state of reachable neighbor entries.
// If we get in a situation where all of our neighbor entries are stale, then we're in trouble.
func (r *Registrator) Check(ctx context.Context, in *empty.Empty) (reply *healthapi.HealthCheckResponse, err error) {
// Set initial state to unknown
reply = &healthapi.HealthCheckResponse{
Messages: []*healthapi.HealthCheck{
{
Status: healthapi.HealthCheck_SERVING,
Status: healthapi.HealthCheck_UNKNOWN,
},
},
}

var neighbors []rtnetlink.NeighMessage

neighbors, err = r.Conn.Neigh.List()
if err != nil {
return reply, err
}

// After getting a list of neighbors we can upgrade to not serving
reply.Messages[0].Status = healthapi.HealthCheck_NOT_SERVING

// Find at least one neighbor in a reachable state
for _, neighbor := range neighbors {
// Verify neighbor is associated with a link
// managed by networkd ( skip cni links )
link, err := r.Conn.Link.Get(neighbor.Index)
if err != nil {
continue
}

if _, ok := r.Networkd.Interfaces[link.Attributes.Name]; !ok {
continue
}

// Verify neighbor state
switch neighbor.State {
case unix.NUD_REACHABLE:
fallthrough
case unix.NUD_STALE:
fallthrough
case unix.NUD_DELAY:
reply.Messages[0].Status = healthapi.HealthCheck_SERVING
return reply, nil
}
}

return reply, nil
}

Expand Down
6 changes: 4 additions & 2 deletions internal/app/networkd/pkg/reg/reg_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,9 @@ func (suite *NetworkdSuite) TestHealthAPI() {
nClient := healthapi.NewHealthClient(conn)
hcResp, err := nClient.Check(context.Background(), &empty.Empty{})
suite.Assert().NoError(err)
suite.Assert().Equal(healthapi.HealthCheck_SERVING, hcResp.Messages[0].Status)
// Can only check against unknown since its not guaranteed that
// the host the tests will run on will have an arp table populated.
suite.Assert().NotEqual(healthapi.HealthCheck_UNKNOWN, hcResp.Messages[0].Status)

rResp, err := nClient.Ready(context.Background(), &empty.Empty{})
suite.Assert().NoError(err)
Expand All @@ -139,7 +141,7 @@ func (suite *NetworkdSuite) TestHealthAPI() {
for i := 0; i < 2; i++ {
hcResp, err = stream.Recv()
suite.Assert().NoError(err)
suite.Assert().Equal(healthapi.HealthCheck_SERVING, hcResp.Messages[0].Status)
suite.Assert().NotEqual(healthapi.HealthCheck_UNKNOWN, hcResp.Messages[0].Status)
}

cancel()
Expand Down

0 comments on commit e911353

Please sign in to comment.