Skip to content

Commit

Permalink
Send SIGTERM to zeus children before SIGKILL
Browse files Browse the repository at this point in the history
Zeus currently terminates processes by sending SIGKILL. This doesn't
allow them to clean up after themselves in any way. It would be nice
to avoid SIGKILL entirely and force children to behave but to avoid
backwards incompatibility we can send SIGTERM followed by SIGKILL if
the process never exits.
  • Loading branch information
andrew-stripe committed Aug 18, 2016
1 parent 7e76a30 commit 73a8e63
Showing 1 changed file with 48 additions and 6 deletions.
54 changes: 48 additions & 6 deletions go/processtree/slavenode.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import (
"strings"
"sync"
"syscall"
"time"

"fmt"
"runtime"
Expand All @@ -19,6 +20,10 @@ import (
"github.com/burke/zeus/go/unixsocket"
)

const (
forceKillTimeout = time.Second
)

type SlaveNode struct {
ProcessTreeNode
socket *unixsocket.Usock
Expand Down Expand Up @@ -113,9 +118,7 @@ func (s *SlaveNode) SlaveWasInitialized(pid, parentPid int, usock *unixsocket.Us

s.L.Lock()
if !s.ReportBootEvent() {
if pid > 0 {
syscall.Kill(pid, syscall.SIGKILL)
}
s.forceKillPid(pid)
slog.ErrorString(fmt.Sprintf("Unexpected process %d with parent %d for slave %q was killed", pid, parentPid, s.Name))
} else {
s.wipe()
Expand Down Expand Up @@ -354,9 +357,7 @@ func (s *SlaveNode) bootCommand(request *CommandRequest) {

func (s *SlaveNode) ForceKill() {
// note that we don't try to lock the mutex.
if s.pid > 0 {
syscall.Kill(s.pid, syscall.SIGKILL)
}
s.forceKillPid(s.pid)
}

func (s *SlaveNode) wipe() {
Expand Down Expand Up @@ -416,6 +417,47 @@ func (s *SlaveNode) handleMessages(featurePipe *os.File) {
}
}

func (s *SlaveNode) forceKillPid(pid int) error {
if pid <= 0 {
return nil
}

if err := syscall.Kill(pid, syscall.SIGTERM); err != nil {
err = fmt.Errorf("Error killing pid %q: %v", pid, err)
s.trace(err.Error())
return err
}

exited := make(chan error)
go func() {
for {
if err := syscall.Kill(pid, syscall.Signal(0)); err != nil {
exited <- nil
return
}

// Since the process is not our direct child, we can't use wait
// and are forced to poll for completion. We know this won't loop
// forever because the timeout below will SIGKILL the process
// which guarantees that it'll go away and we'll get an ESRCH.
time.Sleep(time.Millisecond)
}
}()

select {
case err := <-exited:
if err != nil && err != syscall.ESRCH {
err = fmt.Errorf("Error sending signal to pid %q: %v", pid, err)
s.trace(err.Error())
return err
}
return nil
case <-time.After(forceKillTimeout):
syscall.Kill(pid, syscall.SIGKILL)
return nil
}
}

func (s *SlaveNode) trace(format string, args ...interface{}) {
if !slog.TraceEnabled() {
return
Expand Down

0 comments on commit 73a8e63

Please sign in to comment.