Skip to content

Commit

Permalink
feat(clouddriver): wait for instances down when server group is disa…
Browse files Browse the repository at this point in the history
…bled (spinnaker#2124)

* feat(clouddriver): wait for instances down when server group is disabled
  • Loading branch information
dreynaud authored Apr 12, 2018
1 parent 6579f0c commit 462bfaa
Show file tree
Hide file tree
Showing 4 changed files with 197 additions and 25 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ abstract class AbstractWaitForClusterWideClouddriverTask extends AbstractCloudPr
protected TaskResult emptyClusterResult(Stage stage,
AbstractClusterWideClouddriverTask.ClusterSelection clusterSelection,
Map cluster) {
throw new IllegalStateException("No ServerGroups found in cluster $clusterSelection")
throw new IllegalStateException("no server groups found in cluster $clusterSelection")
}

boolean isServerGroupOperationInProgress(Stage stage,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -66,35 +66,36 @@ class WaitForClusterDisableTask extends AbstractWaitForClusterWideClouddriverTas
boolean isServerGroupOperationInProgress(Stage stage,
List<Map> interestingHealthProviderNames,
Optional<TargetServerGroup> serverGroup) {
// null vs empty interestingHealthProviderNames do mean very different things to Spinnaker
// a null value will result in Spinnaker waiting for discovery + platform, etc. whereas an empty will not wait for anything.
if (interestingHealthProviderNames != null && interestingHealthProviderNames.isEmpty()) {
return false
}

// Assume a missing server group is disabled.
boolean isDisabled = serverGroup.map({ it.disabled } as Function<TargetServerGroup, Boolean>).orElse(true)

// If the server group shows as disabled, we don't need to do anything special w.r.t. interestingHealthProviderNames.
if (isDisabled) {
if (!serverGroup.isPresent()) {
return false
} else {
def targetServerGroup = serverGroup.get()
if (stage.context.desiredPercentage) {
// TODO(lwander) investigate if the non-desiredPercentage case code can be dropped below in favor of this
return !waitForRequiredInstancesDownTask.hasSucceeded(stage, targetServerGroup as Map, targetServerGroup.getInstances(), interestingHealthProviderNames)
}

// The operation can be considered complete if it was requested to only consider the platform health.
def platformHealthType = targetServerGroup.instances.collect { instance ->
HealthHelper.findPlatformHealth(instance.health)
}?.find {
it.type
}?.type

if (!platformHealthType) {
platformHealthType = healthProviderNamesByPlatform[getCloudProvider(stage)]
}

return !(platformHealthType && interestingHealthProviderNames == [platformHealthType])
}

// Even if the server group is disabled, we want to make sure instances are down
// to prevent downstream stages (e.g. scaleDownCluster) from having to deal with disabled-but-instances-up server groups
def targetServerGroup = serverGroup.get()
if (targetServerGroup.isDisabled() || stage.context.desiredPercentage) {
return !waitForRequiredInstancesDownTask.hasSucceeded(stage, targetServerGroup as Map, targetServerGroup.getInstances(), interestingHealthProviderNames)
}

// TODO(lwander) investigate if the non-desiredPercentage/only-platform-health case code can be dropped in favor of waitForRequiredInstancesDownTask
// The operation can be considered complete if it was requested to only consider the platform health.
def platformHealthType = getPlatformHealthType(stage, targetServerGroup)
return !(platformHealthType && interestingHealthProviderNames == [platformHealthType])
}

private String getPlatformHealthType(Stage stage, TargetServerGroup targetServerGroup) {
def platformHealthType = targetServerGroup.instances.collect { instance ->
HealthHelper.findPlatformHealth(instance.health)
}?.find {
it.type
}?.type

return platformHealthType ? platformHealthType : healthProviderNamesByPlatform[getCloudProvider(stage)]
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,162 @@
package com.netflix.spinnaker.orca.clouddriver.tasks.cluster

import com.netflix.spinnaker.orca.TaskResult
import com.netflix.spinnaker.orca.clouddriver.tasks.servergroup.ServerGroupCreator
import com.netflix.spinnaker.orca.clouddriver.tasks.servergroup.WaitForRequiredInstancesDownTask
import com.netflix.spinnaker.orca.clouddriver.utils.OortHelper
import com.netflix.spinnaker.orca.pipeline.model.Execution
import com.netflix.spinnaker.orca.pipeline.model.Stage
import spock.lang.Shared
import spock.lang.Specification
import spock.lang.Subject
import spock.lang.Unroll

import static com.netflix.spinnaker.orca.ExecutionStatus.RUNNING
import static com.netflix.spinnaker.orca.ExecutionStatus.SUCCEEDED
import static com.netflix.spinnaker.orca.test.model.ExecutionBuilder.stage

class WaitForClusterDisableTaskSpec extends Specification {
def oortHelper = Mock(OortHelper)

@Shared def region = "region"
@Shared def clusterName = "clusterName"

@Shared
ServerGroupCreator serverGroupCreator = Stub(ServerGroupCreator) {
getCloudProvider() >> "cloudProvider"
isKatoResultExpected() >> false
getOperations(_) >> [["aOp": "foo"]]
}

@Subject def task = new WaitForClusterDisableTask([serverGroupCreator])

@Unroll
def "status=#status when oldSGDisabled=#oldSGDisabled, desiredPercentage=#desiredPct, interestingHealthProviderNames=#interestingHealthProviderNames"() {
given:
def stage = stage {
context = [
cluster : clusterName,
credentials : "test",
"deploy.server.groups" : [
(dsgregion): ["$clusterName-$oldServerGroup".toString()]
],
(desiredPct ? "desiredPercentage" : "blerp"): desiredPct,
interestingHealthProviderNames : interestingHealthProviderNames
]
}
stage.setStartTime(System.currentTimeMillis())

oortHelper.getCluster(*_) >> [
name: clusterName,
serverGroups: [
serverGroup("$clusterName-v050".toString(), "us-west-1", [:]),
serverGroup("$clusterName-v051".toString(), "us-west-1", [:]),
serverGroup("$clusterName-$newServerGroup".toString(), region, [:]),
serverGroup("$clusterName-$oldServerGroup".toString(), region, [
disabled: oldSGDisabled,
capacity: [desired: desired],
instances: [
instance('i-1', platformHealthState, extraHealths),
instance('i-2', platformHealthState, extraHealths),
instance('i-3', platformHealthState, extraHealths),
]
])
]
]

task.oortHelper = oortHelper
task.waitForRequiredInstancesDownTask = new WaitForRequiredInstancesDownTask()
task.MINIMUM_WAIT_TIME_MS = minWaitTime

when:
TaskResult result = task.execute(stage)

then:
result.getStatus() == status

where:
dsgregion | minWaitTime | oldSGDisabled | desired | desiredPct | interestingHealthProviderNames | extraHealths | platformHealthState || status
"other" | 0 | false | 3 | null | ['platformHealthType'] | [] | 'Unknown' || SUCCEEDED // exercises if (!remainingDeployServerGroups)...
"other" | 90 | false | 3 | null | ['platformHealthType'] | [] | 'Unknown' || RUNNING // keeps running if duration < minWaitTime

// tests for isDisabled==true
region | 0 | true | 3 | null | ['platformHealthType'] | [] | 'Unknown' || SUCCEEDED
region | 0 | true | 3 | null | ['platformHealthType'] | [] | 'NotUnknown' || RUNNING // wait for instances down even if cluster is disabled
region | 0 | true | 3 | 100 | ['platformHealthType'] | [] | 'NotUnknown' || RUNNING // also wait for instances down with a desiredPct
region | 0 | true | 4 | 50 | ['platformHealthType'] | [] | 'Unknown' || SUCCEEDED
region | 0 | true | 3 | null | ['strangeType'] | [] | 'Unknown' || SUCCEEDED // intersection of interesting and provided healths is empty, so we're done
region | 0 | true | 3 | null | ['strangeType'] | health('strange', 'Down') | 'Unknown' || SUCCEEDED // also done if we provide it and are down...
region | 0 | true | 3 | null | ['strangeType'] | health('strange', 'Up') | 'Unknown' || RUNNING // ...but not if that extra health is up

// tests for isDisabled==false, no desiredPct
region | 0 | false | 3 | null | [] | [] | 'Unknown' || SUCCEEDED // no health providers to check so short-circuits early
region | 0 | false | 3 | null | null | [] | 'Unknown' || RUNNING // exercises null vs empty behavior of interestingHealthProviderNames
region | 0 | false | 3 | null | ['platformHealthType'] | [] | 'Unknown' || SUCCEEDED // considered complete because only considers the platform health
region | 0 | false | 3 | null | ['platformHealthType'] | [] | 'Up' || SUCCEEDED // considered complete because only considers the platform health, despite platform health being Up
region | 0 | false | 3 | null | ['strangeType'] | [] | 'Unknown' || RUNNING // can't complete if we need to monitor an unknown health provider...
region | 0 | false | 3 | null | ['strangeType'] | health('strange', 'Down') | 'Unknown' || RUNNING // ...regardless of down status

// tests for waitForRequiredInstancesDownTask.hasSucceeded
region | 0 | false | 3 | 100 | null | [] | 'Unknown' || SUCCEEDED // no other health providers than platform, and it looks down
region | 0 | false | 3 | 100 | null | [] | 'NotUnknown' || RUNNING // no other health providers than platform, and it looks NOT down
region | 0 | false | 4 | 100 | ['platformHealthType'] | [] | 'Unknown' || RUNNING // can't reach count(someAreDownAndNoneAreUp) >= targetDesiredSize
region | 0 | false | 4 | 50 | ['platformHealthType'] | [] | 'Unknown' || SUCCEEDED // all look down, and we want at least 2 down so we're done
region | 0 | false | 3 | 100 | ['strangeType'] | [] | 'Unknown' || SUCCEEDED // intersection of interesting and provided healths is empty, so we're done
region | 0 | false | 3 | 100 | ['strangeType'] | health('strange', 'Down') | 'Unknown' || SUCCEEDED // ...unless we have data for that health provider
region | 0 | false | 3 | 100 | ['strangeType'] | health('strange', 'Up') | 'Unknown' || RUNNING // ...unless we have data for that health provider

oldServerGroup = "v167"
newServerGroup = "v168"
}

@Unroll
def "fails with '#message' when clusterData=#clusterData"() {
given:
def stage = new Stage(Execution.newPipeline("orca"), "test", [
"deploy.server.groups": [
(region): ["$clusterName-v42".toString()]
]
])

oortHelper.getCluster(*_) >> clusterData
task.oortHelper = oortHelper

when:
task.execute(stage)

then:
IllegalStateException e = thrown()
e.message.startsWith(expectedMessage)

where:
clusterData || expectedMessage
Optional.empty() || 'no cluster details found'
[name: clusterName, serverGroups: []] || 'no server groups found'
}

private static Map instance(name, platformHealthState = 'Unknown', extraHealths = []) {
return [
name: name,
launchTime: null,
health: [[healthClass: 'platform', type: 'platformHealthType', state: platformHealthState]] + extraHealths,
healthState: null,
zone: 'thezone'
]
}

private static Map serverGroup(name, region, Map other) {
return [
name : name,
region: region,
] + other
}

private static Map health(String name, String state) {
return [
healthClass: name + 'Class',
type: name + 'Type',
state: state
]
}
}

Original file line number Diff line number Diff line change
Expand Up @@ -63,4 +63,13 @@ public TaskResult(ExecutionStatus status, Map<String, ?> context) {
public @Nonnull Map<String, ?> getOutputs() {
return outputs;
}

@Override
public String toString() {
return "TaskResult{" +
"status=" + status +
", context=" + context +
", outputs=" + outputs +
'}';
}
}

0 comments on commit 462bfaa

Please sign in to comment.