Skip to content

Commit

Permalink
fix(igor): Handle timeouts kicking off builds (spinnaker#3915)
Browse files Browse the repository at this point in the history
* fix(igor): Handle timeouts kicking off builds

This a companion PR to spinnaker/igor#863

The issue here is that when kicking off a build takes a long time (e.g. when jenkins has a lot of job configs) orca might kick off a lot of builds instead of just one (and might still fail!)
Igor now adds support for "pending" operation so if the same request is received twice it will return 202. This change adds the corresponding support to orca, on 202, we just need to retry, because eventually igor will give us the build number

* fixup! fix(igor): Handle timeouts kicking off builds

* fixup! fixup! fix(igor): Handle timeouts kicking off builds

* fixup! fixup! fixup! fix(igor): Handle timeouts kicking off builds

* fixup! fixup! fixup! fixup! fix(igor): Handle timeouts kicking off builds

Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com>
  • Loading branch information
marchello2000 and mergify[bot] authored Sep 22, 2020
1 parent 3b2ed54 commit ae902c7
Show file tree
Hide file tree
Showing 6 changed files with 148 additions and 21 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
import lombok.RequiredArgsConstructor;
import org.springframework.stereotype.Component;
import org.springframework.web.util.UriUtils;
import retrofit.client.Response;

@Component
@RequiredArgsConstructor
Expand All @@ -31,7 +32,7 @@ private String encode(String uri) {
return UriUtils.encodeFragment(uri, "UTF-8");
}

public String build(String master, String jobName, Map<String, String> queryParams) {
public Response build(String master, String jobName, Map<String, String> queryParams) {
return igorService.build(master, encode(jobName), queryParams, "");
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,12 @@
import java.util.List;
import java.util.Map;
import javax.annotation.Nullable;
import retrofit.client.Response;
import retrofit.http.*;

public interface IgorService {
@PUT("/masters/{name}/jobs/{jobName}")
String build(
Response build(
@Path("name") String master,
@Path(encode = false, value = "jobName") String jobName,
@QueryMap Map<String, String> queryParams,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,31 +17,82 @@
package com.netflix.spinnaker.orca.igor.tasks

import com.fasterxml.jackson.databind.ObjectMapper
import com.netflix.spinnaker.kork.exceptions.SystemException
import com.netflix.spinnaker.orca.api.pipeline.RetryableTask
import com.netflix.spinnaker.orca.api.pipeline.models.ExecutionStatus
import com.netflix.spinnaker.orca.api.pipeline.Task
import com.netflix.spinnaker.orca.api.pipeline.models.StageExecution
import com.netflix.spinnaker.orca.api.pipeline.TaskResult
import com.netflix.spinnaker.orca.exceptions.ExceptionHandler
import com.netflix.spinnaker.orca.igor.BuildService
import com.netflix.spinnaker.orca.retrofit.exceptions.RetrofitExceptionHandler
import org.slf4j.Logger
import org.slf4j.LoggerFactory
import org.springframework.beans.factory.annotation.Autowired
import org.springframework.http.HttpStatus
import org.springframework.stereotype.Component
import retrofit.RetrofitError
import retrofit.client.Response

import javax.annotation.Nonnull
import java.time.Duration

@Component
class StartJenkinsJobTask implements Task {
class StartJenkinsJobTask implements RetryableTask {
private final Logger log = LoggerFactory.getLogger(getClass())

@Autowired
BuildService buildService

@Autowired
ObjectMapper objectMapper

@Autowired
RetrofitExceptionHandler retrofitExceptionHandler

@Nonnull
@Override
TaskResult execute(@Nonnull StageExecution stage) {
String master = stage.context.master
String job = stage.context.job
String queuedBuild = buildService.build(master, job, stage.context.parameters)
TaskResult.builder(ExecutionStatus.SUCCEEDED).context([queuedBuild: queuedBuild]).build()

try {
Response igorResponse = buildService.build(master, job, stage.context.parameters)

if (igorResponse.getStatus() == HttpStatus.ACCEPTED.value()) {
log.info("build for job=$job on master=$master is pending, waiting for build to start")
return TaskResult.RUNNING
}

if (igorResponse.getStatus() == HttpStatus.OK.value()) {
String queuedBuild = igorResponse.body.in().text
return TaskResult
.builder(ExecutionStatus.SUCCEEDED)
.context([queuedBuild: queuedBuild])
.build()
}
}
catch (RetrofitError e) {
// This igor call is idempotent so we can retry despite it being PUT/POST
ExceptionHandler.Response exceptionResponse = retrofitExceptionHandler.handle("StartJenkinsJob", e)

if (exceptionResponse.shouldRetry) {
log.warn("Failure communicating with igor to start a jenkins job, will retry", e)
return TaskResult.RUNNING
}
throw e
}

throw new SystemException("Failure starting jenkins job")
}

@Override
long getBackoffPeriod() {
return Duration.ofSeconds(30).toMillis()
}

@Override
long getTimeout() {
return Duration.ofMinutes(15).toMillis()
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -17,26 +17,41 @@
package com.netflix.spinnaker.orca.igor.tasks

import com.fasterxml.jackson.databind.ObjectMapper
import com.netflix.spinnaker.kork.exceptions.IntegrationException
import com.netflix.spinnaker.kork.exceptions.SystemException
import com.netflix.spinnaker.orca.api.pipeline.RetryableTask
import com.netflix.spinnaker.orca.api.pipeline.models.ExecutionStatus
import com.netflix.spinnaker.orca.api.pipeline.Task
import com.netflix.spinnaker.orca.api.pipeline.models.StageExecution
import com.netflix.spinnaker.orca.api.pipeline.TaskResult
import com.netflix.spinnaker.orca.exceptions.ExceptionHandler
import com.netflix.spinnaker.orca.igor.BuildService
import com.netflix.spinnaker.orca.retrofit.exceptions.RetrofitExceptionHandler
import org.slf4j.Logger
import org.slf4j.LoggerFactory
import org.springframework.beans.factory.annotation.Autowired
import org.springframework.beans.factory.annotation.Value
import org.springframework.http.HttpStatus
import org.springframework.stereotype.Component
import retrofit.RetrofitError
import retrofit.client.Response

import javax.annotation.Nonnull
import java.time.Duration

@Component
class StartScriptTask implements Task {
class StartScriptTask implements RetryableTask {
private final Logger log = LoggerFactory.getLogger(getClass())

@Autowired
BuildService buildService

@Autowired
ObjectMapper objectMapper

@Autowired
RetrofitExceptionHandler retrofitExceptionHandler

@Value('${script.master:master}')
String master

Expand Down Expand Up @@ -66,13 +81,13 @@ class StartScriptTask implements Task {
}

def parameters = [
SCRIPT_PATH : scriptPath,
COMMAND : command,
IMAGE_ID : image,
REGION_PARAM : region,
ENV_PARAM : account,
CLUSTER_PARAM: cluster,
CMC : cmc
SCRIPT_PATH : scriptPath,
COMMAND : command,
IMAGE_ID : image,
REGION_PARAM : region,
ENV_PARAM : account,
CLUSTER_PARAM: cluster,
CMC : cmc
]

if (repoUrl) {
Expand All @@ -83,8 +98,43 @@ class StartScriptTask implements Task {
parameters.REPO_BRANCH = repoBranch
}

String queuedBuild = buildService.build(master, job, parameters)
TaskResult.builder(ExecutionStatus.SUCCEEDED).context([master: master, job: job, queuedBuild: queuedBuild, REPO_URL: repoUrl?:'default' ]).build()
try {
Response igorResponse = buildService.build(master, job, parameters)

if (igorResponse.getStatus() == HttpStatus.ACCEPTED.value()) {
log.info("script for job=$job on master=$master is pending, waiting for script to start")
return TaskResult.RUNNING
}

if (igorResponse.getStatus() == HttpStatus.OK.value()) {
String queuedBuild = igorResponse.body.in().text
return TaskResult
.builder(ExecutionStatus.SUCCEEDED)
.context([master: master, job: job, queuedBuild: queuedBuild, REPO_URL: repoUrl ?: 'default'])
.build()
}
}
catch (RetrofitError e) {
// This igor call is idempotent so we can retry despite it being PUT/POST
ExceptionHandler.Response exceptionResponse = retrofitExceptionHandler.handle("StartJenkinsJob", e)

if (exceptionResponse.shouldRetry) {
log.warn("Failure communicating with igor to start a jenkins job, will retry", e)
return TaskResult.RUNNING
}
throw e
}

throw new SystemException("Failure starting script")
}

@Override
long getBackoffPeriod() {
return Duration.ofSeconds(30).toMillis()
}

@Override
long getTimeout() {
return Duration.ofMinutes(15).toMillis()
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,10 @@ import com.netflix.spinnaker.orca.api.pipeline.models.ExecutionStatus
import com.netflix.spinnaker.orca.igor.BuildService
import com.netflix.spinnaker.orca.pipeline.model.PipelineExecutionImpl
import com.netflix.spinnaker.orca.pipeline.model.StageExecutionImpl
import com.netflix.spinnaker.orca.retrofit.exceptions.RetrofitExceptionHandler
import retrofit.RetrofitError
import retrofit.client.Response
import retrofit.mime.TypedString
import spock.lang.Shared
import spock.lang.Specification
import spock.lang.Subject
Expand All @@ -35,6 +38,8 @@ class StartJenkinsJobTaskSpec extends Specification {
task.objectMapper = Mock(ObjectMapper) {
convertValue(_,_) >> [:]
}

task.retrofitExceptionHandler = new RetrofitExceptionHandler()
}

@Shared
Expand All @@ -46,7 +51,8 @@ class StartJenkinsJobTaskSpec extends Specification {

and:
task.buildService = Stub(BuildService) {
build(stage.context.master, stage.context.job, stage.context.parameters) >> [result: 'SUCCESS', running: true, number: 4]
build(stage.context.master, stage.context.job, stage.context.parameters) >>
new Response("", 200, "OK", [], new TypedString(new ObjectMapper().writeValueAsString([result: 'SUCCESS', running: true, number: 4])))
}

when:
Expand All @@ -62,7 +68,8 @@ class StartJenkinsJobTaskSpec extends Specification {

and:
task.buildService = Stub(BuildService) {
build(stage.context.master, stage.context.job, stage.context.parameters) >> [ result : 'SUCCESS', running: true, number: 4 ]
build(stage.context.master, stage.context.job, stage.context.parameters) >>
new Response("", 200, "OK", [], new TypedString(new ObjectMapper().writeValueAsString([result: 'SUCCESS', running: true, number: 4])))
}

when:
Expand All @@ -87,4 +94,21 @@ class StartJenkinsJobTaskSpec extends Specification {
then:
thrown(RetrofitError)
}

def "handle 202 response from igor"() {
given:
def stage = new StageExecutionImpl(pipeline, "jenkins", [master: "builds", job: "orca"])

and:
task.buildService = Stub(BuildService) {
build(stage.context.master, stage.context.job, stage.context.parameters) >>
new Response("", 202, "OK", [], null)
}

when:
def result = task.execute(stage)

then:
result.status == ExecutionStatus.RUNNING
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -77,21 +77,21 @@ class RetrofitExceptionHandler implements ExceptionHandler {
}

// retry on 503 even for non-idempotent requests
if (e.kind == HTTP && e.response.status == HTTP_UNAVAILABLE) {
if (e.kind == HTTP && e.response?.status == HTTP_UNAVAILABLE) {
return true
}

return isIdempotentRequest(e) && (isNetworkError(e) || isGatewayErrorCode(e) || isThrottle(e))
}

boolean isGatewayErrorCode(RetrofitError e) {
e.kind == HTTP && e.response.status in [HTTP_BAD_GATEWAY, HTTP_UNAVAILABLE, HTTP_GATEWAY_TIMEOUT]
e.kind == HTTP && e.response?.status in [HTTP_BAD_GATEWAY, HTTP_UNAVAILABLE, HTTP_GATEWAY_TIMEOUT]
}

private static final int HTTP_TOO_MANY_REQUESTS = 429

boolean isThrottle(RetrofitError e) {
e.kind == HTTP && e.response.status == HTTP_TOO_MANY_REQUESTS
e.kind == HTTP && e.response?.status == HTTP_TOO_MANY_REQUESTS
}

private boolean isNetworkError(RetrofitError e) {
Expand All @@ -100,7 +100,7 @@ class RetrofitExceptionHandler implements ExceptionHandler {

private boolean isMalformedRequest(RetrofitError e) {
// We never want to retry errors like "Path parameter "blah" value must not be null.
return e.kind == UNEXPECTED && e.message.contains("Path parameter")
return e.kind == UNEXPECTED && e.message?.contains("Path parameter")
}

private static boolean isIdempotentRequest(RetrofitError e) {
Expand Down

0 comments on commit ae902c7

Please sign in to comment.