Skip to content

Commit

Permalink
[k8s public preview] Pull in 1.0.9.4 fixes, update images and Helm ch…
Browse files Browse the repository at this point in the history
…arts (Azure#3272)

* Fix edge agent connection 1.0.9 (Azure#3172)

* Recreate connection after every exception

* Fix: Edgelet unable to pull using certain passwords - cherry-pick into 1.0.9. (Azure#3206) (Azure#3209)

Some passwords (especially generated passwords) can be used by docker CLI, but not by Edgelet. There is a chance of this  happening when passwords contain `?` or `~`.

It's not explicitly stated in the [Docker API](https://docs.docker.com/engine/api/v1.40/#section/Authentication), but the [code](https://github.com/docker/cli/blob/master/cli/command/registry.go#L50) is written such that the `X-Registry-Auth` header is expected to be URL safe (RFC 4648) base64.

* Agent reported state as "406" when modules are in backoff. (Azure#3244)

To reproduce

1.    create a deployment which has a module which fails to launch, like an incorrect image name, or some weird mount Docker doesn't like.
2.    Look at the reported status of the edge runtime - If you are lucky, you will see the real error.
3.    Let the runtime run for long enough that the "OrderedRetryPlanRunner" puts the module on backoff.
4.    Look at the reported status of the edge runtime - You won't see the real error message from the failure - you will see a "406 -- The device is offline or not sending status reports"

The root cause is that every reconcile starts with status=Unknown(406), and when a module is in backoff, the status doesn't update.

So it goes:
reconcile loop 1: module command fails, error is reported
reconcile loop 2: module is in backoff now, Unknown is reported
...
reconcile loop n: module command is attempted again, error is reported
reconcile loop n+1: module is in backoff again, Unknown is reported.

User is more likely to see "406 -- The device is offline or not sending status reports" which is not cool.
This has been seen before, one example: Azure#2066

The fix here is to keep track of status in Agent and report current status until it changes (either via a new error

* Update dockerfiles for arm32 & amd64, update Helm charts.

* Fix teh edge agent tests.

Co-authored-by: Anca Antochi <[email protected]>
  • Loading branch information
darobs and ancaantochi authored Jul 23, 2020
1 parent bfac673 commit fd36404
Show file tree
Hide file tree
Showing 12 changed files with 162 additions and 29 deletions.
2 changes: 1 addition & 1 deletion edge-agent/docker/linux/amd64/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
ARG base_tag=2.1.13-alpine3.10
ARG base_tag=2.1.17-alpine3.10
FROM azureiotedge/azureiotedge-runtime-base:1.2-linux-amd64 as builder

FROM mcr.microsoft.com/dotnet/core/runtime:${base_tag}
Expand Down
2 changes: 1 addition & 1 deletion edge-agent/docker/linux/arm32v7/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
ARG base_tag=1.0.6-linux-arm32v7
ARG base_tag=1.0.4.1.k8s-linux-arm32v7
FROM azureiotedge/azureiotedge-agent-base:${base_tag}

ARG EXE_DIR=.
Expand Down
2 changes: 1 addition & 1 deletion edge-agent/docker/linux/arm32v7/base/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
ARG base_tag=2.1.16-bionic-arm32v7
ARG base_tag=2.1.19-bionic-arm32v7
FROM mcr.microsoft.com/dotnet/core/runtime:${base_tag}

RUN apt-get update && \
Expand Down
19 changes: 10 additions & 9 deletions edge-agent/src/Microsoft.Azure.Devices.Edge.Agent.Core/Agent.cs
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ public class Agent
readonly IAvailabilityMetric availabilityMetric;
IEnvironment environment;
DeploymentConfigInfo currentConfig;
DeploymentStatus status;

public Agent(
IConfigSource configSource,
Expand All @@ -58,6 +59,7 @@ public Agent(
this.environment = this.environmentProvider.Create(this.currentConfig.DeploymentConfig);
this.encryptionProvider = Preconditions.CheckNotNull(encryptionProvider, nameof(encryptionProvider));
this.availabilityMetric = Preconditions.CheckNotNull(availabilityMetric, nameof(availabilityMetric));
this.status = DeploymentStatus.Unknown;
Events.AgentCreated();
}

Expand Down Expand Up @@ -109,7 +111,6 @@ await deploymentConfigInfoJson.ForEachAsync(

public async Task ReconcileAsync(CancellationToken token)
{
DeploymentStatus status = DeploymentStatus.Unknown;
ModuleSet moduleSetToReport = null;
using (await this.reconcileLock.LockAsync(token))
{
Expand All @@ -126,7 +127,7 @@ public async Task ReconcileAsync(CancellationToken token)
DeploymentConfig deploymentConfig = deploymentConfigInfo.DeploymentConfig;
if (deploymentConfig.Equals(DeploymentConfig.Empty))
{
status = DeploymentStatus.Success;
this.status = DeploymentStatus.Success;
}
else
{
Expand All @@ -142,7 +143,7 @@ public async Task ReconcileAsync(CancellationToken token)

if (plan.IsEmpty)
{
status = DeploymentStatus.Success;
this.status = DeploymentStatus.Success;
}
else
{
Expand All @@ -152,7 +153,7 @@ public async Task ReconcileAsync(CancellationToken token)
await this.UpdateCurrentConfig(deploymentConfigInfo);
if (result)
{
status = DeploymentStatus.Success;
this.status = DeploymentStatus.Success;
}
}
catch (Exception ex) when (!ex.IsFatal())
Expand All @@ -169,28 +170,28 @@ public async Task ReconcileAsync(CancellationToken token)
switch (ex)
{
case ConfigEmptyException _:
status = new DeploymentStatus(DeploymentStatusCode.ConfigEmptyError, ex.Message);
this.status = new DeploymentStatus(DeploymentStatusCode.ConfigEmptyError, ex.Message);
Events.EmptyConfig(ex);
break;

case InvalidSchemaVersionException _:
status = new DeploymentStatus(DeploymentStatusCode.InvalidSchemaVersion, ex.Message);
this.status = new DeploymentStatus(DeploymentStatusCode.InvalidSchemaVersion, ex.Message);
Events.InvalidSchemaVersion(ex);
break;

case ConfigFormatException _:
status = new DeploymentStatus(DeploymentStatusCode.ConfigFormatError, ex.Message);
this.status = new DeploymentStatus(DeploymentStatusCode.ConfigFormatError, ex.Message);
Events.InvalidConfigFormat(ex);
break;

default:
status = new DeploymentStatus(DeploymentStatusCode.Failed, ex.Message);
this.status = new DeploymentStatus(DeploymentStatusCode.Failed, ex.Message);
Events.UnknownFailure(ex);
break;
}
}

await this.reporter.ReportAsync(token, moduleSetToReport, await this.environment.GetRuntimeInfoAsync(), this.currentConfig.Version, status);
await this.reporter.ReportAsync(token, moduleSetToReport, await this.environment.GetRuntimeInfoAsync(), this.currentConfig.Version, this.status);
Events.FinishedReconcile();
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -237,7 +237,7 @@ async Task<Twin> GetTwinFunc()
{
Events.ErrorGettingTwin(e);

if (!retrying && moduleClient != null && !(e is TimeoutException))
if (!retrying && moduleClient != null)
{
try
{
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
// Copyright (c) Microsoft. All rights reserved.
namespace Microsoft.Azure.Devices.Edge.Agent.IoTHub.SdkClient
{
using System;
using System.Threading;
using System.Threading.Tasks;
using Microsoft.Azure.Devices.Client;
Expand All @@ -14,7 +15,18 @@ public class WrappingSdkModuleClient : ISdkModuleClient
public WrappingSdkModuleClient(ModuleClient sdkModuleClient)
=> this.sdkModuleClient = Preconditions.CheckNotNull(sdkModuleClient, nameof(sdkModuleClient));

public Task OpenAsync() => this.sdkModuleClient.OpenAsync();
public Task OpenAsync()
{
try
{
return this.sdkModuleClient.OpenAsync();
}
catch (Exception)
{
this.sdkModuleClient?.Dispose();
throw;
}
}

public void SetConnectionStatusChangesHandler(ConnectionStatusChangesHandler statusChangesHandler)
=> this.sdkModuleClient.SetConnectionStatusChangesHandler(statusChangesHandler);
Expand Down Expand Up @@ -47,6 +59,10 @@ public Task UpdateReportedPropertiesAsync(TwinCollection reportedProperties)
//// return await EdgeClientWebSocket.Connect(deviceStreamRequest.Url, deviceStreamRequest.AuthorizationToken, cancellationToken);
////}

public Task CloseAsync() => this.sdkModuleClient.CloseAsync();
public Task CloseAsync()
{
this.sdkModuleClient.Dispose();
return Task.CompletedTask;
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -523,6 +523,121 @@ public async Task CurrentIsNotNullBecauseDesiredThrew()
mockPlanRunner.Verify(r => r.ExecuteAsync(1, It.IsAny<Plan>(), token), Times.Never);
}

[Fact]
public async void ReconcileAsyncExecuteAsyncIncompleteDefaulsUnknown()
{
var desiredModule = new TestModule("desired", "v1", "test", ModuleStatus.Running, new TestConfig("image"), RestartPolicy.OnUnhealthy, ImagePullPolicy.OnCreate, new ConfigurationInfo("1"), null);
var currentModule = new TestModule("current", "v1", "test", ModuleStatus.Running, new TestConfig("image"), RestartPolicy.OnUnhealthy, ImagePullPolicy.OnCreate, new ConfigurationInfo("1"), null);
var commandList = new List<ICommand>
{
new Mock<ICommand>().Object,
new Mock<ICommand>().Object,
};
var testPlan = new Plan(commandList);
var token = default(CancellationToken);
var runtimeInfo = Mock.Of<IRuntimeInfo>();
var deploymentConfig = new DeploymentConfig("1.0", runtimeInfo, new SystemModules(null, null), new Dictionary<string, IModule> { ["desired"] = desiredModule });
var deploymentConfigInfo = new DeploymentConfigInfo(0, deploymentConfig);
ModuleSet desiredSet = deploymentConfig.GetModuleSet();
ModuleSet currentSet = ModuleSet.Create(currentModule);

var mockConfigSource = new Mock<IConfigSource>();
var mockEnvironment = new Mock<IEnvironment>();
var mockPlanner = new Mock<IPlanner>();
var mockPlanRunner = new Mock<IPlanRunner>();
var mockReporter = new Mock<IReporter>();
var mockModuleIdentityLifecycleManager = new Mock<IModuleIdentityLifecycleManager>();
var configStore = Mock.Of<IEntityStore<string, string>>();
var mockEnvironmentProvider = Mock.Of<IEnvironmentProvider>(m => m.Create(It.IsAny<DeploymentConfig>()) == mockEnvironment.Object);
var serde = Mock.Of<ISerde<DeploymentConfigInfo>>();
var encryptionDecryptionProvider = Mock.Of<IEncryptionProvider>();
var availabilityMetric = Mock.Of<IAvailabilityMetric>();

mockConfigSource.Setup(cs => cs.GetDeploymentConfigInfoAsync())
.ReturnsAsync(deploymentConfigInfo);
mockEnvironment.Setup(env => env.GetModulesAsync(token))
.ReturnsAsync(currentSet);
mockModuleIdentityLifecycleManager.Setup(m => m.GetModuleIdentitiesAsync(desiredSet, currentSet))
.ReturnsAsync(ImmutableDictionary<string, IModuleIdentity>.Empty);
mockPlanner.Setup(pl => pl.PlanAsync(It.IsAny<ModuleSet>(), currentSet, runtimeInfo, ImmutableDictionary<string, IModuleIdentity>.Empty))
.Returns(Task.FromResult(testPlan));
mockModuleIdentityLifecycleManager.Setup(m => m.GetModuleIdentitiesAsync(It.IsAny<ModuleSet>(), currentSet))
.Returns(Task.FromResult((IImmutableDictionary<string, IModuleIdentity>)ImmutableDictionary<string, IModuleIdentity>.Empty));
mockReporter.Setup(r => r.ReportAsync(token, It.IsAny<ModuleSet>(), It.IsAny<IRuntimeInfo>(), It.IsAny<long>(), DeploymentStatus.Unknown))
.Returns(Task.CompletedTask);
mockPlanRunner.SetupSequence(m => m.ExecuteAsync(It.IsAny<long>(), It.IsAny<Plan>(), It.IsAny<CancellationToken>()))
.ReturnsAsync(false);
var agent = new Agent(mockConfigSource.Object, mockEnvironmentProvider, mockPlanner.Object, mockPlanRunner.Object, mockReporter.Object, mockModuleIdentityLifecycleManager.Object, configStore, DeploymentConfigInfo.Empty, serde, encryptionDecryptionProvider, availabilityMetric);

await agent.ReconcileAsync(token);

mockEnvironment.Verify(env => env.GetModulesAsync(token), Times.Once());
mockPlanner.Verify(pl => pl.PlanAsync(It.IsAny<ModuleSet>(), currentSet, runtimeInfo, ImmutableDictionary<string, IModuleIdentity>.Empty), Times.Once());
mockPlanRunner.VerifyAll();
mockReporter.VerifyAll();
}

[Fact]
public async void ReconcileAsyncExecuteAsyncIncompleteReportsLastState()
{
var desiredModule = new TestModule("desired", "v1", "test", ModuleStatus.Running, new TestConfig("image"), RestartPolicy.OnUnhealthy, ImagePullPolicy.OnCreate, new ConfigurationInfo("1"), null);
var currentModule = new TestModule("current", "v1", "test", ModuleStatus.Running, new TestConfig("image"), RestartPolicy.OnUnhealthy, ImagePullPolicy.OnCreate, new ConfigurationInfo("1"), null);
var commandList = new List<ICommand>
{
new Mock<ICommand>().Object,
new Mock<ICommand>().Object,
};
var testPlan = new Plan(commandList);
var token = default(CancellationToken);
var runtimeInfo = Mock.Of<IRuntimeInfo>();
var deploymentConfig = new DeploymentConfig("1.0", runtimeInfo, new SystemModules(null, null), new Dictionary<string, IModule> { ["desired"] = desiredModule });
var deploymentConfigInfo = new DeploymentConfigInfo(0, deploymentConfig);
ModuleSet desiredSet = deploymentConfig.GetModuleSet();
ModuleSet currentSet = ModuleSet.Create(currentModule);
var statuses = new List<DeploymentStatusCode>();

var mockConfigSource = new Mock<IConfigSource>();
var mockEnvironment = new Mock<IEnvironment>();
var mockPlanner = new Mock<IPlanner>();
var mockPlanRunner = new Mock<IPlanRunner>();
var mockReporter = new Mock<IReporter>();
var mockModuleIdentityLifecycleManager = new Mock<IModuleIdentityLifecycleManager>();
var configStore = Mock.Of<IEntityStore<string, string>>();
var mockEnvironmentProvider = Mock.Of<IEnvironmentProvider>(m => m.Create(It.IsAny<DeploymentConfig>()) == mockEnvironment.Object);
var serde = Mock.Of<ISerde<DeploymentConfigInfo>>();
var encryptionDecryptionProvider = Mock.Of<IEncryptionProvider>();
var availabilityMetric = Mock.Of<IAvailabilityMetric>();

mockConfigSource.Setup(cs => cs.GetDeploymentConfigInfoAsync())
.ReturnsAsync(deploymentConfigInfo);
mockEnvironment.Setup(env => env.GetModulesAsync(token))
.ReturnsAsync(currentSet);
mockModuleIdentityLifecycleManager.Setup(m => m.GetModuleIdentitiesAsync(desiredSet, currentSet))
.ReturnsAsync(ImmutableDictionary<string, IModuleIdentity>.Empty);
mockPlanner.Setup(pl => pl.PlanAsync(It.IsAny<ModuleSet>(), currentSet, runtimeInfo, ImmutableDictionary<string, IModuleIdentity>.Empty))
.Returns(Task.FromResult(testPlan));
mockModuleIdentityLifecycleManager.Setup(m => m.GetModuleIdentitiesAsync(It.IsAny<ModuleSet>(), currentSet))
.Returns(Task.FromResult((IImmutableDictionary<string, IModuleIdentity>)ImmutableDictionary<string, IModuleIdentity>.Empty));
mockReporter.Setup(r => r.ReportAsync(token, It.IsAny<ModuleSet>(), It.IsAny<IRuntimeInfo>(), It.IsAny<long>(), It.IsAny<DeploymentStatus>()))
.Callback((CancellationToken _t, ModuleSet _m, IRuntimeInfo _r, long _v, DeploymentStatus d) => statuses.Add(d.Code))
.Returns(Task.CompletedTask);
// First call represents a command failure, 2nd call represents an execution backoff.
mockPlanRunner.SetupSequence(m => m.ExecuteAsync(It.IsAny<long>(), It.IsAny<Plan>(), It.IsAny<CancellationToken>()))
.ThrowsAsync(new Exception("generic exception"))
.ReturnsAsync(false);
var agent = new Agent(mockConfigSource.Object, mockEnvironmentProvider, mockPlanner.Object, mockPlanRunner.Object, mockReporter.Object, mockModuleIdentityLifecycleManager.Object, configStore, DeploymentConfigInfo.Empty, serde, encryptionDecryptionProvider, availabilityMetric);

await agent.ReconcileAsync(token);
await agent.ReconcileAsync(token);

mockEnvironment.Verify(env => env.GetModulesAsync(token), Times.Exactly(2));
mockPlanner.Verify(pl => pl.PlanAsync(It.IsAny<ModuleSet>(), currentSet, runtimeInfo, ImmutableDictionary<string, IModuleIdentity>.Empty), Times.Exactly(2));
mockPlanRunner.VerifyAll();
mockReporter.Verify(r => r.ReportAsync(token, It.IsAny<ModuleSet>(), It.IsAny<IRuntimeInfo>(), It.IsAny<long>(), It.IsAny<DeploymentStatus>()), Times.Exactly(2));
Assert.Equal(DeploymentStatusCode.Failed, statuses[0]);
Assert.Equal(DeploymentStatusCode.Failed, statuses[1]);
}

[Fact]
public async Task ReportShutdownAsyncConfigTest()
{
Expand Down
2 changes: 1 addition & 1 deletion edgelet/edgelet-docker/src/runtime.rs
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ impl ModuleRegistry for DockerModuleRuntime {
let json = serde_json::to_string(a).with_context(|_| {
ErrorKind::RegistryOperation(RegistryOperation::PullImage(image.clone()))
})?;
Ok(base64::encode(&json))
Ok(base64::encode_config(&json, base64::URL_SAFE))
},
);

Expand Down
15 changes: 8 additions & 7 deletions edgelet/edgelet-docker/tests/runtime.rs
Original file line number Diff line number Diff line change
Expand Up @@ -407,13 +407,13 @@ fn image_pull_with_invalid_creds_handler(req: Request<Body>) -> ResponseFuture {
.headers()
.get_all("X-Registry-Auth")
.into_iter()
.map(|bytes| base64::decode(bytes).unwrap())
.map(|bytes| base64::decode_config(bytes, base64::URL_SAFE).unwrap())
.map(|raw| str::from_utf8(&raw).unwrap().to_owned())
.collect::<Vec<String>>()
.join("");
let auth_config: AuthConfig = serde_json::from_str(&auth_str.to_string()).unwrap();
assert_eq!(auth_config.username(), Some("u1"));
assert_eq!(auth_config.password(), Some("wrong_password"));
let auth_config: AuthConfig = serde_json::from_str(&auth_str).unwrap();
assert_eq!(auth_config.username(), Some("us1"));
assert_eq!(auth_config.password(), Some("ac?ac~aaac???"));
assert_eq!(auth_config.email(), Some("[email protected]"));
assert_eq!(auth_config.serveraddress(), Some("svr1"));

Expand Down Expand Up @@ -460,9 +460,10 @@ fn image_pull_with_invalid_creds_fails() {

let task = DockerModuleRuntime::make_runtime(settings, provisioning_result(), crypto())
.and_then(|runtime| {
// password is written to guarantee base64 encoding has '-' and/or '_'
let auth = AuthConfig::new()
.with_username("u1".to_string())
.with_password("wrong_password".to_string())
.with_username("us1".to_string())
.with_password("ac?ac~aaac???".to_string())
.with_email("[email protected]".to_string())
.with_serveraddress("svr1".to_string());
let config = DockerConfig::new(
Expand Down Expand Up @@ -595,7 +596,7 @@ fn image_pull_with_creds_handler(req: Request<Body>) -> ResponseFuture {
.headers()
.get_all("X-Registry-Auth")
.into_iter()
.map(|bytes| base64::decode(bytes).unwrap())
.map(|bytes| base64::decode_config(bytes, base64::URL_SAFE).unwrap())
.map(|raw| str::from_utf8(&raw).unwrap().to_owned())
.collect::<Vec<String>>()
.join("");
Expand Down
2 changes: 1 addition & 1 deletion kubernetes/charts/edge-kubernetes-crd/Chart.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@ apiVersion: v1
appVersion: '1.0'
description: A Helm chart for installing CRD for Azure IoT Edge on Kubernetes
name: edge-kubernetes-crd
version: 0.2.7
version: 0.2.8
2 changes: 1 addition & 1 deletion kubernetes/charts/edge-kubernetes/Chart.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@ apiVersion: v1
appVersion: '1.0'
description: A Helm chart for running Azure IoT Edge on Kubernetes
name: edge-kubernetes
version: 0.2.7
version: 0.2.8
8 changes: 4 additions & 4 deletions kubernetes/charts/edge-kubernetes/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
iotedged:
image:
repository: azureiotedge/azureiotedge-iotedged
tag: 0.1.0-beta9
tag: 0.1.0-beta10
pullPolicy: Always
nodeSelector: {}
# Volumes which support ownership management are modified to be owned and
Expand Down Expand Up @@ -101,7 +101,7 @@ iotedged:
iotedgedProxy:
image:
repository: azureiotedge/azureiotedge-proxy
tag: 0.1.0-beta9
tag: 0.1.0-beta10
pullPolicy: Always

# Edge Agent image configuration
Expand All @@ -115,7 +115,7 @@ edgeAgent:
containerName: edgeagent
image:
repository: azureiotedge/azureiotedge-agent
tag: 0.1.0-beta9
tag: 0.1.0-beta10
pullPolicy: Always
hostname: "localhost"
env:
Expand Down Expand Up @@ -291,4 +291,4 @@ provisioning:
# provisioning:
# source: "external"
# endpoint: "http://localhost:9999"
# dynamicReprovisioning: false
# dynamicReprovisioning: false

0 comments on commit fd36404

Please sign in to comment.