Skip to content

Commit

Permalink
Merged PR 407664: Add better health care restart planner (BHCRP)
Browse files Browse the repository at this point in the history
Adds the better health care restart planner.
  • Loading branch information
avranju committed Oct 6, 2017
1 parent bf638ee commit aeb5521
Show file tree
Hide file tree
Showing 88 changed files with 5,246 additions and 2,441 deletions.
8 changes: 8 additions & 0 deletions .editorconfig
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
root = true

[*.cs]
insert_final_newline = true
indent_style = space
indent_size = 4
charset = utf-8
trim_trailing_whitespace = true
7 changes: 6 additions & 1 deletion Microsoft.Azure.Devices.Edge.sln
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@

Microsoft Visual Studio Solution File, Format Version 12.00
# Visual Studio 15
VisualStudioVersion = 15.0.26906.1
VisualStudioVersion = 15.0.26730.16
MinimumVisualStudioVersion = 10.0.40219.1
Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "agent", "agent", "{EC8E6AC9-7AD8-4749-88F3-5D0CDD52D7B0}"
ProjectSection(SolutionItems) = preProject
Expand Down Expand Up @@ -138,6 +138,11 @@ Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Microsoft.Azure.Devices.Edg
EndProject
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Microsoft.Azure.Devices.Edge.Storage.Test", "edge-util\test\Microsoft.Azure.Devices.Edge.Storage.Test\Microsoft.Azure.Devices.Edge.Storage.Test.csproj", "{3EB5B58A-5820-44E7-9558-917C105B940D}"
EndProject
Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Solution Items", "Solution Items", "{8D653BF9-C9B0-4892-97F2-4550E3FA3E8A}"
ProjectSection(SolutionItems) = preProject
.editorconfig = .editorconfig
EndProjectSection
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|Any CPU = Debug|Any CPU
Expand Down
122 changes: 64 additions & 58 deletions edge-agent/src/Microsoft.Azure.Devices.Edge.Agent.Core/Agent.cs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Copyright (c) Microsoft. All rights reserved.
// Copyright (c) Microsoft. All rights reserved.

namespace Microsoft.Azure.Devices.Edge.Agent.Core
{
Expand All @@ -9,71 +9,77 @@ namespace Microsoft.Azure.Devices.Edge.Agent.Core
using Microsoft.Extensions.Logging;

public class Agent
{
readonly IEnvironment environment;
readonly IPlanner planner;
readonly IConfigSource configSource;
{
readonly IEnvironment environment;
readonly IPlanner planner;
readonly IReporter reporter;
readonly IConfigSource configSource;

public Agent(IConfigSource configSource, IEnvironment environment, IPlanner planner)
{
this.configSource = Preconditions.CheckNotNull(configSource, nameof(configSource));
this.environment = Preconditions.CheckNotNull(environment, nameof(environment));
this.planner = Preconditions.CheckNotNull(planner, nameof(planner));
Events.AgentCreated();
}
public Agent(IConfigSource configSource, IEnvironment environment, IPlanner planner, IReporter reporter)
{
this.configSource = Preconditions.CheckNotNull(configSource, nameof(configSource));
this.environment = Preconditions.CheckNotNull(environment, nameof(environment));
this.planner = Preconditions.CheckNotNull(planner, nameof(planner));
this.reporter = Preconditions.CheckNotNull(reporter, nameof(reporter));
Events.AgentCreated();
}

public async Task ReconcileAsync(CancellationToken token)
{
Task<ModuleSet> envTask = this.environment.GetModulesAsync(token);
Task<ModuleSet> configTask = this.configSource.GetModuleSetAsync();
public async Task ReconcileAsync(CancellationToken token)
{
Task<ModuleSet> envTask = this.environment.GetModulesAsync(token);
Task<ModuleSet> configTask = this.configSource.GetModuleSetAsync();

await Task.WhenAll(envTask, configTask);
await Task.WhenAll(envTask, configTask);

ModuleSet current = envTask.Result;
ModuleSet desired = configTask.Result;
Plan plan = this.planner.Plan(desired, current);
ModuleSet current = envTask.Result;
ModuleSet desired = configTask.Result;
ModuleSet updated = current;
Plan plan = await this.planner.PlanAsync(desired, current);

if (!plan.IsEmpty)
{
try
{
await plan.ExecuteAsync(token);
}
catch (Exception ex)
{
Events.PlanExecutionFailed(ex);
throw;
}
}
}
if (!plan.IsEmpty)
{
try
{
await plan.ExecuteAsync(token);
updated = await this.environment.GetModulesAsync(token);
}
catch (Exception ex)
{
Events.PlanExecutionFailed(ex);
throw;
}
}

static class Events
{
static readonly ILogger Log = Logger.Factory.CreateLogger<Agent>();
const int IdStart = AgentEventIds.Agent;
await this.reporter.ReportAsync(updated);
}

enum EventIds
{
AgentCreated = IdStart,
UpdateDesiredStateFailed,
PlanExecutionFailed
}
static class Events
{
static readonly ILogger Log = Logger.Factory.CreateLogger<Agent>();
const int IdStart = AgentEventIds.Agent;

public static void AgentCreated()
{
Log.LogDebug((int)EventIds.AgentCreated, "Agent Created.");
}
enum EventIds
{
AgentCreated = IdStart,
UpdateDesiredStateFailed,
PlanExecutionFailed
}

public static void UpdateDesiredStateFailed()
{
Log.LogError((int)EventIds.UpdateDesiredStateFailed, "Agent update to desired state failed.");
}
public static void AgentCreated()
{
Log.LogDebug((int)EventIds.AgentCreated, "Agent Created.");
}

public static void PlanExecutionFailed(Exception ex)
{
Log.LogError((int)EventIds.PlanExecutionFailed, ex, "Agent Plan execution failed.");
}
}
public static void UpdateDesiredStateFailed()
{
Log.LogError((int)EventIds.UpdateDesiredStateFailed, "Agent update to desired state failed.");
}

}
}
public static void PlanExecutionFailed(Exception ex)
{
Log.LogError((int)EventIds.PlanExecutionFailed, ex, "Agent Plan execution failed.");
}
}

}
}
Original file line number Diff line number Diff line change
@@ -1,15 +1,18 @@
// Copyright (c) Microsoft. All rights reserved.
// Copyright (c) Microsoft. All rights reserved.
namespace Microsoft.Azure.Devices.Edge.Agent.Core
{
public struct AgentEventIds
{
const int EventIdStart = 100000;
public const int Agent = EventIdStart;
public const int FileConfigSource = EventIdStart + 100;
public const int TwinConfigSource = EventIdStart + 200;
public const int RestartPlanner = EventIdStart + 300;
public const int Plan = EventIdStart + 400;
public const int FileBackupConfigSource = EventIdStart + 500;
public const int TwinReportStateCommandFactory = EventIdStart + 600;
}
}
public struct AgentEventIds
{
const int EventIdStart = 100000;
public const int Agent = EventIdStart;
public const int FileConfigSource = EventIdStart + 100;
public const int TwinConfigSource = EventIdStart + 200;
public const int RestartPlanner = EventIdStart + 300;
public const int Plan = EventIdStart + 400;
public const int FileBackupConfigSource = EventIdStart + 500;
public const int HealthRestartPlanner = EventIdStart + 600;
public const int RestartManager = EventIdStart + 700;
public const int IoTHubReporter = EventIdStart + 800;
public const int DockerEnvironment = EventIdStart + 900;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
// Copyright (c) Microsoft. All rights reserved.

namespace Microsoft.Azure.Devices.Edge.Agent.Core
{
public static class Constants
{
public const string Owner = "Microsoft.Azure.Devices.Edge.Agent";

public const string EdgeDeviceConnectionStringKey = "EdgeDeviceConnectionString";

public const string EdgeHubConnectionStringKey = "EdgeHubConnectionString";

public const string ModuleIdKey = "ModuleId";

public const string MMAStorePartitionKey = "mma";

public const RestartPolicy DefaultRestartPolicy = RestartPolicy.OnUnhealthy;

public const ModuleStatus DefaultDesiredStatus = ModuleStatus.Running;

public static class Labels
{
public const string Version = "net.azure-devices.edge.version";
public const string Owner = "net.azure-devices.edge.owner";
public const string RestartPolicy = "net.azure-devices.edge.restartPolicy";
public const string DesiredStatus = "net.azure-devices.edge.desiredStatus";
public const string NormalizedCreateOptions = "net.azure-devices.edge.normalizedCreateOptions";
}
}
}
24 changes: 21 additions & 3 deletions edge-agent/src/Microsoft.Azure.Devices.Edge.Agent.Core/Diff.cs
Original file line number Diff line number Diff line change
Expand Up @@ -14,16 +14,34 @@ public class Diff
public bool IsEmpty => this.Updated.Count == 0 && this.Removed.Count == 0;

/// <summary>
/// List of modules that have been updated
/// List of modules that have been updated or added.
/// </summary>
/// <remarks>
/// You might wonder why we do not have a separate property here to track "added" modules.
/// The reason is that this type (<see cref="Diff"/>) is used to deserialize patch updates to
/// the MMA's desired properties. The twin document delivered to us by IoT Hub does not
/// distinguish between added and updated modules. What has been "added" or "updated"
/// is only relevant when taking local state maintained in the MMA into account (capture
/// via <see cref="ModuleSet.Diff(ModuleSet)"/>).
/// </remarks>
public IImmutableSet<IModule> Updated { get; }

/// <summary>
/// List of modules names that have been removed
/// List of modules names that have been removed.
/// </summary>
/// <remarks>
/// You might wonder why this is not an <see cref="IImmutableSet{IModule}"/> instead
/// of what it is here. The reason is that this type (<see cref="Diff"/>) is used to
/// deserialize patch updates to the MMA's desired properties. When a module is
/// removed it shows up as an entry in the JSON where the key is the module name and
/// the value is <c>null</c>. In this case, when deserializing the JSON, the deserializer
/// (<see cref="Microsoft.Azure.Devices.Edge.Agent.Core.Serde.DiffSerde"/>) is unable
/// to construct an <see cref="IModule"/> object from the value <c>null</c>. All it
/// can do is populate a set of strings with module names. Hence an <see cref="IImmutableSet{string}"/>.
/// </remarks>
public IImmutableSet<string> Removed { get; }

public Diff(IList<IModule> updated, IList<string> removed)
public Diff(IList<IModule> updated, IList<string> removed)
{
this.Updated = Preconditions.CheckNotNull(updated, nameof(updated)).ToImmutableHashSet();
this.Removed = Preconditions.CheckNotNull(removed, nameof(removed)).ToImmutableHashSet();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,10 @@ public interface ICommandFactory

ICommand Start(IModule module);

ICommand Restart(IModule module);

ICommand Stop(IModule module);

ICommand Wrap(ICommand command);
}
}
73 changes: 66 additions & 7 deletions edge-agent/src/Microsoft.Azure.Devices.Edge.Agent.Core/IModule.cs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Copyright (c) Microsoft. All rights reserved.
// Copyright (c) Microsoft. All rights reserved.

namespace Microsoft.Azure.Devices.Edge.Agent.Core
{
Expand All @@ -10,16 +10,73 @@ namespace Microsoft.Azure.Devices.Edge.Agent.Core
[JsonConverter(typeof(StringEnumConverter))]
public enum ModuleStatus
{
/// <summary>
/// This is the state that all modules start out in. As soon as a deployment
/// is created, it is assumed that all modules in the deployment begin life
/// in the "Unknown" state.
/// </summary>
[EnumMember(Value = "unknown")]
Unknown,
Unknown, // TODO: Consider removing this status entirely since it doesn't seem to be used.

/// <summary>
/// Modules transition to the "Backoff" state when the MMA has scheduled
/// the module to be started but hasn't actually started running yet. This is
/// useful when we have a failing module that is undergoing state changes as
/// part of the implementation of its restart policy. For example when a failing
/// module is awaiting restart during the cool-off period as dictated by the
/// exponential back-off restart strategy, the module will be in this
/// "Backoff" state.
/// </summary>
[EnumMember(Value = "backoff")]
Backoff,

/// <summary>
/// This state indicates that module is currently running.
/// </summary>
[EnumMember(Value = "running")]
Running,
[EnumMember(Value = "stopped")]
Stopped,
[EnumMember(Value = "paused")]
Paused,

/// <summary>
/// The state transitions to "unhealthy" when a health-probe check fails/times out.
/// </summary>
[EnumMember(Value = "unhealthy")]
Unhealthy,

/// <summary>
/// The "Stopped" state indicates that the module exited successfully (with a zero
/// exit code).
/// </summary>
[EnumMember(Value = "stopped")]
Stopped,

/// <summary>
/// The "Failed" state indicates that the module exited with a failure exit code
/// (non-zer0). The module can transition back to "Backoff" from this state
/// depending on the restart policy in effect.
///
/// This state can indicate that the module has experienced an unrecoverable error.
/// This happens when the MMA has given up on trying to resuscitate the module and user
/// action is required to update its code/configuration in order for it to work again
/// which would mean that a new deployment is required.
/// </summary>
[EnumMember(Value = "failed")]
Failed
}

[JsonConverter(typeof(StringEnumConverter))]
public enum RestartPolicy
{
[EnumMember(Value = "never")]
Never = 0,

[EnumMember(Value = "on-failure")]
OnFailure = 1,

[EnumMember(Value = "on-unhealthy")]
OnUnhealthy = 2,

[EnumMember(Value = "always")]
Always = 3
}

public interface IModule : IEquatable<IModule>
Expand All @@ -34,8 +91,10 @@ public interface IModule : IEquatable<IModule>
string Type { get; }

[JsonProperty(PropertyName = "status")]
ModuleStatus Status { get; }
ModuleStatus DesiredStatus { get; }

[JsonProperty(PropertyName = "restartPolicy")]
RestartPolicy RestartPolicy { get; }
}

public interface IModule<TConfig> : IModule, IEquatable<IModule<TConfig>>
Expand Down
Loading

0 comments on commit aeb5521

Please sign in to comment.