Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add additional error handling to CosmosHealthCheck #4781

Open
wants to merge 8 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -61,48 +61,100 @@ public async Task GivenCosmosDbCanBeQueried_WhenHealthIsChecked_ThenHealthyState
Assert.Equal(HealthStatus.Healthy, result.Status);
}

[Fact]
public async Task GivenCosmosDb_WhenCosmosOperationCanceledExceptionIsAlwaysThrown_ThenUnhealthyStateShouldBeReturned()
[Theory]
[InlineData(typeof(CosmosOperationCanceledException))]
[InlineData(typeof(CosmosException))]
public async Task GivenCosmosDb_WhenRetryableExceptionIsAlwaysThrown_ThenUnhealthyStateShouldBeReturned(Type exceptionType)
{
// This test simulates that all Health Check calls result in OperationCanceledExceptions.
// And all retries should fail.

var diagnostics = Substitute.For<CosmosDiagnostics>();
var coce = new CosmosOperationCanceledException(originalException: new OperationCanceledException(), diagnostics);
// Arrange
Exception exception;

if (exceptionType == typeof(CosmosOperationCanceledException))
{
exception = new CosmosOperationCanceledException(
originalException: new OperationCanceledException(),
diagnostics: Substitute.For<CosmosDiagnostics>());
}
else if (exceptionType == typeof(CosmosException))
{
exception = new CosmosException(
message: "Service Unavailable",
statusCode: System.Net.HttpStatusCode.ServiceUnavailable,
subStatusCode: 0,
activityId: Guid.NewGuid().ToString(),
requestCharge: 0);
}
else
{
throw new ArgumentException("Unsupported exception type.");
}

_testProvider.PerformTestAsync(default, CancellationToken.None).ThrowsForAnyArgs(exception);

_testProvider.PerformTestAsync(default, CancellationToken.None).ThrowsForAnyArgs(coce);
// Act
HealthCheckResult result = await _healthCheck.CheckHealthAsync(new HealthCheckContext());

// Assert
Assert.Equal(HealthStatus.Unhealthy, result.Status);
_testProvider.ReceivedWithAnyArgs(3);
_testProvider.ReceivedWithAnyArgs(3); // Ensure the maximum retries were attempted
}

[Fact]
public async Task GivenCosmosDb_WhenCosmosOperationCanceledExceptionIsOnceThrown_ThenHealthyStateShouldBeReturned()
[Theory]
[InlineData(typeof(CosmosOperationCanceledException))]
[InlineData(typeof(CosmosException))]
public async Task GivenCosmosDb_WhenRetryableExceptionIsOnceThrown_ThenHealthyStateShouldBeReturned(Type exceptionType)
{
// This test simulates that the first call to Health Check results in an OperationCanceledException.
// The first attempt should fail, but the next ones should pass.

var diagnostics = Substitute.For<CosmosDiagnostics>();
var coce = new CosmosOperationCanceledException(originalException: new OperationCanceledException(), diagnostics);
// Arrange
Exception exception;

int runs = 0;
Func<Task> fakeRetry = () =>
if (exceptionType == typeof(CosmosOperationCanceledException))
{
exception = new CosmosOperationCanceledException(
originalException: new OperationCanceledException(),
diagnostics: Substitute.For<CosmosDiagnostics>());
}
else if (exceptionType == typeof(CosmosException))
{
exception = new CosmosException(
message: "Service Unavailable",
statusCode: System.Net.HttpStatusCode.ServiceUnavailable,
subStatusCode: 0,
activityId: Guid.NewGuid().ToString(),
requestCharge: 0);
}
else
{
runs++;
if (runs == 1)
throw new ArgumentException("Unsupported exception type.");
}

int runs = 0;

// Simulate failure on the first attempt and success on subsequent attempts
_testProvider.PerformTestAsync(default, CancellationToken.None)
.ReturnsForAnyArgs(_ =>
{
throw coce;
}
runs++;
if (runs == 1)
{
throw exception;
}

return Task.CompletedTask;
};
return Task.CompletedTask;
});

_testProvider.PerformTestAsync(default, CancellationToken.None).ReturnsForAnyArgs(x => fakeRetry());
// Act
HealthCheckResult result = await _healthCheck.CheckHealthAsync(new HealthCheckContext());

Assert.Equal(HealthStatus.Healthy, result.Status);
_testProvider.ReceivedWithAnyArgs(2);
// Assert
Assert.Equal(HealthStatus.Healthy, result.Status); // Final state should be Healthy
Assert.Equal(2, runs); // Ensure 2 attempts were made
_testProvider.ReceivedWithAnyArgs(2); // Verify PerformTestAsync was called twice
}

[Fact]
Expand Down Expand Up @@ -130,11 +182,9 @@ public async Task GivenCosmosAccessIsForbidden_IsClientCmkError_WhenHealthIsChec
Assert.NotNull(result.Data);
Assert.True(result.Data.Any());

Assert.True(result.Data.ContainsKey("Reason"));
Assert.Equal(HealthStatusReason.CustomerManagedKeyAccessLost, result.Data["Reason"]);
VerifyErrorInResult(result.Data, "Reason", HealthStatusReason.CustomerManagedKeyAccessLost.ToString());

Assert.True(result.Data.ContainsKey("Error"));
Assert.Equal(FhirHealthErrorCode.Error412.ToString(), result.Data["Error"]);
VerifyErrorInResult(result.Data, "Error", FhirHealthErrorCode.Error412.ToString());
}
}

Expand All @@ -156,9 +206,7 @@ public async Task GivenCosmosAccessIsForbidden_IsNotClientCmkError_WhenHealthIsC

Assert.NotNull(result.Data);
Assert.True(result.Data.Any());

Assert.True(result.Data.ContainsKey("Error"));
Assert.Equal(FhirHealthErrorCode.Error500.ToString(), result.Data["Error"]);
VerifyErrorInResult(result.Data, "Error", FhirHealthErrorCode.Error500.ToString());
}

[Fact]
Expand All @@ -170,9 +218,38 @@ public async Task GivenCosmosDbWithTooManyRequests_WhenHealthIsChecked_ThenHealt
HealthCheckResult result = await _healthCheck.CheckHealthAsync(new HealthCheckContext());

Assert.Equal(HealthStatus.Degraded, result.Status);
VerifyErrorInResult(result.Data, "Error", FhirHealthErrorCode.Error429.ToString());
}

[Fact]
public async Task GivenCosmosDbWithTimeout_WhenHealthIsChecked_ThenHealthyStateShouldBeReturned()
{
var exception = new CosmosException(
message: "RequestTimeout",
statusCode: HttpStatusCode.RequestTimeout,
subStatusCode: 0,
activityId: Guid.NewGuid().ToString(),
requestCharge: 0);

Assert.True(result.Data.ContainsKey("Error"));
Assert.Equal(FhirHealthErrorCode.Error429.ToString(), result.Data["Error"]);
_testProvider.PerformTestAsync(default, CancellationToken.None)
.ThrowsForAnyArgs(exception);

HealthCheckResult result = await _healthCheck.CheckHealthAsync(new HealthCheckContext());

Assert.Equal(HealthStatus.Degraded, result.Status);
VerifyErrorInResult(result.Data, "Error", FhirHealthErrorCode.Error408.ToString());
}

private void VerifyErrorInResult(IReadOnlyDictionary<string, object> dictionary, string key, string expectedMessage)
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added this helper due to code scanning errors.

{
if (dictionary.TryGetValue(key, out var actualValue))
{
Assert.Equal(expectedMessage, actualValue.ToString());
}
else
{
Assert.Fail($"Expected key '{key}' not found in the dictionary.");
}
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

using System;
using System.Collections.Generic;
using System.Net;
using System.Threading;
using System.Threading.Tasks;
using EnsureThat;
Expand Down Expand Up @@ -66,6 +67,26 @@ public async Task<HealthCheckResult> CheckHealthAsync(HealthCheckContext context
const int maxExecutionTimeInSeconds = 30;
const int maxNumberAttempts = 3;
int attempt = 0;

// CosmosOperationCanceledException are "safe to retry on and can be treated as timeouts from the retrying perspective.".
// Reference: https://learn.microsoft.com/azure/cosmos-db/nosql/troubleshoot-dotnet-sdk-request-timeout?tabs=cpu-new
// Cosmos 503 and 449 are transient errors that can be retried.
// Reference: https://learn.microsoft.com/azure/cosmos-db/nosql/conceptual-resilient-sdk-applications#should-my-application-retry-on-errors
static bool IsRetryableException(Exception ex) =>
ex is CosmosOperationCanceledException ||
(ex is CosmosException cex && (cex.StatusCode == HttpStatusCode.ServiceUnavailable || cex.StatusCode == (HttpStatusCode)449));
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we include HttpStatusCode 408 as well?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Reading the code now I see that HTTP408 is treated differently. That's fine.


void LogAdditionalRetryableExceptionDetails(Exception exception)
{
if (exception is CosmosException cosmosException && cosmosException.StatusCode == HttpStatusCode.ServiceUnavailable)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we also add the logginc for HTTP449 and HTTP408?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

{
_logger.LogWarning(
cosmosException,
"Received a ServiceUnavailable response from Cosmos DB. Retrying. Diagnostics: {CosmosDiagnostics}",
cosmosException.Diagnostics?.ToString() ?? "empty");
}
}

do
{
cancellationToken.ThrowIfCancellationRequested();
Expand All @@ -76,18 +97,17 @@ public async Task<HealthCheckResult> CheckHealthAsync(HealthCheckContext context
await _testProvider.PerformTestAsync(_container.Value, operationTokenSource.Token);
return HealthCheckResult.Healthy("Successfully connected.");
}
catch (CosmosOperationCanceledException coce)
catch (Exception ex) when (IsRetryableException(ex))
{
// CosmosOperationCanceledException are "safe to retry on and can be treated as timeouts from the retrying perspective.".
// Reference: https://learn.microsoft.com/en-us/azure/cosmos-db/nosql/troubleshoot-dotnet-sdk-request-timeout?tabs=cpu-new
attempt++;

if (cancellationToken.IsCancellationRequested)
{
// Handling an extenal cancellation.
// No reasons to retry as the cancellation was external to the health check.

_logger.LogWarning(coce, "Failed to connect to the data store. External cancellation requested.");
_logger.LogWarning(ex, "Failed to connect to the data store. External cancellation requested.");
LogAdditionalRetryableExceptionDetails(ex);

return HealthCheckResult.Unhealthy(
description: UnhealthyDescription,
Expand All @@ -102,10 +122,11 @@ public async Task<HealthCheckResult> CheckHealthAsync(HealthCheckContext context
// This is a very rare situation. This condition indicates that multiple attempts to connect to the data store happened, but they were not successful.

_logger.LogWarning(
coce,
ex,
"Failed to connect to the data store. There were {NumberOfAttempts} attempts to connect to the data store, but they suffered a '{ExceptionType}'.",
attempt,
nameof(CosmosOperationCanceledException));
ex.GetType().Name);
LogAdditionalRetryableExceptionDetails(ex);

return HealthCheckResult.Unhealthy(
description: UnhealthyDescription,
Expand All @@ -118,12 +139,12 @@ public async Task<HealthCheckResult> CheckHealthAsync(HealthCheckContext context
else
{
// Number of attempts not reached. Allow retry.

_logger.LogWarning(
coce,
ex,
"Failed to connect to the data store. Attempt {NumberOfAttempts}. '{ExceptionType}'.",
attempt,
nameof(CosmosOperationCanceledException));
ex.GetType().Name);
LogAdditionalRetryableExceptionDetails(ex);
}
}
catch (CosmosException ex) when (ex.IsCmkClientError())
Expand All @@ -142,6 +163,22 @@ public async Task<HealthCheckResult> CheckHealthAsync(HealthCheckContext context
{ "Error", FhirHealthErrorCode.Error412.ToString() },
});
}
catch (CosmosException ex) when (ex.StatusCode == HttpStatusCode.RequestTimeout)
{
// Handling timeout exceptions

_logger.LogWarning(
ex,
"Failed to connect to the data store. Request has timed out.");

return HealthCheckResult.Degraded(
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should timeout requests to CosmosDB result in Degraded or ServiceUnavailable? 408 status code can mean the database is overloaded from client requests.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Discussed with @fhibf - degraded is the proper behavior here.

description: DegradedDescription,
data: new Dictionary<string, object>
{
{ "Reason", HealthStatusReason.ServiceDegraded },
{ "Error", FhirHealthErrorCode.Error408.ToString() },
});
}
catch (Exception ex) when (ex.IsRequestRateExceeded())
{
// Handling request rate exceptions.
Expand Down
Loading