-
Notifications
You must be signed in to change notification settings - Fork 521
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add additional error handling to CosmosHealthCheck #4781
base: main
Are you sure you want to change the base?
Changes from 5 commits
c494b81
e16044d
8e85f65
5950354
03a7162
7653e90
27e8875
dfe120d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -5,6 +5,7 @@ | |
|
||
using System; | ||
using System.Collections.Generic; | ||
using System.Net; | ||
using System.Threading; | ||
using System.Threading.Tasks; | ||
using EnsureThat; | ||
|
@@ -66,6 +67,26 @@ public async Task<HealthCheckResult> CheckHealthAsync(HealthCheckContext context | |
const int maxExecutionTimeInSeconds = 30; | ||
const int maxNumberAttempts = 3; | ||
int attempt = 0; | ||
|
||
// CosmosOperationCanceledException are "safe to retry on and can be treated as timeouts from the retrying perspective.". | ||
// Reference: https://learn.microsoft.com/azure/cosmos-db/nosql/troubleshoot-dotnet-sdk-request-timeout?tabs=cpu-new | ||
// Cosmos 503 and 449 are transient errors that can be retried. | ||
// Reference: https://learn.microsoft.com/azure/cosmos-db/nosql/conceptual-resilient-sdk-applications#should-my-application-retry-on-errors | ||
static bool IsRetryableException(Exception ex) => | ||
ex is CosmosOperationCanceledException || | ||
(ex is CosmosException cex && (cex.StatusCode == HttpStatusCode.ServiceUnavailable || cex.StatusCode == (HttpStatusCode)449)); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should we include HttpStatusCode 408 as well? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Reading the code now I see that HTTP408 is treated differently. That's fine. |
||
|
||
void LogAdditionalRetryableExceptionDetails(Exception exception) | ||
{ | ||
if (exception is CosmosException cosmosException && cosmosException.StatusCode == HttpStatusCode.ServiceUnavailable) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should we also add the logginc for HTTP449 and HTTP408? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Actually this can be removed because it's added when useful to the exception message in the CosmosSDK. |
||
{ | ||
_logger.LogWarning( | ||
cosmosException, | ||
"Received a ServiceUnavailable response from Cosmos DB. Retrying. Diagnostics: {CosmosDiagnostics}", | ||
cosmosException.Diagnostics?.ToString() ?? "empty"); | ||
} | ||
} | ||
|
||
do | ||
{ | ||
cancellationToken.ThrowIfCancellationRequested(); | ||
|
@@ -76,18 +97,17 @@ public async Task<HealthCheckResult> CheckHealthAsync(HealthCheckContext context | |
await _testProvider.PerformTestAsync(_container.Value, operationTokenSource.Token); | ||
return HealthCheckResult.Healthy("Successfully connected."); | ||
} | ||
catch (CosmosOperationCanceledException coce) | ||
catch (Exception ex) when (IsRetryableException(ex)) | ||
{ | ||
// CosmosOperationCanceledException are "safe to retry on and can be treated as timeouts from the retrying perspective.". | ||
// Reference: https://learn.microsoft.com/en-us/azure/cosmos-db/nosql/troubleshoot-dotnet-sdk-request-timeout?tabs=cpu-new | ||
attempt++; | ||
|
||
if (cancellationToken.IsCancellationRequested) | ||
{ | ||
// Handling an extenal cancellation. | ||
// No reasons to retry as the cancellation was external to the health check. | ||
|
||
_logger.LogWarning(coce, "Failed to connect to the data store. External cancellation requested."); | ||
_logger.LogWarning(ex, "Failed to connect to the data store. External cancellation requested."); | ||
LogAdditionalRetryableExceptionDetails(ex); | ||
|
||
return HealthCheckResult.Unhealthy( | ||
description: UnhealthyDescription, | ||
|
@@ -102,10 +122,11 @@ public async Task<HealthCheckResult> CheckHealthAsync(HealthCheckContext context | |
// This is a very rare situation. This condition indicates that multiple attempts to connect to the data store happened, but they were not successful. | ||
|
||
_logger.LogWarning( | ||
coce, | ||
ex, | ||
"Failed to connect to the data store. There were {NumberOfAttempts} attempts to connect to the data store, but they suffered a '{ExceptionType}'.", | ||
attempt, | ||
nameof(CosmosOperationCanceledException)); | ||
ex.GetType().Name); | ||
LogAdditionalRetryableExceptionDetails(ex); | ||
|
||
return HealthCheckResult.Unhealthy( | ||
description: UnhealthyDescription, | ||
|
@@ -118,12 +139,12 @@ public async Task<HealthCheckResult> CheckHealthAsync(HealthCheckContext context | |
else | ||
{ | ||
// Number of attempts not reached. Allow retry. | ||
|
||
_logger.LogWarning( | ||
coce, | ||
ex, | ||
"Failed to connect to the data store. Attempt {NumberOfAttempts}. '{ExceptionType}'.", | ||
attempt, | ||
nameof(CosmosOperationCanceledException)); | ||
ex.GetType().Name); | ||
LogAdditionalRetryableExceptionDetails(ex); | ||
} | ||
} | ||
catch (CosmosException ex) when (ex.IsCmkClientError()) | ||
|
@@ -142,6 +163,22 @@ public async Task<HealthCheckResult> CheckHealthAsync(HealthCheckContext context | |
{ "Error", FhirHealthErrorCode.Error412.ToString() }, | ||
}); | ||
} | ||
catch (CosmosException ex) when (ex.StatusCode == HttpStatusCode.RequestTimeout) | ||
{ | ||
// Handling timeout exceptions | ||
|
||
_logger.LogWarning( | ||
ex, | ||
"Failed to connect to the data store. Request has timed out."); | ||
|
||
return HealthCheckResult.Degraded( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should timeout requests to CosmosDB result in Degraded or ServiceUnavailable? 408 status code can mean the database is overloaded from client requests. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Discussed with @fhibf - degraded is the proper behavior here. |
||
description: DegradedDescription, | ||
data: new Dictionary<string, object> | ||
{ | ||
{ "Reason", HealthStatusReason.ServiceDegraded }, | ||
{ "Error", FhirHealthErrorCode.Error408.ToString() }, | ||
}); | ||
} | ||
catch (Exception ex) when (ex.IsRequestRateExceeded()) | ||
{ | ||
// Handling request rate exceptions. | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Added this helper due to code scanning errors.