From c494b81de745ca2958087cf150026db83d9f811b Mon Sep 17 00:00:00 2001 From: Mikael Weaver Date: Wed, 15 Jan 2025 11:14:20 -0800 Subject: [PATCH 1/8] Added additional error handling to CosmosHealthchecks --- .../Features/Health/CosmosHealthCheck.cs | 123 ++++++++++++------ 1 file changed, 84 insertions(+), 39 deletions(-) diff --git a/src/Microsoft.Health.Fhir.CosmosDb/Features/Health/CosmosHealthCheck.cs b/src/Microsoft.Health.Fhir.CosmosDb/Features/Health/CosmosHealthCheck.cs index b6782e62e9..fb7b8b90c8 100644 --- a/src/Microsoft.Health.Fhir.CosmosDb/Features/Health/CosmosHealthCheck.cs +++ b/src/Microsoft.Health.Fhir.CosmosDb/Features/Health/CosmosHealthCheck.cs @@ -5,6 +5,7 @@ using System; using System.Collections.Generic; +using System.Net; using System.Threading; using System.Threading.Tasks; using EnsureThat; @@ -79,51 +80,32 @@ public async Task CheckHealthAsync(HealthCheckContext context catch (CosmosOperationCanceledException coce) { // CosmosOperationCanceledException are "safe to retry on and can be treated as timeouts from the retrying perspective.". - // Reference: https://learn.microsoft.com/en-us/azure/cosmos-db/nosql/troubleshoot-dotnet-sdk-request-timeout?tabs=cpu-new + // Reference: https://learn.microsoft.com/azure/cosmos-db/nosql/troubleshoot-dotnet-sdk-request-timeout?tabs=cpu-new attempt++; - - if (cancellationToken.IsCancellationRequested) + var result = HandleRetry(coce, attempt, maxNumberAttempts, nameof(CosmosOperationCanceledException), cancellationToken); + if (result.HasValue) { - // Handling an extenal cancellation. - // No reasons to retry as the cancellation was external to the health check. - - _logger.LogWarning(coce, "Failed to connect to the data store. External cancellation requested."); - - return HealthCheckResult.Unhealthy( - description: UnhealthyDescription, - data: new Dictionary - { - { "Reason", HealthStatusReason.ServiceUnavailable }, - { "Error", FhirHealthErrorCode.Error408.ToString() }, - }); + return result.Value; } - else if (attempt >= maxNumberAttempts) + } + catch (CosmosException ex) when (ex.StatusCode == HttpStatusCode.ServiceUnavailable || + ex.StatusCode == (HttpStatusCode)449) + { + // Cosmos 503 and 449 are transient errors that can be retried. + // Reference: https://learn.microsoft.com/azure/cosmos-db/nosql/conceptual-resilient-sdk-applications#should-my-application-retry-on-errors + attempt++; + var result = HandleRetry(ex, attempt, maxNumberAttempts, nameof(CosmosException), cancellationToken); + + // Log additional diagnostics for 503 errors. + if (ex.StatusCode == HttpStatusCode.ServiceUnavailable) { - // This is a very rare situation. This condition indicates that multiple attempts to connect to the data store happened, but they were not successful. - - _logger.LogWarning( - coce, - "Failed to connect to the data store. There were {NumberOfAttempts} attempts to connect to the data store, but they suffered a '{ExceptionType}'.", - attempt, - nameof(CosmosOperationCanceledException)); - - return HealthCheckResult.Unhealthy( - description: UnhealthyDescription, - data: new Dictionary - { - { "Reason", HealthStatusReason.ServiceUnavailable }, - { "Error", FhirHealthErrorCode.Error501.ToString() }, - }); + var diagnostics = ex.Diagnostics?.ToString() ?? "empty"; + _logger.LogWarning(ex, "Received a ServiceUnavailable response from Cosmos DB. Retrying. Diagnostics: {CosmosDiagnostics}", diagnostics); } - else - { - // Number of attempts not reached. Allow retry. - _logger.LogWarning( - coce, - "Failed to connect to the data store. Attempt {NumberOfAttempts}. '{ExceptionType}'.", - attempt, - nameof(CosmosOperationCanceledException)); + if (result.HasValue) + { + return result.Value; } } catch (CosmosException ex) when (ex.IsCmkClientError()) @@ -142,6 +124,22 @@ public async Task CheckHealthAsync(HealthCheckContext context { "Error", FhirHealthErrorCode.Error412.ToString() }, }); } + catch (CosmosException ex) when (ex.StatusCode == HttpStatusCode.RequestTimeout) + { + // Handling timeout exceptions + + _logger.LogWarning( + ex, + "Failed to connect to the data store. Request has timed out."); + + return HealthCheckResult.Degraded( + description: DegradedDescription, + data: new Dictionary + { + { "Reason", HealthStatusReason.ServiceDegraded }, + { "Error", FhirHealthErrorCode.Error408.ToString() }, + }); + } catch (Exception ex) when (ex.IsRequestRateExceeded()) { // Handling request rate exceptions. @@ -176,5 +174,52 @@ public async Task CheckHealthAsync(HealthCheckContext context } while (true); } + + private HealthCheckResult? HandleRetry(Exception ex, int attempt, int maxNumberAttempts, string exceptionType, CancellationToken cancellationToken) + { + if (cancellationToken.IsCancellationRequested) + { + // Handling an extenal cancellation. + // No reasons to retry as the cancellation was external to the health check. + + _logger.LogWarning(ex, "Failed to connect to the data store. External cancellation requested."); + + return HealthCheckResult.Unhealthy( + description: UnhealthyDescription, + data: new Dictionary + { + { "Reason", HealthStatusReason.ServiceUnavailable }, + { "Error", FhirHealthErrorCode.Error408.ToString() }, + }); + } + + if (attempt >= maxNumberAttempts) + { + // This is a very rare situation. This condition indicates that multiple attempts to connect to the data store happened, but they were not successful. + + _logger.LogWarning( + ex, + "Failed to connect to the data store. There were {NumberOfAttempts} attempts to connect to the data store, but they suffered a '{ExceptionType}'.", + attempt, + exceptionType); + + return HealthCheckResult.Unhealthy( + description: UnhealthyDescription, + data: new Dictionary + { + { "Reason", HealthStatusReason.ServiceUnavailable }, + { "Error", FhirHealthErrorCode.Error501.ToString() }, + }); + } + + // Number of attempts not reached. Allow retry. + _logger.LogWarning( + ex, + "Failed to connect to the data store. Attempt {NumberOfAttempts}. '{ExceptionType}'.", + attempt, + exceptionType); + + return null; // Indicates that the retry loop should continue. + } } } From e16044dc04c6b73e9e2f8dab98074a66957da632 Mon Sep 17 00:00:00 2001 From: Mikael Weaver Date: Wed, 15 Jan 2025 11:57:26 -0800 Subject: [PATCH 2/8] Added tests --- .../Features/Health/CosmosHealthCheckTests.cs | 117 ++++++++++++++---- 1 file changed, 92 insertions(+), 25 deletions(-) diff --git a/src/Microsoft.Health.Fhir.CosmosDb.UnitTests/Features/Health/CosmosHealthCheckTests.cs b/src/Microsoft.Health.Fhir.CosmosDb.UnitTests/Features/Health/CosmosHealthCheckTests.cs index c271c61976..4beca38d08 100644 --- a/src/Microsoft.Health.Fhir.CosmosDb.UnitTests/Features/Health/CosmosHealthCheckTests.cs +++ b/src/Microsoft.Health.Fhir.CosmosDb.UnitTests/Features/Health/CosmosHealthCheckTests.cs @@ -61,48 +61,94 @@ public async Task GivenCosmosDbCanBeQueried_WhenHealthIsChecked_ThenHealthyState Assert.Equal(HealthStatus.Healthy, result.Status); } - [Fact] - public async Task GivenCosmosDb_WhenCosmosOperationCanceledExceptionIsAlwaysThrown_ThenUnhealthyStateShouldBeReturned() + [Theory] + [InlineData(typeof(CosmosOperationCanceledException))] + [InlineData(typeof(CosmosException))] + public async Task GivenCosmosDb_WhenRetryableExceptionIsAlwaysThrown_ThenUnhealthyStateShouldBeReturned(Type exceptionType) { - // This test simulates that all Health Check calls result in OperationCanceledExceptions. - // And all retries should fail. + // Arrange + Exception exception; + + if (exceptionType == typeof(CosmosOperationCanceledException)) + { + exception = new CosmosOperationCanceledException( + originalException: new OperationCanceledException(), + diagnostics: Substitute.For()); + } + else if (exceptionType == typeof(CosmosException)) + { + exception = new CosmosException( + message: "Service Unavailable", + statusCode: System.Net.HttpStatusCode.ServiceUnavailable, + subStatusCode: 0, + activityId: Guid.NewGuid().ToString(), + requestCharge: 0); + } + else + { + throw new ArgumentException("Unsupported exception type."); + } - var diagnostics = Substitute.For(); - var coce = new CosmosOperationCanceledException(originalException: new OperationCanceledException(), diagnostics); + _testProvider.PerformTestAsync(default, CancellationToken.None).ThrowsForAnyArgs(exception); - _testProvider.PerformTestAsync(default, CancellationToken.None).ThrowsForAnyArgs(coce); + // Act HealthCheckResult result = await _healthCheck.CheckHealthAsync(new HealthCheckContext()); + // Assert Assert.Equal(HealthStatus.Unhealthy, result.Status); - _testProvider.ReceivedWithAnyArgs(3); + _testProvider.ReceivedWithAnyArgs(3); // Ensure the maximum retries were attempted } - [Fact] - public async Task GivenCosmosDb_WhenCosmosOperationCanceledExceptionIsOnceThrown_ThenHealthyStateShouldBeReturned() + [Theory] + [InlineData(typeof(CosmosOperationCanceledException))] + [InlineData(typeof(CosmosException))] + public async Task GivenCosmosDb_WhenRetryableExceptionIsOnceThrown_ThenHealthyStateShouldBeReturned(Type exceptionType) { - // This test simulates that the first call to Health Check results in an OperationCanceledException. - // The first attempt should fail, but the next ones should pass. + // Arrange + Exception exception; - var diagnostics = Substitute.For(); - var coce = new CosmosOperationCanceledException(originalException: new OperationCanceledException(), diagnostics); + if (exceptionType == typeof(CosmosOperationCanceledException)) + { + exception = new CosmosOperationCanceledException( + originalException: new OperationCanceledException(), + diagnostics: Substitute.For()); + } + else if (exceptionType == typeof(CosmosException)) + { + exception = new CosmosException( + message: "Service Unavailable", + statusCode: System.Net.HttpStatusCode.ServiceUnavailable, + subStatusCode: 0, + activityId: Guid.NewGuid().ToString(), + requestCharge: 0); + } + else + { + throw new ArgumentException("Unsupported exception type."); + } int runs = 0; - Func fakeRetry = () => - { - runs++; - if (runs == 1) + + // Simulate failure on the first attempt and success on subsequent attempts + _testProvider.PerformTestAsync(default, CancellationToken.None) + .ReturnsForAnyArgs(_ => { - throw coce; - } + runs++; + if (runs == 1) + { + throw exception; + } - return Task.CompletedTask; - }; + return Task.CompletedTask; + }); - _testProvider.PerformTestAsync(default, CancellationToken.None).ReturnsForAnyArgs(x => fakeRetry()); + // Act HealthCheckResult result = await _healthCheck.CheckHealthAsync(new HealthCheckContext()); - Assert.Equal(HealthStatus.Healthy, result.Status); - _testProvider.ReceivedWithAnyArgs(2); + // Assert + Assert.Equal(HealthStatus.Healthy, result.Status); // Final state should be Healthy + Assert.Equal(2, runs); // Ensure 2 attempts were made + _testProvider.ReceivedWithAnyArgs(2); // Verify PerformTestAsync was called twice } [Fact] @@ -174,5 +220,26 @@ public async Task GivenCosmosDbWithTooManyRequests_WhenHealthIsChecked_ThenHealt Assert.True(result.Data.ContainsKey("Error")); Assert.Equal(FhirHealthErrorCode.Error429.ToString(), result.Data["Error"]); } + + [Fact] + public async Task GivenCosmosDbWithTimeout_WhenHealthIsChecked_ThenHealthyStateShouldBeReturned() + { + var exception = new CosmosException( + message: "RequestTimeout", + statusCode: HttpStatusCode.RequestTimeout, + subStatusCode: 0, + activityId: Guid.NewGuid().ToString(), + requestCharge: 0); + + _testProvider.PerformTestAsync(default, CancellationToken.None) + .ThrowsForAnyArgs(exception); + + HealthCheckResult result = await _healthCheck.CheckHealthAsync(new HealthCheckContext()); + + Assert.Equal(HealthStatus.Degraded, result.Status); + + Assert.True(result.Data.ContainsKey("Error")); + Assert.Equal(FhirHealthErrorCode.Error408.ToString(), result.Data["Error"]); + } } } From 8e85f65a1f299c7ab28cacb253512ad56735582c Mon Sep 17 00:00:00 2001 From: Mikael Weaver Date: Wed, 15 Jan 2025 15:49:56 -0800 Subject: [PATCH 3/8] fix code scanning error in tests --- .../Features/Health/CosmosHealthCheckTests.cs | 28 +++++++++++-------- 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/src/Microsoft.Health.Fhir.CosmosDb.UnitTests/Features/Health/CosmosHealthCheckTests.cs b/src/Microsoft.Health.Fhir.CosmosDb.UnitTests/Features/Health/CosmosHealthCheckTests.cs index 4beca38d08..2cad88c630 100644 --- a/src/Microsoft.Health.Fhir.CosmosDb.UnitTests/Features/Health/CosmosHealthCheckTests.cs +++ b/src/Microsoft.Health.Fhir.CosmosDb.UnitTests/Features/Health/CosmosHealthCheckTests.cs @@ -176,11 +176,9 @@ public async Task GivenCosmosAccessIsForbidden_IsClientCmkError_WhenHealthIsChec Assert.NotNull(result.Data); Assert.True(result.Data.Any()); - Assert.True(result.Data.ContainsKey("Reason")); - Assert.Equal(HealthStatusReason.CustomerManagedKeyAccessLost, result.Data["Reason"]); + VerifyErrorInResult(result.Data, "Reason", HealthStatusReason.CustomerManagedKeyAccessLost.ToString()); - Assert.True(result.Data.ContainsKey("Error")); - Assert.Equal(FhirHealthErrorCode.Error412.ToString(), result.Data["Error"]); + VerifyErrorInResult(result.Data, "Error", FhirHealthErrorCode.Error412.ToString()); } } @@ -202,9 +200,7 @@ public async Task GivenCosmosAccessIsForbidden_IsNotClientCmkError_WhenHealthIsC Assert.NotNull(result.Data); Assert.True(result.Data.Any()); - - Assert.True(result.Data.ContainsKey("Error")); - Assert.Equal(FhirHealthErrorCode.Error500.ToString(), result.Data["Error"]); + VerifyErrorInResult(result.Data, "Error", FhirHealthErrorCode.Error500.ToString()); } [Fact] @@ -216,9 +212,7 @@ public async Task GivenCosmosDbWithTooManyRequests_WhenHealthIsChecked_ThenHealt HealthCheckResult result = await _healthCheck.CheckHealthAsync(new HealthCheckContext()); Assert.Equal(HealthStatus.Degraded, result.Status); - - Assert.True(result.Data.ContainsKey("Error")); - Assert.Equal(FhirHealthErrorCode.Error429.ToString(), result.Data["Error"]); + VerifyErrorInResult(result.Data, "Error", FhirHealthErrorCode.Error429.ToString()); } [Fact] @@ -237,9 +231,19 @@ public async Task GivenCosmosDbWithTimeout_WhenHealthIsChecked_ThenHealthyStateS HealthCheckResult result = await _healthCheck.CheckHealthAsync(new HealthCheckContext()); Assert.Equal(HealthStatus.Degraded, result.Status); + VerifyErrorInResult(result.Data, "Error", FhirHealthErrorCode.Error408.ToString()); + } - Assert.True(result.Data.ContainsKey("Error")); - Assert.Equal(FhirHealthErrorCode.Error408.ToString(), result.Data["Error"]); + private void VerifyErrorInResult(IReadOnlyDictionary dictionary, string key, string expectedMessage) + { + if (dictionary.TryGetValue(key, out var actualValue)) + { + Assert.Equal(expectedMessage, actualValue.ToString()); + } + else + { + Assert.Fail($"Expected key '{key}' not found in the dictionary."); + } } } } From 595035418d3e8e9fa5d74abf3f0b8a5ca54e8ddb Mon Sep 17 00:00:00 2001 From: Mikael Weaver Date: Fri, 17 Jan 2025 10:47:54 -0800 Subject: [PATCH 4/8] make code easier to review --- .../Features/Health/CosmosHealthCheck.cs | 130 ++++++++---------- 1 file changed, 61 insertions(+), 69 deletions(-) diff --git a/src/Microsoft.Health.Fhir.CosmosDb/Features/Health/CosmosHealthCheck.cs b/src/Microsoft.Health.Fhir.CosmosDb/Features/Health/CosmosHealthCheck.cs index fb7b8b90c8..79bf4503cc 100644 --- a/src/Microsoft.Health.Fhir.CosmosDb/Features/Health/CosmosHealthCheck.cs +++ b/src/Microsoft.Health.Fhir.CosmosDb/Features/Health/CosmosHealthCheck.cs @@ -67,6 +67,26 @@ public async Task CheckHealthAsync(HealthCheckContext context const int maxExecutionTimeInSeconds = 30; const int maxNumberAttempts = 3; int attempt = 0; + + // CosmosOperationCanceledException are "safe to retry on and can be treated as timeouts from the retrying perspective.". + // Reference: https://learn.microsoft.com/azure/cosmos-db/nosql/troubleshoot-dotnet-sdk-request-timeout?tabs=cpu-new + // Cosmos 503 and 449 are transient errors that can be retried. + // Reference: https://learn.microsoft.com/azure/cosmos-db/nosql/conceptual-resilient-sdk-applications#should-my-application-retry-on-errors + static bool IsRetryableException(Exception ex) => + ex is CosmosOperationCanceledException || + (ex is CosmosException cex && (cex.StatusCode == HttpStatusCode.ServiceUnavailable || cex.StatusCode == (HttpStatusCode)449)); + + void LogAdditionalRetryableExceptionDetails(Exception exception) + { + if (exception is CosmosException cosmosException && cosmosException.StatusCode == HttpStatusCode.ServiceUnavailable) + { + _logger.LogWarning( + cosmosException, + "Received a ServiceUnavailable response from Cosmos DB. Retrying. Diagnostics: {CosmosDiagnostics}", + cosmosException.Diagnostics?.ToString() ?? "empty"); + } + } + do { cancellationToken.ThrowIfCancellationRequested(); @@ -77,35 +97,54 @@ public async Task CheckHealthAsync(HealthCheckContext context await _testProvider.PerformTestAsync(_container.Value, operationTokenSource.Token); return HealthCheckResult.Healthy("Successfully connected."); } - catch (CosmosOperationCanceledException coce) + catch (Exception ex) when (IsRetryableException(ex)) { - // CosmosOperationCanceledException are "safe to retry on and can be treated as timeouts from the retrying perspective.". - // Reference: https://learn.microsoft.com/azure/cosmos-db/nosql/troubleshoot-dotnet-sdk-request-timeout?tabs=cpu-new attempt++; - var result = HandleRetry(coce, attempt, maxNumberAttempts, nameof(CosmosOperationCanceledException), cancellationToken); - if (result.HasValue) + + if (cancellationToken.IsCancellationRequested) { - return result.Value; + // Handling an extenal cancellation. + // No reasons to retry as the cancellation was external to the health check. + + _logger.LogWarning(ex, "Failed to connect to the data store. External cancellation requested."); + LogAdditionalRetryableExceptionDetails(ex); + + return HealthCheckResult.Unhealthy( + description: UnhealthyDescription, + data: new Dictionary + { + { "Reason", HealthStatusReason.ServiceUnavailable }, + { "Error", FhirHealthErrorCode.Error408.ToString() }, + }); } - } - catch (CosmosException ex) when (ex.StatusCode == HttpStatusCode.ServiceUnavailable || - ex.StatusCode == (HttpStatusCode)449) - { - // Cosmos 503 and 449 are transient errors that can be retried. - // Reference: https://learn.microsoft.com/azure/cosmos-db/nosql/conceptual-resilient-sdk-applications#should-my-application-retry-on-errors - attempt++; - var result = HandleRetry(ex, attempt, maxNumberAttempts, nameof(CosmosException), cancellationToken); - - // Log additional diagnostics for 503 errors. - if (ex.StatusCode == HttpStatusCode.ServiceUnavailable) + else if (attempt >= maxNumberAttempts) { - var diagnostics = ex.Diagnostics?.ToString() ?? "empty"; - _logger.LogWarning(ex, "Received a ServiceUnavailable response from Cosmos DB. Retrying. Diagnostics: {CosmosDiagnostics}", diagnostics); + // This is a very rare situation. This condition indicates that multiple attempts to connect to the data store happened, but they were not successful. + + _logger.LogWarning( + ex, + "Failed to connect to the data store. There were {NumberOfAttempts} attempts to connect to the data store, but they suffered a '{ExceptionType}'.", + attempt, + ex.GetType().Name); + LogAdditionalRetryableExceptionDetails(ex); + + return HealthCheckResult.Unhealthy( + description: UnhealthyDescription, + data: new Dictionary + { + { "Reason", HealthStatusReason.ServiceUnavailable }, + { "Error", FhirHealthErrorCode.Error501.ToString() }, + }); } - - if (result.HasValue) + else { - return result.Value; + // Number of attempts not reached. Allow retry. + _logger.LogWarning( + ex, + "Failed to connect to the data store. Attempt {NumberOfAttempts}. '{ExceptionType}'.", + attempt, + ex.GetType().Name); + LogAdditionalRetryableExceptionDetails(ex); } } catch (CosmosException ex) when (ex.IsCmkClientError()) @@ -174,52 +213,5 @@ public async Task CheckHealthAsync(HealthCheckContext context } while (true); } - - private HealthCheckResult? HandleRetry(Exception ex, int attempt, int maxNumberAttempts, string exceptionType, CancellationToken cancellationToken) - { - if (cancellationToken.IsCancellationRequested) - { - // Handling an extenal cancellation. - // No reasons to retry as the cancellation was external to the health check. - - _logger.LogWarning(ex, "Failed to connect to the data store. External cancellation requested."); - - return HealthCheckResult.Unhealthy( - description: UnhealthyDescription, - data: new Dictionary - { - { "Reason", HealthStatusReason.ServiceUnavailable }, - { "Error", FhirHealthErrorCode.Error408.ToString() }, - }); - } - - if (attempt >= maxNumberAttempts) - { - // This is a very rare situation. This condition indicates that multiple attempts to connect to the data store happened, but they were not successful. - - _logger.LogWarning( - ex, - "Failed to connect to the data store. There were {NumberOfAttempts} attempts to connect to the data store, but they suffered a '{ExceptionType}'.", - attempt, - exceptionType); - - return HealthCheckResult.Unhealthy( - description: UnhealthyDescription, - data: new Dictionary - { - { "Reason", HealthStatusReason.ServiceUnavailable }, - { "Error", FhirHealthErrorCode.Error501.ToString() }, - }); - } - - // Number of attempts not reached. Allow retry. - _logger.LogWarning( - ex, - "Failed to connect to the data store. Attempt {NumberOfAttempts}. '{ExceptionType}'.", - attempt, - exceptionType); - - return null; // Indicates that the retry loop should continue. - } } } From 03a7162bedc0898962ebcce55b1c0c1c6c88dc25 Mon Sep 17 00:00:00 2001 From: Mikael Weaver Date: Fri, 17 Jan 2025 10:53:05 -0800 Subject: [PATCH 5/8] add test descriptions back to code --- .../Features/Health/CosmosHealthCheckTests.cs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/Microsoft.Health.Fhir.CosmosDb.UnitTests/Features/Health/CosmosHealthCheckTests.cs b/src/Microsoft.Health.Fhir.CosmosDb.UnitTests/Features/Health/CosmosHealthCheckTests.cs index 2cad88c630..1813bf507a 100644 --- a/src/Microsoft.Health.Fhir.CosmosDb.UnitTests/Features/Health/CosmosHealthCheckTests.cs +++ b/src/Microsoft.Health.Fhir.CosmosDb.UnitTests/Features/Health/CosmosHealthCheckTests.cs @@ -66,6 +66,9 @@ public async Task GivenCosmosDbCanBeQueried_WhenHealthIsChecked_ThenHealthyState [InlineData(typeof(CosmosException))] public async Task GivenCosmosDb_WhenRetryableExceptionIsAlwaysThrown_ThenUnhealthyStateShouldBeReturned(Type exceptionType) { + // This test simulates that all Health Check calls result in OperationCanceledExceptions. + // And all retries should fail. + // Arrange Exception exception; @@ -104,6 +107,9 @@ public async Task GivenCosmosDb_WhenRetryableExceptionIsAlwaysThrown_ThenUnhealt [InlineData(typeof(CosmosException))] public async Task GivenCosmosDb_WhenRetryableExceptionIsOnceThrown_ThenHealthyStateShouldBeReturned(Type exceptionType) { + // This test simulates that the first call to Health Check results in an OperationCanceledException. + // The first attempt should fail, but the next ones should pass. + // Arrange Exception exception; From 7653e90deeb3d5ee41f24cbbe452a3c00c408463 Mon Sep 17 00:00:00 2001 From: Mikael Weaver Date: Fri, 17 Jan 2025 14:48:46 -0800 Subject: [PATCH 6/8] removed unnecessary exception logging --- .../Features/Health/CosmosHealthCheck.cs | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/src/Microsoft.Health.Fhir.CosmosDb/Features/Health/CosmosHealthCheck.cs b/src/Microsoft.Health.Fhir.CosmosDb/Features/Health/CosmosHealthCheck.cs index 79bf4503cc..ad9fee7121 100644 --- a/src/Microsoft.Health.Fhir.CosmosDb/Features/Health/CosmosHealthCheck.cs +++ b/src/Microsoft.Health.Fhir.CosmosDb/Features/Health/CosmosHealthCheck.cs @@ -76,17 +76,6 @@ static bool IsRetryableException(Exception ex) => ex is CosmosOperationCanceledException || (ex is CosmosException cex && (cex.StatusCode == HttpStatusCode.ServiceUnavailable || cex.StatusCode == (HttpStatusCode)449)); - void LogAdditionalRetryableExceptionDetails(Exception exception) - { - if (exception is CosmosException cosmosException && cosmosException.StatusCode == HttpStatusCode.ServiceUnavailable) - { - _logger.LogWarning( - cosmosException, - "Received a ServiceUnavailable response from Cosmos DB. Retrying. Diagnostics: {CosmosDiagnostics}", - cosmosException.Diagnostics?.ToString() ?? "empty"); - } - } - do { cancellationToken.ThrowIfCancellationRequested(); @@ -107,7 +96,6 @@ void LogAdditionalRetryableExceptionDetails(Exception exception) // No reasons to retry as the cancellation was external to the health check. _logger.LogWarning(ex, "Failed to connect to the data store. External cancellation requested."); - LogAdditionalRetryableExceptionDetails(ex); return HealthCheckResult.Unhealthy( description: UnhealthyDescription, @@ -126,7 +114,6 @@ void LogAdditionalRetryableExceptionDetails(Exception exception) "Failed to connect to the data store. There were {NumberOfAttempts} attempts to connect to the data store, but they suffered a '{ExceptionType}'.", attempt, ex.GetType().Name); - LogAdditionalRetryableExceptionDetails(ex); return HealthCheckResult.Unhealthy( description: UnhealthyDescription, @@ -144,7 +131,6 @@ void LogAdditionalRetryableExceptionDetails(Exception exception) "Failed to connect to the data store. Attempt {NumberOfAttempts}. '{ExceptionType}'.", attempt, ex.GetType().Name); - LogAdditionalRetryableExceptionDetails(ex); } } catch (CosmosException ex) when (ex.IsCmkClientError()) From 27e88751a047849fd291eb6a2352cfc13d828b13 Mon Sep 17 00:00:00 2001 From: Mikael Weaver Date: Mon, 20 Jan 2025 14:21:09 -0800 Subject: [PATCH 7/8] Moved logging of CosmosDiagnostic to the property bag --- .../Features/Health/CosmosHealthCheckTests.cs | 96 ++++++++++++++++++- .../Features/Health/TestCosmosHealthCheck.cs | 2 +- .../Features/Health/CosmosHealthCheck.cs | 41 +++++--- 3 files changed, 125 insertions(+), 14 deletions(-) diff --git a/src/Microsoft.Health.Fhir.CosmosDb.UnitTests/Features/Health/CosmosHealthCheckTests.cs b/src/Microsoft.Health.Fhir.CosmosDb.UnitTests/Features/Health/CosmosHealthCheckTests.cs index 1813bf507a..ee39ad3079 100644 --- a/src/Microsoft.Health.Fhir.CosmosDb.UnitTests/Features/Health/CosmosHealthCheckTests.cs +++ b/src/Microsoft.Health.Fhir.CosmosDb.UnitTests/Features/Health/CosmosHealthCheckTests.cs @@ -12,6 +12,7 @@ using System.Threading.Tasks; using Microsoft.Azure.Cosmos; using Microsoft.Extensions.Diagnostics.HealthChecks; +using Microsoft.Extensions.Logging; using Microsoft.Extensions.Logging.Abstractions; using Microsoft.Extensions.Options; using Microsoft.Health.Abstractions.Exceptions; @@ -19,6 +20,7 @@ using Microsoft.Health.Fhir.Core.Features.Health; using Microsoft.Health.Fhir.CosmosDb.Core.Configs; using Microsoft.Health.Fhir.CosmosDb.Core.Features.Storage; +using Microsoft.Health.Fhir.CosmosDb.Features.Health; using Microsoft.Health.Fhir.CosmosDb.Features.Storage; using Microsoft.Health.Fhir.Tests.Common; using Microsoft.Health.Test.Utilities; @@ -37,20 +39,21 @@ public class CosmosHealthCheckTests private readonly ICosmosClientTestProvider _testProvider = Substitute.For(); private readonly CosmosDataStoreConfiguration _configuration = new CosmosDataStoreConfiguration { DatabaseId = "mydb" }; private readonly CosmosCollectionConfiguration _cosmosCollectionConfiguration = new CosmosCollectionConfiguration { CollectionId = "mycoll" }; + private readonly ILogger _mockLogger = Substitute.For>(); private readonly TestCosmosHealthCheck _healthCheck; public CosmosHealthCheckTests() { var optionsSnapshot = Substitute.For>(); - optionsSnapshot.Get(Microsoft.Health.Fhir.CosmosDb.Constants.CollectionConfigurationName).Returns(_cosmosCollectionConfiguration); + optionsSnapshot.Get(Constants.CollectionConfigurationName).Returns(_cosmosCollectionConfiguration); _healthCheck = new TestCosmosHealthCheck( new NonDisposingScope(_container), _configuration, optionsSnapshot, _testProvider, - NullLogger.Instance); + _mockLogger); } [Fact] @@ -240,6 +243,63 @@ public async Task GivenCosmosDbWithTimeout_WhenHealthIsChecked_ThenHealthyStateS VerifyErrorInResult(result.Data, "Error", FhirHealthErrorCode.Error408.ToString()); } + [Fact] + public async Task GivenCosmosException_WhenLogged_ThenDiagnosticsShouldBeIncludedInLog() + { + // This test ensures the CosmosDiagnostics are logged when a CosmosException is thrown. + + // Arrange + var diagnosticsString = "Mock diagnostics data"; + var mockDiagnostics = new TestCosmosDiagnostics(diagnosticsString); + + var exception = new TestCosmosException( + message: "Service Unavailable", + statusCode: HttpStatusCode.ServiceUnavailable, + subStatusCode: 0, + activityId: Guid.NewGuid().ToString(), + requestCharge: 0, + diagnostics: mockDiagnostics); + + _testProvider.PerformTestAsync(default, CancellationToken.None).ThrowsForAnyArgs(exception); + + // Act + HealthCheckResult result = await _healthCheck.CheckHealthAsync(new HealthCheckContext()); + + // Assert + _mockLogger.Received().Log( + LogLevel.Warning, + Arg.Any(), + Arg.Is(v => v.ToString().Contains($"CosmosDiagnostics: {diagnosticsString}")), + exception, + Arg.Any>()); + } + + [Fact] + public async Task GivenCosmosExceptionWithoutDiagnostics_WhenLogged_ThenMessageShouldNotIncludeDiagnostics() + { + // Arrange + var exception = new TestCosmosException( + message: "Service Unavailable", + statusCode: HttpStatusCode.ServiceUnavailable, + subStatusCode: 0, + activityId: Guid.NewGuid().ToString(), + requestCharge: 0, + diagnostics: null); + + _testProvider.PerformTestAsync(default, CancellationToken.None).ThrowsForAnyArgs(exception); + + // Act + await _healthCheck.CheckHealthAsync(new HealthCheckContext()); + + // Assert + _mockLogger.Received().Log( + LogLevel.Warning, + Arg.Any(), + Arg.Is(v => !v.ToString().Contains("CosmosDiagnostics")), + exception, + Arg.Any>()); + } + private void VerifyErrorInResult(IReadOnlyDictionary dictionary, string key, string expectedMessage) { if (dictionary.TryGetValue(key, out var actualValue)) @@ -251,5 +311,37 @@ private void VerifyErrorInResult(IReadOnlyDictionary dictionary, Assert.Fail($"Expected key '{key}' not found in the dictionary."); } } + + // Allows for testing CosmosExceptions with CosmosDiagnostics as the field is read-only in the base class. + public class TestCosmosException : CosmosException + { + private readonly CosmosDiagnostics _diagnostics; + + public TestCosmosException(string message, HttpStatusCode statusCode, int subStatusCode, string activityId, double requestCharge, CosmosDiagnostics diagnostics) + : base(message, statusCode, subStatusCode, activityId, requestCharge) + { + _diagnostics = diagnostics; + } + + public override CosmosDiagnostics Diagnostics => _diagnostics; + } + + // Allows for testing CosmosDiagnostics flows through to the logger. CosmosDiagnostics is an abstract class. + public class TestCosmosDiagnostics : CosmosDiagnostics + { + private readonly string _testDiagnosticsString; + + public TestCosmosDiagnostics(string testDiagnosticsString) + { + _testDiagnosticsString = testDiagnosticsString; + } + + public override IReadOnlyList<(string regionName, Uri uri)> GetContactedRegions() + { + throw new NotImplementedException(); + } + + public override string ToString() => _testDiagnosticsString; + } } } diff --git a/src/Microsoft.Health.Fhir.CosmosDb.UnitTests/Features/Health/TestCosmosHealthCheck.cs b/src/Microsoft.Health.Fhir.CosmosDb.UnitTests/Features/Health/TestCosmosHealthCheck.cs index 5622335466..82fb271afc 100644 --- a/src/Microsoft.Health.Fhir.CosmosDb.UnitTests/Features/Health/TestCosmosHealthCheck.cs +++ b/src/Microsoft.Health.Fhir.CosmosDb.UnitTests/Features/Health/TestCosmosHealthCheck.cs @@ -14,7 +14,7 @@ namespace Microsoft.Health.Fhir.CosmosDb.UnitTests.Features.Health { - internal class TestCosmosHealthCheck : CosmosHealthCheck + public class TestCosmosHealthCheck : CosmosHealthCheck { public TestCosmosHealthCheck( IScoped container, diff --git a/src/Microsoft.Health.Fhir.CosmosDb/Features/Health/CosmosHealthCheck.cs b/src/Microsoft.Health.Fhir.CosmosDb/Features/Health/CosmosHealthCheck.cs index ad9fee7121..5154d6632c 100644 --- a/src/Microsoft.Health.Fhir.CosmosDb/Features/Health/CosmosHealthCheck.cs +++ b/src/Microsoft.Health.Fhir.CosmosDb/Features/Health/CosmosHealthCheck.cs @@ -5,6 +5,7 @@ using System; using System.Collections.Generic; +using System.Linq; using System.Net; using System.Threading; using System.Threading.Tasks; @@ -76,6 +77,21 @@ static bool IsRetryableException(Exception ex) => ex is CosmosOperationCanceledException || (ex is CosmosException cex && (cex.StatusCode == HttpStatusCode.ServiceUnavailable || cex.StatusCode == (HttpStatusCode)449)); + // Adds the CosmosDiagnostics to the log message if the exception is a CosmosException. + // This avoids truncation of the diagnostics details in the exception by moving it to the properties bag. + void LogWithDetails(LogLevel logLevel, Exception ex, string message, IEnumerable logArgs = null) + { + logArgs ??= []; + + if (ex is CosmosException cosmosException && cosmosException.Diagnostics is not null) + { + message = message + " CosmosDiagnostics: {CosmosDiagnostics}"; + logArgs = logArgs.Append(cosmosException.Diagnostics.ToString()); + } + + _logger.Log(logLevel, ex, message, logArgs.ToArray()); + } + do { cancellationToken.ThrowIfCancellationRequested(); @@ -95,7 +111,7 @@ ex is CosmosOperationCanceledException || // Handling an extenal cancellation. // No reasons to retry as the cancellation was external to the health check. - _logger.LogWarning(ex, "Failed to connect to the data store. External cancellation requested."); + LogWithDetails(LogLevel.Warning, ex, "Failed to connect to the data store. External cancellation requested."); return HealthCheckResult.Unhealthy( description: UnhealthyDescription, @@ -109,11 +125,11 @@ ex is CosmosOperationCanceledException || { // This is a very rare situation. This condition indicates that multiple attempts to connect to the data store happened, but they were not successful. - _logger.LogWarning( + LogWithDetails( + LogLevel.Warning, ex, "Failed to connect to the data store. There were {NumberOfAttempts} attempts to connect to the data store, but they suffered a '{ExceptionType}'.", - attempt, - ex.GetType().Name); + [attempt, ex.GetType().Name]); return HealthCheckResult.Unhealthy( description: UnhealthyDescription, @@ -126,18 +142,19 @@ ex is CosmosOperationCanceledException || else { // Number of attempts not reached. Allow retry. - _logger.LogWarning( + LogWithDetails( + LogLevel.Warning, ex, "Failed to connect to the data store. Attempt {NumberOfAttempts}. '{ExceptionType}'.", - attempt, - ex.GetType().Name); + [attempt, ex.GetType().Name]); } } catch (CosmosException ex) when (ex.IsCmkClientError()) { // Handling CMK errors. - _logger.LogWarning( + LogWithDetails( + LogLevel.Warning, ex, "Connection to the data store was unsuccesful because the client's customer-managed key is not available."); @@ -153,7 +170,8 @@ ex is CosmosOperationCanceledException || { // Handling timeout exceptions - _logger.LogWarning( + LogWithDetails( + LogLevel.Warning, ex, "Failed to connect to the data store. Request has timed out."); @@ -169,7 +187,8 @@ ex is CosmosOperationCanceledException || { // Handling request rate exceptions. - _logger.LogWarning( + LogWithDetails( + LogLevel.Warning, ex, "Failed to connect to the data store. Rate limit has been exceeded."); @@ -186,7 +205,7 @@ ex is CosmosOperationCanceledException || // Handling other exceptions. const string message = "Failed to connect to the data store."; - _logger.LogWarning(ex, message); + LogWithDetails(LogLevel.Warning, ex, message); return HealthCheckResult.Unhealthy( description: UnhealthyDescription, From dfe120dcdf8f948ef99522d738ce34a6bdf7abb5 Mon Sep 17 00:00:00 2001 From: Mikael Weaver Date: Mon, 20 Jan 2025 20:18:06 -0800 Subject: [PATCH 8/8] fix code scanning error --- .../Features/Health/CosmosHealthCheckTests.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Microsoft.Health.Fhir.CosmosDb.UnitTests/Features/Health/CosmosHealthCheckTests.cs b/src/Microsoft.Health.Fhir.CosmosDb.UnitTests/Features/Health/CosmosHealthCheckTests.cs index ee39ad3079..b48dc903d2 100644 --- a/src/Microsoft.Health.Fhir.CosmosDb.UnitTests/Features/Health/CosmosHealthCheckTests.cs +++ b/src/Microsoft.Health.Fhir.CosmosDb.UnitTests/Features/Health/CosmosHealthCheckTests.cs @@ -263,7 +263,7 @@ public async Task GivenCosmosException_WhenLogged_ThenDiagnosticsShouldBeInclude _testProvider.PerformTestAsync(default, CancellationToken.None).ThrowsForAnyArgs(exception); // Act - HealthCheckResult result = await _healthCheck.CheckHealthAsync(new HealthCheckContext()); + await _healthCheck.CheckHealthAsync(new HealthCheckContext()); // Assert _mockLogger.Received().Log(