Skip to content

Commit c8f678c

Browse files
authored
Moved cosmos job hosting to use unified retry logic (#4773)
* Moved cosmos job hosting to use unified retry logic * Improve exception policy per cosmos recomendations * remove unneeded using * fix reindex worker context * fix code scanning issue * handle case of null diagnostics * Add retry unit tests * formatting fixes * remove unneeded usings * updated retry policy * cleanup * add logging to new tests * relaxed retry test timing due to stopwatch impercision * removed primary constructor in CosmosQueueClientTests child test class * advanced base, non default delay
1 parent bc6b453 commit c8f678c

File tree

9 files changed

+249
-28
lines changed

9 files changed

+249
-28
lines changed

src/Microsoft.Health.Fhir.Core.UnitTests/Features/Operations/Reindex/ReindexJobWorkerTests.cs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,10 @@
99
using System.Threading.Tasks;
1010
using Microsoft.Extensions.Logging.Abstractions;
1111
using Microsoft.Extensions.Options;
12+
using Microsoft.Health.Core.Features.Context;
1213
using Microsoft.Health.Extensions.DependencyInjection;
1314
using Microsoft.Health.Fhir.Core.Configs;
15+
using Microsoft.Health.Fhir.Core.Features.Context;
1416
using Microsoft.Health.Fhir.Core.Features.Operations;
1517
using Microsoft.Health.Fhir.Core.Features.Operations.Reindex;
1618
using Microsoft.Health.Fhir.Core.Features.Operations.Reindex.Models;
@@ -59,6 +61,7 @@ public ReindexJobWorkerTests()
5961
Options.Create(_reindexJobConfiguration),
6062
_reindexJobTask.CreateMockScopeProvider(),
6163
searchParameterOperations,
64+
Substitute.For<RequestContextAccessor<IFhirRequestContext>>(),
6265
NullLogger<ReindexJobWorker>.Instance);
6366

6467
_reindexJobWorker.Handle(new Messages.Search.SearchParametersInitializedNotification(), CancellationToken.None);

src/Microsoft.Health.Fhir.Core/Features/Operations/Reindex/ReindexJobWorker.cs

Lines changed: 29 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,11 @@
1212
using MediatR;
1313
using Microsoft.Extensions.Logging;
1414
using Microsoft.Extensions.Options;
15+
using Microsoft.Extensions.Primitives;
16+
using Microsoft.Health.Core.Features.Context;
1517
using Microsoft.Health.Extensions.DependencyInjection;
1618
using Microsoft.Health.Fhir.Core.Configs;
19+
using Microsoft.Health.Fhir.Core.Features.Context;
1720
using Microsoft.Health.Fhir.Core.Features.Operations.Reindex.Models;
1821
using Microsoft.Health.Fhir.Core.Features.Search.Parameters;
1922
using Microsoft.Health.Fhir.Core.Messages.Search;
@@ -29,6 +32,7 @@ public class ReindexJobWorker : INotificationHandler<SearchParametersInitialized
2932
private readonly ReindexJobConfiguration _reindexJobConfiguration;
3033
private readonly IScopeProvider<IReindexJobTask> _reindexJobTaskFactory;
3134
private readonly ISearchParameterOperations _searchParameterOperations;
35+
private readonly RequestContextAccessor<IFhirRequestContext> _contextAccessor;
3236
private readonly ILogger _logger;
3337
private bool _searchParametersInitialized = false;
3438

@@ -37,19 +41,15 @@ public ReindexJobWorker(
3741
IOptions<ReindexJobConfiguration> reindexJobConfiguration,
3842
IScopeProvider<IReindexJobTask> reindexJobTaskFactory,
3943
ISearchParameterOperations searchParameterOperations,
44+
RequestContextAccessor<IFhirRequestContext> contextAccessor,
4045
ILogger<ReindexJobWorker> logger)
4146
{
42-
EnsureArg.IsNotNull(fhirOperationDataStoreFactory, nameof(fhirOperationDataStoreFactory));
43-
EnsureArg.IsNotNull(reindexJobConfiguration?.Value, nameof(reindexJobConfiguration));
44-
EnsureArg.IsNotNull(reindexJobTaskFactory, nameof(reindexJobTaskFactory));
45-
EnsureArg.IsNotNull(searchParameterOperations, nameof(searchParameterOperations));
46-
EnsureArg.IsNotNull(logger, nameof(logger));
47-
48-
_fhirOperationDataStoreFactory = fhirOperationDataStoreFactory;
49-
_reindexJobConfiguration = reindexJobConfiguration.Value;
50-
_reindexJobTaskFactory = reindexJobTaskFactory;
51-
_searchParameterOperations = searchParameterOperations;
52-
_logger = logger;
47+
_fhirOperationDataStoreFactory = EnsureArg.IsNotNull(fhirOperationDataStoreFactory, nameof(fhirOperationDataStoreFactory));
48+
_reindexJobConfiguration = EnsureArg.IsNotNull(reindexJobConfiguration?.Value, nameof(reindexJobConfiguration));
49+
_reindexJobTaskFactory = EnsureArg.IsNotNull(reindexJobTaskFactory, nameof(reindexJobTaskFactory));
50+
_searchParameterOperations = EnsureArg.IsNotNull(searchParameterOperations, nameof(searchParameterOperations));
51+
_contextAccessor = EnsureArg.IsNotNull(contextAccessor, nameof(contextAccessor));
52+
_logger = EnsureArg.IsNotNull(logger, nameof(logger));
5353
}
5454

5555
public async Task ExecuteAsync(CancellationToken cancellationToken)
@@ -60,6 +60,22 @@ public async Task ExecuteAsync(CancellationToken cancellationToken)
6060
{
6161
if (_searchParametersInitialized)
6262
{
63+
var originalRequestContext = _contextAccessor.RequestContext;
64+
65+
// Create a background task context to trigger the correct retry policy.
66+
var fhirRequestContext = new FhirRequestContext(
67+
method: nameof(ReindexJobWorker),
68+
uriString: nameof(ReindexJobWorker),
69+
baseUriString: nameof(ReindexJobWorker),
70+
correlationId: Guid.NewGuid().ToString(),
71+
requestHeaders: new Dictionary<string, StringValues>(),
72+
responseHeaders: new Dictionary<string, StringValues>())
73+
{
74+
IsBackgroundTask = true,
75+
};
76+
77+
_contextAccessor.RequestContext = fhirRequestContext;
78+
6379
// Check for any changes to Search Parameters
6480
try
6581
{
@@ -124,6 +140,8 @@ public async Task ExecuteAsync(CancellationToken cancellationToken)
124140
// The job failed.
125141
_logger.LogError(ex, "Error polling Reindex jobs.");
126142
}
143+
144+
_contextAccessor.RequestContext = originalRequestContext;
127145
}
128146

129147
try

src/Microsoft.Health.Fhir.CosmosDb.UnitTests/Features/Storage/CosmosFhirDataStoreTests.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,7 @@ public CosmosFhirDataStoreTests()
7676
_cosmosDataStoreConfiguration,
7777
Substitute.For<IOptionsMonitor<CosmosCollectionConfiguration>>(),
7878
_cosmosQueryFactory,
79-
new RetryExceptionPolicyFactory(_cosmosDataStoreConfiguration, requestContextAccessor),
79+
new RetryExceptionPolicyFactory(_cosmosDataStoreConfiguration, requestContextAccessor, NullLogger<RetryExceptionPolicyFactory>.Instance),
8080
NullLogger<CosmosFhirDataStore>.Instance,
8181
Options.Create(new CoreFeatureConfiguration()),
8282
_bundleOrchestrator,

src/Microsoft.Health.Fhir.CosmosDb.UnitTests/Features/Storage/FhirCosmosClientInitializerTests.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ public FhirCosmosClientInitializerTests()
4040
_initializer = new FhirCosmosClientInitializer(
4141
clientTestProvider,
4242
() => new[] { new TestRequestHandler() },
43-
new RetryExceptionPolicyFactory(_cosmosDataStoreConfiguration, Substitute.For<RequestContextAccessor<IFhirRequestContext>>()),
43+
new RetryExceptionPolicyFactory(_cosmosDataStoreConfiguration, Substitute.For<RequestContextAccessor<IFhirRequestContext>>(), NullLogger<RetryExceptionPolicyFactory>.Instance),
4444
Substitute.For<CosmosAccessTokenProviderFactory>(),
4545
NullLogger<FhirCosmosClientInitializer>.Instance);
4646

Lines changed: 140 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,140 @@
1+
// -------------------------------------------------------------------------------------------------
2+
// Copyright (c) Microsoft Corporation. All rights reserved.
3+
// Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
4+
// -------------------------------------------------------------------------------------------------
5+
6+
using System;
7+
using System.Diagnostics;
8+
using System.Net;
9+
using System.Threading;
10+
using System.Threading.Tasks;
11+
using Microsoft.Azure.Cosmos;
12+
using Microsoft.Extensions.Logging.Abstractions;
13+
using Microsoft.Health.Abstractions.Exceptions;
14+
using Microsoft.Health.Core.Features.Context;
15+
using Microsoft.Health.Extensions.DependencyInjection;
16+
using Microsoft.Health.Fhir.Core.Features.Context;
17+
using Microsoft.Health.Fhir.CosmosDb.Core.Configs;
18+
using Microsoft.Health.Fhir.CosmosDb.Core.Features.Storage;
19+
using Microsoft.Health.Fhir.CosmosDb.Features.Queries;
20+
using Microsoft.Health.Fhir.CosmosDb.Features.Storage;
21+
using Microsoft.Health.Fhir.CosmosDb.Features.Storage.Queues;
22+
using Microsoft.Health.Fhir.Tests.Common;
23+
using Microsoft.Health.Test.Utilities;
24+
using NSubstitute;
25+
using Xunit;
26+
27+
namespace Microsoft.Health.Fhir.CosmosDb.UnitTests.Features.Storage.Queues;
28+
29+
[Trait(Traits.OwningTeam, OwningTeam.Fhir)]
30+
[Trait(Traits.Category, Categories.DataSourceValidation)]
31+
public class CosmosQueueClientTests
32+
{
33+
private readonly ICosmosQueryFactory _cosmosQueryFactory;
34+
private readonly ICosmosDbDistributedLockFactory _distributedLockFactory;
35+
private readonly CosmosDataStoreConfiguration _cosmosDataStoreConfiguration = new CosmosDataStoreConfiguration();
36+
private readonly RequestContextAccessor<IFhirRequestContext> _requestContextAccessor;
37+
private readonly RetryExceptionPolicyFactory _retryPolicyFactory;
38+
private readonly CosmosQueueClient _cosmosQueueClient;
39+
40+
public CosmosQueueClientTests()
41+
{
42+
_cosmosQueryFactory = Substitute.For<ICosmosQueryFactory>();
43+
_distributedLockFactory = Substitute.For<ICosmosDbDistributedLockFactory>();
44+
_requestContextAccessor = Substitute.For<RequestContextAccessor<IFhirRequestContext>>();
45+
_retryPolicyFactory = new RetryExceptionPolicyFactory(_cosmosDataStoreConfiguration, _requestContextAccessor, NullLogger<RetryExceptionPolicyFactory>.Instance);
46+
47+
_cosmosQueueClient = new CosmosQueueClient(
48+
Substitute.For<Func<IScoped<Container>>>(),
49+
_cosmosQueryFactory,
50+
_distributedLockFactory,
51+
_retryPolicyFactory);
52+
}
53+
54+
[Theory]
55+
[InlineData(HttpStatusCode.ServiceUnavailable)]
56+
[InlineData(HttpStatusCode.TooManyRequests)]
57+
[InlineData(HttpStatusCode.Gone)]
58+
[InlineData((HttpStatusCode)449)]
59+
[InlineData(HttpStatusCode.RequestTimeout)]
60+
public async Task GivenADequeueJobOperation_WhenExceptionOccurs_RetryWillHappen(HttpStatusCode statusCode)
61+
{
62+
// Arrange
63+
ICosmosQuery<JobGroupWrapper> cosmosQuery = Substitute.For<ICosmosQuery<JobGroupWrapper>>();
64+
_cosmosQueryFactory.Create<JobGroupWrapper>(Arg.Any<Container>(), Arg.Any<CosmosQueryContext>())
65+
.ReturnsForAnyArgs(cosmosQuery);
66+
67+
int callCount = 0;
68+
cosmosQuery.ExecuteNextAsync(Arg.Any<CancellationToken>()).ReturnsForAnyArgs(_ =>
69+
{
70+
if (callCount++ == 0)
71+
{
72+
throw new TestCosmosException(statusCode);
73+
}
74+
75+
return Task.FromResult(Substitute.For<FeedResponse<JobGroupWrapper>>());
76+
});
77+
78+
// Act
79+
await _cosmosQueueClient.DequeueAsync(0, "testworker", 10, CancellationToken.None);
80+
81+
// Assert
82+
Assert.Equal(2, callCount);
83+
await cosmosQuery.ReceivedWithAnyArgs(2).ExecuteNextAsync(Arg.Any<CancellationToken>());
84+
}
85+
86+
[Theory]
87+
[InlineData(typeof(CosmosException))]
88+
[InlineData(typeof(RequestRateExceededException))]
89+
public async Task GivenADequeueJobOperation_WhenExceptionWithRetryAfterIsProvided_PolicyRespectsRetryAfter(Type exceptionType)
90+
{
91+
// Arrange
92+
ICosmosQuery<JobGroupWrapper> cosmosQuery = Substitute.For<ICosmosQuery<JobGroupWrapper>>();
93+
_cosmosQueryFactory.Create<JobGroupWrapper>(Arg.Any<Container>(), Arg.Any<CosmosQueryContext>())
94+
.ReturnsForAnyArgs(cosmosQuery);
95+
var retryAfter = TimeSpan.FromSeconds(2);
96+
int callCount = 0;
97+
98+
cosmosQuery.ExecuteNextAsync(Arg.Any<CancellationToken>()).ReturnsForAnyArgs(_ =>
99+
{
100+
if (callCount++ == 0)
101+
{
102+
throw exceptionType == typeof(CosmosException)
103+
? new TestCosmosException(HttpStatusCode.TooManyRequests, retryAfter)
104+
: new RequestRateExceededException(retryAfter);
105+
}
106+
107+
return Task.FromResult(Substitute.For<FeedResponse<JobGroupWrapper>>());
108+
});
109+
110+
var stopwatch = Stopwatch.StartNew();
111+
112+
// Act
113+
await _cosmosQueueClient.DequeueAsync(0, "testworker", 10, CancellationToken.None);
114+
115+
stopwatch.Stop();
116+
117+
// Assert
118+
Assert.Equal(2, callCount);
119+
await cosmosQuery.ReceivedWithAnyArgs(2).ExecuteNextAsync(Arg.Any<CancellationToken>());
120+
121+
// Allowing small imprecision due to timer resolution
122+
var actualElapsedSeconds = stopwatch.Elapsed.TotalSeconds;
123+
Assert.True(
124+
Math.Abs(actualElapsedSeconds - retryAfter.TotalSeconds) <= 0.5,
125+
$"Expected retry after {retryAfter.TotalSeconds} seconds, but actual elapsed time was {actualElapsedSeconds} seconds.");
126+
}
127+
128+
public class TestCosmosException : CosmosException
129+
{
130+
private readonly TimeSpan? _retryAfter;
131+
132+
public TestCosmosException(HttpStatusCode statusCode, TimeSpan? retryAfter = null)
133+
: base("Test exception message", statusCode, 0, "test-activity-id", 0.0)
134+
{
135+
_retryAfter = retryAfter;
136+
}
137+
138+
public override TimeSpan? RetryAfter => _retryAfter;
139+
}
140+
}

src/Microsoft.Health.Fhir.CosmosDb/Features/Storage/Queues/CosmosQueueClient.cs

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
using System.Threading.Tasks;
1313
using EnsureThat;
1414
using Microsoft.Azure.Cosmos;
15+
using Microsoft.Extensions.Logging;
1516
using Microsoft.Health.Abstractions.Exceptions;
1617
using Microsoft.Health.Core;
1718
using Microsoft.Health.Core.Extensions;
@@ -29,20 +30,21 @@ public class CosmosQueueClient : IQueueClient
2930
private readonly Func<IScoped<Container>> _containerFactory;
3031
private readonly ICosmosQueryFactory _queryFactory;
3132
private readonly ICosmosDbDistributedLockFactory _distributedLockFactory;
32-
private static readonly AsyncPolicy _retryPolicy = Policy
33-
.Handle<CosmosException>(ex => ex.StatusCode == HttpStatusCode.PreconditionFailed)
34-
.Or<CosmosException>(ex => ex.StatusCode == HttpStatusCode.TooManyRequests)
35-
.Or<RequestRateExceededException>()
36-
.WaitAndRetryAsync(5, _ => TimeSpan.FromMilliseconds(RandomNumberGenerator.GetInt32(100, 1000)));
33+
private readonly RetryExceptionPolicyFactory _retryExceptionPolicyFactory;
34+
private readonly AsyncPolicy _retryPolicy;
3735

3836
public CosmosQueueClient(
3937
Func<IScoped<Container>> containerFactory,
4038
ICosmosQueryFactory queryFactory,
41-
ICosmosDbDistributedLockFactory distributedLockFactory)
39+
ICosmosDbDistributedLockFactory distributedLockFactory,
40+
RetryExceptionPolicyFactory retryExceptionPolicyFactory)
4241
{
4342
_containerFactory = EnsureArg.IsNotNull(containerFactory, nameof(containerFactory));
4443
_queryFactory = EnsureArg.IsNotNull(queryFactory, nameof(queryFactory));
4544
_distributedLockFactory = EnsureArg.IsNotNull(distributedLockFactory, nameof(distributedLockFactory));
45+
_retryExceptionPolicyFactory = EnsureArg.IsNotNull(retryExceptionPolicyFactory, nameof(retryExceptionPolicyFactory));
46+
47+
_retryPolicy = _retryExceptionPolicyFactory.BackgroundWorkerRetryPolicy;
4648
}
4749

4850
public bool IsInitialized() => true;

0 commit comments

Comments
 (0)