Skip to content

Commit b2c7678

Browse files
authored
proactiveInit wiring in diagnostics (Azure#36111)
* Preliminary code for tracking last unhealthy timestamp only when replica first becomes unhealthy. * Fix tests. * Fix tests. * Fix tests. * Preliminary code changes. * Preliminary code changes. * Preliminary code changes. * Updated CHANGELOG.md. * Added tests. * Fixed tests. * Fixed tests. * Fixed tests. * Fixed tests.
1 parent 6caa4a4 commit b2c7678

File tree

11 files changed

+134
-29
lines changed

11 files changed

+134
-29
lines changed

sdk/cosmos/azure-cosmos-tests/src/test/java/com/azure/cosmos/ClientTelemetryTest.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -267,6 +267,7 @@ public void clientTelemetryWithStageJunoEndpoint(boolean useProxy) throws Interr
267267
IllegalAccessException {
268268
CosmosClient cosmosClient = null;
269269
String databaseId = UUID.randomUUID().toString();
270+
270271
try {
271272
String whiteListedAccountForTelemetry = System.getProperty("COSMOS.CLIENT_TELEMETRY_COSMOS_ACCOUNT");
272273
assertThat(whiteListedAccountForTelemetry).isNotNull();

sdk/cosmos/azure-cosmos-tests/src/test/java/com/azure/cosmos/implementation/ClientConfigDiagnosticsTest.java

Lines changed: 86 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,27 +4,84 @@
44
package com.azure.cosmos.implementation;
55

66
import com.azure.cosmos.ConnectionMode;
7+
import com.azure.cosmos.CosmosContainerProactiveInitConfig;
8+
import com.azure.cosmos.CosmosContainerProactiveInitConfigBuilder;
79
import com.azure.cosmos.implementation.directconnectivity.RntbdTransportClient;
810
import com.azure.cosmos.implementation.guava25.collect.ImmutableList;
911
import com.azure.cosmos.implementation.http.HttpClientConfig;
12+
import com.azure.cosmos.models.CosmosContainerIdentity;
1013
import com.fasterxml.jackson.core.JsonFactory;
1114
import com.fasterxml.jackson.core.JsonGenerator;
1215
import com.fasterxml.jackson.databind.ObjectMapper;
1316
import com.fasterxml.jackson.databind.SerializerProvider;
1417
import com.fasterxml.jackson.databind.node.ObjectNode;
1518
import org.mockito.Mockito;
19+
import org.testng.annotations.DataProvider;
1620
import org.testng.annotations.Test;
1721

1822
import java.io.StringWriter;
1923
import java.time.Duration;
24+
import java.util.Arrays;
2025
import java.util.HashMap;
26+
import java.util.List;
2127
import java.util.UUID;
2228
import java.util.concurrent.atomic.AtomicInteger;
29+
import java.util.stream.Collectors;
2330

2431
import static org.assertj.core.api.Assertions.assertThat;
2532

2633
public class ClientConfigDiagnosticsTest {
2734
private final ObjectMapper objectMapper = new ObjectMapper();
35+
private static final ImplementationBridgeHelpers.CosmosContainerIdentityHelper.CosmosContainerIdentityAccessor containerIdentityAccessor = ImplementationBridgeHelpers
36+
.CosmosContainerIdentityHelper
37+
.getCosmosContainerIdentityAccessor();
38+
39+
@DataProvider(name = "proactiveContainerInitConfigProvider")
40+
public Object[][] proactiveContainerInitConfigProvider() {
41+
42+
Duration aggressiveWarmUpDuration1 = Duration.ofSeconds(1);
43+
int proactiveConnectionRegionCount1 = 1;
44+
45+
Duration aggressiveWarmUpDuration2 = Duration.ofMillis(1000);
46+
int proactiveConnectionRegionCount2 = 1;
47+
48+
Duration aggressiveWarmUpDuration3 = null;
49+
int proactiveConnectionRegionCount3 = 2;
50+
51+
List<CosmosContainerIdentity> cosmosContainerIdentities = Arrays.asList(
52+
new CosmosContainerIdentity("test-db", "test-container-1"),
53+
new CosmosContainerIdentity("test-db", "test-container-2")
54+
);
55+
56+
return new Object[][] {
57+
{
58+
new CosmosContainerProactiveInitConfigBuilder(cosmosContainerIdentities)
59+
.setAggressiveWarmupDuration(aggressiveWarmUpDuration1)
60+
.setProactiveConnectionRegionsCount(proactiveConnectionRegionCount1)
61+
.build(),
62+
aggressiveWarmUpDuration1,
63+
proactiveConnectionRegionCount1,
64+
cosmosContainerIdentities
65+
},
66+
{
67+
new CosmosContainerProactiveInitConfigBuilder(cosmosContainerIdentities)
68+
.setAggressiveWarmupDuration(aggressiveWarmUpDuration2)
69+
.setProactiveConnectionRegionsCount(proactiveConnectionRegionCount2)
70+
.build(),
71+
aggressiveWarmUpDuration2,
72+
proactiveConnectionRegionCount2,
73+
cosmosContainerIdentities
74+
},
75+
{
76+
new CosmosContainerProactiveInitConfigBuilder(cosmosContainerIdentities)
77+
.setProactiveConnectionRegionsCount(proactiveConnectionRegionCount3)
78+
.build(),
79+
null,
80+
proactiveConnectionRegionCount3,
81+
cosmosContainerIdentities
82+
}
83+
};
84+
}
2885

2986
@Test(groups = { "unit" })
3087
public void bareMinimum() throws Exception {
@@ -124,8 +181,13 @@ public void gw() throws Exception {
124181
assertThat(objectNode.get("connCfg").get("other").asText()).isEqualTo("(ed: false, cs: false, rv: true)");
125182
}
126183

127-
@Test(groups = { "unit" })
128-
public void full() throws Exception {
184+
@Test(groups = { "unit" }, dataProvider = "proactiveContainerInitConfigProvider")
185+
public void full(
186+
CosmosContainerProactiveInitConfig containerProactiveInitConfig,
187+
Duration aggressiveWarmupDuration,
188+
int proactiveConnectionRegionCount,
189+
List<CosmosContainerIdentity> cosmosContainerIdentities) throws Exception {
190+
129191
DiagnosticsClientContext clientContext = Mockito.mock(DiagnosticsClientContext.class);
130192
System.setProperty("COSMOS.REPLICA_ADDRESS_VALIDATION_ENABLED", "false");
131193

@@ -144,6 +206,7 @@ public void full() throws Exception {
144206
diagnosticsClientConfig.withConnectionSharingAcrossClientsEnabled(true);
145207
diagnosticsClientConfig.withEndpointDiscoveryEnabled(true);
146208
diagnosticsClientConfig.withClientMap(new HashMap<>());
209+
diagnosticsClientConfig.withProactiveContainerInitConfig(containerProactiveInitConfig);
147210

148211
Mockito.doReturn(diagnosticsClientConfig).when(clientContext).getConfig();
149212

@@ -162,6 +225,27 @@ public void full() throws Exception {
162225
assertThat(objectNode.get("connCfg").get("gw").asText()).isEqualTo("(cps:500, nrto:PT18S, icto:PT17S, p:false)");
163226
assertThat(objectNode.get("connCfg").get("other").asText()).isEqualTo("(ed: true, cs: true, rv: false)");
164227

228+
String expectedProactiveInitConfigString = reconstructProactiveInitConfigString(cosmosContainerIdentities, aggressiveWarmupDuration, proactiveConnectionRegionCount);
229+
230+
assertThat(objectNode.get("proactiveInit").asText()).isEqualTo(expectedProactiveInitConfigString);
231+
165232
System.clearProperty("COSMOS.REPLICA_ADDRESS_VALIDATION_ENABLED");
166233
}
234+
235+
private static String reconstructProactiveInitConfigString(
236+
List<CosmosContainerIdentity> containerIdentities,
237+
Duration aggressiveWarmupDuration,
238+
int proactiveConnectionRegionCount) {
239+
240+
return String.format(
241+
"(containers:%s)(pcrc:%d)(awd:%s)",
242+
containerIdentities
243+
.stream()
244+
.map(ci -> String.join(
245+
".",
246+
containerIdentityAccessor.getContainerLink(ci)))
247+
.collect(Collectors.joining(";")),
248+
proactiveConnectionRegionCount,
249+
aggressiveWarmupDuration);
250+
}
167251
}

sdk/cosmos/azure-cosmos-tests/src/test/java/com/azure/cosmos/implementation/RxDocumentClientImplTest.java

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
import com.azure.core.http.ProxyOptions;
77
import com.azure.cosmos.BridgeInternal;
88
import com.azure.cosmos.ConsistencyLevel;
9+
import com.azure.cosmos.CosmosContainerProactiveInitConfig;
910
import com.azure.cosmos.CosmosDiagnostics;
1011
import com.azure.cosmos.CosmosEndToEndOperationLatencyPolicyConfig;
1112
import com.azure.cosmos.SessionRetryOptions;
@@ -79,6 +80,7 @@ public class RxDocumentClientImplTest {
7980
private IRetryPolicyFactory resetSessionTokenRetryPolicyMock;
8081
private CosmosEndToEndOperationLatencyPolicyConfig endToEndOperationLatencyPolicyConfig;
8182
private SessionRetryOptions sessionRetryOptionsMock;
83+
private CosmosContainerProactiveInitConfig containerProactiveInitConfigMock;
8284

8385
@BeforeClass(groups = "unit")
8486
public void setUp() {
@@ -100,6 +102,7 @@ public void setUp() {
100102
this.resetSessionTokenRetryPolicyMock = Mockito.mock(IRetryPolicyFactory.class);
101103
this.endToEndOperationLatencyPolicyConfig = Mockito.mock(CosmosEndToEndOperationLatencyPolicyConfig.class);
102104
this.sessionRetryOptionsMock = Mockito.mock(SessionRetryOptions.class);
105+
this.containerProactiveInitConfigMock = Mockito.mock(CosmosContainerProactiveInitConfig.class);
103106
}
104107

105108
@Test(groups = {"unit"})
@@ -220,7 +223,8 @@ public void readMany() {
220223
this.cosmosClientTelemetryConfigMock,
221224
this.clientCorrelationIdMock,
222225
this.endToEndOperationLatencyPolicyConfig,
223-
this.sessionRetryOptionsMock);
226+
this.sessionRetryOptionsMock,
227+
this.containerProactiveInitConfigMock);
224228

225229
ReflectionUtils.setCollectionCache(rxDocumentClient, this.collectionCacheMock);
226230
ReflectionUtils.setPartitionKeyRangeCache(rxDocumentClient, this.partitionKeyRangeCacheMock);

sdk/cosmos/azure-cosmos-tests/src/test/java/com/azure/cosmos/implementation/RxDocumentClientUnderTest.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@ public RxDocumentClientUnderTest(URI serviceEndpoint,
5757
clientTelemetryConfig,
5858
null,
5959
null,
60+
null,
6061
null);
6162
init(null, null);
6263
}

sdk/cosmos/azure-cosmos-tests/src/test/java/com/azure/cosmos/implementation/SpyClientUnderTestFactory.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,7 @@ public SpyBaseClass(
6363
clientTelemetryConfig,
6464
null,
6565
null,
66+
null,
6667
null);
6768
}
6869

sdk/cosmos/azure-cosmos-tests/src/test/java/com/azure/cosmos/implementation/directconnectivity/BarrierRequestHelperTest.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -182,6 +182,7 @@ public void barrierWithAadAuthorizationTokenProviderType() throws URISyntaxExcep
182182
new CosmosClientTelemetryConfig().sendClientTelemetryToService(false),
183183
null,
184184
null,
185+
null,
185186
null);
186187

187188
ResourceType resourceType = ResourceType.DocumentCollection;

sdk/cosmos/azure-cosmos/CHANGELOG.md

Lines changed: 6 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -7,26 +7,19 @@
77

88
#### Breaking Changes
99

10-
#### Bugs Fixed
11-
12-
#### Other Changes
13-
14-
### 4.48.1 (2023-08-09)
15-
> [!IMPORTANT]
16-
> We strongly recommend our customers to use version 4.48.1 and above.
10+
* Gone exceptions that are not idempotent should not be retried because it is not known if they succeeded for sure. The handling of the exception in this case is left to the user. Fixed retrying write operations when a gone exception occurs in bulk mode. - See [PR 35838](https://github.com/Azure/azure-sdk-for-java/pull/35838)
1711
#### Bugs Fixed
1812
* Fixed retrying write operations when a gone exception occurs in bulk mode. - See [PR 35838](https://github.com/Azure/azure-sdk-for-java/pull/35838)
1913
* Fixed request start time in the `CosmosDiagnostics` for individual request responses - See [PR 35705](https://github.com/Azure/azure-sdk-for-java/pull/35705)
2014
* Fixed an issue where `ConnectionStateListener` tracked staled `Uris` which fails to mark the current `Uris` unhealthy properly - See [PR 36067](https://github.com/Azure/azure-sdk-for-java/pull/36067)
2115
* Gone exceptions that are not idempotent should not be retried because it is not known if they succeeded for sure. The handling of the exception in this case is left to the user. Fixed retrying write operations when a gone exception occurs in bulk mode. - See [PR 35838](https://github.com/Azure/azure-sdk-for-java/pull/35838)
22-
* Fixed an issue to update the last unhealthy timestamp for an `Uri` instance only when transitioning to `Unhealthy` from a different health status - See [36083](https://github.com/Azure/azure-sdk-for-java/pull/36083)
23-
* Improved the channel health check flow to deem a channel unhealthy when it sees consecutive cancellations. - See [PR 36225](https://github.com/Azure/azure-sdk-for-java/pull/36225)
24-
* Optimized the replica validation flow to validate replica health with `Unknown` health status only when the replica is
25-
used by a container which is also part of the connection warm-up flow. - See [PR 36225](https://github.com/Azure/azure-sdk-for-java/pull/36225)
16+
* Fixed an issue to update the last unhealthy timestamp for an `Uri` instance only when transitioning to `Unhealthy` from a different health status - See [PR 36083](https://github.com/Azure/azure-sdk-for-java/pull/36083)
17+
* Wired `proactiveInit` into the diagnostics to track warmed up containers, proactive connection regions and aggressive warm up duration - See [PR 36111](https://github.com/Azure/azure-sdk-for-java/pull/36111)
2618

2719
#### Other Changes
2820

2921
### 4.48.0 (2023-07-18)
22+
3023
#### Bugs Fixed
3124
* Fixed an issue with deserialization of `conflictResolutionTimestamp` for All versions and deletes change feed mode. - See [PR 35909](https://github.com/Azure/azure-sdk-for-java/pull/35909)
3225
* Added capability to mark a region as unavailable when a request is cancelled due to end-to-end timeout and connection issues
@@ -188,6 +181,8 @@ there are non-existent document IDs also passed through the API - See [PR 35513]
188181
* Added option to set throughput control group name on per-request level for batch and bulk operations. - See [PR 31362](https://github.com/Azure/azure-sdk-for-java/pull/31362)
189182

190183
### 4.37.1 (2022-10-07)
184+
> [!IMPORTANT]
185+
> We strongly recommend our customers to use version 4.37.1 and above.
191186
#### Bugs Fixed
192187
* Fixed incorrect RU metric reporting in micrometer metrics. - See [PR 31307](https://github.com/Azure/azure-sdk-for-java/pull/31307)
193188
* Enabled failover to preferred locations in the case of single-write/multi-read region enabled account for read in Gateway mode and for metadata requests in Direct mode. - More details about the [Bug: Cosmos DB Client gets stuck in timeout retry loop](https://github.com/Azure/azure-sdk-for-java/issues/31260#issue-1396454421). - See [PR 31314](https://github.com/Azure/azure-sdk-for-java/pull/31314)

sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/CosmosAsyncClient.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -167,6 +167,7 @@ public final class CosmosAsyncClient implements Closeable {
167167
.withClientCorrelationId(clientCorrelationId)
168168
.withEndToEndOperationLatencyPolicyConfig(endToEndOperationLatencyPolicyConfig)
169169
.withSessionRetryOptions(sessionRetryOptions)
170+
.withContainerProactiveInitConfig(this.proactiveContainerInitConfig)
170171
.build();
171172

172173
this.accountConsistencyLevel = this.asyncDocumentClient.getDefaultConsistencyLevelOfAccount();

sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/CosmosContainerProactiveInitConfig.java

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -96,17 +96,19 @@ public String toString() {
9696
return "";
9797
}
9898

99-
return
100-
String.format(
101-
"%s(%d)",
99+
// containers -> the containers part of the connection warm up
100+
// pcrc -> the first k regions from the preferredRegions to which connections are warmed up
101+
// awd -> duration within which aggressive connection warm up happens
102+
return String.format(
103+
"(containers:%s)(pcrc:%d)(awd:%s)",
102104
cosmosContainerIdentities
103105
.stream()
104106
.map(ci -> String.join(
105107
".",
106-
containerIdAccessor.getDatabaseName(ci),
107-
containerIdAccessor.getContainerName(ci)))
108-
.collect(Collectors.joining(",")),
109-
numProactiveConnectionRegions);
108+
containerIdAccessor.getContainerLink(ci)))
109+
.collect(Collectors.joining(";")),
110+
numProactiveConnectionRegions,
111+
aggressiveWarmupDuration);
110112
}
111113

112114
static void initialize() {

sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/AsyncDocumentClient.java

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,7 @@ class Builder {
103103
private String clientCorrelationId = null;
104104
private CosmosEndToEndOperationLatencyPolicyConfig cosmosEndToEndOperationLatencyPolicyConfig;
105105
private SessionRetryOptions sessionRetryOptions;
106+
private CosmosContainerProactiveInitConfig containerProactiveInitConfig;
106107

107108
public Builder withServiceEndpoint(String serviceEndpoint) {
108109
try {
@@ -249,6 +250,11 @@ public Builder withSessionRetryOptions(SessionRetryOptions sessionRetryOptions)
249250
return this;
250251
}
251252

253+
public Builder withContainerProactiveInitConfig(CosmosContainerProactiveInitConfig containerProactiveInitConfig) {
254+
this.containerProactiveInitConfig = containerProactiveInitConfig;
255+
return this;
256+
}
257+
252258
private void ifThrowIllegalArgException(boolean value, String error) {
253259
if (value) {
254260
throw new IllegalArgumentException(error);
@@ -283,7 +289,8 @@ public AsyncDocumentClient build() {
283289
clientTelemetryConfig,
284290
clientCorrelationId,
285291
cosmosEndToEndOperationLatencyPolicyConfig,
286-
sessionRetryOptions);
292+
sessionRetryOptions,
293+
containerProactiveInitConfig);
287294

288295
client.init(state, null);
289296
return client;

0 commit comments

Comments
 (0)