Skip to content

Commit ce29be9

Browse files
authored
Retry Requests Based on Causal Exception Too (Azure#32648)
Retry Requests Based on Causal Exception Too
1 parent fbe320d commit ce29be9

File tree

2 files changed

+107
-28
lines changed

2 files changed

+107
-28
lines changed

sdk/storage/azure-storage-common/src/main/java/com/azure/storage/common/policy/RequestRetryPolicy.java

Lines changed: 60 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,7 @@ private Mono<HttpResponse> attemptAsync(final HttpPipelineCallContext context, H
110110
Flux<ByteBuffer> bufferedBody = context.getHttpRequest().getBody().map(ByteBuffer::duplicate);
111111
context.getHttpRequest().setBody(bufferedBody);
112112
}
113+
113114
if (!tryingPrimary) {
114115
UrlBuilder builder = UrlBuilder.parse(context.getHttpRequest().getUrl());
115116
builder.setHost(this.requestRetryOptions.getSecondaryHost());
@@ -119,23 +120,18 @@ private Mono<HttpResponse> attemptAsync(final HttpPipelineCallContext context, H
119120
return Mono.error(e);
120121
}
121122
}
122-
/*
123-
Update the RETRY_COUNT_CONTEXT to log retries.
124-
*/
123+
124+
// Update the RETRY_COUNT_CONTEXT to log retries.
125125
context.setData(HttpLoggingPolicy.RETRY_COUNT_CONTEXT, attempt);
126126

127-
/*
128-
Reset progress if progress is tracked.
129-
*/
127+
// Reset progress if progress is tracked.
130128
ProgressReporter progressReporter = Contexts.with(context.getContext()).getHttpRequestProgressReporter();
131129
if (progressReporter != null) {
132130
progressReporter.reset();
133131
}
134132

135-
/*
136-
We want to send the request with a given timeout, but we don't want to kickoff that timeout-bound operation
137-
until after the retry backoff delay, so we call delaySubscription.
138-
*/
133+
// We want to send the request with a given timeout, but we don't want to kick off that timeout-bound operation
134+
// until after the retry backoff delay, so we call delaySubscription.
139135
Mono<HttpResponse> responseMono = next.clone().process();
140136

141137
// Default try timeout is Integer.MAX_VALUE seconds, if it's that don't set a timeout as that's about 68 years
@@ -152,19 +148,11 @@ private Mono<HttpResponse> attemptAsync(final HttpPipelineCallContext context, H
152148

153149
return responseMono.flatMap(response -> {
154150
boolean newConsiderSecondary = considerSecondary;
155-
boolean retry = false;
156151
int statusCode = response.getStatusCode();
157152

158-
/*
159-
* If attempt was against the secondary & it returned a StatusNotFound (404), then the resource was not
160-
* found. This may be due to replication delay. So, in this case, we'll never try the secondary again for
161-
* this operation.
162-
*/
153+
boolean retry = shouldStatusCodeBeRetried(statusCode, tryingPrimary);
163154
if (!tryingPrimary && statusCode == 404) {
164155
newConsiderSecondary = false;
165-
retry = true;
166-
} else if (statusCode == 503 || statusCode == 500) {
167-
retry = true;
168156
}
169157

170158
if (retry && attempt < requestRetryOptions.getMaxTries()) {
@@ -208,15 +196,10 @@ private Mono<HttpResponse> attemptAsync(final HttpPipelineCallContext context, H
208196
* better to optimistically retry instead of failing too soon. A Timeout Exception is a client-side timeout
209197
* coming from Rx.
210198
*/
211-
boolean retry = false;
212-
Throwable unwrappedThrowable = Exceptions.unwrap(throwable);
213-
if (unwrappedThrowable instanceof IOException) {
214-
retry = true;
215-
} else if (unwrappedThrowable instanceof TimeoutException) {
216-
retry = true;
217-
}
199+
ExceptionRetryStatus exceptionRetryStatus = shouldErrorBeRetried(throwable, attempt,
200+
requestRetryOptions.getMaxTries());
218201

219-
if (retry && attempt < requestRetryOptions.getMaxTries()) {
202+
if (exceptionRetryStatus.canBeRetried) {
220203
/*
221204
* We increment primaryTry if we are about to try the primary again (which is when we consider the
222205
* secondary and tried the secondary this time (tryingPrimary==false) or we do not consider the
@@ -225,14 +208,63 @@ private Mono<HttpResponse> attemptAsync(final HttpPipelineCallContext context, H
225208
*/
226209
int newPrimaryTry = (!tryingPrimary || !considerSecondary) ? primaryTry + 1 : primaryTry;
227210
List<Throwable> suppressedLocal = suppressed == null ? new LinkedList<>() : suppressed;
228-
suppressedLocal.add(unwrappedThrowable);
211+
suppressedLocal.add(exceptionRetryStatus.unwrappedThrowable);
229212
return attemptAsync(context, next, originalRequest, considerSecondary, newPrimaryTry, attempt + 1,
230213
suppressedLocal);
231214
}
215+
232216
if (suppressed != null) {
233217
suppressed.forEach(throwable::addSuppressed);
234218
}
219+
235220
return Mono.error(throwable);
236221
});
237222
}
223+
224+
static ExceptionRetryStatus shouldErrorBeRetried(Throwable error, int attempt, int maxAttempts) {
225+
Throwable unwrappedThrowable = Exceptions.unwrap(error);
226+
227+
// Check if there are any attempts remaining.
228+
if (attempt >= maxAttempts) {
229+
return new ExceptionRetryStatus(false, unwrappedThrowable);
230+
}
231+
232+
// Check if the unwrapped error is an IOException or TimeoutException.
233+
if (unwrappedThrowable instanceof IOException || unwrappedThrowable instanceof TimeoutException) {
234+
return new ExceptionRetryStatus(true, unwrappedThrowable);
235+
}
236+
237+
// Check the causal exception chain for this exception being caused by an IOException or TimeoutException.
238+
Throwable causalException = unwrappedThrowable.getCause();
239+
while (causalException != null) {
240+
if (causalException instanceof IOException || causalException instanceof TimeoutException) {
241+
return new ExceptionRetryStatus(true, unwrappedThrowable);
242+
}
243+
244+
causalException = causalException.getCause();
245+
}
246+
247+
// Finally all exceptions have been checked and none can be retried.
248+
return new ExceptionRetryStatus(false, unwrappedThrowable);
249+
}
250+
251+
static boolean shouldStatusCodeBeRetried(int statusCode, boolean isPrimary) {
252+
/*
253+
* Retry the request if the server had an error (500), was unavailable (503), or requested a backoff (429),
254+
* or if the secondary was being tried and the resources didn't exist there (404). Only the secondary can retry
255+
* if the resource wasn't found as there may be a delay in replication from the primary.
256+
*/
257+
return (statusCode == 429 || statusCode == 500 || statusCode == 503)
258+
|| (!isPrimary && statusCode == 404);
259+
}
260+
261+
static final class ExceptionRetryStatus {
262+
final boolean canBeRetried;
263+
final Throwable unwrappedThrowable;
264+
265+
ExceptionRetryStatus(boolean canBeRetried, Throwable unwrappedThrowable) {
266+
this.canBeRetried = canBeRetried;
267+
this.unwrappedThrowable = unwrappedThrowable;
268+
}
269+
}
238270
}
Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
package com.azure.storage.common.policy
2+
3+
import reactor.core.Exceptions
4+
import spock.lang.Specification
5+
6+
import java.util.concurrent.TimeoutException
7+
8+
/**
9+
* Tests {@link RequestRetryPolicy}
10+
*/
11+
class RequestRetryPolicyTest extends Specification {
12+
def "should exception be retried"() {
13+
expect:
14+
RequestRetryPolicy.shouldErrorBeRetried(error, 0, 1).canBeRetried == shouldBeRetried
15+
16+
where:
17+
error | shouldBeRetried
18+
new IOException() | true
19+
new TimeoutException() | true
20+
new RuntimeException() | false
21+
Exceptions.propagate(new IOException()) | true
22+
Exceptions.propagate(new TimeoutException()) | true
23+
Exceptions.propagate(new RuntimeException()) | false
24+
new Exception(new IOException()) | true
25+
new Exception(new TimeoutException()) | true
26+
new Exception(new RuntimeException()) | false
27+
Exceptions.propagate(new Exception(new IOException())) | true
28+
Exceptions.propagate(new Exception(new TimeoutException())) | true
29+
Exceptions.propagate(new Exception(new RuntimeException())) | false
30+
}
31+
32+
def "should status code be retried"() {
33+
expect:
34+
RequestRetryPolicy.shouldStatusCodeBeRetried(statusCode, isPrimary) == shouldBeRetried
35+
36+
where:
37+
statusCode | isPrimary | shouldBeRetried
38+
429 | true | true
39+
429 | false | true
40+
500 | true | true
41+
500 | false | true
42+
503 | true | true
43+
503 | false | true
44+
404 | true | false
45+
404 | false | true
46+
}
47+
}

0 commit comments

Comments
 (0)