See if Redesign of InputStream Converter Helps Memory Usage (Azure#32444)

alzimmermsft · web-flow · commit 3aaeb58488ba · 2022-12-07T10:05:34.000-05:00
diff --git a/sdk/storage/azure-storage-blob/src/main/java/com/azure/storage/blob/BlobAsyncClient.java b/sdk/storage/azure-storage-blob/src/main/java/com/azure/storage/blob/BlobAsyncClient.java
@@ -650,27 +650,34 @@ public Mono<Response<BlockBlobItem>> uploadWithResponse(Flux<ByteBuffer> data,
     @ServiceMethod(returns = ReturnType.SINGLE)
     public Mono<Response<BlockBlobItem>> uploadWithResponse(BlobParallelUploadOptions options) {
         /*
-        The following is catalogue of all the places we allocate memory/copy in any upload method a justification for
-        that case current as of 1/13/21.
-        - Async buffered upload chunked upload: We used an UploadBufferPool. This will allocate memory as needed up to
-        the configured maximum. This is necessary to support replayability on retires. Each flux to come out of the pool
-        is a Flux.just() of up to two deep copied buffers, so it is replayable. It also allows us to optimize the upload
-        by uploading the maximum amount per block. Finally, in the case of chunked uploading, it allows the customer to
-        pass data without knowing the size. Note that full upload does not need a deep copy because the Flux emitted by
-        the PayloadSizeGate in the full upload case is already replayable and the length is maintained by the gate.
-        - Sync buffered upload: converting the input stream to a flux involves creating a buffer for each stream read.
-        Using a new buffer per read ensures that the reads are safe and not overwriting data in buffers that were passed
-        to the async upload but have not yet been sent. This covers both full and chunked uploads in the sync case.
-        - BlobOutputStream: A deep copy is made of any buffer passed to write. While async copy does streamline our code
-        and allow for some potential parallelization, this extra copy is necessary to ensure that customers writing to
-        the stream in a tight loop are not overwriting data previously given to the stream before it has been sent.
-
-        Taken together, these should support retries and protect against data being overwritten in all upload scenarios.
-
-        One note is that there is no deep copy in the uploadFull method. This is unnecessary as explained in
-        uploadFullOrChunked because the Flux coming out of the size gate in that case is already replayable and reusing
-        buffers is not a common scenario for async like it is in sync (and we already buffer in sync to convert from a
-        stream).
+         * The following is catalogue of all the places we allocate memory/copy in any upload method a justification for
+         * that case current as of 1/13/21.
+         *
+         * - Async buffered upload chunked upload: We used an UploadBufferPool. This will allocate memory as needed up
+         *   to the configured maximum. This is necessary to support replayability on retires. Each flux to come out of
+         *   the pool is a Flux.just() of up to two deep copied buffers, so it is replayable. It also allows us to
+         *   optimize the upload by uploading the maximum amount per block. Finally, in the case of chunked uploading,
+         *   it allows the customer to pass data without knowing the size. Note that full upload does not need a deep
+         *   copy because the Flux emitted by the PayloadSizeGate in the full upload case is already replayable and the
+         *   length is maintained by the gate.
+         *
+         * - Sync buffered upload: converting the input stream to a flux involves creating a buffer for each stream
+         *   read. Using a new buffer per read ensures that the reads are safe and not overwriting data in buffers that
+         *   were passed to the async upload but have not yet been sent. This covers both full and chunked uploads in
+         *   the sync case.
+         *
+         * - BlobOutputStream: A deep copy is made of any buffer passed to write. While async copy does streamline our
+         *   code and allow for some potential parallelization, this extra copy is necessary to ensure that customers
+         *   writing to the stream in a tight loop are not overwriting data previously given to the stream before it has
+         *   been sent.
+         *
+         * Taken together, these should support retries and protect against data being overwritten in all upload
+         * scenarios.
+         *
+         * One note is that there is no deep copy in the uploadFull method. This is unnecessary as explained in
+         * uploadFullOrChunked because the Flux coming out of the size gate in that case is already replayable and
+         * reusing buffers is not a common scenario for async like it is in sync (and we already buffer in sync to
+         * convert from a stream).
          */
         try {
             StorageImplUtils.assertNotNull("options", options);
@@ -728,10 +735,9 @@ private Mono<Response<BlockBlobItem>> uploadFullBlob(BlockBlobAsyncClient blockB
         Boolean legalHold) {
 
         /*
-        Note that there is no need to buffer here as the flux returned by the size gate in this case is created
-        from an iterable and is therefore replayable.
+         * Note that there is no need to buffer here as the flux returned by the size gate in this case is created
+         * from an iterable and is therefore replayable.
          */
-
         return UploadUtils.computeMd5(data, computeMd5, LOGGER)
             .map(fluxMd5Wrapper -> new BlockBlobSimpleUploadOptions(fluxMd5Wrapper.getData(), length)
                 .setHeaders(headers)
@@ -775,10 +781,10 @@ private Mono<Response<BlockBlobItem>> uploadInChunks(BlockBlobAsyncClient blockB
             ModelHelper.wrapBlobOptions(parallelTransferOptions));
 
         /*
-         Write to the pool and upload the output.
-         maxConcurrency = 1 when writing means only 1 BufferAggregator will be accumulating at a time.
-         parallelTransferOptions.getMaxConcurrency() appends will be happening at once, so we guarantee buffering of
-         only concurrency + 1 chunks at a time.
+         * Write to the pool and upload the output.
+         * maxConcurrency = 1 when writing means only 1 BufferAggregator will be accumulating at a time.
+         * parallelTransferOptions.getMaxConcurrency() appends will be happening at once, so we guarantee buffering of
+         * only concurrency + 1 chunks at a time.
          */
         return chunkedSource.flatMapSequential(stagingArea::write, 1, 1)
             .concatWith(Flux.defer(stagingArea::flush))
@@ -799,10 +805,8 @@ private Mono<Response<BlockBlobItem>> uploadInChunks(BlockBlobAsyncClient blockB
                         }
                         return responseMono;
                     })
-                    // We only care about the stageBlock insofar as it was successful,
-                    // but we need to collect the ids.
-                    .map(x -> blockId)
-                    .flux();
+                    // We only care about the stageBlock insofar as it was successful, but we need to collect the ids.
+                    .map(x -> blockId);
             }, parallelTransferOptions.getMaxConcurrency(), 1)
             .collect(Collectors.toList())
             .flatMap(ids ->
diff --git a/sdk/storage/azure-storage-common/src/main/java/com/azure/storage/common/Utility.java b/sdk/storage/azure-storage-common/src/main/java/com/azure/storage/common/Utility.java
@@ -5,14 +5,15 @@
 
 import com.azure.core.exception.UnexpectedLengthException;
 import com.azure.core.util.CoreUtils;
+import com.azure.core.util.FluxUtil;
 import com.azure.core.util.UrlBuilder;
 import com.azure.core.util.logging.ClientLogger;
 import com.azure.storage.common.implementation.StorageImplUtils;
 import reactor.core.publisher.Flux;
-import reactor.core.publisher.Mono;
 
 import java.io.IOException;
 import java.io.InputStream;
+import java.io.UncheckedIOException;
 import java.io.UnsupportedEncodingException;
 import java.net.URLDecoder;
 import java.net.URLEncoder;
@@ -188,7 +189,7 @@ public static Flux<ByteBuffer> convertStreamToByteBuffer(InputStream data, long
     /**
      * A utility method for converting the input stream to Flux of ByteBuffer. Will check the equality of entity length
      * and the input length.
-     *
+     * <p>
      * Using markAndReset=true to force a seekable stream implies a buffering strategy is not being used, in which case
      * length is still needed for whatever underlying REST call is being streamed to. If markAndReset=false and data is
      * being buffered, consider using {@link com.azure.core.util.FluxUtil#toFluxByteBuffer(InputStream, int)} which
@@ -205,78 +206,97 @@ public static Flux<ByteBuffer> convertStreamToByteBuffer(InputStream data, long
      * @throws RuntimeException When I/O error occurs.
      */
     public static Flux<ByteBuffer> convertStreamToByteBuffer(InputStream data, long length, int blockSize,
-                                                             boolean markAndReset) {
+        boolean markAndReset) {
         if (markAndReset) {
             data.mark(Integer.MAX_VALUE);
         }
+
         if (length == 0) {
             try {
                 if (data.read() != -1) {
                     long totalLength = 1 + data.available();
-                    throw LOGGER.logExceptionAsError(new UnexpectedLengthException(
-                        String.format("Request body emitted %d bytes, more than the expected %d bytes.",
-                            totalLength, length), totalLength, length));
+                    return FluxUtil.fluxError(LOGGER, new UnexpectedLengthException(String.format(
+                        "Request body emitted %d bytes, more than the expected %d bytes.", totalLength, length),
+                        totalLength, length));
                 }
             } catch (IOException e) {
-                throw LOGGER.logExceptionAsError(new RuntimeException("I/O errors occurred", e));
+                return FluxUtil.fluxError(LOGGER, new UncheckedIOException(e));
             }
         }
+
         return Flux.defer(() -> {
             /*
-            If the request needs to be retried, the flux will be resubscribed to. The stream and counter must be
-            reset in order to correctly return the same data again.
+             * If the request needs to be retried, the flux will be resubscribed to. The stream and counter must be
+             * reset in order to correctly return the same data again.
              */
-            final long[] currentTotalLength = new long[1];
             if (markAndReset) {
                 try {
                     data.reset();
                 } catch (IOException e) {
-                    throw LOGGER.logExceptionAsError(new RuntimeException(e));
+                    return FluxUtil.fluxError(LOGGER, new UncheckedIOException(e));
                 }
             }
-            return Flux.range(0, (int) Math.ceil((double) length / (double) blockSize))
-                .map(i -> i * blockSize)
-                .concatMap(pos -> Mono.fromCallable(() -> {
-                    long count = pos + blockSize > length ? length - pos : blockSize;
-                    byte[] cache = new byte[(int) count];
-                    int numOfBytes = 0;
-                    int offset = 0;
-                    // Revise the casting if the max allowed network data transmission is over 2G.
-                    int len = (int) count;
-                    while (numOfBytes != -1 && offset < count) {
+
+            final long[] currentTotalLength = new long[1];
+            return Flux.generate(() -> data, (is, sink) -> {
+                long pos = currentTotalLength[0];
+
+                long count = (pos + blockSize) > length ? (length - pos) : blockSize;
+                byte[] cache = new byte[(int) count];
+
+                int numOfBytes = 0;
+                int offset = 0;
+                // Revise the casting if the max allowed network data transmission is over 2G.
+                int len = (int) count;
+
+                while (numOfBytes != -1 && offset < count) {
+                    try {
                         numOfBytes = data.read(cache, offset, len);
                         if (numOfBytes != -1) {
                             offset += numOfBytes;
                             len -= numOfBytes;
                             currentTotalLength[0] += numOfBytes;
                         }
+                    } catch (IOException e) {
+                        sink.error(e);
+                        return is;
                     }
-                    if (numOfBytes == -1 && currentTotalLength[0] < length) {
-                        throw LOGGER.logExceptionAsError(new UnexpectedLengthException(
-                            String.format("Request body emitted %d bytes, less than the expected %d bytes.",
-                                currentTotalLength[0], length), currentTotalLength[0], length));
-                    }
+                }
 
-                    // Validate that stream isn't longer.
-                    if (currentTotalLength[0] >= length) {
-                        try {
-                            if (data.read() != -1) {
-                                long totalLength = 1 + currentTotalLength[0] + data.available();
-                                throw LOGGER.logExceptionAsError(new UnexpectedLengthException(
-                                    String.format("Request body emitted %d bytes, more than the expected %d bytes.",
-                                        totalLength, length), totalLength, length));
-                            } else if (currentTotalLength[0] > length) {
-                                throw LOGGER.logExceptionAsError(new IllegalStateException(
-                                    String.format("Read more data than was requested. Size of data read: %d. Size of data"
-                                        + " requested: %d", currentTotalLength[0], length)));
-                            }
-                        } catch (IOException e) {
-                            throw LOGGER.logExceptionAsError(new RuntimeException("I/O errors occurred", e));
+                if (numOfBytes == -1 && currentTotalLength[0] < length) {
+                    sink.error(LOGGER.logExceptionAsError(new UnexpectedLengthException(String.format(
+                        "Request body emitted %d bytes, less than the expected %d bytes.",
+                        currentTotalLength[0], length), currentTotalLength[0], length)));
+                    return is;
+                }
+
+                // Validate that stream isn't longer.
+                if (currentTotalLength[0] >= length) {
+                    try {
+                        if (data.read() != -1) {
+                            long totalLength = 1 + currentTotalLength[0] + data.available();
+                            sink.error(LOGGER.logExceptionAsError(new UnexpectedLengthException(
+                                String.format("Request body emitted %d bytes, more than the expected %d bytes.",
+                                    totalLength, length), totalLength, length)));
+                            return is;
+                        } else if (currentTotalLength[0] > length) {
+                            sink.error(LOGGER.logExceptionAsError(new IllegalStateException(
+                                String.format("Read more data than was requested. Size of data read: %d. Size of data"
+                                    + " requested: %d", currentTotalLength[0], length))));
+                            return is;
                         }
+                    } catch (IOException e) {
+                        sink.error(LOGGER.logExceptionAsError(new RuntimeException("I/O errors occurred", e)));
+                        return is;
                     }
+                }
 
-                    return ByteBuffer.wrap(cache, 0, offset);
-                }));
+                sink.next(ByteBuffer.wrap(cache, 0, offset));
+                if (currentTotalLength[0] == length) {
+                    sink.complete();
+                }
+                return is;
+            });
         });
     }