aphp
diff --git a/‎changelog.md‎
Lines changed: 5 additions & 1 deletion b/‎changelog.md‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎edsnlp/core/pipeline.py‎
Lines changed: 0 additions & 2 deletions b/‎edsnlp/core/pipeline.py‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎edsnlp/core/stream.py‎
Lines changed: 11 additions & 8 deletions b/‎edsnlp/core/stream.py‎
Lines changed: 11 additions & 8 deletions
@@ -23,11 +23,15 @@
 
 - Sort files before iterating over a standoff or json folder to ensure reproducibility
 - Sentence detection now correctly match capitalized letters + apostrophe
+- We now ensure that the workers pool is properly closed whatever happens (exception, garbage collection, data ending) in the `multiprocessing` backend. This prevents some executions from hanging indefinitely at the end of the processing.
 
 ### Data API changes
 
 - `LazyCollection` objects are now called `Stream` objects
-- By default, `multiprocessing` backend now preserves the order of the input data
+- By default, `multiprocessing` backend now preserves the order of the input data. To disable this and improve performance, use `deterministic=False` in the `set_processing` method
+- :rocket: Parallelized GPU inference throughput improvements !
+  - For simple {pre-process → model → post-process} pipelines, GPU inference can be up to 30% faster in non-deterministic mode (results can be out of order) and up to 20% faster in deterministic mode (results are in order)
+  - For multitask pipelines, GPU inference can be up to twice as fast (measured in a two-tasks BERT+NER+Qualif pipeline on T4 and A100 GPUs)
 - The `.map_batches`, `.map_pipeline` and `.map_gpu` methods now support a specific `batch_size` and batching function, instead of having a single batch size for all pipes
 - Readers now have a `loop` parameter to cycle over the data indefinitely (useful for training)
 - Readers now have a `shuffle` parameter to shuffle the data before iterating over it
 
@@ -721,8 +721,6 @@ def preprocess_many(self, docs: Iterable[Doc], compress=True, supervision=True):
         """
         res = Stream.ensure_stream(docs)
         res = res.map(functools.partial(self.preprocess, supervision=supervision))
-        if compress:
-            res = res.map(batch_compress_dict())
         return res
 
     def collate(
 
@@ -34,7 +34,7 @@
 
     from edsnlp import Pipeline
     from edsnlp.core.torch_component import TorchComponent
-    from edsnlp.data.base import BaseReader, BaseWriter
+    from edsnlp.data.base import BaseReader, BaseWriter, BatchWriter
 
 
 def deep_isgeneratorfunction(x):
@@ -234,6 +234,9 @@ def __init__(self, prepare_batch, forward, postprocess, elementwise=False):
         self.postprocess = postprocess
         self.elementwise = elementwise
 
+    def __call__(self, *args, **kwargs):
+        return self.forward(*args, **kwargs)
+
     def batch_process(self, batch):
         res = self.forward(self.prepare_batch(batch, None))
         return self.postprocess(batch, res) if self.postprocess is not None else res
@@ -275,7 +278,7 @@ class Stream(metaclass=MetaStream):
     def __init__(
         self,
         reader: Optional[BaseReader] = None,
-        writer: Optional[BaseWriter] = None,
+        writer: Optional[Union[BaseWriter, BatchWriter]] = None,
         ops: List[Any] = [],
         config: Dict = {},
     ):
@@ -446,10 +449,10 @@ def set_processing(
             List of GPU devices to use for the CPU workers. Used for debugging purposes.
         deterministic: bool
             Whether to try and preserve the order of the documents in "multiprocessing"
-            mode. If set to False, workers will process documents whenever they are
-            available in a dynamic fashion, which may result in out-of-order processing.
-            If set to true, tasks will be distributed in a static, round-robin fashion
-            to workers. Defaults to True.
+            mode. If set to `False`, workers will process documents whenever they are
+            available in a dynamic fashion, which may result in out-of-order but usually
+            faster processing. If set to true, tasks will be distributed in a
+            static, round-robin fashion to workers. Defaults to `True`.
 
         Returns
         -------
@@ -461,8 +464,8 @@ def set_processing(
             or kwargs.pop("sort_chunks", INFER) is not INFER
         ):
             warnings.warn(
-                """chunk_size and sort_chunks are deprecated, use \
-                map_batched(sort_fn, batch_size=chunk_size) instead.""",
+                "chunk_size and sort_chunks are deprecated, use "
+                "map_batched(sort_fn, batch_size=chunk_size) instead.",
                 VisibleDeprecationWarning,
             )
         if kwargs.pop("split_into_batches_after", INFER) is not INFER: