aphp
diff --git a/‎changelog.md‎
Lines changed: 4 additions & 0 deletions b/‎changelog.md‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎edsnlp/core/stream.py‎
Lines changed: 25 additions & 18 deletions b/‎edsnlp/core/stream.py‎
Lines changed: 25 additions & 18 deletions
diff --git a/‎edsnlp/data/base.py‎
Lines changed: 2 additions & 2 deletions b/‎edsnlp/data/base.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎edsnlp/data/parquet.py‎
Lines changed: 34 additions & 35 deletions b/‎edsnlp/data/parquet.py‎
Lines changed: 34 additions & 35 deletions
diff --git a/‎edsnlp/processing/multiprocessing.py‎
Lines changed: 43 additions & 10 deletions b/‎edsnlp/processing/multiprocessing.py‎
Lines changed: 43 additions & 10 deletions
diff --git a/‎edsnlp/processing/simple.py‎
Lines changed: 5 additions & 4 deletions b/‎edsnlp/processing/simple.py‎
Lines changed: 5 additions & 4 deletions
diff --git a/‎edsnlp/processing/spark.py‎
Lines changed: 4 additions & 4 deletions b/‎edsnlp/processing/spark.py‎
Lines changed: 4 additions & 4 deletions
@@ -36,6 +36,10 @@
 - Readers now have a `loop` parameter to cycle over the data indefinitely (useful for training)
 - Readers now have a `shuffle` parameter to shuffle the data before iterating over it
 - In `multiprocessing` mode, file based readers now read the data in the workers (was an option before)
+- We now support two new special batch sizes
+  - "fragment" in the case of parquet datasets: rows of a full parquet file fragment per batch
+  - "dataset" which is mostly useful during training, for instance to shuffle the dataset at each epoch.
+  These are also compatible in batched writer such as parquet, where each input fragment can be processed and mapped to a single matching output fragment.
 - :boom: Breaking change: a `map` function returning a list or a generator won't be automatically flattened anymore. Use `flatten()` to flatten the output if needed. This shouldn't change the behavior for most users since most writers (to_pandas, to_polars, to_parquet, ...) still flatten the output
 - :boom: Breaking change: the `chunk_size` and `sort_chunks` are now deprecated : to sort data before applying a transformation, use `.map_batches(custom_sort_fn, batch_size=...)`
 
 
@@ -88,10 +88,6 @@ class Op(abc.ABC):
     def __call__(self, items):
         raise NotImplementedError()
 
-    @property
-    def expected_sentinels(self):
-        return set()
-
 
 class FlattenOp(Op):
     elementwise = False
@@ -150,10 +146,6 @@ def __repr__(self):
             f"sentinel_mode={self.sentinel_mode})"
         )
 
-    @property
-    def expected_sentinels(self):
-        return getattr(self.batch_fn, "expected_sentinels", set())
-
 
 class MapOp(Op):
     def __init__(self, pipe, kwargs):
@@ -966,7 +958,13 @@ def _make_stages(self, split_torch_pipes: bool) -> List[Stage]:
 
     def validate_ops(self, ops, update: bool = False):
         # Check batchify requirements
-        expected_sentinels = set()
+        requires_sentinels = set()
+
+        if hasattr(self.writer, "batch_fn") and hasattr(
+            self.writer.batch_fn, "requires_sentinel"
+        ):
+            requires_sentinels.add(self.writer.batch_fn.requires_sentinel)
+
         self_batch_fn = batchify_fns.get(self.batch_by, self.batch_by)
         for op in reversed(ops):
             if isinstance(op, BatchifyOp):
@@ -977,29 +975,38 @@ def validate_ops(self, ops, update: bool = False):
                     else None
                 )
                 if sentinel_mode == "auto":
-                    sentinel_mode = "split" if expected_sentinels else "drop"
-                if expected_sentinels and op.sentinel_mode == "drop":
+                    sentinel_mode = "split" if requires_sentinels else "drop"
+                if requires_sentinels and op.sentinel_mode == "drop":
                     raise ValueError(
                         f"Operation {op} drops the stream sentinel values "
                         f"(markers for the end of a dataset or a dataset "
                         f"fragment), but some downstream operation(s) require "
-                        f"the following sentinel values: {expected_sentinels}. "
+                        f"the following sentinel values: {requires_sentinels}. "
                         f"Ensure that you do not set `sentinel_mode='drop'` on "
                         f"any upstream batching operation."
                     )
-                expected_sentinels.update(op.expected_sentinels)
                 if update:
                     op.sentinel_mode = sentinel_mode
 
-        if expected_sentinels and (self.backend == "spark" or not self.deterministic):
+                if hasattr(batch_fn, "requires_sentinel"):
+                    requires_sentinels.add(batch_fn.requires_sentinel)
+
+        sentinel_str = ", ".join(requires_sentinels)
+        if requires_sentinels and self.backend == "spark":
             raise ValueError(
-                f"Some operations require sentinel values ({expected_sentinels}), "
+                f"Some operations require sentinel values ({sentinel_str}), "
                 f"but the Spark backend does not support sentinel values."
             )
-        if not (expected_sentinels < self.reader.emitted_sentinels):
+        if requires_sentinels and not self.deterministic:
+            raise ValueError(
+                f"Some operations require sentinel values ({sentinel_str}), "
+                f"but these are not supported in when `deterministic=False`."
+            )
+        if not (requires_sentinels <= self.reader.emitted_sentinels):
             raise ValueError(
-                f"Some operations require sentinel values ({expected_sentinels}), "
-                f"but the reader does not emit these values."
+                f"Some operations require sentinel values ({sentinel_str}), "
+                f"but the reader does not emit these values "
+                f"({', '.join(self.reader.emitted_sentinels)})."
             )
 
     def __repr__(self):
 
@@ -90,8 +90,8 @@ def consolidate(self, items: Iterable):
 
 class BatchWriter(BaseWriter):
     batch_size: Optional[int] = None
-    batch_by: Callable
-    batch_in_worker: bool = False
+    batch_fn: Callable
+    write_in_worker: bool = False
 
     def handle_batch(self, batch):
         raise NotImplementedError()
 
@@ -14,7 +14,7 @@
 from edsnlp.core.stream import Stream
 from edsnlp.data.base import BatchWriter, FileBasedReader
 from edsnlp.data.converters import get_dict2doc_converter, get_doc2dict_converter
-from edsnlp.utils.batching import batchify_fns
+from edsnlp.utils.batching import BatchBy, batchify_fns
 from edsnlp.utils.collections import batchify, dl_to_ld, flatten, ld_to_dl, shuffle
 from edsnlp.utils.file_system import FileSystem, normalize_fs_path
 from edsnlp.utils.stream_sentinels import DatasetEndSentinel, FragmentEndSentinel
@@ -61,12 +61,16 @@ def read_records(self) -> Iterable[Any]:
                 for file in shuffle(files, self.rng):
                     records = shuffle(self.read_fragment(file), self.rng)
                     yield from records
-                    yield FragmentEndSentinel(file)
-            else:
+                    yield FragmentEndSentinel(file.path)
+            elif self.shuffle == "dataset":
                 records = (line for file in files for line in self.read_fragment(file))
-                if self.shuffle == "dataset":
-                    records = shuffle(records, self.rng)
+                records = shuffle(records, self.rng)
                 yield from records
+            else:
+                for file in files:
+                    records = list(self.read_fragment(file))
+                    yield from records
+                    yield FragmentEndSentinel(file.path)
             yield DatasetEndSentinel()
             if not self.loop:
                 break
@@ -85,9 +89,9 @@ def __init__(
         self,
         *,
         path: Union[str, Path],
-        batch_size: Optional[Union[int]] = None,
-        batch_by: Union[Callable, Literal["docs"]] = "docs",
-        batch_in_worker: bool = False,
+        batch_size: Optional[Union[int, str]] = None,
+        batch_by: BatchBy = None,
+        write_in_worker: bool = False,
         overwrite: bool,
         filesystem: Optional[FileSystem] = None,
     ):
@@ -113,21 +117,18 @@ def __init__(
             for file in dataset.files:
                 self.fs.rm_file(file)
         self.fs = filesystem
-        assert batch_by is None or batch_by in batchify_fns or callable(batch_by)
-        self.batch_by = batchify_fns.get(batch_by, batch_by)
-        if (
-            batch_by in ("docs", "doc")
-            or self.batch_by is batchify
-            and batch_size is None
-        ):
+        batch_size, batch_by = Stream.validate_batching(batch_size, batch_by)
+        if batch_by in ("docs", "doc", None, batchify) and batch_size is None:
             warnings.warn(
                 "You should specify a batch size when using record-wise batch writing. "
                 "Setting batch size to 1024."
             )
             batch_size = 1024
+        batch_by = batch_by or "docs"
+        self.batch_fn = batchify_fns.get(batch_by, batch_by)
 
         self.batch_size = batch_size
-        self.batch_in_worker = batch_in_worker
+        self.write_in_worker = write_in_worker
         self.batch = []
         self.closed = False
 
@@ -250,9 +251,9 @@ def write_parquet(
     data: Union[Any, Stream],
     path: Union[str, Path],
     *,
-    batch_size: Optional[int] = None,
-    batch_by: Union[Callable, Literal["docs"]] = "docs",
-    batch_in_worker: bool = True,
+    batch_size: Optional[Union[int, str]] = None,
+    batch_by: BatchBy = None,
+    write_in_worker: bool = True,
     overwrite: bool = False,
     filesystem: Optional[FileSystem] = None,
     execute: bool = True,
@@ -295,15 +296,17 @@ def write_parquet(
         The method to batch the documents. If "docs", the batch size is the number of
         documents. If "fragment", each batch corresponds to a parquet file fragment from
         the input data.
-    batch_in_worker: bool
-        In multiprocessing or spark mode, whether to batch the documents in the workers
-        or in the main process.
+    write_in_worker: bool
+        In multiprocessing or spark mode, whether to batch and write the documents in
+        the workers or in the main process.
 
         For instance, a worker may read the 1st, 3rd, 5th, ... documents, while another
-        reads the 2nd, 4th, 6th, ... documents. If `batch_in_worker` is False and
-        `deterministic` is True (default), the original order of the documents will be
-        recovered in the main process, and batching there can produce fragments that
-        respect the original order.
+        reads the 2nd, 4th, 6th, ... documents.
+
+        If `write_in_worker` is False, `deterministic` is True (default) and no
+        operation adds or remove document from the stream (e.g., no `map_batches`), the
+        original order of the documents will be recovered in the main process, and
+        batching there can produce fragments that respect the original order.
     overwrite: bool
         Whether to overwrite existing directories.
     filesystem: Optional[AbstractFileSystem] = None,
@@ -326,14 +329,10 @@ def write_parquet(
             batch_size is None
         ), "Cannot specify both 'batch_size' and deprecated 'num_rows_per_file'."
         batch_size = kwargs.pop("num_rows_per_file")
-        assert batch_by == "docs", "Cannot use 'num_rows_per_file' with 'batch_by'."
-    if "write_in_worker" in kwargs:
-        warnings.warn(
-            "The 'write_in_worker' parameter is deprecated. To perform "
-            "batching in the worker processes, set 'batch_in_worker=True'.",
-            VisibleDeprecationWarning,
-        )
-        batch_in_worker = kwargs.pop("write_in_worker")
+        assert batch_by in (
+            None,
+            "docs",
+        ), "Cannot use 'num_rows_per_file' with 'batch_by'."
     if "accumulate" in kwargs:
         warnings.warn(
             "The 'accumulate' parameter is deprecated.", VisibleDeprecationWarning
@@ -347,7 +346,7 @@ def write_parquet(
             path=path,
             batch_size=batch_size,
             batch_by=batch_by,
-            batch_in_worker=batch_in_worker,
+            write_in_worker=write_in_worker,
             overwrite=overwrite,
             filesystem=filesystem,
         ),
 
@@ -705,16 +705,27 @@ def preprocess_before_forward(self, items, stage):
     def send_results(self, items):
         writer = self.stream.writer
         if writer is not None:
-            items = (writer.handle_record(rec) for rec in items)
-        if getattr(writer, "batch_in_worker", None) is True:
-            items = writer.batch_by(
+            items = (
+                writer.handle_record(rec)
+                if not isinstance(rec, StreamSentinel)
+                else rec
+                for rec in items
+            )
+        if getattr(writer, "write_in_worker", None) is True:
+            items = writer.batch_fn(
                 items,
                 batch_size=writer.batch_size,
                 sentinel_mode="drop",
             )
-            items = (writer.handle_batch(b) for b in items)
+            items = (
+                writer.handle_batch(b)
+                for b in items
+                if not isinstance(b, StreamSentinel)
+            )
         else:
-            items = ((x, 1) for x in items if not isinstance(x, StreamSentinel))
+            items = (
+                (x, 1) if not isinstance(x, StreamSentinel) else (x, 0) for x in items
+            )
 
         name = f"from-{self.uid}_to-main"
         queue = self.data_queues[name]
@@ -1024,11 +1035,15 @@ def run(self):
         # Create the main iterator
         items = self.dequeue_outputs()
         writer = self.stream.writer
-        if getattr(writer, "batch_in_worker", None) is False:
+        if getattr(writer, "write_in_worker", None) is False:
             writer: BatchWriter
-            items = writer.batch_by(items, writer.batch_size)
+            items = writer.batch_fn(items, writer.batch_size, sentinel_mode="drop")
             # get the 1st element (2nd is the count)
-            items = (writer.handle_batch(b)[0] for b in items)
+            items = (
+                writer.handle_batch(b)[0]
+                for b in items
+                if not isinstance(b, StreamSentinel)
+            )
 
         # If we are garbage collected, stop the execution
         weakref.finalize(items, self.teardown, garbage_collected=True)
@@ -1060,6 +1075,13 @@ def dequeue_outputs(self):
 
     def iter_outputs(self, stop_mode=False):
         deterministic = self.stream.deterministic
+        requires_sentinel = (
+            hasattr(self.stream.writer, "batch_fn")
+            and getattr(self.stream.writer.batch_fn, "requires_sentinel", None)
+            and not self.stream.writer.write_in_worker
+        )
+        missing_sentinels = len(self.cpu_worker_names) if requires_sentinel else 0
+        buffer = []
         while self.num_alive_workers > 0:
             if self.stopped and not stop_mode:  # pragma: no cover
                 raise StopSignal()
@@ -1097,9 +1119,20 @@ def iter_outputs(self, stop_mode=False):
                 self.num_alive_workers -= 1
                 self.workers_status[worker_idx] = False
                 continue
-            if isinstance(out, StreamSentinel) and worker_idx > 0:
+            if isinstance(out[0], StreamSentinel):
+                if out[0].kind == requires_sentinel:
+                    missing_sentinels -= 1
+                    if missing_sentinels == 0:
+                        yield from buffer
+                        yield out
+                        buffer.clear()
+                        missing_sentinels = len(self.cpu_worker_names)
                 continue
-            yield out
+            if requires_sentinel:
+                buffer.append(out)
+            else:
+                yield out
+        yield from buffer
         if self.error:
             raise self.error
 
 
@@ -80,12 +80,13 @@ def process():
                     for item in items
                 )
 
-            if getattr(writer, "batch_by", None) is not None:
-                items = writer.batch_by(items, writer.batch_size, sentinel_mode="drop")
+            if getattr(writer, "batch_fn", None) is not None:
+                items = writer.batch_fn(items, writer.batch_size, sentinel_mode="drop")
                 # get the 1st element (2nd is the count)
                 for b in items:
-                    item, count = writer.handle_batch(b)
-                    bar.update(count)
+                    if not isinstance(b, StreamSentinel):
+                        item, count = writer.handle_batch(b)
+                        bar.update(count)
                     yield item
             else:
                 for item in items:
 
@@ -132,8 +132,8 @@ def process_partition(items):  # pragma: no cover
             items = (writer.handle_record(item) for item in items)
 
         results = []
-        if getattr(writer, "batch_in_worker", None) is True:
-            items = writer.batch_by(items, writer.batch_size)
+        if getattr(writer, "write_in_worker", None) is True:
+            items = writer.batch_fn(items, writer.batch_size)
             # get the 1st element (2nd is the count)
             for item in items:
                 item, count = writer.handle_batch(item)
@@ -163,9 +163,9 @@ def process_partition(items):  # pragma: no cover
             for item in df.rdd.mapPartitions(process_partition).toLocalIterator()
         )
 
-        if getattr(writer, "batch_in_worker", None) is False:
+        if getattr(writer, "write_in_worker", None) is False:
             writer: BatchWriter
-            items = writer.batch_by(items, writer.batch_size)
+            items = writer.batch_fn(items, writer.batch_size)
             # get the 1st element (2nd is the count)
             items = (writer.handle_batch(b)[0] for b in items)
         return items if writer is None else writer.consolidate(items)