Skip to content

Commit 989b70c

Browse files
committed
refacto: deprecate batch_size argument of Pipeline
1 parent 7c58309 commit 989b70c

File tree

2 files changed

+8
-11
lines changed

2 files changed

+8
-11
lines changed

changelog.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@
3939
4040
- `eds.span_context_getter`'s parameter `context_sents` is no longer optional and must be explicitly set to 0 to disable sentence context
4141
- In multi-GPU setups, streams that contain torch components are now stripped of their parameter tensors when sent to CPU Workers since these workers only perform preprocessing and postprocessing and should therefore not need the model parameters.
42+
- The `batch_size` argument of `Pipeline` is deprecated and is not used anymore. Use the `batch_size` argument of `stream.map_pipeline` instead.
4243
4344
### Fixed
4445

edsnlp/core/pipeline.py

Lines changed: 7 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -101,7 +101,7 @@ def __init__(
101101
lang: str,
102102
create_tokenizer: Optional[Callable[[Self], Tokenizer]] = None,
103103
vocab: Union[bool, Vocab] = True,
104-
batch_size: Optional[int] = 128,
104+
batch_size: Optional[int] = None,
105105
vocab_config: Type[BaseDefaults] = None,
106106
meta: Dict[str, Any] = None,
107107
pipeline: Optional[Sequence[str]] = None,
@@ -119,8 +119,6 @@ def __init__(
119119
Function that creates a tokenizer for the pipeline
120120
vocab: Union[bool, Vocab]
121121
Whether to create a new vocab or use an existing one
122-
batch_size: Optional[int]
123-
Batch size to use in the `.pipe()` method
124122
vocab_config: Type[BaseDefaults]
125123
Configuration for the vocab
126124
meta: Dict[str, Any]
@@ -129,6 +127,12 @@ def __init__(
129127
spacy_blank_cls = get_lang_class(lang)
130128

131129
self.Defaults = spacy_blank_cls.Defaults
130+
if batch_size is not None:
131+
warnings.warn(
132+
"The 'batch_size' argument is deprecated. Use the 'batch_size' "
133+
"argument in `stream.map_pipeline` instead.",
134+
DeprecationWarning,
135+
)
132136
self.batch_size = batch_size
133137
if (vocab is not True) and (vocab_config is not None):
134138
raise ValueError(
@@ -397,7 +401,6 @@ def __call__(self, text: Union[str, Doc]) -> Doc:
397401
def pipe(
398402
self,
399403
inputs: Union[Iterable, Stream],
400-
batch_size: Optional[int] = None,
401404
n_process: int = None,
402405
**kwargs,
403406
) -> Stream:
@@ -409,9 +412,6 @@ def pipe(
409412
----------
410413
inputs: Iterable[Union[str, Doc]]
411414
The inputs to create the Docs from, or Docs directly.
412-
batch_size: Optional[int]
413-
The batch size to use. If not provided, the batch size of the pipeline
414-
object will be used.
415415
n_process: int
416416
Deprecated. Use the ".set(num_cpu_workers=n_process)" method on the returned
417417
data stream instead.
@@ -423,10 +423,6 @@ def pipe(
423423
Stream
424424
"""
425425

426-
if batch_size is None:
427-
batch_size = self.batch_size
428-
kwargs = {"batch_size": batch_size, **kwargs}
429-
430426
stream = edsnlp.data.from_iterable(inputs)
431427
stream = stream.map_pipeline(self, **kwargs)
432428
if n_process is not None:

0 commit comments

Comments
 (0)